In [6]:
## Imports
# Data processing
import pandas as pd
import numpy as np
import scipy.stats
from sklearn.metrics.pairwise import cosine_similarity

In [24]:
# Read datasets

users_colnames=['userId', 'gender', 'age', 'profession']
movies_colnames=['movieId', 'year', 'title']
ratings_colnames=['userId', 'movieId', 'rating']

users = pd.read_csv('cse2525-reccommender-systems-challenge/users.csv', sep=';', names=users_colnames)
movies = pd.read_csv('cse2525-reccommender-systems-challenge/movies_v2.csv', sep=';', names=movies_colnames) # changed line 3601 from dataset : ';' -> '_'
ratings = pd.read_csv('cse2525-reccommender-systems-challenge/ratings.csv', sep=';', names=ratings_colnames)

# Check
users.head()
movies.head()
ratings.head()


Unnamed: 0,movieId,year,title
0,1,1995,Toy_Story_(1995)
1,2,1995,Jumanji_(1995)
2,3,1995,Grumpier_Old_Men_(1995)
3,4,1995,Waiting_to_Exhale_(1995)
4,5,1995,Father_of_the_Bride_Part_II_(1995)


In [50]:
# Count the # of ratings per movie
ratings_by_movie_id = ratings.groupby(['movieId'])['movieId']\
    .count()\
    .reset_index(name='counts')

ratings_by_movie_id.head()


Unnamed: 0,movieId,counts
0,1,1896
1,2,635
2,3,443
3,4,155
4,5,270


In [97]:
# Number of rows remaining after saving only the ones that have `>= threshold` ratings
movies_threshold = 100
ratings_by_movie_id_processed = ratings_by_movie_id[ratings_by_movie_id['counts'] >= movies_threshold] # drop in another variable
len(ratings_by_movie_id_processed)


1940

In [98]:
# Titles of the most popular (in # of ratings) movies
relevant_movie_ids = ratings_by_movie_id.sort_values(by='counts', ascending=False)['movieId'].array
# relevant_movie_ids.array
# type(relevant_movie_ids.array)
joined_ids = list(set(movies['movieId']) & set(relevant_movie_ids))
joined_ids = [x - 1 for x in joined_ids] # 0-indexed
movies.loc[joined_ids]

Unnamed: 0,movieId,year,title
0,1,1995,Toy_Story_(1995)
1,2,1995,Jumanji_(1995)
2,3,1995,Grumpier_Old_Men_(1995)
3,4,1995,Waiting_to_Exhale_(1995)
4,5,1995,Father_of_the_Bride_Part_II_(1995)
...,...,...,...
3689,3690,1943,"Phantom_of_the_Opera,_The_(1943)"
3690,3691,1984,Runaway_(1984)
3701,3702,2000,Meet_the_Parents_(2000)
3702,3703,2000,Requiem_for_a_Dream_(2000)


In [93]:
# Create utility (user-movie) matrix
# rows - users, cols - movies
matrix = ratings.pivot_table(index='userId', columns='movieId', values='rating')
matrix.head()

movieId,1,2,3,4,5,6,7,8,9,10,...,3697,3698,3699,3700,3701,3702,3703,3704,3705,3706
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,5.0,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,,,,,,,,,,...,,,,,,,,,,


In [96]:
# Statistics on ratings values

# Number of users
print('The ratings dataset has', ratings['userId'].nunique(), 'unique users')# Number of movies
print('The ratings dataset has', ratings['movieId'].nunique(), 'unique movies')# Number of ratings
print('The ratings dataset has', ratings['rating'].nunique(), 'unique ratings')# List of unique ratings
print('The unique ratings are', sorted(ratings['rating'].unique()))

The ratings dataset has 6040 unique users
The ratings dataset has 3695 unique movies
The ratings dataset has 5 unique ratings
The unique ratings are [1, 2, 3, 4, 5]
