In [1]:
import numpy as np
import pandas as pd
from scipy.sparse import coo_matrix
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
ratings_df = pd.read_csv('../ratings.csv')

In [3]:
ratings_df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1.0,1.0,4.0,964982703
1,1.0,3.0,4.0,964981247
2,1.0,6.0,4.0,964982224
3,1.0,47.0,5.0,964983815
4,1.0,50.0,5.0,964982931


In [4]:
ratings_df.describe()

Unnamed: 0,userId,movieId,rating,timestamp
count,99032.0,100836.0,100836.0,100836.0
mean,327.736368,19435.295718,3.752177,1205946000.0
std,182.869519,35530.987199,4.306037,216261000.0
min,1.0,1.0,-8.0,828124600.0
25%,177.0,1199.0,3.0,1019124000.0
50%,328.0,2991.0,3.5,1186087000.0
75%,477.0,8122.0,4.0,1435994000.0
max,610.0,193609.0,99.0,1537799000.0


In [5]:
# there is rating that missing user id, it isn't useable
ratings_df.isnull().sum()

userId       1804
movieId         0
rating          0
timestamp       0
dtype: int64

In [6]:
print(f'there are {ratings_df["movieId"].nunique()} movies before cleanup')
ratings_df.dropna(inplace=True)
print(f'there are {ratings_df["movieId"].nunique()} movies AFTER cleanup')

there are 9724 movies before cleanup
there are 9696 movies AFTER cleanup


In [7]:
# TODO: find a better way to save usable movie ids
ratings_df['movieId'].to_csv('../usable_movie_ids.csv', index=True)

In [8]:
def normalize(x):
    x = x.astype(float)
    x_sum = x.sum()
    x_num = x.astype(bool).sum()
    x_mean = x_sum / x_num

    if x_num == 1 or x.std() == 0:
        return 0.0
    return (x - x_mean) / (x.max() - x.min())

In [9]:
# For each user, normalize rating for that user by mean and (max - min) of ratings of that user
ratings_df['avg'] = ratings_df.groupby('userId')['rating'].transform(lambda x: normalize(x))

In [10]:
ratings_df['userId'] = ratings_df['userId'].astype('category')
ratings_df['movieId'] = ratings_df['movieId'].astype('category')

In [11]:
ratings_df.head()

Unnamed: 0,userId,movieId,rating,timestamp,avg
0,1.0,1.0,4.0,964982703,-0.091595
1,1.0,3.0,4.0,964981247,-0.091595
2,1.0,6.0,4.0,964982224,-0.091595
3,1.0,47.0,5.0,964983815,0.158405
4,1.0,50.0,5.0,964982931,0.158405


In [12]:
# avg is normalized rating for each user, for each rating
# e.g. user A rated 3 stars for movie may have normalized rating of 3 stars to be 0.3
# but user B rated 3 stars for movie may have normalized rating of 3 stars to be 0.5
# movieId's codes is index into array of data (length equal to length of df), at that index, what is the category value
coo = coo_matrix((ratings_df['avg'].astype(float),
                 (ratings_df['movieId'].cat.codes.copy(),
                  ratings_df['userId'].cat.codes.copy())))

In [13]:
coo.shape

(9696, 590)

In [15]:
movie_similarity = cosine_similarity(coo)

In [16]:
# similarity between movie, both dim equal to number of movies
movie_similarity.shape, movie_similarity.dtype

((9696, 9696), dtype('float64'))

In [17]:
# categories is actually value of that category i.e. categories of movieId is the movie id
similarity_df = pd.DataFrame(movie_similarity, index=ratings_df['movieId'].cat.categories, columns=ratings_df['movieId'].cat.categories)

In [20]:
similarity_df.to_pickle('../similarity_df.pkl')