In [1]:
# import the movielens small dataset from ./ml-latest-small/ml-latest-small/ratings.csv
import pandas as pd

ratings = pd.read_csv('ml-latest-small/ml-latest-small/ratings.csv')

print(ratings.head())
print(ratings.shape)

   userId  movieId  rating  timestamp
0       1        1     4.0  964982703
1       1        3     4.0  964981247
2       1        6     4.0  964982224
3       1       47     5.0  964983815
4       1       50     5.0  964982931
(100836, 4)


In [2]:
# import the movies.csv file for movie names
movies = pd.read_csv('ml-latest-small/ml-latest-small/movies.csv')

print(movies.head())
print(movies.shape)

   movieId                               title  \
0        1                    Toy Story (1995)   
1        2                      Jumanji (1995)   
2        3             Grumpier Old Men (1995)   
3        4            Waiting to Exhale (1995)   
4        5  Father of the Bride Part II (1995)   

                                        genres  
0  Adventure|Animation|Children|Comedy|Fantasy  
1                   Adventure|Children|Fantasy  
2                               Comedy|Romance  
3                         Comedy|Drama|Romance  
4                                       Comedy  
(9742, 3)


In [3]:
# import the imdb links from the links.csv file
links = pd.read_csv('ml-latest-small/ml-latest-small/links.csv')

print(links.head())
print(links.shape)

   movieId  imdbId   tmdbId
0        1  114709    862.0
1        2  113497   8844.0
2        3  113228  15602.0
3        4  114885  31357.0
4        5  113041  11862.0
(9742, 3)


In [4]:
# temporarily create a list of imdb ids to use for recommendation (liked movies), ironman, avengers, and dark knight
liked_movies = ['tt0371746', 'tt0848228', 'tt0468569']

# create a list of movie ids for the liked movies
liked_movie_ids = []
for movie in liked_movies:
    liked_movie_ids.append(links[links['imdbId'] == int(movie[2:])]['movieId'].values[0])

print(liked_movie_ids)

[59315, 89745, 58559]


In [5]:
# Analysis of the data

print("There are {} unique users in the ratings list".format(ratings['userId'].nunique()))
print("There are {} unique movies in the ratings list".format(ratings['movieId'].nunique()))
print("There are {} ratings in the ratings list".format(ratings.shape[0]))

There are 610 unique users in the ratings list
There are 9724 unique movies in the ratings list
There are 100836 ratings in the ratings list


In [6]:
# construct the user-item matrix
user_item_matrix = ratings.pivot(index='userId', columns='movieId', values='rating')

# normalize ratings from 1-5 to -2.5 to 2.5, replace NaN with 0
user_item_matrix = user_item_matrix.apply(lambda x: (x - 2.5) / 2.5).fillna(0)

print(user_item_matrix.head())

movieId  1       2       3       4       5       6       7       8       \
userId                                                                    
1           0.6     0.0     0.6     0.0     0.0     0.6     0.0     0.0   
2           0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0   
3           0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0   
4           0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0   
5           0.6     0.0     0.0     0.0     0.0     0.0     0.0     0.0   

movieId  9       10      ...  193565  193567  193571  193573  193579  193581  \
userId                   ...                                                   
1           0.0     0.0  ...     0.0     0.0     0.0     0.0     0.0     0.0   
2           0.0     0.0  ...     0.0     0.0     0.0     0.0     0.0     0.0   
3           0.0     0.0  ...     0.0     0.0     0.0     0.0     0.0     0.0   
4           0.0     0.0  ...     0.0     0.0     0.0     0.0     0.0     0

In [7]:
# convert the user-item matrix to a numpy array with mappings for user and movie ids
class Mapping:
    def __init__(self, ids):
        self.ids = ids
        self.id_to_index = {id: index for index, id in enumerate(ids)}
        self.index_to_id = {index: id for index, id in enumerate(ids)}

    def get_index(self, id):
        return self.id_to_index[id]
    
    def get_id(self, index):
        return self.index_to_id[index]
    
    def get_indices(self, ids):
        return [self.id_to_index[id] for id in ids]
    
    def get_ids(self, indices):
        return [self.index_to_id[index] for index in indices]

mapping_user = Mapping(user_item_matrix.index)
mapping_movie = Mapping(user_item_matrix.columns)

user_item_matrix = user_item_matrix.values
print(user_item_matrix)

[[ 0.6  0.   0.6 ...  0.   0.   0. ]
 [ 0.   0.   0.  ...  0.   0.   0. ]
 [ 0.   0.   0.  ...  0.   0.   0. ]
 ...
 [ 0.  -0.2 -0.2 ...  0.   0.   0. ]
 [ 0.2  0.   0.  ...  0.   0.   0. ]
 [ 1.   0.   0.  ...  0.   0.   0. ]]


In [8]:
# create a user's interaction based on the liked movies using liked_movie_ids

import numpy as np

user_interaction = np.zeros(user_item_matrix.shape[1])

for movie_id in liked_movie_ids:
    user_interaction[mapping_movie.get_index(movie_id)] = 2.5

print(user_interaction)

[0. 0. 0. ... 0. 0. 0.]


In [9]:
# simple knn model solution
from sklearn.metrics.pairwise import cosine_similarity

# calculate the cosine similarity between the user-item matrix (item/movie similarity)
cosine_sim_model = cosine_similarity(user_item_matrix.T)

In [10]:
# calculate the scores for the user interaction
scores = cosine_sim_model.dot(user_interaction)

# sort the scores and get the top 10 movie ids
top_10_indices = scores.argsort()[-10:][::-1]

# get the top 10 movie ids
top_10_movie_ids = mapping_movie.get_ids(top_10_indices)

# get the top 10 movie names
top_10_movie_names = movies[movies['movieId'].isin(top_10_movie_ids)]['title'].values

print(top_10_movie_names)


['Dark Knight, The (2008)' 'Iron Man (2008)' 'WALL·E (2008)'
 'Iron Man 2 (2010)' 'Inception (2010)' 'X-Men: First Class (2011)'
 'Avengers, The (2012)' 'Dark Knight Rises, The (2012)'
 'Iron Man 3 (2013)' 'Guardians of the Galaxy (2014)']


In [11]:
# we can see above that the top 10 contains the liked movies, to avoid this we can give the liked movies a score of -1 so that they are not recommended
for movie_id in liked_movie_ids:
    scores[mapping_movie.get_index(movie_id)] = -1

# sort the scores and get the top 10 movie ids
top_10_indices = scores.argsort()[-10:][::-1]

top_10_movie_ids = mapping_movie.get_ids(top_10_indices)
top_10_movie_names = movies[movies['movieId'].isin(top_10_movie_ids)]['title'].values

print(top_10_movie_names)

['Batman Begins (2005)' 'WALL·E (2008)' 'Star Trek (2009)' 'Up (2009)'
 'Iron Man 2 (2010)' 'Inception (2010)' 'X-Men: First Class (2011)'
 'Dark Knight Rises, The (2012)' 'Iron Man 3 (2013)'
 'Guardians of the Galaxy (2014)']


In [12]:
# simple pearson correlation model solution
user_item_matrix = ratings.pivot(index='userId', columns='movieId', values='rating')

# fill NaN with 0
user_item_matrix = user_item_matrix.fillna(0)

# calculate the pearson correlation between the user-item matrix using corrcoef (item/movie similarity)
pearson_corr_model = np.corrcoef(user_item_matrix.T)


In [13]:
# get the scores, remove the liked movies, and get the top 10 movie names
scores = pearson_corr_model.dot(user_interaction)
for movie_id in liked_movie_ids:
    scores[mapping_movie.get_index(movie_id)] = -1
top_10_indices = scores.argsort()[-10:][::-1]
top_10_movie_ids = mapping_movie.get_ids(top_10_indices)
top_10_movie_names = movies[movies['movieId'].isin(top_10_movie_ids)]['title'].values

print(top_10_movie_names)

['WALL·E (2008)' 'Up (2009)' 'Avatar (2009)' 'Iron Man 2 (2010)'
 'Inception (2010)' 'Thor (2011)' 'X-Men: First Class (2011)'
 'Dark Knight Rises, The (2012)' 'Iron Man 3 (2013)'
 'Guardians of the Galaxy (2014)']


## Notes on Pearson Correlation vs Cosine Coefficient

### Cosine Similarity

Cosine similarity measures the angle between two vectors. The downside is that it does not account for the magnitude of the vectors. This means that if one user rates all movies very high and another user rates all movies very low, the cosine similarity will be very high. This is a problem for our specific use case because there are users that rate all movies very high or very low.

However, we can remedy this by subtracting the average rating of each user from all their ratings. This way, the magnitude of the vectors will be taken into account. This is called `centered cosine similarity`.

### Pearson Correlation

Pearson correlation coefficient is a measure of the strength and direction of association that exists between two continuous variables, which ranges from -1 to 1, where 1 means that there is a strong positive correlation between the variables, -1 means that there is a strong negative correlation between the variables, and 0 means that there is no correlation between the variables. However, Pearson correlation coefficient can be more sensitive to outliers than cosine similarity. This is because Pearson correlation coefficient is based on the mean of the data, which can be affected by outliers. This is probably not too much of an issue for us because we are dealing with ratings, which are not continuous variables.

### Conclusion

In conclusion, both cosine similarity and pearson correlation should work well for our case. This is evident in the result from our experiment above, where both models produced similar recommendations for the same user. So we are going to conduct a more thorough experiment to determine which model is better for our case, by comparing the RMSE of the two models.