In [31]:
#Initialization

import numpy as np
import pandas as pd
import os
import re
#os.getcwd()

In [2]:
#Build a recommendation system based on ratings from users
file_path = 'C:/Users/User/Machine Learning/Movie_Rank_Recommendation_Lite/ratings_lite.csv'
ratings = pd.read_csv(file_path)

#Make sure movie id is string for primary key to join df
ratings['movieId'] = ratings['movieId'].astype(str)
ratings.head(5)

Unnamed: 0,userId,movieId,rating
0,1,296,5.0
1,1,306,3.5
2,1,307,5.0
3,1,665,5.0
4,1,899,3.5


In [3]:
file_path1 = 'C:/Users/User/Machine Learning/Movie_Rank_Recommendation_Lite/movies.csv'
movie = pd.read_csv(file_path1)
movie['movieId'] = movie['movieId'].astype(str)
#movie = movie.merge(ratings, how = 'inner', on = 'movieId')
movie

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
62418,209157,We (2018),Drama
62419,209159,Window of the Soul (2001),Documentary
62420,209163,Bad Poems (2018),Comedy|Drama
62421,209169,A Girl Thing (2001),(no genres listed)


In [4]:
#Clean the title of the movies
def clean_title(title):
    return re.sub("[^a-zA-Z0-9 ]", "", title)

In [5]:
movie["clean_title"] = movie["title"].apply(clean_title)
movie.head(5)

Unnamed: 0,movieId,title,genres,clean_title
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 1995
1,2,Jumanji (1995),Adventure|Children|Fantasy,Jumanji 1995
2,3,Grumpier Old Men (1995),Comedy|Romance,Grumpier Old Men 1995
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,Waiting to Exhale 1995
4,5,Father of the Bride Part II (1995),Comedy,Father of the Bride Part II 1995


In [6]:
#Generate a list of watched movie
mymovie = movie.sample(n = 520)
movie_set = set(mymovie['movieId'])

In [7]:
#Find similar users
overlap_users = {}
for i in range(ratings.shape[0]):
    if ratings['movieId'][i] in movie_set:
        if ratings['userId'][i] not in overlap_users:
            overlap_users[ratings['userId'][i]] = 1
        else:
            overlap_users[ratings['userId'][i]] += 1


In [8]:
len(overlap_users)

2847

In [9]:
#Further find only users who have read some of the same books (more than 5%) as us
filtered_overlap_users = set([k for k in overlap_users if overlap_users[k] > len(movie_set)*0.05])
filtered_overlap_users

{847, 997, 1748, 2177, 5114}

In [10]:
#Get the movie id and ratings from the overlap users
interactions = ratings[(ratings["userId"].isin(filtered_overlap_users))][["userId", "movieId", "rating"]]
interactions

Unnamed: 0,userId,movieId,rating
113099,847,1,4.0
113100,847,6,4.5
113101,847,7,3.0
113102,847,10,2.5
113103,847,16,2.5
...,...,...,...
753127,5114,204542,2.5
753128,5114,204692,1.5
753129,5114,204698,2.0
753130,5114,205279,3.0


In [11]:
#Get my movie ratings
my_movie_ratings = ratings[(ratings["movieId"].isin(movie_set))][["movieId", "rating"]]
myuserId = pd.Series(-1, index = range(len(my_movie_ratings)))
my_movie_ratings = my_movie_ratings.merge(myuserId.rename("userId"), left_index = True, right_index = True)
my_movie_ratings = my_movie_ratings.reindex(columns = ['userId', 'movieId', 'rating'])
my_movie_ratings.head(5)

Unnamed: 0,userId,movieId,rating
62,-1,8786,4.0
175,-1,3114,4.5
245,-1,31923,3.5
780,-1,95441,3.5
830,-1,110102,4.0


In [12]:
#Adding own movie ratings into the ratings list
interactions = pd.concat([my_movie_ratings[["userId", "movieId", "rating"]], interactions])
interactions['userId'] = interactions['userId'].astype(str)
interactions['movieId'] = interactions['movieId'].astype(str)
interactions['rating'] = pd.to_numeric(interactions['rating'])

In [13]:
#Want each user id to correspond to single row in the collaboration filter matrix
#Assigne a number to each category (user id AND movie id)
interactions['user_index'] = interactions['userId'].astype('category').cat.codes
interactions['movie_index'] = interactions['movieId'].astype('category').cat.codes

In [14]:
from scipy.sparse import coo_matrix
import scipy.sparse as sparse

#Memory efficient using sparse matrix
ratings_mat_coo = coo_matrix((interactions['rating'], (interactions['user_index'], interactions['movie_index'])))
ratings_mat_coo

<6x8909 sparse matrix of type '<class 'numpy.float64'>'
	with 15409 stored elements in COOrdinate format>

In [15]:
ratings_mat = ratings_mat_coo.tocsr()
print(ratings_mat.toarray())

[[0.  0.  0.  ... 0.  0.  0. ]
 [4.5 4.5 3.  ... 0.  0.  0. ]
 [4.  4.  4.  ... 0.  0.  0. ]
 [3.5 0.  0.  ... 2.5 0.  0. ]
 [4.  2.5 3.5 ... 4.5 0.  0. ]
 [4.5 3.  0.  ... 0.  3.5 3. ]]


In [16]:
interactions[interactions['userId'] == '-1']

Unnamed: 0,userId,movieId,rating,user_index,movie_index
62,-1,8786,4.0,0,8327
175,-1,3114,4.5,0,3618
245,-1,31923,3.5,0,3724
780,-1,95441,3.5,0,8758
830,-1,110102,4.0,0,349
841,-1,112370,3.0,0,412
864,-1,122906,4.0,0,680
904,-1,176101,3.5,0,1894
950,-1,3114,3.0,0,3618
1030,-1,72226,4.0,0,7400


In [17]:
my_index = 0

In [18]:
#Quantify how similar the other users to us
#Get movie recommendations from them (collaborating with similar users)
from sklearn.metrics.pairwise import cosine_similarity

similarity = cosine_similarity(ratings_mat[my_index,:], ratings_mat).flatten()
similarity

array([1.        , 0.03400318, 0.03237031, 0.04827243, 0.03846844,
       0.05977109])

In [19]:
#Get user id for the users most similar to us
if (len(similarity) - 1) % 2 == 0:
    number = len(similarity - 1)
else:
    number = len(similarity)
display(number)
#indices = np.argpartition(similarity, -1*(number / 2))[(number / 2):]
#n users that are most similar to us
round_num = round(number/2)
indices = np.argpartition(similarity, -round_num)[round_num:]

similar_users = interactions[interactions['user_index'].isin(indices)].copy()
similar_users = similar_users[similar_users['userId'] != '-1']

6

In [20]:
indices

array([3, 5, 0], dtype=int64)

In [21]:
interactions

Unnamed: 0,userId,movieId,rating,user_index,movie_index
62,-1,8786,4.0,0,8327
175,-1,3114,4.5,0,3618
245,-1,31923,3.5,0,3724
780,-1,95441,3.5,0,8758
830,-1,110102,4.0,0,349
...,...,...,...,...,...
753127,5114,204542,2.5,3,2459
753128,5114,204692,1.5,3,2461
753129,5114,204698,2.0,3,2462
753130,5114,205279,3.0,3,2472


In [22]:
similar_users

Unnamed: 0,userId,movieId,rating,user_index,movie_index
140679,997,1,4.5,5,0
140680,997,2,3.5,5,2389
140681,997,5,3.0,5,5500
140682,997,6,4.5,5,6396
140683,997,10,3.0,5,1
...,...,...,...,...,...
753127,5114,204542,2.5,3,2459
753128,5114,204692,1.5,3,2461
753129,5114,204698,2.0,3,2462
753130,5114,205279,3.0,3,2472


In [23]:
#Find how many times each movies appear in this recommendation and the its mean rating
movie_recs = similar_users.groupby('movieId').rating.agg(['count' , 'mean'])
movie_recs

Unnamed: 0_level_0,count,mean
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,2,4.0
10,1,3.0
100044,1,3.5
100075,1,3.5
100083,2,2.5
...,...,...
99910,1,3.0
99912,1,2.0
99917,1,2.5
99957,1,3.5


In [24]:
#Get movie title to the recommendation list
movie_recs = movie_recs.merge(movie, how = 'inner', on = 'movieId')
movie_recs

Unnamed: 0,movieId,count,mean,title,genres,clean_title
0,1,2,4.0,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 1995
1,10,1,3.0,GoldenEye (1995),Action|Adventure|Thriller,GoldenEye 1995
2,100044,1,3.5,Human Planet (2011),Documentary,Human Planet 2011
3,100075,1,3.5,Choose (2010),Crime|Drama|Horror,Choose 2010
4,100083,2,2.5,Movie 43 (2013),Comedy,Movie 43 2013
...,...,...,...,...,...,...
3997,99910,1,3.0,"Last Stand, The (2013)",Action|Crime|Thriller,Last Stand The 2013
3998,99912,1,2.0,Mama (2013),Horror,Mama 2013
3999,99917,1,2.5,Upstream Color (2013),Romance|Sci-Fi|Thriller,Upstream Color 2013
4000,99957,1,3.5,Broken City (2013),Crime|Drama|Thriller,Broken City 2013


In [25]:
#Find the number of times a movie is rated
total_ratings_per_movie = ratings.groupby('movieId').rating.agg(['count'])
total_ratings_per_movie.head(5)

Unnamed: 0_level_0,count
movieId,Unnamed: 1_level_1
1,2465
10,1173
100,162
1000,10
100008,1


In [26]:
total_ratings_per_movie.shape

(22240, 1)

In [27]:
movie_recs = movie_recs.merge(total_ratings_per_movie, how = 'inner', on = 'movieId')
movie_recs.rename(columns={'count_x': 'count', 'count_y': 'rating'}, inplace = True)
movie_recs

Unnamed: 0,movieId,count,mean,title,genres,clean_title,rating
0,1,2,4.0,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 1995,2465
1,10,1,3.0,GoldenEye (1995),Action|Adventure|Thriller,GoldenEye 1995,1173
2,100044,1,3.5,Human Planet (2011),Documentary,Human Planet 2011,20
3,100075,1,3.5,Choose (2010),Crime|Drama|Horror,Choose 2010,1
4,100083,2,2.5,Movie 43 (2013),Comedy,Movie 43 2013,34
...,...,...,...,...,...,...,...
3997,99910,1,3.0,"Last Stand, The (2013)",Action|Crime|Thriller,Last Stand The 2013,33
3998,99912,1,2.0,Mama (2013),Horror,Mama 2013,51
3999,99917,1,2.5,Upstream Color (2013),Romance|Sci-Fi|Thriller,Upstream Color 2013,29
4000,99957,1,3.5,Broken City (2013),Crime|Drama|Thriller,Broken City 2013,10


In [28]:
#Ranking recommendations
movie_recs['adjusted_count'] = movie_recs['count'] * (movie_recs['count'] / movie_recs['rating'])

In [29]:
movie_recs['score'] = movie_recs['mean'] * movie_recs['adjusted_count']
#Take out any movie we have already watched
movie_recs = movie_recs[~movie_recs['movieId'].isin(mymovie['movieId'])]
#Take out any movie title too because this is not the cleanest data, different movies might have the same movie id
movie_recs = movie_recs[~movie_recs['clean_title'].isin(mymovie['clean_title'])]
#At least 2 users who are similar to us had watch the movie and like it in order to put into the movie reco list
movie_recs = movie_recs[movie_recs['count'] > 1]
movie_recs = movie_recs[movie_recs['mean'] > 3]

top_movie_recs = movie_recs.sort_values('score', ascending = False)
top_movie_recs

Unnamed: 0,movieId,count,mean,title,genres,clean_title,rating,adjusted_count,score
3177,68288,2,3.75,"Informers, The (2008)",Crime|Drama|Thriller,Informers The 2008,3,1.333333,5.000000
3245,70015,2,3.25,Polytechnique (2009),Crime|Drama,Polytechnique 2009,5,0.800000,2.600000
1015,168288,2,3.25,The Belko Experiment (2017),Action|Horror|Thriller,The Belko Experiment 2017,8,0.500000,1.625000
233,111235,2,4.25,Jodorowsky's Dune (2013),Documentary|Sci-Fi,Jodorowskys Dune 2013,14,0.285714,1.214286
3977,98239,2,3.25,Red Dawn (2012),Action|War,Red Dawn 2012,15,0.266667,0.866667
...,...,...,...,...,...,...,...,...,...
213,110,2,3.50,Braveheart (1995),Action|Drama|War,Braveheart 1995,2516,0.001590,0.005564
1774,260,2,4.00,Star Wars: Episode IV - A New Hope (1977),Action|Adventure|Sci-Fi,Star Wars Episode IV A New Hope 1977,2931,0.001365,0.005459
1927,296,2,4.00,Pulp Fiction (1994),Comedy|Crime|Drama|Thriller,Pulp Fiction 1994,3418,0.001170,0.004681
2127,356,2,3.75,Forrest Gump (1994),Comedy|Drama|Romance|War,Forrest Gump 1994,3518,0.001137,0.004264


In [30]:
top_movie_recs[top_movie_recs['score'] > 1]

Unnamed: 0,movieId,count,mean,title,genres,clean_title,rating,adjusted_count,score
3177,68288,2,3.75,"Informers, The (2008)",Crime|Drama|Thriller,Informers The 2008,3,1.333333,5.0
3245,70015,2,3.25,Polytechnique (2009),Crime|Drama,Polytechnique 2009,5,0.8,2.6
1015,168288,2,3.25,The Belko Experiment (2017),Action|Horror|Thriller,The Belko Experiment 2017,8,0.5,1.625
233,111235,2,4.25,Jodorowsky's Dune (2013),Documentary|Sci-Fi,Jodorowskys Dune 2013,14,0.285714,1.214286
