# Project Goal

In this project, you will build a proof of concept: A web application that showcases different movie recommendation algorithms.

# Import the relevant packages and libraries

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.decomposition import NMF 
from sklearn.impute import KNNImputer

import pickle
import random
from sklearn.metrics.pairwise import cosine_similarity


# Load the datas

In [2]:
#load the movies table
movies = pd.read_csv('data/movies.csv')
movies

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
9737,193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy
9738,193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy
9739,193585,Flint (2017),Drama
9740,193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation


In [3]:
#load the ratings table
ratings = pd.read_csv('data/ratings.csv')
ratings

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931
...,...,...,...,...
100831,610,166534,4.0,1493848402
100832,610,168248,5.0,1493850091
100833,610,168250,5.0,1494273047
100834,610,168252,5.0,1493846352


In [4]:
#merge the tables in respect of movieId
ratings_movies = pd.merge(ratings,movies, how='left',on='movieId')

#save as csv file for further process
ratings_movies.to_csv('data/ratings_movies.csv', index=False)

ratings_movies.head()

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
0,1,1,4.0,964982703,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,1,3,4.0,964981247,Grumpier Old Men (1995),Comedy|Romance
2,1,6,4.0,964982224,Heat (1995),Action|Crime|Thriller
3,1,47,5.0,964983815,Seven (a.k.a. Se7en) (1995),Mystery|Thriller
4,1,50,5.0,964982931,"Usual Suspects, The (1995)",Crime|Mystery|Thriller


In [5]:
#check the missing values
ratings_movies.isna().sum()

userId       0
movieId      0
rating       0
timestamp    0
title        0
genres       0
dtype: int64

# Implement a simple recommender

In [6]:
#Calculate the average rating for each movie in the dataset
ratings_movies.groupby(['title'])[['rating']].mean().sort_values(by='rating',ascending=False).round(2)

Unnamed: 0_level_0,rating
title,Unnamed: 1_level_1
Gena the Crocodile (1969),5.0
True Stories (1986),5.0
Cosmic Scrat-tastrophe (2015),5.0
Love and Pigeons (1985),5.0
Red Sorghum (Hong gao liang) (1987),5.0
...,...
Don't Look Now (1973),0.5
Journey 2: The Mysterious Island (2012),0.5
Joe Dirt 2: Beautiful Loser (2015),0.5
Jesus Christ Vampire Hunter (2001),0.5


In [7]:
#Filter out movies that have been watched by less than 20 users
movies_count = ratings_movies.groupby(['title'])[['movieId']].count()
less_than_20 = movies_count[movies_count['movieId']<20].sort_values(by='movieId')
less_than_20

Unnamed: 0_level_0,movieId
title,Unnamed: 1_level_1
'71 (2014),1
Lola Versus (2012),1
Lola Montès (1955),1
Logan Lucky (2017),1
Live Wire (1992),1
...,...
Virtuosity (1995),19
Grease 2 (1982),19
"Red Violin, The (Violon rouge, Le) (1998)",19
50/50 (2011),19


In [8]:
#top ten movies by mean of ratings
ratings_movies.groupby('title')[['rating']].mean().sort_values(by='rating',ascending=False)[:10]

Unnamed: 0_level_0,rating
title,Unnamed: 1_level_1
Gena the Crocodile (1969),5.0
True Stories (1986),5.0
Cosmic Scrat-tastrophe (2015),5.0
Love and Pigeons (1985),5.0
Red Sorghum (Hong gao liang) (1987),5.0
"Thin Line Between Love and Hate, A (1996)",5.0
Lesson Faust (1994),5.0
Eva (2011),5.0
Who Killed Chea Vichea? (2010),5.0
Siam Sunset (1999),5.0


In [9]:
#top ten movies by count of ratings
ratings_movies.groupby('title')[['rating']].count().sort_values(by='rating',ascending=False)[:10]

Unnamed: 0_level_0,rating
title,Unnamed: 1_level_1
Forrest Gump (1994),329
"Shawshank Redemption, The (1994)",317
Pulp Fiction (1994),307
"Silence of the Lambs, The (1991)",279
"Matrix, The (1999)",278
Star Wars: Episode IV - A New Hope (1977),251
Jurassic Park (1993),238
Braveheart (1995),237
Terminator 2: Judgment Day (1991),224
Schindler's List (1993),220


In [10]:
#merge these two tables to find the best movies which were seen a large number of people
ratings_movies_best = pd.DataFrame(ratings_movies.groupby('title')[['rating']].mean().round(2))
ratings_movies_best['count_rating'] = ratings_movies.groupby('title')[['rating']].count()
ratings_movies_best.rename(columns={'rating':'mean_rating'}, inplace=True)
ratings_movies_best.sort_values(['count_rating','mean_rating'], ascending=False)[:10]

Unnamed: 0_level_0,mean_rating,count_rating
title,Unnamed: 1_level_1,Unnamed: 2_level_1
Forrest Gump (1994),4.16,329
"Shawshank Redemption, The (1994)",4.43,317
Pulp Fiction (1994),4.2,307
"Silence of the Lambs, The (1991)",4.16,279
"Matrix, The (1999)",4.19,278
Star Wars: Episode IV - A New Hope (1977),4.23,251
Jurassic Park (1993),3.75,238
Braveheart (1995),4.03,237
Terminator 2: Judgment Day (1991),3.97,224
Schindler's List (1993),4.22,220


In [11]:
#create a seen movie list for a given user
userId = 1
seen_movies_list = ratings_movies.loc[ratings_movies['userId'] == userId, :]['title'].to_list()

#create a best movie list which not seen by given userId
best_movie_list_not_seen = []
for i in ratings_movies_best.index.to_list():
    if i not in seen_movies_list:
        best_movie_list_not_seen.append(i)
    else:
        continue

In [12]:
#check the lists
len(best_movie_list_not_seen), len(seen_movies_list), len(ratings_movies_best)

(9487, 232, 9719)

In [13]:
#recommend the top ten movies that a user has not seen yet
ratings_movies_best.loc[best_movie_list_not_seen,:].sort_values(['count_rating','mean_rating'], ascending=False)[:10]


Unnamed: 0_level_0,mean_rating,count_rating
title,Unnamed: 1_level_1,Unnamed: 2_level_1
"Shawshank Redemption, The (1994)",4.43,317
Terminator 2: Judgment Day (1991),3.97,224
Apollo 13 (1995),3.85,201
"Lord of the Rings: The Fellowship of the Ring, The (2001)",4.11,198
"Godfather, The (1972)",4.29,192
"Lord of the Rings: The Two Towers, The (2002)",4.02,188
"Lord of the Rings: The Return of the King, The (2003)",4.12,185
Aladdin (1992),3.79,183
"Sixth Sense, The (1999)",3.89,179
True Lies (1994),3.5,178


In [14]:
movie_dict = pd.Series(ratings_movies.title, index=ratings_movies.movieId.values).to_dict()

In [15]:
def recommend_popular(query, ratings, k=10):
    '''The function gets a user query of rated movie-ids and the ratings table as input. It returns a list of k movie-ids.
    '''
    # 1.candidate generation
       
    # filter out movies that the user has already seen
    seen_movie_list = []
    for movie in query.keys():
        seen_movie_list.append(movie)
           
    # filter out movies that have been watched by less than 100 users
    ratings_2 = ratings.groupby('movieId')[['rating']].count()
    ratings_2 = ratings_2[ratings_2['rating'] >=100]
    movies_more_than_100 = ratings_2.index.to_list()
    
    #create a best movie list which have not seen before and watched by more than 100 users
    best_movie_list = []
    for i in movies_more_than_100:
        if i not in seen_movie_list:
            best_movie_list.append(i)
        else:
            continue
            
    ratings = ratings.reset_index().set_index('movieId')
    ratings_3 = ratings.loc[best_movie_list,:]
    
    # 2. scoring
    # calculate the average rating and wachted by users for each movie
    ratings_3 = ratings_3.reset_index()
    # merge them to a new dataframe
    ratings_4 =  pd.DataFrame(ratings_3.groupby('movieId')[['rating']].mean().round(2))
    ratings_4['count_rating'] = ratings_3.groupby('movieId')[['rating']].count()
    ratings_4.rename(columns={'rating':'mean_rating'}, inplace=True)
    
    # 3. ranking
    ratings_4.sort_values(['mean_rating','count_rating'], ascending=False)
    
    # return the top-k highest rated movie ids or titles
    return ratings_4.sort_values(['mean_rating','count_rating'], ascending=False)[:k]

In [16]:
#one recommend for given query
query = {12: 5, 234: 1}
recommend_popular(query,ratings_movies,k=5)

Unnamed: 0_level_0,mean_rating,count_rating
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
318,4.43,317
858,4.29,192
2959,4.27,218
1221,4.26,129
1213,4.25,126


# Collaborative Filtering with Non Negative Matrix Factorisation

## Fill the missing values

In [17]:
#pivot the ratings_movies table
ratings_movies_pivot = pd.pivot_table(ratings_movies, 
                                      index='userId', 
                                      columns='title', 
                                      values='rating'
                                     )
ratings_movies_pivot.head()

title,'71 (2014),'Hellboy': The Seeds of Creation (2004),'Round Midnight (1986),'Salem's Lot (2004),'Til There Was You (1997),'Tis the Season for Love (2015),"'burbs, The (1989)",'night Mother (1986),(500) Days of Summer (2009),*batteries not included (1987),...,Zulu (2013),[REC] (2007),[REC]² (2009),[REC]³ 3 Génesis (2012),anohana: The Flower We Saw That Day - The Movie (2013),eXistenZ (1999),xXx (2002),xXx: State of the Union (2005),¡Three Amigos! (1986),À nous la liberté (Freedom for Us) (1931)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,,,,,,,...,,,,,,,,,4.0,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,,,,,,,,,,...,,,,,,,,,,


In [18]:
user_list  = ratings_movies_pivot.index.to_list()
movie_list = ratings_movies_pivot.columns.to_list()

In [19]:
knn_imputer = KNNImputer(n_neighbors=2)

In [20]:
knn_imputer.fit_transform(ratings_movies_pivot).shape

(610, 9719)

In [21]:
ratings = pd.DataFrame(data=knn_imputer.fit_transform(ratings_movies_pivot), 
                       index=user_list, 
                       columns=movie_list
                      )

In [22]:
ratings.head()

Unnamed: 0,'71 (2014),'Hellboy': The Seeds of Creation (2004),'Round Midnight (1986),'Salem's Lot (2004),'Til There Was You (1997),'Tis the Season for Love (2015),"'burbs, The (1989)",'night Mother (1986),(500) Days of Summer (2009),*batteries not included (1987),...,Zulu (2013),[REC] (2007),[REC]² (2009),[REC]³ 3 Génesis (2012),anohana: The Flower We Saw That Day - The Movie (2013),eXistenZ (1999),xXx (2002),xXx: State of the Union (2005),¡Three Amigos! (1986),À nous la liberté (Freedom for Us) (1931)
1,4.0,4.0,3.5,5.0,4.0,1.5,4.0,3.0,4.5,3.75,...,1.5,4.0,3.25,3.0,3.0,3.75,2.0,2.0,4.0,1.0
2,4.0,4.0,3.5,5.0,4.0,1.5,4.25,3.0,4.5,3.0,...,1.5,4.0,3.25,3.0,3.0,4.0,2.25,1.75,2.0,1.0
3,4.0,4.0,3.5,5.0,4.0,1.5,2.5,3.0,3.0,2.75,...,1.5,4.5,3.25,3.0,3.0,2.75,3.25,1.75,2.5,1.0
4,4.0,4.0,3.5,5.0,4.0,1.5,3.75,3.0,4.25,3.25,...,1.5,3.25,3.25,3.0,3.0,3.5,2.25,2.25,3.0,1.0
5,4.0,4.0,3.5,5.0,4.0,1.5,2.5,3.0,4.5,3.25,...,1.5,4.25,4.0,3.0,3.0,4.5,3.5,1.75,3.0,1.0


## Create a model

In [23]:
#determine the component number of model
genre = []
for i in ratings_movies.genres:
    for j in i.split("|"):
        if j not in genre:
            genre.append(j)
genre, len(genre)

(['Adventure',
  'Animation',
  'Children',
  'Comedy',
  'Fantasy',
  'Romance',
  'Action',
  'Crime',
  'Thriller',
  'Mystery',
  'Horror',
  'Drama',
  'War',
  'Western',
  'Sci-Fi',
  'Musical',
  'Film-Noir',
  'IMAX',
  'Documentary',
  '(no genres listed)'],
 20)

In [24]:
# determine the parameters of NMF model
nmf_model = NMF(n_components=20, init='nndsvda', max_iter=10000)

In [25]:
#fit the model on the full imputed user/movie dataframe/matrix
nmf_model.fit(ratings)

NMF(init='nndsvda', max_iter=10000, n_components=20)

In [26]:
# create movie feature matrix Q from the model components
Q_matrix = nmf_model.components_
Q_matrix.shape

(20, 9719)

In [27]:
features = []
for i in range(20):
    features.append(f'feature{i}')
    i += 1

In [28]:
#get the movie-feature dataframe Q
Q = pd.DataFrame(data=Q_matrix, 
                 index=features, 
                 columns=movie_list
                )
Q

Unnamed: 0,'71 (2014),'Hellboy': The Seeds of Creation (2004),'Round Midnight (1986),'Salem's Lot (2004),'Til There Was You (1997),'Tis the Season for Love (2015),"'burbs, The (1989)",'night Mother (1986),(500) Days of Summer (2009),*batteries not included (1987),...,Zulu (2013),[REC] (2007),[REC]² (2009),[REC]³ 3 Génesis (2012),anohana: The Flower We Saw That Day - The Movie (2013),eXistenZ (1999),xXx (2002),xXx: State of the Union (2005),¡Three Amigos! (1986),À nous la liberté (Freedom for Us) (1931)
feature0,2.400814,2.400814,2.100712,3.001017,2.343763,0.900305,1.553042,1.80061,0.826383,1.956889,...,0.900305,1.893146,2.6559,1.80061,1.80061,2.570309,2.040509,1.315405,2.163994,0.600203
feature1,22.262092,22.262092,19.47933,27.827615,22.274522,8.348284,8.296811,16.696568,18.890661,14.355245,...,8.348284,21.898129,19.995896,16.696568,16.696568,14.605955,8.02687,8.025507,11.585124,5.565523
feature2,1.135844,1.135844,0.993864,1.419806,1.149269,0.425942,1.757259,0.851883,2.33197,0.904993,...,0.425942,1.109766,1.310077,0.851883,0.851883,0.494223,0.0,0.981837,0.780994,0.283961
feature3,2.383502,2.383502,2.085565,2.979378,2.317426,0.893813,2.864978,1.787627,1.465574,3.540297,...,0.893813,1.555766,1.421468,1.787627,1.787627,1.221168,0.0,0.793535,3.543914,0.595876
feature4,117.041744,117.041744,102.411525,146.302181,117.5464,43.890654,0.0,87.781307,135.897454,94.468782,...,43.890654,151.697763,85.576494,87.781307,87.781307,74.915227,76.090648,50.949933,116.242665,29.260436
feature5,74.32379,74.32379,65.033314,92.904741,76.462746,27.871419,155.178387,55.742839,35.278478,60.568379,...,27.871419,177.307971,88.419511,55.742839,55.742839,0.0,34.018258,47.203698,134.202723,18.580946
feature6,561.600584,561.600584,491.400504,702.000742,563.313981,210.600212,590.393886,421.200425,591.839084,1181.746561,...,210.600212,248.296483,659.770107,421.200425,421.200425,992.351008,909.945974,714.240369,664.366787,140.400141
feature7,3169.58343,3169.58343,2773.385468,3961.979346,3299.312646,1188.593755,8482.510722,2377.187511,4693.390181,777.958507,...,1188.593755,697.619222,3227.560246,2377.187511,2377.187511,1339.034466,6196.978667,2697.755314,1761.645668,792.395835
feature8,24.888901,24.888901,21.777787,31.111128,24.960366,9.333337,3.272621,18.666674,111.586797,32.85801,...,9.333337,52.132441,4.888976,18.666674,18.666674,5.568391,19.082158,5.810493,15.419266,6.222225
feature9,23.190055,23.190055,20.291297,28.987571,24.608759,8.69627,44.97513,17.39254,95.926765,41.944941,...,8.69627,25.558621,59.039854,17.39254,17.39254,10.72087,76.890157,9.301858,36.858929,5.797513


In [29]:
#create user feature matrix P
P_matrix = nmf_model.transform(ratings)
P_matrix.shape

(610, 20)

In [30]:
##get the user-feature dataframe P
P = pd.DataFrame(data=P_matrix, 
                 index=user_list, 
                 columns=features
                )
P.head()

Unnamed: 0,feature0,feature1,feature2,feature3,feature4,feature5,feature6,feature7,feature8,feature9,feature10,feature11,feature12,feature13,feature14,feature15,feature16,feature17,feature18,feature19
1,0.165295,0.006955,0.358474,0.125259,0.000511,0.002211,3e-06,1.6e-05,0.001826,0.002387,0.030759,7e-05,5.6e-05,9.1e-05,0.004968,0.0,0.035669,0.117441,0.0,0.524309
2,0.05145,0.004514,0.208436,0.06329,0.002877,0.0,0.000137,1.2e-05,0.00238,0.003469,0.006078,0.000337,0.000123,0.000168,0.001949,0.026824,0.076671,0.036361,0.0,0.61675
3,0.041558,0.033591,0.043592,0.0,0.006041,0.001038,0.0,2.2e-05,0.0,0.0,0.015028,0.000104,0.000257,2.7e-05,0.002909,0.001366,0.004896,0.000212,0.015296,0.51914
4,0.039489,0.024131,0.049475,0.074424,0.000617,0.001008,0.0,0.000104,0.00684,0.003692,0.001855,3e-05,4e-05,0.0,0.0,0.023138,0.132915,0.134723,0.282022,0.360401
5,0.204985,0.024359,0.048121,0.010236,0.001495,0.001759,2.4e-05,2.9e-05,0.001662,0.00179,0.017878,0.00014,2.6e-05,0.000126,0.000489,0.027009,0.058488,0.219771,0.252701,0.308386


In [31]:
#reconstruct the ratings matrix
ratings_rec_matrix = np.dot(P_matrix,Q_matrix)
ratings_rec_matrix.shape

(610, 9719)

In [32]:
#get the ratings reconstructed dataframe
ratings_rec = pd.DataFrame(data=ratings_rec_matrix, index=user_list, columns=movie_list)
ratings_rec.round(2).head()

Unnamed: 0,'71 (2014),'Hellboy': The Seeds of Creation (2004),'Round Midnight (1986),'Salem's Lot (2004),'Til There Was You (1997),'Tis the Season for Love (2015),"'burbs, The (1989)",'night Mother (1986),(500) Days of Summer (2009),*batteries not included (1987),...,Zulu (2013),[REC] (2007),[REC]² (2009),[REC]³ 3 Génesis (2012),anohana: The Flower We Saw That Day - The Movie (2013),eXistenZ (1999),xXx (2002),xXx: State of the Union (2005),¡Three Amigos! (1986),À nous la liberté (Freedom for Us) (1931)
1,4.0,4.0,3.5,5.0,4.0,1.5,4.06,3.0,4.16,3.51,...,1.5,4.1,3.5,3.0,3.0,3.41,2.45,2.11,3.4,1.0
2,4.0,4.0,3.5,5.0,4.01,1.5,3.52,3.0,3.72,3.26,...,1.5,3.96,3.52,3.0,3.0,4.09,2.49,1.99,3.32,1.0
3,4.01,4.01,3.51,5.02,4.01,1.5,2.42,3.01,2.64,3.22,...,1.5,4.01,3.25,3.01,3.01,3.36,2.22,1.7,2.97,1.0
4,4.0,4.0,3.5,5.0,4.0,1.5,3.38,3.0,3.9,3.0,...,1.5,3.5,3.43,3.0,3.0,3.4,2.45,2.02,3.07,1.0
5,4.0,4.0,3.5,5.01,4.0,1.5,3.43,3.0,3.97,2.94,...,1.5,4.33,3.64,3.0,3.0,3.96,2.59,2.09,3.23,1.0


In [33]:
#get the reconstructed error R and R^
nmf_model.reconstruction_err_

779.9410279811871

In [34]:
np.sqrt(np.square(abs(ratings - ratings_rec)).sum().sum())

779.9348303022867

## Save the model with pickle

In [35]:
binary = pickle.dumps(nmf_model)
print(binary)

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [36]:
file = open('model/nmf_model.bin', mode='wb')
file.write(binary)
file.close()

# Recommendations for one user

In [37]:
new_user_query = { "'Til There Was You (1997)": 4,
                   "'Tis the Season for Love (2015)": 2,
                   "'burbs, The (1989)": 3,
                   "'night Mother (1986)": 5,
                   '(500) Days of Summer (2009)': 1}

In [38]:
def nmf_recommender(new_user_query, k_top=5):
    '''The function gets k-top movie recommadations for a given user matrix based on nmf model
    '''
    #get new_user-item dataframe with the previous dictionary
    new_user_ratings = pd.DataFrame(
                                    data=new_user_query,
                                    columns=movie_list, 
                                    index=['new_user']
                                   )
    #fill the missing value
    new_user_ratings_imputed = pd.DataFrame(
                                            data=knn_imputer.transform(new_user_ratings), 
                                            columns=movie_list, 
                                            index=['new_user']
                                           )
    #load the pickled model
    file = open('model/nmf_model.bin', mode='rb')
    binary = file.read()
    file.close()
    nmf_model = pickle.loads(binary)
    
    #create movie feature matrix Q
    Q_matrix = nmf_model.components_
    
    
    #create user feature matrix P
    P_new_user_matrix = nmf_model.transform(new_user_ratings_imputed)
    
    ##get the ratings reconstructed dataframe
    new_user_ratings_rec = pd.DataFrame(data=np.dot(P_new_user_matrix,Q_matrix),
                                    columns=new_user_ratings.columns,
                                    index=new_user_ratings.index.to_list()
                                       )
    #exclude the rated movies
    mask = new_user_ratings.T.isna()
    new_user_ratings_rec_2 = new_user_ratings_rec.T[mask].T
   
    #get top k rated movies
    recommend_list = new_user_ratings_rec_2.sort_values(new_user_ratings.index.to_list(), axis=1, ascending=False).T.index.to_list()[:k_top]
    
    #Create a loop to print the first k movies from the sorted movies list
    j=0
    print(f'The {k_top} most recommended movies are:\n')
    for item in recommend_list:
        print(j+1,item)
        j = j+1
        if j > k_top-1:
            break

In [39]:
nmf_recommender(new_user_query, k_top=5)

The 5 most recommended movies are:

1 Big Top Scooby-Doo! (2012)
2 Umberto D. (1952)
3 Decalogue, The (Dekalog) (1989)
4 On the Other Side of the Tracks (De l'autre côté du périph) (2012)
5 Tyler Perry's I Can Do Bad All by Myself (2009)


# Neighbourhood based Collaborative Filtering

## User based collaborative filter

In [40]:
#transpose the pivot table
user_based = ratings_movies_pivot.T
user_based.head()

userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
'71 (2014),,,,,,,,,,,...,,,,,,,,,,4.0
'Hellboy': The Seeds of Creation (2004),,,,,,,,,,,...,,,,,,,,,,
'Round Midnight (1986),,,,,,,,,,,...,,,,,,,,,,
'Salem's Lot (2004),,,,,,,,,,,...,,,,,,,,,,
'Til There Was You (1997),,,,,,,,,,,...,,,,,,,,,,


In [41]:
#assign to 0 for unrated movies 
user_based_imputed = user_based.fillna(value=0)
user_based_imputed.head()

userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
'71 (2014),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0
'Hellboy': The Seeds of Creation (2004),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
'Round Midnight (1986),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
'Salem's Lot (2004),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
'Til There Was You (1997),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [42]:
#apply cosine similarity to create a similarity matrix
user_based_similarity_matrix = pd.DataFrame(cosine_similarity(user_based_imputed.T))

In [43]:
user_based_similarity_matrix

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,600,601,602,603,604,605,606,607,608,609
0,1.000000,0.027283,0.059720,0.194395,0.129080,0.128152,0.158744,0.136968,0.064263,0.016875,...,0.080554,0.164455,0.221486,0.070669,0.153625,0.164191,0.269389,0.291097,0.093572,0.145321
1,0.027283,1.000000,0.000000,0.003726,0.016614,0.025333,0.027585,0.027257,0.000000,0.067445,...,0.202671,0.016866,0.011997,0.000000,0.000000,0.028429,0.012948,0.046211,0.027565,0.102427
2,0.059720,0.000000,1.000000,0.002251,0.005020,0.003936,0.000000,0.004941,0.000000,0.000000,...,0.005048,0.004892,0.024992,0.000000,0.010694,0.012993,0.019247,0.021128,0.000000,0.032119
3,0.194395,0.003726,0.002251,1.000000,0.128659,0.088491,0.115120,0.062969,0.011361,0.031163,...,0.085938,0.128273,0.307973,0.052985,0.084584,0.200395,0.131746,0.149858,0.032198,0.107683
4,0.129080,0.016614,0.005020,0.128659,1.000000,0.300349,0.108342,0.429075,0.000000,0.030611,...,0.068048,0.418747,0.110148,0.258773,0.148758,0.106435,0.152866,0.135535,0.261232,0.060792
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
605,0.164191,0.028429,0.012993,0.200395,0.106435,0.102123,0.200035,0.099388,0.075898,0.088963,...,0.178084,0.116534,0.300669,0.066032,0.148141,1.000000,0.153063,0.262558,0.069622,0.201104
606,0.269389,0.012948,0.019247,0.131746,0.152866,0.162182,0.186114,0.185142,0.011844,0.010451,...,0.092525,0.199910,0.203540,0.137834,0.118780,0.153063,1.000000,0.283081,0.149190,0.139114
607,0.291097,0.046211,0.021128,0.149858,0.135535,0.178809,0.323541,0.187233,0.100435,0.077424,...,0.158355,0.197514,0.232771,0.155306,0.178142,0.262558,0.283081,1.000000,0.121993,0.322055
608,0.093572,0.027565,0.000000,0.032198,0.261232,0.214234,0.090840,0.423993,0.000000,0.021766,...,0.035653,0.335231,0.061941,0.236601,0.097610,0.069622,0.149190,0.121993,1.000000,0.053225


In [44]:
#choose an active user
active_user = 5

In [45]:
#create a list of unseen movies for this user
unseen_mask = user_based[active_user].isna()

In [46]:
unseen_movies = user_based[unseen_mask].index
unseen_movies

Index([''71 (2014)', ''Hellboy': The Seeds of Creation (2004)',
       ''Round Midnight (1986)', ''Salem's Lot (2004)',
       ''Til There Was You (1997)', ''Tis the Season for Love (2015)',
       ''burbs, The (1989)', ''night Mother (1986)',
       '(500) Days of Summer (2009)', '*batteries not included (1987)',
       ...
       'Zulu (2013)', '[REC] (2007)', '[REC]² (2009)',
       '[REC]³ 3 Génesis (2012)',
       'anohana: The Flower We Saw That Day - The Movie (2013)',
       'eXistenZ (1999)', 'xXx (2002)', 'xXx: State of the Union (2005)',
       '¡Three Amigos! (1986)', 'À nous la liberté (Freedom for Us) (1931)'],
      dtype='object', name='title', length=9675)

In [47]:
#create a list of top 20 similar users
top_20_users = user_based_similarity_matrix[active_user].sort_values(ascending=False).index[1:21]
top_20_users

Int64Index([116, 180,  57, 239, 410,  42, 435, 558, 591, 135, 446, 583, 403,
            469, 283, 445, 173, 601, 320, 475],
           dtype='int64')

In [48]:
user_based_similarity_matrix

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,600,601,602,603,604,605,606,607,608,609
0,1.000000,0.027283,0.059720,0.194395,0.129080,0.128152,0.158744,0.136968,0.064263,0.016875,...,0.080554,0.164455,0.221486,0.070669,0.153625,0.164191,0.269389,0.291097,0.093572,0.145321
1,0.027283,1.000000,0.000000,0.003726,0.016614,0.025333,0.027585,0.027257,0.000000,0.067445,...,0.202671,0.016866,0.011997,0.000000,0.000000,0.028429,0.012948,0.046211,0.027565,0.102427
2,0.059720,0.000000,1.000000,0.002251,0.005020,0.003936,0.000000,0.004941,0.000000,0.000000,...,0.005048,0.004892,0.024992,0.000000,0.010694,0.012993,0.019247,0.021128,0.000000,0.032119
3,0.194395,0.003726,0.002251,1.000000,0.128659,0.088491,0.115120,0.062969,0.011361,0.031163,...,0.085938,0.128273,0.307973,0.052985,0.084584,0.200395,0.131746,0.149858,0.032198,0.107683
4,0.129080,0.016614,0.005020,0.128659,1.000000,0.300349,0.108342,0.429075,0.000000,0.030611,...,0.068048,0.418747,0.110148,0.258773,0.148758,0.106435,0.152866,0.135535,0.261232,0.060792
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
605,0.164191,0.028429,0.012993,0.200395,0.106435,0.102123,0.200035,0.099388,0.075898,0.088963,...,0.178084,0.116534,0.300669,0.066032,0.148141,1.000000,0.153063,0.262558,0.069622,0.201104
606,0.269389,0.012948,0.019247,0.131746,0.152866,0.162182,0.186114,0.185142,0.011844,0.010451,...,0.092525,0.199910,0.203540,0.137834,0.118780,0.153063,1.000000,0.283081,0.149190,0.139114
607,0.291097,0.046211,0.021128,0.149858,0.135535,0.178809,0.323541,0.187233,0.100435,0.077424,...,0.158355,0.197514,0.232771,0.155306,0.178142,0.262558,0.283081,1.000000,0.121993,0.322055
608,0.093572,0.027565,0.000000,0.032198,0.261232,0.214234,0.090840,0.423993,0.000000,0.021766,...,0.035653,0.335231,0.061941,0.236601,0.097610,0.069622,0.149190,0.121993,1.000000,0.053225


In [49]:
#calculate the top-20 users average ratings
ratings_movies.set_index('userId').loc[top_20_users,:]['rating'].mean()

3.705345060893099

In [50]:
for movie in unseen_movies:
    others_user = user_based.columns[~user_based.loc[movie].isna()]
    others_user = set(others_user)
    for user in set(top_20_users).intersection(others_user):
        print(movie,user)

10 Things I Hate About You (1999) 583
12 Angry Men (1957) 601
12 Angry Men (1957) 135
13 Ghosts (1960) 116
1984 (Nineteen Eighty-Four) (1984) 469
2 Days in the Valley (1996) 57
20 Dates (1998) 42
20,000 Leagues Under the Sea (1916) 116
2001: A Space Odyssey (1968) 57
2001: A Space Odyssey (1968) 42
2001: A Space Odyssey (1968) 469
2001: A Space Odyssey (1968) 239
2010: The Year We Make Contact (1984) 57
2010: The Year We Make Contact (1984) 42
2010: The Year We Make Contact (1984) 469
2012 (2009) 601
28 Days (2000) 320
28 Days (2000) 42
28 Days (2000) 558
28 Days Later (2002) 239
300 (2007) 239
3000 Miles to Graceland (2001) 558
3:10 to Yuma (2007) 239
40-Year-Old Virgin, The (2005) 239
40-Year-Old Virgin, The (2005) 116
40-Year-Old Virgin, The (2005) 445
40-Year-Old Virgin, The (2005) 583
400 Blows, The (Les quatre cents coups) (1959) 410
400 Blows, The (Les quatre cents coups) (1959) 469
50 First Dates (2004) 583
8 1/2 (8½) (1963) 410
8 Seconds (1994) 42
8MM (1999) 42
8MM (1999) 135


Born on the Fourth of July (1989) 42
Born on the Fourth of July (1989) 239
Bounce (2000) 42
Bounce (2000) 410
Bound (1996) 57
Bourne Identity, The (2002) 475
Bourne Identity, The (2002) 239
Bourne Supremacy, The (2004) 475
Bourne Supremacy, The (2004) 239
Bourne Ultimatum, The (2007) 601
Bourne Ultimatum, The (2007) 475
Bourne Ultimatum, The (2007) 445
Bourne Ultimatum, The (2007) 239
Bowfinger (1999) 57
Bowfinger (1999) 410
Bowfinger (1999) 42
Bowfinger (1999) 591
Bowling for Columbine (2002) 239
Boxing Helena (1993) 410
Boxing Helena (1993) 135
Boys Don't Cry (1999) 42
Boys Don't Cry (1999) 469
Boys Don't Cry (1999) 591
Boyz N the Hood (1991) 239
Brady Bunch Movie, The (1995) 446
Brave (2012) 601
Brazil (1985) 469
Brazil (1985) 135
Breakdown (1997) 57
Breakdown (1997) 469
Breaker Morant (1980) 57
Breakfast Club, The (1985) 135
Breakfast Club, The (1985) 42
Breakfast Club, The (1985) 558
Breakfast Club, The (1985) 469
Breakfast Club, The (1985) 57
Bridge on the River Kwai, The (1957) 

Double Indemnity (1944) 57
Double Jeopardy (1999) 239
Down Periscope (1996) 42
Down to Earth (2001) 42
Downfall (Untergang, Der) (2004) 445
Dr. No (1962) 57
Dr. No (1962) 469
Dr. Strangelove or: How I Learned to Stop Worrying and Love the Bomb (1964) 469
Dr. Strangelove or: How I Learned to Stop Worrying and Love the Bomb (1964) 57
Dr. Strangelove or: How I Learned to Stop Worrying and Love the Bomb (1964) 410
Dr. Strangelove or: How I Learned to Stop Worrying and Love the Bomb (1964) 445
Dracula (Bram Stoker's Dracula) (1992) 42
Dracula (Bram Stoker's Dracula) (1992) 116
Dragonfly (2002) 558
Dragonheart (1996) 57
Dreamscape (1984) 469
Drop Zone (1994) 57
Drop Zone (1994) 42
Duck Soup (1933) 57
Duets (2000) 57
Dumb & Dumber (Dumb and Dumber) (1994) 135
Dumb & Dumber (Dumb and Dumber) (1994) 42
Dumb & Dumber (Dumb and Dumber) (1994) 239
Dumb & Dumber (Dumb and Dumber) (1994) 116
Dumb & Dumber (Dumb and Dumber) (1994) 446
Dumbo (1941) 116
Dune (1984) 57
Dune (1984) 180
Dune (1984) 469
Du

Glory (1989) 57
Glory (1989) 180
Go (1999) 57
Go (1999) 42
Godfather, The (1972) 135
Godfather, The (1972) 42
Godfather, The (1972) 239
Godfather, The (1972) 601
Godfather, The (1972) 435
Godfather, The (1972) 116
Godfather, The (1972) 469
Godfather, The (1972) 57
Godfather: Part II, The (1974) 42
Godfather: Part II, The (1974) 239
Godfather: Part II, The (1974) 435
Godfather: Part II, The (1974) 469
Godfather: Part II, The (1974) 601
Godfather: Part II, The (1974) 57
Godfather: Part III, The (1990) 42
Godfather: Part III, The (1990) 239
Godfather: Part III, The (1990) 435
Godfather: Part III, The (1990) 469
Godfather: Part III, The (1990) 57
Gods Must Be Crazy, The (1980) 57
Gods Must Be Crazy, The (1980) 410
Gods Must Be Crazy, The (1980) 469
Godzilla (1998) 57
Golden Child, The (1986) 42
Golden Child, The (1986) 135
Golden Compass, The (2007) 475
GoldenEye (1995) 42
GoldenEye (1995) 173
GoldenEye (1995) 469
GoldenEye (1995) 57
GoldenEye (1995) 446
Goldfinger (1964) 57
Goldfinger (19

Key Largo (1948) 57
Key Largo (1948) 410
Kids in the Hall: Brain Candy (1996) 135
Kill Bill: Vol. 1 (2003) 435
Kill Bill: Vol. 1 (2003) 239
Kill Bill: Vol. 2 (2004) 435
Kill Bill: Vol. 2 (2004) 239
Killing Fields, The (1984) 116
Killing Fields, The (1984) 469
Killing, The (1956) 469
King Kong (1933) 469
King's Speech, The (2010) 601
Kingpin (1996) 42
Kingpin (1996) 283
Kingpin (1996) 180
Kingpin (1996) 135
Kiss Kiss Bang Bang (2005) 239
Kiss of the Dragon (2001) 320
Kiss the Girls (1997) 57
Knocked Up (2007) 239
Kung Fu Panda (2008) 475
L.A. Confidential (1997) 57
L.A. Confidential (1997) 42
L.A. Confidential (1997) 469
L.A. Story (1991) 57
Labyrinth (1986) 135
Lady Eve, The (1941) 57
Lady and the Tramp (1955) 57
Lady and the Tramp (1955) 116
Ladyhawke (1985) 57
Ladyhawke (1985) 558
Ladyhawke (1985) 135
Lake Placid (1999) 42
Lara Croft Tomb Raider: The Cradle of Life (2003) 475
Lara Croft: Tomb Raider (2001) 42
Lara Croft: Tomb Raider (2001) 475
Last Action Hero (1993) 57
Last Action H

My Neighbor Totoro (Tonari no Totoro) (1988) 601
My Neighbor Totoro (Tonari no Totoro) (1988) 445
Mystery Men (1999) 57
Mystery Men (1999) 180
Mystery Men (1999) 135
Mystery Science Theater 3000: The Movie (1996) 57
Mystery Science Theater 3000: The Movie (1996) 283
Mystery Science Theater 3000: The Movie (1996) 469
Mystery Science Theater 3000: The Movie (1996) 135
Mystic Pizza (1988) 42
Mystic Pizza (1988) 558
Mystic Pizza (1988) 410
Mystic River (2003) 239
Naked Gun 2 1/2: The Smell of Fear, The (1991) 42
Naked Gun 2 1/2: The Smell of Fear, The (1991) 135
Naked Gun 33 1/3: The Final Insult (1994) 239
Napoleon Dynamite (2004) 239
National Lampoon's Vacation (1983) 42
National Lampoon's Vacation (1983) 469
National Lampoon's Van Wilder (2002) 558
Natural Born Killers (1994) 135
Natural Born Killers (1994) 42
Natural Born Killers (1994) 239
Natural Born Killers (1994) 469
Natural Born Killers (1994) 446
Natural, The (1984) 42
Natural, The (1984) 135
Navy Seals (1990) 42
Negotiator, The

Rumble in the Bronx (Hont faan kui) (1995) 57
Rumble in the Bronx (Hont faan kui) (1995) 469
Rumble in the Bronx (Hont faan kui) (1995) 135
Run Lola Run (Lola rennt) (1998) 135
Runaway (1984) 57
Runaway Bride (1999) 42
Runaway Bride (1999) 403
Runaway Bride (1999) 116
Runaway Bride (1999) 591
Running Man, The (1987) 469
Running Man, The (1987) 135
Rush Hour (1998) 57
Rush Hour (1998) 469
Rush Hour (1998) 135
Rush Hour 2 (2001) 475
Rushmore (1998) 410
Rushmore (1998) 469
Rushmore (1998) 135
Saboteur (1942) 469
Sabrina (1954) 57
Sabrina (1954) 410
Sabrina (1995) 42
Saint, The (1997) 57
Saint, The (1997) 42
Salton Sea, The (2002) 558
Santa Clause, The (1994) 446
Santa Clause, The (1994) 583
Saturn 3 (1980) 469
Saving Grace (2000) 558
Saving Private Ryan (1998) 135
Saving Private Ryan (1998) 42
Saving Private Ryan (1998) 558
Saving Private Ryan (1998) 239
Saving Private Ryan (1998) 601
Saving Private Ryan (1998) 469
Saving Private Ryan (1998) 57
Saving Private Ryan (1998) 445
Say Anything.

Stealing Beauty (1996) 57
Steamboat Willie (1928) 57
Step Brothers (2008) 583
Stepford Wives, The (1975) 469
Steve Jobs: The Man in the Machine (2015) 601
Stigmata (1999) 239
Stigmata (1999) 591
Sting, The (1973) 57
Sting, The (1973) 469
Stir of Echoes (1999) 135
Story of Us, The (1999) 57
Straight Story, The (1999) 469
Strange Days (1995) 57
Stranger Than Paradise (1984) 410
Stranger than Fiction (2006) 239
Strangers on a Train (1951) 410
Strangers on a Train (1951) 469
Street Fighter (1994) 135
Streetcar Named Desire, A (1951) 469
Strictly Ballroom (1992) 410
Stripes (1981) 410
Stripes (1981) 403
Stripes (1981) 42
Stripes (1981) 469
Striptease (1996) 57
Summer Rental (1985) 42
Summer of Sam (1999) 42
Sunset Blvd. (a.k.a. Sunset Boulevard) (1950) 57
Sunset Blvd. (a.k.a. Sunset Boulevard) (1950) 410
Superbad (2007) 239
Supercop (Police Story 3: Supercop) (Jing cha gu shi III: Chao ji jing cha) (1992) 57
Superman (1978) 57
Superman (1978) 469
Superman II (1980) 42
Superman II (1980) 403

Welcome to Sarajevo (1997) 57
Westworld (1973) 469
What About Bob? (1991) 469
What About Bob? (1991) 135
What Dreams May Come (1998) 57
What Dreams May Come (1998) 42
What Dreams May Come (1998) 558
What Dreams May Come (1998) 239
What Lies Beneath (2000) 591
What Planet Are You From? (2000) 42
What Women Want (2000) 42
What's Eating Gilbert Grape (1993) 446
When Harry Met Sally... (1989) 42
When Harry Met Sally... (1989) 469
When Harry Met Sally... (1989) 239
When a Man Loves a Woman (1994) 42
While You Were Sleeping (1995) 116
While You Were Sleeping (1995) 446
Whiplash (2013) 601
Whiplash (2014) 445
White Men Can't Jump (1992) 410
White Men Can't Jump (1992) 42
White Men Can't Jump (1992) 135
White Squall (1996) 42
Who Framed Roger Rabbit? (1988) 135
Who Framed Roger Rabbit? (1988) 42
Who Framed Roger Rabbit? (1988) 558
Who Framed Roger Rabbit? (1988) 239
Who Framed Roger Rabbit? (1988) 469
Who Framed Roger Rabbit? (1988) 57
Who Framed Roger Rabbit? (1988) 410
Who's Afraid of Virgin

In [51]:
#create ratings for the active user
#predict the rating based on the (weighted) average ratings of the other user
# sum(ratings)/no.users OR sum(ratings*similarity)/sum(similarities)

pred_ratings_list = []

for movie in unseen_movies:
    others_user = user_based.columns[~user_based.loc[movie].isna()]
    others_user = set(others_user)
    num = 0
    den = 0
    pred_ratings = 0
    for user in set(top_20_users).intersection(others_user):
        ratings = user_based[user][movie]    
        sim = user_based_similarity_matrix[active_user][user]
        #print(user, ratings, sim, movie)
        
        num = num + (ratings*sim)
        den = den + sim + 0.000001
        
        pred_ratings = round(num/den,3)
        
        
    pred_ratings_list.append((movie, pred_ratings))
    
    
    
pred_ratings_list

[("'71 (2014)", 0),
 ("'Hellboy': The Seeds of Creation (2004)", 0),
 ("'Round Midnight (1986)", 0),
 ("'Salem's Lot (2004)", 0),
 ("'Til There Was You (1997)", 0),
 ("'Tis the Season for Love (2015)", 0),
 ("'burbs, The (1989)", 0),
 ("'night Mother (1986)", 0),
 ('(500) Days of Summer (2009)', 0),
 ('*batteries not included (1987)', 0),
 ('...All the Marbles (1981)', 0),
 ('...And Justice for All (1979)', 0),
 ('00 Schneider - Jagd auf Nihil Baxter (1994)', 0),
 ('1-900 (06) (1994)', 0),
 ('10 (1979)', 0),
 ('10 Cent Pistol (2015)', 0),
 ('10 Cloverfield Lane (2016)', 0),
 ('10 Items or Less (2006)', 0),
 ('10 Things I Hate About You (1999)', 5.0),
 ('10 Years (2011)', 0),
 ('10,000 BC (2008)', 0),
 ('100 Girls (2000)', 0),
 ('100 Streets (2016)', 0),
 ('101 Dalmatians (1996)', 0),
 ('101 Dalmatians (One Hundred and One Dalmatians) (1961)', 0),
 ("101 Dalmatians II: Patch's London Adventure (2003)", 0),
 ('101 Reykjavik (101 Reykjavík) (2000)', 0),
 ('102 Dalmatians (2000)', 0),
 ('1

In [52]:
def user_based_cosine_similarity_movie_recommendation(ratings_movies_pivot, active_user, similar_users, k_top):
    '''The function gets k-top movie recommadations for a given user based on cosine similarity matrix
    '''
    #create user based ratings movies table
    user_based = ratings_movies_pivot.T
    
    #assign to 0 for unrated movies 
    user_based_imputed = user_based.fillna(value=0)
      
    #apply cosine similarity to create a similarity matrix
    user_based_similarity_matrix = pd.DataFrame(cosine_similarity(user_based_imputed.T))
    
    #create a list of unseen movies for active user
    unseen_mask = user_based[active_user].isna()
    unseen_movies = user_based[unseen_mask].index
    
    #create a list of top n similar users
    x = similar_users + 1
    similar_top_n_users = user_based_similarity_matrix[active_user].sort_values(ascending=False).index[1:x]
   
    #create ratings for the active user
    #predict the rating based on the (weighted) average ratings of the other user
    #sum(ratings*similarity)/sum(similarities)

    pred_ratings_list = []

    for movie in unseen_movies:
        others_user = user_based.columns[~user_based.loc[movie].isna()]
        others_user = set(others_user)
        num = 0
        den = 0
        pred_ratings = 0
        for user in set(similar_top_n_users).intersection(others_user):
            ratings = user_based[user][movie]    
            sim = user_based_similarity_matrix[active_user][user]
              
            num = num + (ratings*sim)
            den = den + sim + 0.000001
        
            pred_ratings = round(num/den,3)
        
        
        pred_ratings_list.append((movie, pred_ratings))
        
        
    #look at the rating and choose n of them
    recommend_list = sorted(pred_ratings_list, key=lambda tup: tup[1], reverse=True)[0:k_top]
    
  
    
    #return recommend_list
    

In [53]:
def cosim_recommender(new_user_query, k_top=5):
    '''The function gets k-top movie recommadations for a given user based on cosine similarity matrix
    '''
   
    #get new_user-item dataframe with the previous dictionary
    new_user_ratings = pd.DataFrame(
                                    data=new_user_query,
                                    columns=movie_list, 
                                    index=['new_user']
                                   )

    #add new user to user_based dataframe
    ratings_movies_pivot_2 = ratings_movies_pivot.append(new_user_ratings, ignore_index=False)

    #transpose the pivot table
    user_based = ratings_movies_pivot_2.T

    #assign to 0 for unrated movies  
    user_based_imputed = user_based.fillna(value=0)

    #apply cosine similarity to create a similarity matrix
    user_based_similarity_matrix = pd.DataFrame(cosine_similarity(user_based_imputed.T))
      
    #create a list of unseen movies for new user
    unseen_mask = user_based['new_user'].isna()
    unseen_movies = user_based[unseen_mask].index
    
    #create a list of top n=20 similar users
    similar_top_n_users = user_based_similarity_matrix[610].sort_values(ascending=False).index[1:21]
   
    #create ratings for the active user
    #predict the rating based on the (weighted) average ratings of the other user
    #sum(ratings*similarity)/sum(similarities)

    pred_ratings_list = []

    for movie in unseen_movies:
        others_user = user_based.columns[~user_based.loc[movie].isna()]
        others_user = set(others_user)
        num = 0
        den = 0
        pred_ratings = 0
        for user in set(similar_top_n_users).intersection(others_user):
            rating = user_based[user][movie]    
            sim = user_based_similarity_matrix[610][user]
              
            num = num + (rating*sim)
            den = den + sim + 0.000001
        
            pred_ratings = round(num/den,3)
        
        
        pred_ratings_list.append((movie, pred_ratings))
        pred_ratings = 0
        
        
    #look at the rating and choose n of them
    recommend_list = sorted(pred_ratings_list, key=lambda tup: tup[1], reverse=True)[0:k_top]
    
     #Create a loop to print the first k movies from the sorted movies list
    j=0
    print(f'The {k_top} most recommended movies are:\n')
    for item in recommend_list:
        print(j+1,item[0])
        j = j+1
        if j > k_top-1:
            break
        
   # return recommend_list

In [54]:
cosim_recommender(new_user_query, k_top=5)

The 5 most recommended movies are:

1 101 Dalmatians (1996)
2 Adventures of Buckaroo Banzai Across the 8th Dimension, The (1984)
3 American Gangster (2007)
4 Amistad (1997)
5 Anne of Green Gables (1985)


# Recommedation functions

In [55]:
movie_list = ratings_movies_pivot.columns.to_list()

In [56]:
#save the movie list 
binary_2 = pickle.dumps(movie_list)
file = open('data/movie_list.bin', mode='wb')
file.write(binary_2)
file.close()

In [57]:
new_user_query = { 'Project X (1968)':2,
 'Munich (2005)':3,
 'Remember Me (2010)':4,
 'What Happened, Miss Simone? (2015)':3,
 'Triumph of the Spirit (1989)':2}

In [58]:
def nmf_recommender(new_user_query, k_top=5):
    '''The function gets k-top movie recommadations for a given user matrix based on nmf model
    '''
    #get new_user-item dataframe with the previous dictionary
    new_user_ratings = pd.DataFrame(
                                    data=new_user_query,
                                    columns=movie_list, 
                                    index=['new_user']
                                   )
    #fill the missing value
    new_user_ratings_imputed = pd.DataFrame(
                                            data=knn_imputer.transform(new_user_ratings), 
                                            columns=movie_list, 
                                            index=['new_user']
                                           )
    #load the pickled model
    file = open('model/nmf_model.bin', mode='rb')
    binary = file.read()
    file.close()
    nmf_model = pickle.loads(binary)
    
    #create movie feature matrix Q
    Q_matrix = nmf_model.components_
    
    
    #create user feature matrix P
    P_new_user_matrix = nmf_model.transform(new_user_ratings_imputed)
    
    ##get the ratings reconstructed dataframe
    new_user_ratings_rec = pd.DataFrame(data=np.dot(P_new_user_matrix,Q_matrix),
                                    columns=new_user_ratings.columns,
                                    index=new_user_ratings.index.to_list()
                                       )
    #exclude the rated movies
    mask = new_user_ratings.T.isna()
    new_user_ratings_rec_2 = new_user_ratings_rec.T[mask].T
   
    #get top k rated movies
    recommend_list = new_user_ratings_rec_2.sort_values(new_user_ratings.index.to_list(), axis=1, ascending=False).T.index.to_list()[:k_top]
        
        
    return recommend_list

In [59]:
nmf_recommender(new_user_query, k_top=5)

["'Salem's Lot (2004)",
 'Connections (1978)',
 'Palindromes (2004)',
 'Meantime (1984)',
 'Mephisto (1981)']

In [60]:
def cosim_recommender(new_user_query, k_top=5):
    '''The function gets k-top movie recommadations for a given user based on cosine similarity matrix
    '''
   
    #get new_user-item dataframe with the previous dictionary
    new_user_ratings = pd.DataFrame(
                                    data=new_user_query,
                                    columns=movie_list, 
                                    index=['new_user']
                                   )

    #add new user to user_based dataframe
    ratings_movies_pivot_2 = ratings_movies_pivot.append(new_user_ratings, ignore_index=False)

    #transpose the pivot table
    user_based = ratings_movies_pivot_2.T

    #assign to 0 for unrated movies  
    user_based_imputed = user_based.fillna(value=0)

    #apply cosine similarity to create a similarity matrix
    user_based_similarity_matrix = pd.DataFrame(cosine_similarity(user_based_imputed.T))
      
    #create a list of unseen movies for new user
    unseen_mask = user_based['new_user'].isna()
    unseen_movies = user_based[unseen_mask].index
    
    #create a list of top n=20 similar users
    similar_top_n_users = user_based_similarity_matrix[610].sort_values(ascending=False).index[1:21]
   
    #create ratings for the active user
    #predict the rating based on the (weighted) average ratings of the other user
    #sum(ratings*similarity)/sum(similarities)

    pred_ratings_list = []

    for movie in unseen_movies:
        others_user = user_based.columns[~user_based.loc[movie].isna()]
        others_user = set(others_user)
        num = 0
        den = 0
        pred_ratings = 0
        for user in set(similar_top_n_users).intersection(others_user):
            rating = user_based[user][movie]    
            sim = user_based_similarity_matrix[610][user]
              
            num = num + (rating*sim)
            den = den + sim + 0.000001
        
            pred_ratings = round(num/den,3)
        
        
        pred_ratings_list.append((movie, pred_ratings))
        pred_ratings = 0
        
        
    #look at the rating and choose n of them
    recommend_list = sorted(pred_ratings_list, key=lambda tup: tup[1], reverse=True)[0:k_top]
        
    return recommend_list

In [61]:
cosim_recommender(new_user_query, k_top=5)

[('12 Angry Men (1957)', 5.0),
 ('3:10 to Yuma (2007)', 5.0),
 ('84 Charing Cross Road (1987)', 5.0),
 ('A Detective Story (2003)', 5.0),
 ('Adaptation (2002)', 5.0)]

In [62]:
def most_popular_movie_recommender(new_user_query,k_top=5):
    '''The function gets k-top most popular movie recommadations 
    '''
    #get new_user-item dataframe with the previous dictionary
    new_user_ratings = pd.DataFrame(
                                    data=new_user_query,
                                    columns=movie_list, 
                                    index=['new_user']
                                   )

    #add new user to user_based dataframe
    ratings_movies_pivot_2 = ratings_movies_pivot.append(new_user_ratings, ignore_index=False)

    #transpose the pivot table
    user_based = ratings_movies_pivot_2.T

    #create a list of unseen movies for new user
    unseen_mask = user_based['new_user'].isna()
    unseen_movies = user_based[unseen_mask].index

    # filter out movies that have been watched by less than 100 users
    ratings = ratings_movies.groupby('title')[['rating']].count()
    ratings = ratings[ratings['rating'] >=100]
    movies_more_than_100 = ratings.index.to_list()
    
    #create a best movie list which have not seen before and watched by more than 100 users
    best_movie_list = []
    for i in movies_more_than_100:
        if i in unseen_movies:
            best_movie_list.append(i)
        else:
            continue
            
    ratings_2 = ratings_movies.set_index('title').loc[best_movie_list,:]
    
    # 2. scoring
    # calculate the average rating and wachted by users for each movie
    ratings_3 = ratings_2.reset_index()
    # merge them to a new dataframe
    ratings_4 = pd.DataFrame(ratings_3.groupby('title')[['rating']].mean().round(2))
    ratings_4['count_rating'] = ratings_3.groupby('title')[['rating']].count()
    ratings_4.rename(columns={'rating':'mean_rating'}, inplace=True)
    
    # 3. ranking
    recommend_list = ratings_4.sort_values(['mean_rating','count_rating'], ascending=False).index.to_list()[:5]
   
    return recommend_list


In [63]:
most_popular_movie_recommender(new_user_query,k_top=5)

['Shawshank Redemption, The (1994)',
 'Godfather, The (1972)',
 'Fight Club (1999)',
 'Godfather: Part II, The (1974)',
 'Goodfellas (1990)']

In [64]:
def random_recommender(new_user_query,k_top=5):
    '''The function gets k-top random movie recommadations
    '''
    #get new_user-item dataframe with the previous dictionary
    new_user_ratings = pd.DataFrame(
                                    data=new_user_query,
                                    columns=movie_list, 
                                    index=['new_user']
                                   )

    #add new user to user_based dataframe
    ratings_movies_pivot_2 = ratings_movies_pivot.append(new_user_ratings, ignore_index=False)


    #transpose the pivot table
    user_based = ratings_movies_pivot_2.T

    #create a list of unseen movies for new user
    unseen_mask = user_based['new_user'].isna()
    unseen_movies = user_based[unseen_mask].index

    recommend_list = []
    
    for i in range(5):

        movie = random.choice(unseen_movies)
        recommend_list.append(movie)

    return recommend_list

In [65]:
random_recommender(new_user_query,k_top=5)

['Scanner Darkly, A (2006)',
 'Scary Movie (2000)',
 'Herbie Goes Bananas (1980)',
 'RocketMan (a.k.a. Rocket Man) (1997)',
 'Diamonds Are Forever (1971)']