In [70]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import math

from surprise.prediction_algorithms import knns
from surprise.similarities import cosine, msd, pearson
from surprise import Reader, Dataset
from surprise import accuracy

# importing relevant libraries
from surprise.model_selection import train_test_split
from surprise.model_selection import cross_validate
from surprise.prediction_algorithms import SVD, KNNBasic, KNNBaseline
from surprise.model_selection import GridSearchCV

import warnings
warnings.filterwarnings('ignore')

### Import Movies and Ratings as DataFrame

In [71]:
movies_df = pd.read_csv('../ml-latest-small/movies.csv')
print('Dataset - Movies')
print('-------------------------')
print('Number of Rows: ' + str(movies_df.shape[0]))
print('Number of Columns: ' + str(movies_df.shape[1]))
movies_df.head()

Dataset - Movies
-------------------------
Number of Rows: 9742
Number of Columns: 3


Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [72]:
ratings_df = pd.read_csv('../ml-latest-small/ratings.csv',index_col=0).reset_index().drop(['timestamp'],axis=1)
print('Dataset - Ratings')
print('-------------------------')
print('Number of Rows: ' + str(ratings_df.shape[0]))
print('Number of Columns: ' + str(ratings_df.shape[1]))
ratings_df.head()

Dataset - Ratings
-------------------------
Number of Rows: 100836
Number of Columns: 3


Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
3,1,47,5.0
4,1,50,5.0


In [73]:
print('Number of Unique Movies: ', len(movies_df))
print('Number of Unique Users: ', ratings_df['userId'].nunique())

Number of Unique Movies:  9742
Number of Unique Users:  610


### Load Dataset
Load a dataset from a pandas dataframe using load_from_df method with a reader object.

In [74]:
reader = Reader(rating_scale=(0.5,5))
data = Dataset.load_from_df(ratings_df,reader)

In [75]:
dataset = data.build_full_trainset()

### Determine Best Model
Here we will try out different algorithms to see which of them can provide the best RMSE.
* **KNNBasic** - Basic collaboriative filtering algorithm
* **KNNBaseline** - Basic collaborative  filtering algorithm taking into acount a baseline rating
* **SVD** - matrix factorization-based algorithm

In [76]:
benchmark = []
# Iterate over all algorithms
for algorithm in [SVD(), KNNBaseline(), KNNBasic()]:
    # Perform cross validation
    results = cross_validate(algorithm, data, measures=['RMSE'], cv=3, verbose=False)
    
    # Get results & append algorithm name
    tmp = pd.DataFrame.from_dict(results).mean(axis=0)
    tmp = tmp.append(pd.Series([str(algorithm).split(' ')[0].split('.')[-1]], index=['Algorithm']))
    benchmark.append(tmp)
    
pd.DataFrame(benchmark).set_index('Algorithm').sort_values('test_rmse') 

Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.


Unnamed: 0_level_0,test_rmse,fit_time,test_time
Algorithm,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
KNNBaseline,0.881718,0.327165,3.720249
SVD,0.881911,5.006302,0.350382
KNNBasic,0.958952,0.105856,2.814734


# Model-Based Collaborative Filtering (SVD)

SVD is used as a collaborative filtering technique for matrix decomposition. It uses a matrix structure where each row represents a user, and each column represents an item. The elements of this matrix are the ratings that are given to items by users.

### GridSearchCV

It helps to loop through predefined hyperparameters and fit your estimator (model) on your training set. So, in the end, you can select the best parameters from the listed hyperparameters.

In [50]:
# # Perform a gridsearch with SVD
# # Obtain optimal values of model hyperparameters

# params = {'n_factors': [20, 50, 100],
#          'reg_all': [0.02, 0.05, 0.1]}
# g_s_svd = GridSearchCV(SVD,param_grid=params,n_jobs=-1)
# g_s_svd.fit(data)

In [51]:
# print(g_s_svd.best_params)
# # output = {'rmse': {'n_factors': 20, 'reg_all': 0.02}, 'mae': {'n_factors': 20, 'reg_all': 0.02}}

### Fit Data

In [52]:
svd = SVD(n_factors= 20, reg_all= 0.02)
svd.fit(data.build_full_trainset())

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x11e2395f8>

### Predict New User

In [53]:
def movie_rocommendation_system(movie_df,num, genre=None):
    userID = 1000
    rating_list = []
    while num > 0:
        if genre:
            movie = movie_df[movies_df['genres'].str.contains(genre)].sample(1)
        else:
            movie = movies_df.sample(1)
        print(movie)
        rating = input('How do you rate this movie on a scale of 1-5, press n if you have not seen :\n')
        if rating == 'n':
            continue
        else:
            rating_one_movie = {'userId':userID,'movieId':movie['movieId'].values[0],'rating':rating}
            rating_list.append(rating_one_movie) 
            num -= 1
    return pd.DataFrame(rating_list)

In [54]:
user_rating = movie_rocommendation_system(movies_df,5, genre='Adventure')
user_rating

      movieId            title                          genres
6207    45442  Poseidon (2006)  Action|Adventure|Thriller|IMAX
How do you rate this movie on a scale of 1-5, press n if you have not seen :
5
     movieId                              title                      genres
109      126  NeverEnding Story III, The (1994)  Adventure|Children|Fantasy
How do you rate this movie on a scale of 1-5, press n if you have not seen :
4
      movieId                         title                      genres
8462   112460  Planes: Fire & Rescue (2014)  Adventure|Animation|Comedy
How do you rate this movie on a scale of 1-5, press n if you have not seen :
5
      movieId                              title                   genres
7832    93139  Mega Shark vs. Crocosaurus (2010)  Action|Adventure|Horror
How do you rate this movie on a scale of 1-5, press n if you have not seen :
2
      movieId                         title                      genres
9127   146309  The Boy and the Beast (2015

Unnamed: 0,userId,movieId,rating
0,1000,45442,5
1,1000,126,4
2,1000,112460,5
3,1000,93139,2
4,1000,146309,2


In [55]:
## add the new ratings to the original ratings DataFrame
new_ratings_df = ratings_df.append(user_rating,ignore_index=True)
new_data = Dataset.load_from_df(new_ratings_df,reader)

### Fit Data with New User

In [56]:
svd.fit(new_data.build_full_trainset())

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x11e2395f8>

### Make Predictions for the New User

In [57]:
# make predictions for the user
# you'll probably want to create a list of tuples in the format (movie_id, predicted_score)
list_of_movies = []
for m_id in ratings_df['movieId'].unique():
    list_of_movies.append( (m_id,svd.predict(1000,m_id)[3]))

In [58]:
# order the predictions from highest to lowest rated
ranked_movies = sorted(list_of_movies, key=lambda x:x[1], reverse=True)

In [59]:
# return the top n recommendations using the 
def recommended_movies(user_ratings,movie_title_df,n):
        for idx, rec in enumerate(user_ratings):
            title = movie_title_df.loc[movie_title_df['movieId'] == int(rec[0])]['title']
            print('Recommendation # ', idx+1, ': ', title, '\n')
            n-= 1
            if n == 0:
                break

### Recommendation Results (SVD)

In [60]:
recommended_movies(ranked_movies,movies_df,5)

Recommendation #  1 :  277    Shawshank Redemption, The (1994)
Name: title, dtype: object 

Recommendation #  2 :  899    Princess Bride, The (1987)
Name: title, dtype: object 

Recommendation #  3 :  906    Lawrence of Arabia (1962)
Name: title, dtype: object 

Recommendation #  4 :  686    Rear Window (1954)
Name: title, dtype: object 

Recommendation #  5 :  659    Godfather, The (1972)
Name: title, dtype: object 



# Model-Based Collaborative Filtering (ALS)

### GridSearchCV

It helps to loop through predefined hyperparameters and fit your estimator (model) on your training set. So, in the end, you can select the best parameters from the listed hyperparameters.

In [61]:
# Perform a gridsearch with SVD
# Obtain optimal values of model hyperparameters

params = {'k': [10,20,30],'min_k': [1,3,5]}

g_s_KNNBasic = GridSearchCV(KNNBasic,param_grid=params,n_jobs=-1)
g_s_KNNBasic.fit(data)

In [33]:
print(g_s_KNNBasic.best_params)

{'rmse': {'k': 10, 'min_k': 3}, 'mae': {'k': 10, 'min_k': 3}}


### Fit Data

In [62]:
KNNBasic_options = {'k': 10, 'min_k': 3, 'sim_options':{'method': 'als','name':'pearson','user_based':True}}
KNNbasic = KNNBasic(KNNbasic_options=KNNBasic_options)
KNNbasic.fit(data.build_full_trainset())

Computing the msd similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNBasic at 0x12f2d90f0>

### Make Predictions for the New User

In [64]:
# make predictions for the user
# you'll probably want to create a list of tuples in the format (movie_id, predicted_score)
list_of_movies = []
for m_id in ratings_df['movieId'].unique():
    list_of_movies.append( (m_id,KNNbasic.predict(1000,m_id)[3]))

In [65]:
# order the predictions from highest to lowest rated
ranked_movies = sorted(list_of_movies, key=lambda x:x[1], reverse=True)

In [66]:
# return the top n recommendations using the 
def recommended_movies(user_ratings,movie_title_df,n):
        for idx, rec in enumerate(user_ratings):
            title = movie_title_df.loc[movie_title_df['movieId'] == int(rec[0])]['title']
            print('Recommendation # ', idx+1, ': ', title, '\n')
            n-= 1
            if n == 0:
                break

### Recommendation Results (KNNBasic)

In [67]:
recommended_movies(ranked_movies,movies_df,5)

Recommendation #  1 :  0    Toy Story (1995)
Name: title, dtype: object 

Recommendation #  2 :  2    Grumpier Old Men (1995)
Name: title, dtype: object 

Recommendation #  3 :  5    Heat (1995)
Name: title, dtype: object 

Recommendation #  4 :  43    Seven (a.k.a. Se7en) (1995)
Name: title, dtype: object 

Recommendation #  5 :  46    Usual Suspects, The (1995)
Name: title, dtype: object 

