# Model-Based Collaborative Filtering (SVD)

SVD is used as a collaborative filtering technique for matrix decomposition. It uses a matrix structure where each row represents a user, and each column represents an item. The elements of this matrix are the ratings that are given to items by users.

In [53]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import math

from surprise.prediction_algorithms import knns
from surprise.similarities import cosine, msd, pearson
from surprise import accuracy

# importing relevant libraries
from surprise.model_selection import cross_validate
from surprise.prediction_algorithms import SVD
from surprise.prediction_algorithms import KNNWithMeans, KNNBasic, KNNBaseline
from surprise.model_selection import GridSearchCV

import warnings
warnings.filterwarnings('ignore')

In [54]:
movies_df = pd.read_csv('../ml-latest-small/movies.csv')
print('Dataset - Movies')
print('-------------------------')
print('Number of Rows: ' + str(movies_df.shape[0]))
print('Number of Columns: ' + str(movies_df.shape[1]))
movies_df.head()

Dataset - Movies
-------------------------
Number of Rows: 9742
Number of Columns: 3


Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [55]:
ratings_df = pd.read_csv('../ml-latest-small/ratings.csv',index_col=0).reset_index().drop(['timestamp'],axis=1)
print('Dataset - Ratings')
print('-------------------------')
print('Number of Rows: ' + str(ratings_df.shape[0]))
print('Number of Columns: ' + str(ratings_df.shape[1]))
ratings_df.head()

Dataset - Ratings
-------------------------
Number of Rows: 100836
Number of Columns: 3


Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
3,1,47,5.0
4,1,50,5.0


In [56]:
print('Number of Unique Movies: ', len(movies_df))
print('Number of Unique Users: ', ratings_df['userId'].nunique())

Number of Unique Movies:  9742
Number of Unique Users:  610


### SVD GridSearch

The GridSearchCV class computes accuracy metrics for an algorithm on various combinations of parameters, over a cross-validation procedure. This is useful for finding the best set of parameters for a prediction algorithm. It is analogous to GridSearchCV from scikit-learn.

In [57]:
# Transform dataframe for compatibility with surprise
from surprise import Reader, Dataset
reader = Reader()
data = Dataset.load_from_df(ratings_df,reader)

In [58]:
dataset = data.build_full_trainset()
print('Number of users: ', dataset.n_users, '\n')
print('Number of items: ', dataset.n_items)

Number of users:  610 

Number of items:  9724


In [59]:
# # Perform a gridsearch with SVD
# # Obtain optimal values of model hyperparameters

# params = {'n_factors': [20, 50, 100],
#          'reg_all': [0.02, 0.05, 0.1]}
# g_s_svd = GridSearchCV(SVD,param_grid=params,n_jobs=-1)
# g_s_svd.fit(data)

In [60]:
# print(g_s_svd.best_score)
# print(g_s_svd.best_params)

# # Output
#     # {'rmse': 0.8688267437363242, 'mae': 0.6675428126693821}
#     # {'rmse': {'n_factors': 20, 'reg_all': 0.02}, 'mae': {'n_factors': 20, 'reg_all': 0.02}}

The model had an RMSE of about 0.86, meaning it was off by roughly 1 point for each guess it made for ratings.

### KNNBasic Cross Validation

In [61]:
# cross validating with KNNBasic
knn_basic = KNNBasic(sim_options={'name':'pearson', 'user_based':True})
cv_knn_basic = cross_validate(knn_basic, data, n_jobs=-1)

In [62]:
for i in cv_knn_basic.items():
    print(i)
print('-----------------------')
print(np.mean(cv_knn_basic['test_rmse']))

('test_rmse', array([0.95991688, 0.96941681, 0.97554537, 0.98977275, 0.97011492]))
('test_mae', array([0.74304783, 0.7483459 , 0.75047962, 0.76427803, 0.75037607]))
('fit_time', (0.46140503883361816, 0.6600701808929443, 0.5352401733398438, 1.3418431282043457, 0.4863731861114502))
('test_time', (1.7324450016021729, 1.6640348434448242, 2.5715088844299316, 1.4913089275360107, 1.4116618633270264))
-----------------------
0.9729533450017295


### KNNBaseline Cross Validation

In [63]:
# cross validating with KNNBaseline
knn_baseline = KNNBaseline(sim_options={'name':'pearson', 'user_based':True})
cv_knn_baseline = cross_validate(knn_baseline,data)

Estimating biases using als...
Computing the pearson similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson similarity matrix...
Done computing similarity matrix.


In [64]:
for i in cv_knn_baseline.items():
    print(i)

np.mean(cv_knn_baseline['test_rmse'])

('test_rmse', array([0.86805491, 0.87330398, 0.88249303, 0.88941961, 0.87082244]))
('test_mae', array([0.66328372, 0.66694539, 0.67464347, 0.67830089, 0.66502456]))
('fit_time', (0.853564977645874, 0.8250541687011719, 0.8064038753509521, 0.8327629566192627, 0.8053739070892334))
('test_time', (2.2995381355285645, 2.819133996963501, 2.2748639583587646, 2.1510238647460938, 2.8423280715942383))


0.8768187957190496

### Fit SVD Model

In [65]:
# Using the best parameters from the Gridsearch result
svd = SVD(n_factors= 20, reg_all=0.02)
svd.fit(dataset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x12c7089e8>

In [66]:
svd.predict(2, 4)

Prediction(uid=2, iid=4, r_ui=None, est=2.9664547272365613, details={'was_impossible': False})

### Create Personal User Ratings

In [67]:
def movie_rocommendation_system(movie_df,num, genre=None):
    userID = 1000
    rating_list = []
    while num > 0:
        if genre:
            movie = movie_df[movies_df['genres'].str.contains(genre)].sample(1)
        else:
            movie = movies_df.sample(1)
        print(movie)
        rating = input('How do you rate this movie on a scale of 1-5, press n if you have not seen :\n')
        if rating == 'n':
            continue
        else:
            rating_one_movie = {'userId':userID,'movieId':movie['movieId'].values[0],'rating':rating}
            rating_list.append(rating_one_movie) 
            num -= 1
    return pd.DataFrame(rating_list)

In [68]:
user_rating = movie_rocommendation_system(movies_df,5, genre='Adventure')
user_rating

      movieId                               title                    genres
2823     3771  The Golden Voyage of Sinbad (1973)  Action|Adventure|Fantasy
How do you rate this movie on a scale of 1-5, press n if you have not seen :
5
      movieId            title            genres
8478   112911  Hercules (2014)  Action|Adventure
How do you rate this movie on a scale of 1-5, press n if you have not seen :
4
      movieId               title                        genres
7842    93363  John Carter (2012)  Action|Adventure|Sci-Fi|IMAX
How do you rate this movie on a scale of 1-5, press n if you have not seen :
3
      movieId                           title  \
5161     8361  Day After Tomorrow, The (2004)   

                                      genres  
5161  Action|Adventure|Drama|Sci-Fi|Thriller  
How do you rate this movie on a scale of 1-5, press n if you have not seen :
4
     movieId                                   title                    genres
863     1136  Monty Python and the

Unnamed: 0,userId,movieId,rating
0,1000,3771,5
1,1000,112911,4
2,1000,93363,3
3,1000,8361,4
4,1000,1136,5


In [69]:
## add the new ratings to the original ratings DataFrame
new_ratings_df = ratings_df.append(user_rating,ignore_index=True)
new_data = Dataset.load_from_df(new_ratings_df,reader)

### Train New Model with Added User Ratings

In [71]:
# train a model using the new combined DataFrame
svd_ = SVD(n_factors= 20, reg_all=0.02)
svd_.fit(new_data.build_full_trainset())

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x1305b7320>

In [72]:
# make predictions for the user
# you'll probably want to create a list of tuples in the format (movie_id, predicted_score)
list_of_movies = []
for m_id in ratings_df['movieId'].unique():
    list_of_movies.append( (m_id,svd_.predict(1000,m_id)[3]))

In [73]:
# order the predictions from highest to lowest rated
ranked_movies = sorted(list_of_movies, key=lambda x:x[1], reverse=True)

In [74]:
# return the top n recommendations using the 
def recommended_movies(user_ratings,movie_title_df,n):
        for idx, rec in enumerate(user_ratings):
            title = movie_title_df.loc[movie_title_df['movieId'] == int(rec[0])]['title']
            print('Recommendation # ', idx+1, ': ', title, '\n')
            n-= 1
            if n == 0:
                break
            
recommended_movies(ranked_movies,df_movies,5)

Recommendation #  1 :  602    Dr. Strangelove or: How I Learned to Stop Worr...
Name: title, dtype: object 

Recommendation #  2 :  906    Lawrence of Arabia (1962)
Name: title, dtype: object 

Recommendation #  3 :  257    Pulp Fiction (1994)
Name: title, dtype: object 

Recommendation #  4 :  277    Shawshank Redemption, The (1994)
Name: title, dtype: object 

Recommendation #  5 :  841    Streetcar Named Desire, A (1951)
Name: title, dtype: object 

