In [75]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import math

from surprise.prediction_algorithms import knns
from surprise.similarities import cosine, msd, pearson
from surprise import Reader, Dataset
from surprise import accuracy

# importing relevant libraries
from surprise.model_selection import train_test_split
from surprise.model_selection import cross_validate
from surprise.prediction_algorithms import SVD, SVDpp, SlopeOne, NMF, NormalPredictor, KNNBaseline
from surprise.prediction_algorithms import KNNBasic, KNNWithMeans, KNNWithZScore, BaselineOnly, CoClustering
from surprise.model_selection import GridSearchCV

import warnings
warnings.filterwarnings('ignore')

### Import Movies and Ratings as DataFrame

In [76]:
movies_df = pd.read_csv('../ml-latest-small/movies.csv')
print('Dataset - Movies')
print('-------------------------')
print('Number of Rows: ' + str(movies_df.shape[0]))
print('Number of Columns: ' + str(movies_df.shape[1]))
movies_df.head()

Dataset - Movies
-------------------------
Number of Rows: 9742
Number of Columns: 3


Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [77]:
ratings_df = pd.read_csv('../ml-latest-small/ratings.csv',index_col=0).reset_index().drop(['timestamp'],axis=1)
print('Dataset - Ratings')
print('-------------------------')
print('Number of Rows: ' + str(ratings_df.shape[0]))
print('Number of Columns: ' + str(ratings_df.shape[1]))
ratings_df.head()

Dataset - Ratings
-------------------------
Number of Rows: 100836
Number of Columns: 3


Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
3,1,47,5.0
4,1,50,5.0


In [78]:
print('Number of Unique Movies: ', len(movies_df))
print('Number of Unique Users: ', ratings_df['userId'].nunique())

Number of Unique Movies:  9742
Number of Unique Users:  610


### Load Dataset
Load a dataset from a pandas dataframe using load_from_df method with a reader object.

In [79]:
reader = Reader(rating_scale=(0.5,5))
data = Dataset.load_from_df(ratings_df,reader)

In [80]:
dataset = data.build_full_trainset()

### Determine Best Model
Here we will try out different algorithms to see which of them can provide the best RMSE. We use “RMSE” as our accuracy metric for the predictions.

#### Basic Algorithms
* **NormalPredictor**: algorithm predicts a random rating based on the distribution of the training set, which is assumed to be normal. This is one of the most basic algorithms that do not do much work.
* **BaselineOnly**: algorithm predicts the baseline estimate for given user and item.

#### K-NN Algorithms
* **KNNBasic**: KNNBasic is a basic collaborative filtering algorithm.
* **KNNWithMeans**: basic collaborative filtering algorithm, taking into account the mean ratings of each user.
* **KNNWithZScore**: basic collaborative filtering algorithm, taking into account the z-score normalization of each user.
* **KNNBaseline**: basic collaborative filtering algorithm taking into account a baseline rating.

#### Matrix Factorization-Based Algorithms
* **SVD**: algorithm is equivalent to Probabilistic Matrix Factorization
* **SVDpp**: algorithm is an extension of SVD that takes into account implicit ratings.
* **NMF**: collaborative filtering algorithm based on Non-negative Matrix Factorization. It is very similar with SVD.
* **Slope One**: straightforward implementation of the SlopeOne algorithm.
* **Co-clustering**: collaborative filtering algorithm based on co-clustering.



In [81]:
# benchmark = []
# # Iterate over all algorithms
# for algorithm in [SVD(), SVDpp(), SlopeOne(), NMF(), NormalPredictor(), KNNBaseline(), KNNBasic(), KNNWithMeans(), KNNWithZScore(), BaselineOnly(), CoClustering()]:
#     # Perform cross validation
#     results = cross_validate(algorithm, data, measures=['RMSE'], cv=3, verbose=False)
    
#     # Get results & append algorithm name
#     tmp = pd.DataFrame.from_dict(results).mean(axis=0)
#     tmp = tmp.append(pd.Series([str(algorithm).split(' ')[0].split('.')[-1]], index=['Algorithm']))
#     benchmark.append(tmp)
    
# pd.DataFrame(benchmark).set_index('Algorithm').sort_values('test_rmse') 

SVDpp has the best predicted RMSE, but test time is very large. Instead we will be using the next best RMSE which is BaselineOnly algorithm.

### Predict New User

Introduce a new user with ratings to a rating matrix. This user will be appended to the ratings_df dataframe.

In [82]:
# function that creates a new user with manually input ratings
def movie_rocommendation_system(movie_df,num, genre=None):
    userID = 1000
    rating_list = []
    while num > 0:
        if genre:
            movie = movie_df[movies_df['genres'].str.contains(genre)].sample(1)
        else:
            movie = movies_df.sample(1)
        print(movie)
        rating = input('How do you rate this movie on a scale of 1-5, press n if you have not seen :\n')
        if rating == 'n':
            continue
        else:
            rating_one_movie = {'userId':userID,'movieId':movie['movieId'].values[0],'rating':rating}
            rating_list.append(rating_one_movie) 
            num -= 1
    return pd.DataFrame(rating_list)

In [83]:
# create new user
user_rating = movie_rocommendation_system(movies_df,5, genre='Comedy')
user_rating

      movieId                  title          genres
8863   133712  Office Romance (1977)  Comedy|Romance
How do you rate this movie on a scale of 1-5, press n if you have not seen :
5
      movieId               title  genres
2824     3773  House Party (1990)  Comedy
How do you rate this movie on a scale of 1-5, press n if you have not seen :
4
      movieId                                     title               genres
6810    60904  Heart of a Dog (Sobachye serdtse) (1988)  Comedy|Drama|Sci-Fi
How do you rate this movie on a scale of 1-5, press n if you have not seen :
5
      movieId              title          genres
1787     2385  Home Fries (1998)  Comedy|Romance
How do you rate this movie on a scale of 1-5, press n if you have not seen :
2
      movieId           title               genres
8530   114818  Stretch (2014)  Action|Comedy|Crime
How do you rate this movie on a scale of 1-5, press n if you have not seen :
1


Unnamed: 0,userId,movieId,rating
0,1000,133712,5
1,1000,3773,4
2,1000,60904,5
3,1000,2385,2
4,1000,114818,1


In [84]:
## add the new ratings to the original ratings dataframe
new_ratings_df = ratings_df.append(user_rating,ignore_index=True)
new_data = Dataset.load_from_df(new_ratings_df,reader)

# Model-Based Collaborative Filtering (SVD)

SVD is used as a collaborative filtering technique for matrix decomposition. It uses a matrix structure where each row represents a user, and each column represents an item. The elements of this matrix are the ratings that are given to items by users.

### GridSearchCV

It helps to loop through predefined hyperparameters and fit your estimator (model) on your training set. So, in the end, you can select the best parameters from the listed hyperparameters.

In [85]:
# # Perform a gridsearch with SVD
# # Obtain optimal values of model hyperparameters

# params = {'n_factors': [20, 50, 100],
#          'reg_all': [0.02, 0.05, 0.1]}
# g_s_svd = GridSearchCV(SVD,param_grid=params,n_jobs=-1)
# g_s_svd.fit(data)

In [86]:
# # outputs best parameters for SVD
# print(g_s_svd.best_params)
# # output = {'rmse': {'n_factors': 20, 'reg_all': 0.02}, 'mae': {'n_factors': 20, 'reg_all': 0.02}}

### Fit Data into SVD Model

In [87]:
svd = SVD(n_factors= 20, reg_all= 0.02)
svd.fit(new_data.build_full_trainset())

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x12194cf98>

### Make Predictions for the New User

In [88]:
# make predictions for the user
list_of_movies = []
for m_id in ratings_df['movieId'].unique():
    list_of_movies.append( (m_id,svd.predict(1000,m_id)[3]))
    
# movie_id, predicted_rating
list_of_movies[0:5]

[(1, 4.0564262731296825),
 (3, 3.2287457201008416),
 (6, 3.987614948434257),
 (47, 3.960791581742847),
 (50, 4.2704246306223865)]

In [89]:
# order the predictions from highest to lowest rated
ranked_movies = sorted(list_of_movies, key=lambda x:x[1], reverse=True)

# movie_id, sorted predicted_rating
ranked_movies[0:5]

[(750, 4.423177797318697),
 (1104, 4.415501406264085),
 (912, 4.39786379455249),
 (1204, 4.3899475733754105),
 (1213, 4.358891267388496)]

In [90]:
# return the top n recommendations
def recommended_movies(user_ratings,movie_title_df,n):
        for idx, rec in enumerate(user_ratings):
            title = movie_title_df.loc[movie_title_df['movieId'] == int(rec[0])]['title']
            print('Recommendation # ', idx+1, ': ', title, '\n')
            n-= 1
            if n == 0:
                break

### Recommendation Results (SVD)

In [91]:
recommended_movies(ranked_movies,movies_df,5)

Recommendation #  1 :  602    Dr. Strangelove or: How I Learned to Stop Worr...
Name: title, dtype: object 

Recommendation #  2 :  841    Streetcar Named Desire, A (1951)
Name: title, dtype: object 

Recommendation #  3 :  694    Casablanca (1942)
Name: title, dtype: object 

Recommendation #  4 :  906    Lawrence of Arabia (1962)
Name: title, dtype: object 

Recommendation #  5 :  914    Goodfellas (1990)
Name: title, dtype: object 



# Model-Based Collaborative Filtering (BaselineOnly)

### Fit Data into BaselineOnly Model

In [92]:
bsl_options = {'method': 'als',
               'n_epochs': 5,
               'reg_u': 12,
               'reg_i': 5
               }
BaselineOnly = BaselineOnly(bsl_options=bsl_options)
BaselineOnly.fit(new_data.build_full_trainset())

Estimating biases using als...


<surprise.prediction_algorithms.baseline_only.BaselineOnly at 0x1262466d8>

In [93]:
# BaselineOnly_options = {'k': 10, 'min_k': 3, 'sim_options':{'method': 'als','name':'pearson','user_based':True}}
# BaselineOnly = BaselineOnly(BaselineOnly_options=BaselineOnly_options)
# preditions = BaselineOnly.fit(data.build_full_trainset())

### Make Predictions for the New User

In [94]:
# make predictions for the user
list_of_movies = []
for m_id in ratings_df['movieId'].unique():
    list_of_movies.append( (m_id,BaselineOnly.predict(1000,m_id)[3]))
    
# movie_id, predicted_rating
list_of_movies[0:5]

[(1, 3.939098126840147),
 (3, 3.311289804945486),
 (6, 3.972964226907635),
 (47, 4.03584380449924),
 (50, 4.2911930465962325)]

In [95]:
# order the predictions from highest to lowest rated
ranked_movies = sorted(list_of_movies, key=lambda x:x[1], reverse=True)

# movie_id, sorted predicted_rating
ranked_movies[0:5]

[(318, 4.425313199124609),
 (1204, 4.364694645455563),
 (750, 4.363658638040587),
 (177593, 4.327239841755363),
 (1104, 4.326295063062852)]

In [96]:
# return the top n recommendations
def recommended_movies(user_ratings,movie_title_df,n):
        for idx, rec in enumerate(user_ratings):
            title = movie_title_df.loc[movie_title_df['movieId'] == int(rec[0])]['title']
            print('Recommendation # ', idx+1, ': ', title, '\n')
            n-= 1
            if n == 0:
                break

### Recommendation Results (BaselineOnly)

In [97]:
recommended_movies(ranked_movies,movies_df,5)

Recommendation #  1 :  277    Shawshank Redemption, The (1994)
Name: title, dtype: object 

Recommendation #  2 :  906    Lawrence of Arabia (1962)
Name: title, dtype: object 

Recommendation #  3 :  602    Dr. Strangelove or: How I Learned to Stop Worr...
Name: title, dtype: object 

Recommendation #  4 :  9618    Three Billboards Outside Ebbing, Missouri (2017)
Name: title, dtype: object 

Recommendation #  5 :  841    Streetcar Named Desire, A (1951)
Name: title, dtype: object 

