### Data Description: 
The ratings data set contains 26024289 ratings of 270896 on 45115 movies. 


### Goal
finding a collaborative recommender model with an appropriate accuracy.  


In [1]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt

from surprise import Dataset
from surprise import Reader
from surprise import KNNWithMeans

from surprise.model_selection import GridSearchCV
from surprise.model_selection import train_test_split
from surprise import SVD
from pandasql import sqldf

from six.moves import cPickle as pickle

In [2]:
#df_rating=pd.read_csv('input/ratings_small.csv')

In [3]:
#df_cb=pd.read_csv('content_based_data.csv')

### 1  Importing data and cleaning data

In [4]:
df_rating=pd.read_csv('input/ratings.csv')
df_rating.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26024289 entries, 0 to 26024288
Data columns (total 4 columns):
 #   Column     Dtype  
---  ------     -----  
 0   userId     int64  
 1   movieId    int64  
 2   rating     float64
 3   timestamp  int64  
dtypes: float64(1), int64(3)
memory usage: 794.2 MB


In [5]:
num_users = len(df_rating.userId.unique())
num_users

270896

In [6]:
num_movies = len(df_rating.movieId.unique())
num_movies

45115

#### 1.1 considering users with number of ratings in top 5%

In [7]:
df_user_cnt = pd.DataFrame(df_rating.groupby('userId').size(), columns=['count'])
df_user_cnt

Unnamed: 0_level_0,count
userId,Unnamed: 1_level_1
1,27
2,22
3,10
4,62
5,26
...,...
270892,75
270893,190
270894,148
270895,20


In [8]:
m=df_user_cnt['count'].quantile(.95)
m

398.0

In [9]:
index2 = list(set(df_user_cnt.query('count >= @m').index))
df_rating_new= df_rating[df_rating.userId.isin(index2)]
df_rating_new

Unnamed: 0,userId,movieId,rating,timestamp
1710,24,1,4.0,979869938
1711,24,2,3.0,979974023
1712,24,6,4.0,979870499
1713,24,16,3.0,979870379
1714,24,17,3.0,979974163
...,...,...,...,...
26023517,270887,171439,1.0,1500925554
26023518,270887,171755,3.0,1493850948
26023519,270887,173149,5.0,1497402632
26023520,270887,173405,5.0,1496285130


In [10]:
df_rating_new=df_rating_new[['userId','movieId','rating']].reset_index()

#### 1.2 Comparing updated ratings data and data used in content based recommender file.

Here we make sure we use the same set of movies as metadata data used for content based recommender, to develope a collaborative filtering.

In [11]:
df_cb=pd.read_csv('content_based_data.csv')

In [12]:
df_cb['id']=df_cb['id'].astype('int')

In [13]:
movie_index=list(df_cb['id'].value_counts().index)

In [14]:
df= df_rating_new[df_rating_new.movieId.isin(movie_index)]

In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1100760 entries, 6 to 10363829
Data columns (total 4 columns):
 #   Column   Non-Null Count    Dtype  
---  ------   --------------    -----  
 0   index    1100760 non-null  int64  
 1   userId   1100760 non-null  int64  
 2   movieId  1100760 non-null  int64  
 3   rating   1100760 non-null  float64
dtypes: float64(1), int64(3)
memory usage: 42.0 MB


In [16]:
print(df.rating.value_counts())


4.0    278184
3.0    245991
3.5    138549
5.0    114814
2.0    100510
4.5     71493
2.5     67795
1.0     44909
1.5     21517
0.5     16998
Name: rating, dtype: int64


In [17]:
df=df.drop(['index'],axis=1)

In [18]:
df.to_csv('colaborative_data.csv')

### 2 Model base

#### 2.1 Learning Latent factor models: Matrix Factorization-based algorithm (SVD)


In [19]:
# A reader is still needed but only the rating_scale param is requiered.
reader = Reader(rating_scale=(1, 5))
# The columns must correspond to user id, item id and ratings (in that order).
data = Dataset.load_from_df(df[['userId', 'movieId', 'rating']], reader)
print(data)

<surprise.dataset.DatasetAutoFolds object at 0x7feee42e31c0>


In [20]:
#Tunning parameters for svd model

param_grid = {
    "n_epochs": [5, 10],
    "lr_all": [0.002, 0.005],
    "reg_all": [0.4, 0.6]
}
gs = GridSearchCV(SVD, param_grid, measures=["rmse", "mae"], cv=3)

gs.fit(data)

print(gs.best_score["rmse"])
print(gs.best_params["rmse"])

0.8700387724707199
{'n_epochs': 10, 'lr_all': 0.005, 'reg_all': 0.4}


In [21]:
algo_svd = gs.best_estimator['rmse']
algo_svd.fit(data.build_full_trainset())

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7feee43dd130>

In [22]:
algo_svd.predict(uid=4, iid=10, r_ui=4, verbose=True)

user: 4          item: 10         r_ui = 4.00   est = 3.35   {'was_impossible': False}


Prediction(uid=4, iid=10, r_ui=4, est=3.350058141647589, details={'was_impossible': False})

In [23]:
pickle.dump(algo_svd, open('recommender-svd', 'wb'))

### 3 Memory base

#### 3.1 Learning neighborhood model: k-NN inspired algorithm

In [28]:
param_grid = {'bsl_options': {'method': ['als', 'sgd'],
                              'reg': [1, 2]},
              'k': [2, 3],
              'sim_options': {'name': ['msd', 'cosine'],
                              'min_support': [1, 5],
                              'user_based': [False]}
              }
gs = GridSearchCV(KNNWithMeans,param_grid , measures=["rmse", "mae"], cv=3)

gs.fit(data)

Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matr

In [29]:
algo_knn = gs.best_estimator['rmse']
algo_knn.fit(data.build_full_trainset())

Computing the msd similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNWithMeans at 0x7feee42c59d0>

In [30]:
print(gs.best_score["rmse"])
print(gs.best_params["rmse"])

0.871713725579134
{'bsl_options': {'method': 'als', 'reg': 1}, 'k': 3, 'sim_options': {'name': 'msd', 'min_support': 5, 'user_based': False}}


In [31]:
algo_knn.predict(uid=4, iid=10, r_ui=4, verbose=True)

user: 4          item: 10         r_ui = 4.00   est = 3.35   {'was_impossible': True, 'reason': 'User and/or item is unknown.'}


Prediction(uid=4, iid=10, r_ui=4, est=3.350058141647589, details={'was_impossible': True, 'reason': 'User and/or item is unknown.'})

In [32]:
pickle.dump(algo_knn, open('recommender-knn', 'wb'))

### 4 Conclusion

The score of knn model is slightly better than svd model with accuracy .87. But it takes much longer to learn the knn model. Therefore overall the svd model is better. 

https://www.inf.unibz.it/~ricci/ISR/papers/ieeecomputer.pdf)
https://surprise.readthedocs.io/en/stable/matrix_factorization.html
https://surprise.readthedocs.io/en/stable/knn_inspired.html#surprise.prediction_algorithms.knns.KNNWithMeans