# Surprise is a library for collaborative filtering

## Input data for each algorithm (in this order:) #user_id #item_id #rating
#### https://towardsdatascience.com/building-and-testing-recommender-systems-with-surprise-step-by-step-d4ba702ef80b
#### https://nbviewer.jupyter.org/github/NicolasHug/Surprise/blob/master/examples/notebooks/KNNBasic_analysis.ipynb
#### https://blog.cambridgespark.com/tutorial-practical-introduction-to-recommender-systems-dbe22848392b

In [33]:
import pandas as pd

from surprise import NormalPredictor
from surprise import SVD
from surprise import SlopeOne
from surprise import Dataset
from surprise import Reader
from surprise.model_selection import cross_validate, GridSearchCV
from surprise import KNNBasic, KNNWithMeans, KNNWithZScore, KNNBaseline, BaselineOnly, CoClustering, SVDpp
from surprise.accuracy import rmse
from surprise.model_selection import train_test_split as surprise_train_test_split

# set random state for reproducibility
kwargs = dict(random_state=42)

In [2]:
ratings = pd.read_csv('../data/raw/ratings.csv', sep=',')

In [3]:
ratings.describe()

Unnamed: 0,user_id,movieID,rating
count,812818.0,812818.0,812818.0
mean,689.692353,8706.134093,3.437447
std,498.190021,14439.659629,1.002791
min,0.0,1.0,0.5
25%,273.0,1367.0,3.0
50%,593.0,3249.0,3.5
75%,1040.0,6534.0,4.0
max,2112.0,65133.0,5.0


In [4]:
# build a reader, define the rating scale (minimum and maximum value)
reader = Reader(rating_scale=(0.5, 5))

# The columns must be in the order of user id, item id and ratings 
data = Dataset.load_from_df(ratings[['user_id', 'movieID', 'rating']], reader)

Cross-validate different collaborative-filtering algorithms

In [5]:
# Basic prediction algorithms

In [6]:
# Normal Predictor predicting a random rating based on the distribution of the training set 
# distribution is assumed to be normal
cross_validate(NormalPredictor(), data, cv=5)

{'test_rmse': array([1.38293621, 1.37909338, 1.38348484, 1.38148618, 1.38095311]),
 'test_mae': array([1.10207133, 1.09935957, 1.10195359, 1.10153856, 1.09985882]),
 'fit_time': (1.1751799583435059,
  1.5376050472259521,
  1.6252198219299316,
  1.4448628425598145,
  1.628993034362793),
 'test_time': (2.0391452312469482,
  1.6880578994750977,
  1.6698150634765625,
  1.581482172012329,
  1.6205618381500244)}

In [7]:
# Singular Value Decomposition
cross_validate(SVD(), data, cv=5)

{'test_rmse': array([0.77433474, 0.77338096, 0.7749435 , 0.77210478, 0.77261608]),
 'test_mae': array([0.58586467, 0.58530208, 0.58636314, 0.58458287, 0.58564174]),
 'fit_time': (38.377469062805176,
  37.70374917984009,
  38.74200391769409,
  38.26911783218384,
  36.012696981430054),
 'test_time': (1.667863130569458,
  1.607652187347412,
  2.136852979660034,
  1.7461907863616943,
  1.5753850936889648)}

In [8]:
# Simple K-Nearest-Neighbor (KNN) Algorithm
cross_validate(KNNBasic(), data, cv=5)

Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.


{'test_rmse': array([0.82068863, 0.82049977, 0.81970893, 0.82205131, 0.82250005]),
 'test_mae': array([0.61990558, 0.61952323, 0.6197012 , 0.62223963, 0.62119363]),
 'fit_time': (7.732069969177246,
  8.274495124816895,
  8.068937301635742,
  10.549314975738525,
  8.069509983062744),
 'test_time': (54.254112005233765,
  52.63852906227112,
  54.59881520271301,
  51.43642210960388,
  55.17534112930298)}

In [9]:
# KNN, taking into account the z-score normalization of each user.
cross_validate(KNNWithZScore(), data, cv=5)

Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.


{'test_rmse': array([0.80539733, 0.80865978, 0.80628723, 0.80802487, 0.80712037]),
 'test_mae': array([0.61110584, 0.6128266 , 0.61228491, 0.61254773, 0.61334773]),
 'fit_time': (9.30230188369751,
  9.643959045410156,
  8.63787293434143,
  9.81911587715149,
  9.302175998687744),
 'test_time': (62.991698026657104,
  56.422497034072876,
  55.804354190826416,
  57.11968803405762,
  54.328469038009644)}

In [None]:
# SlopeOne is a straightforward implementation of the SlopeOne algorithm.
cross_validate(SlopeOne(), data, cv=3)

In [12]:
# KNNBaseline is a basic collaborative filtering algorithm taking into account a baseline rating.
cross_validate(KNNBaseline(), data, cv=5)

Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.


{'test_rmse': array([0.7907402 , 0.79252282, 0.79127624, 0.79440177, 0.79364287]),
 'test_mae': array([0.59920897, 0.60156972, 0.60022048, 0.60243432, 0.6019286 ]),
 'fit_time': (12.045432090759277,
  11.182394027709961,
  11.519973039627075,
  11.812057971954346,
  11.283203840255737),
 'test_time': (58.60267186164856,
  57.21177077293396,
  60.355955839157104,
  57.6170608997345,
  57.28627610206604)}

In [16]:
# BaselineOnly algorithm predicts the baseline estimate for given user and item
cross_validate(BaselineOnly(), data, cv=5)

Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...


{'test_rmse': array([0.81001586, 0.80904531, 0.81305484, 0.81356832, 0.80727184]),
 'test_mae': array([0.61666153, 0.61523479, 0.61869101, 0.61837147, 0.61487407]),
 'fit_time': (3.4693751335144043,
  3.6895430088043213,
  3.604027271270752,
  3.622332811355591,
  3.591384172439575),
 'test_time': (1.95843505859375,
  1.3004589080810547,
  1.163538932800293,
  1.1517188549041748,
  1.162735939025879)}

In [17]:
# CoClustering is a collaborative filtering algorithm based on co-clustering.
cross_validate(CoClustering(), data, cv=5)

{'test_rmse': array([0.83921567, 0.84548405, 0.84056895, 0.84266923, 0.84621581]),
 'test_mae': array([0.65359291, 0.65859025, 0.65428987, 0.65673432, 0.65862049]),
 'fit_time': (16.482234001159668,
  16.66210412979126,
  16.770533800125122,
  16.69052505493164,
  16.774842262268066),
 'test_time': (1.4695608615875244,
  1.2712979316711426,
  1.24924898147583,
  1.2563130855560303,
  1.4417059421539307)}

In [None]:
# The SVDpp algorithm is an extension of SVD that takes into account implicit ratings.
cross_validate(SVDpp(), data, cv=5)

In [19]:
trainset, testset = surprise_train_test_split(data, test_size=0.2, shuffle=True, **kwargs)

In [20]:
svd = SVD(**kwargs)

In [21]:
svd.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x10e5ca190>

In [22]:
# user factors
svd.pu

array([[-0.07550305,  0.19653254,  0.00471276, ..., -0.28346341,
         0.23844732, -0.12534409],
       [-0.07150064, -0.19183905, -0.06341324, ..., -0.10770481,
        -0.00287729, -0.06020949],
       [-0.05673481,  0.11206512,  0.0986272 , ...,  0.17376617,
         0.15469672,  0.23185501],
       ...,
       [-0.23490498, -0.07949418, -0.01095694, ...,  0.0006677 ,
         0.03078363,  0.0474292 ],
       [ 0.04696989,  0.13926831,  0.01537944, ..., -0.04143001,
         0.02971297,  0.05937526],
       [ 0.12462804, -0.05820714, -0.07581964, ..., -0.00284997,
        -0.06214073, -0.03067043]])

In [23]:
# item factors
svd.qi

array([[-0.03509496, -0.12382632,  0.03998815, ..., -0.11375698,
        -0.00573124,  0.01972498],
       [-0.1034999 ,  0.13044891,  0.24004721, ...,  0.26433661,
         0.19439505, -0.10171066],
       [-0.07827134,  0.15405811, -0.13564332, ..., -0.02430594,
        -0.11488402,  0.0640317 ],
       ...,
       [ 0.04413867,  0.09839711, -0.08025985, ..., -0.1002068 ,
         0.01406884, -0.18818064],
       [-0.09074666, -0.01419026,  0.03995792, ...,  0.13409375,
        -0.07822657, -0.23921678],
       [-0.01613894,  0.09141754, -0.00869151, ..., -0.17679153,
        -0.01110383,  0.16239869]])

In [24]:
# user biases
svd.bu.shape

(2113,)

In [25]:
# item biases
svd.bi.shape

(9901,)

In [26]:
preds = svd.test(testset)

In [29]:
# Printing first 50 predicitons for overview
for i in range (0,50):
    print(preds[i])

user: 364        item: 2542       r_ui = 3.00   est = 4.00   {'was_impossible': False}
user: 204        item: 49822      r_ui = 3.50   est = 3.28   {'was_impossible': False}
user: 692        item: 1407       r_ui = 2.50   est = 2.21   {'was_impossible': False}
user: 387        item: 34         r_ui = 4.00   est = 3.59   {'was_impossible': False}
user: 1254       item: 550        r_ui = 3.50   est = 2.99   {'was_impossible': False}
user: 227        item: 7154       r_ui = 3.50   est = 2.74   {'was_impossible': False}
user: 515        item: 527        r_ui = 4.00   est = 4.30   {'was_impossible': False}
user: 433        item: 923        r_ui = 4.00   est = 3.78   {'was_impossible': False}
user: 656        item: 6264       r_ui = 2.00   est = 1.85   {'was_impossible': False}
user: 1465       item: 367        r_ui = 4.50   est = 3.30   {'was_impossible': False}
user: 975        item: 6662       r_ui = 3.50   est = 3.75   {'was_impossible': False}
user: 1057       item: 33493      r_ui = 4.

## Now finding optimal Parameters

In [40]:
# Using GridSearch to find optimal Params
param_grid = {'lr_all' : [.001, .01], 'reg_all' : [.1,.5], 'n_epochs' : [5,20]}
print("Starting GridSearch")
gs = GridSearchCV(SVD, param_grid, measures=['rmse','mae'], cv=5)
gs.fit(data)

#Print best param combination
print(gs.best_score['rmse'])
print(gs.best_params['rmse'])

Starting GridSearch
0.7996727216489399
{'lr_all': 0.01, 'reg_all': 0.1, 'n_epochs': 10}


In [None]:
trainset, testset = surprise_train_test_split(data, test_size=0.2, shuffle=True, **kwargs)

In [None]:
svd = SVD(lr_all= , reg_all= , n_epochs= ,random_state = **kwargs)

In [None]:
svd.fit(trainset)

In [None]:
preds = svd.test(testset)