# Surprise is a library for collaborative filtering

In [None]:
# input data for each algorithm (in this order:) #user_id #item_id #rating

In [43]:
import pandas as pd

from surprise import NormalPredictor
from surprise import SVD
from surprise import SlopeOne
from surprise import Dataset
from surprise import Reader
from surprise.model_selection import cross_validate
from surprise import KNNBasic, KNNWithMeans, KNNWithZScore, KNNBaseline
from surprise.accuracy import rmse
from surprise.model_selection import train_test_split as surprise_train_test_split

# set random state for reproducibility
kwargs = dict(random_state=42)

In [2]:
ratings = pd.read_csv('../data/raw/ratings.csv', sep=',')

In [3]:
ratings.describe()

Unnamed: 0,user_id,movieID,rating
count,812818.0,812818.0,812818.0
mean,689.692353,8706.134093,3.437447
std,498.190021,14439.659629,1.002791
min,0.0,1.0,0.5
25%,273.0,1367.0,3.0
50%,593.0,3249.0,3.5
75%,1040.0,6534.0,4.0
max,2112.0,65133.0,5.0


In [5]:
# build a reader, define the rating scale (minimum and maximum value)
reader = Reader(rating_scale=(0.5, 5))

# The columns must be in the order of user id, item id and ratings 
data = Dataset.load_from_df(ratings[['user_id', 'movieID', 'rating']], reader)

Cross-validate different collaborative-filtering algorithms

In [None]:
# Basic prediction algorithms

In [13]:
# Normal Predictor predicting a random rating based on the distribution of the training set 
# distribution is assumed to be normal
cross_validate(NormalPredictor(), data, cv=5)

{'test_rmse': array([1.37887753, 1.3822837 , 1.38455046, 1.37976877, 1.37834124]),
 'test_mae': array([1.09888479, 1.10241847, 1.10189494, 1.09934343, 1.09819222]),
 'fit_time': (0.9818909168243408,
  1.1790714263916016,
  1.1470410823822021,
  1.1470415592193604,
  1.1520462036132812),
 'test_time': (1.2611455917358398,
  1.4423105716705322,
  1.3882603645324707,
  1.5403988361358643,
  1.4052760601043701)}

In [14]:
# Singular Value Decomposition
cross_validate(SVD(), data, cv=5)

{'test_rmse': array([0.77295206, 0.77452598, 0.77456965, 0.77336247, 0.77383235]),
 'test_mae': array([0.58639007, 0.58638703, 0.58594585, 0.58580674, 0.58629899]),
 'fit_time': (35.7544732093811,
  37.15017008781433,
  35.74535894393921,
  36.005547285079956,
  35.5472846031189),
 'test_time': (1.4603257179260254,
  1.4353036880493164,
  1.421290636062622,
  1.5964505672454834,
  1.429297924041748)}

In [15]:
# Simple K-Nearest-Neighbor (KNN) Algorithm
cross_validate(KNNBasic(), data, cv=5)

Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.


{'test_rmse': array([0.82025873, 0.82042603, 0.82259845, 0.81998164, 0.82060395]),
 'test_mae': array([0.61968453, 0.61950368, 0.62178122, 0.61977187, 0.62085926]),
 'fit_time': (13.00881552696228,
  13.048851490020752,
  13.144938945770264,
  13.609359979629517,
  13.314763307571411),
 'test_time': (48.62015676498413,
  48.607144832611084,
  50.255642890930176,
  49.795599699020386,
  49.35782718658447)}

In [45]:
# KNN, taking into account the z-score normalization of each user.
cross_validate(KNNWithZScore(), data, cv=5)

Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.


{'test_rmse': array([0.80766735, 0.80804213, 0.8070759 , 0.80715646, 0.8064727 ]),
 'test_mae': array([0.6128293 , 0.61273382, 0.61258087, 0.61263462, 0.61229324]),
 'fit_time': (13.479241609573364,
  13.577330827713013,
  13.713453769683838,
  13.650396585464478,
  13.456221103668213),
 'test_time': (50.90723514556885,
  51.03935503959656,
  51.20250391960144,
  51.366652488708496,
  51.35363984107971)}

In [44]:
# 
cross_validate(SlopeOne(), data, cv=5)

{'test_rmse': array([0.81015878, 0.80904046, 0.8093389 , 0.81221125, 0.81261829]),
 'test_mae': array([0.61616261, 0.6151213 , 0.61537192, 0.61728232, 0.61760081]),
 'fit_time': (35.6103413105011,
  35.55429148674011,
  35.771488428115845,
  36.03873085975647,
  36.22289705276489),
 'test_time': (97.64968609809875,
  96.1202974319458,
  97.58863115310669,
  97.49154257774353,
  96.86697626113892)}

In [21]:
trainset, testset = surprise_train_test_split(data, test_size=0.2, shuffle=True, **kwargs)

In [22]:
svd = SVD(**kwargs)

In [23]:
svd.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x16ceb266730>

In [37]:
# user factors
svd.pu

array([[-0.07550305,  0.19653254,  0.00471276, ..., -0.28346341,
         0.23844732, -0.12534409],
       [-0.07150064, -0.19183905, -0.06341324, ..., -0.10770481,
        -0.00287729, -0.06020949],
       [-0.05673481,  0.11206512,  0.0986272 , ...,  0.17376617,
         0.15469672,  0.23185501],
       ...,
       [-0.23490498, -0.07949418, -0.01095694, ...,  0.0006677 ,
         0.03078363,  0.0474292 ],
       [ 0.04696989,  0.13926831,  0.01537944, ..., -0.04143001,
         0.02971297,  0.05937526],
       [ 0.12462804, -0.05820714, -0.07581964, ..., -0.00284997,
        -0.06214073, -0.03067043]])

In [38]:
# item factors
svd.qi

array([[-0.03509496, -0.12382632,  0.03998815, ..., -0.11375698,
        -0.00573124,  0.01972498],
       [-0.1034999 ,  0.13044891,  0.24004721, ...,  0.26433661,
         0.19439505, -0.10171066],
       [-0.07827134,  0.15405811, -0.13564332, ..., -0.02430594,
        -0.11488402,  0.0640317 ],
       ...,
       [ 0.04413867,  0.09839711, -0.08025985, ..., -0.1002068 ,
         0.01406884, -0.18818064],
       [-0.09074666, -0.01419026,  0.03995792, ...,  0.13409375,
        -0.07822657, -0.23921678],
       [-0.01613894,  0.09141754, -0.00869151, ..., -0.17679153,
        -0.01110383,  0.16239869]])

In [31]:
# user biases
svd.bu.shape

(2113,)

In [32]:
# item biases
svd.bi.shape

(9901,)

In [41]:
preds = svd.test(testset)

In [42]:
preds

[Prediction(uid=364, iid=2542, r_ui=3.0, est=4.004661458297131, details={'was_impossible': False}),
 Prediction(uid=204, iid=49822, r_ui=3.5, est=3.2847449194039315, details={'was_impossible': False}),
 Prediction(uid=692, iid=1407, r_ui=2.5, est=2.2138481013784554, details={'was_impossible': False}),
 Prediction(uid=387, iid=34, r_ui=4.0, est=3.593047336537698, details={'was_impossible': False}),
 Prediction(uid=1254, iid=550, r_ui=3.5, est=2.985528691657416, details={'was_impossible': False}),
 Prediction(uid=227, iid=7154, r_ui=3.5, est=2.7443258792751486, details={'was_impossible': False}),
 Prediction(uid=515, iid=527, r_ui=4.0, est=4.3034408073208885, details={'was_impossible': False}),
 Prediction(uid=433, iid=923, r_ui=4.0, est=3.77505254190117, details={'was_impossible': False}),
 Prediction(uid=656, iid=6264, r_ui=2.0, est=1.8496008526595022, details={'was_impossible': False}),
 Prediction(uid=1465, iid=367, r_ui=4.5, est=3.302003149922238, details={'was_impossible': False}),