In [3]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

!pip install scikit-surprise

  import pandas.util.testing as tm




In [0]:
from surprise import Reader, Dataset
from surprise import NormalPredictor, KNNBasic, KNNWithMeans, KNNWithZScore
from surprise import KNNBaseline, SVD, BaselineOnly, SVDpp, NMF, SlopeOne
from surprise import CoClustering
from surprise import accuracy
from surprise.accuracy import rmse
from surprise.model_selection import cross_validate, train_test_split
from surprise.model_selection import GridSearchCV

Importing data and filtering out rarely rated books and inactive users in order to reduce the size of the dataset.

In [6]:
df = pd.read_csv('ratings.csv')

min_book_rating = 5
filter_books = df['book_id'].value_counts() > min_book_rating
filter_books = filter_books[filter_books].index.tolist()

min_user_rating = 5
filter_users = df['user_id'].value_counts() > min_user_rating
filter_users = filter_users[filter_users].index.tolist()

df_new = df[(df['book_id'].isin(filter_books)) & (df['user_id'].isin(filter_users))]
print(f'the original dataset: {df.shape}')
print(f'the new one: {df_new.shape}')

the original dataset: (981756, 3)
the new one: (916880, 3)


Choosing the algorithm, performing cross-validation.


In [0]:
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(df_new[['book_id', 'user_id', 'rating']][0:500000], reader)

algorithms = [SVD(), SVDpp(), SlopeOne(), NMF(), NormalPredictor(), 
              KNNBaseline(), KNNBasic(), KNNWithMeans(),
              KNNWithZScore(), BaselineOnly(), CoClustering()]


SVD.

In [11]:
res_svd = cross_validate(algorithms[0], data, measures=['RMSE'], cv=3, 
                         verbose=False)
rs = pd.DataFrame.from_dict(res_svd).mean(axis=0)
rs = rs.append(pd.Series([str(algorithms[0]).split(' ')[0].split('.')[-1]], 
                         index=['algorithm']))

print(rs)

test_rmse    0.858182
fit_time      21.0569
test_time     2.07682
algorithm         SVD
dtype: object


SVDpp.

In [14]:
res_svd_pp = cross_validate(algorithms[1], data, measures=['RMSE'], cv=3, 
                         verbose=False)
rsp = pd.DataFrame.from_dict(res_svd_pp).mean(axis=0)
rsp = rsp.append(pd.Series([str(algorithms[1]).split(' ')[0].split('.')[-1]], 
                         index=['algorithm']))

print(rsp)

test_rmse    0.842579
fit_time      299.102
test_time     16.4027
algorithm       SVDpp
dtype: object


SlopeOne.

In [15]:
res_slope_one = cross_validate(algorithms[2], data, measures=['RMSE'], cv=3, 
                         verbose=False)
rsl = pd.DataFrame.from_dict(res_slope_one).mean(axis=0)
rsl = rsl.append(pd.Series([str(algorithms[2]).split(' ')[0].split('.')[-1]], 
                         index=['algorithm']))

print(rsl)

test_rmse    0.867739
fit_time      29.4719
test_time     12.1835
algorithm    SlopeOne
dtype: object


NMF.

In [16]:
res_nmf = cross_validate(algorithms[3], data, measures=['RMSE'], cv=3, 
                         verbose=False)
rn = pd.DataFrame.from_dict(res_nmf).mean(axis=0)
rn = rn.append(pd.Series([str(algorithms[3]).split(' ')[0].split('.')[-1]], 
                         index=['algorithm']))

print(rn)

test_rmse    0.911373
fit_time      28.1495
test_time     1.78277
algorithm         NMF
dtype: object


NormalPredictor.

In [17]:
res_np = cross_validate(algorithms[4], data, measures=['RMSE'], cv=3, 
                         verbose=False)
rnp = pd.DataFrame.from_dict(res_np).mean(axis=0)
rnp = rnp.append(pd.Series([str(algorithms[4]).split(' ')[0].split('.')[-1]], 
                         index=['algorithm']))

print(rnp)

test_rmse            1.33274
fit_time            0.622067
test_time            1.85176
algorithm    NormalPredictor
dtype: object


KNNBaseline.

In [18]:
res_knnb = cross_validate(algorithms[5], data, measures=['RMSE'], cv=3, 
                         verbose=False)
rknnb = pd.DataFrame.from_dict(res_knnb).mean(axis=0)
rknnb = rknnb.append(pd.Series([str(algorithms[5]).split(' ')[0].split('.')[-1]], 
                         index=['algorithm']))

print(rknnb)

Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
test_rmse       0.867567
fit_time         3.71329
test_time        16.3515
algorithm    KNNBaseline
dtype: object


KNNBasic.

In [19]:
res_kb = cross_validate(algorithms[6], data, measures=['RMSE'], cv=3, 
                         verbose=False)
rkb = pd.DataFrame.from_dict(res_kb).mean(axis=0)
rkb = rkb.append(pd.Series([str(algorithms[6]).split(' ')[0].split('.')[-1]], 
                         index=['algorithm']))

print(rkb)

Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
test_rmse    0.900036
fit_time      2.11614
test_time     13.2837
algorithm    KNNBasic
dtype: object


KNNWithMeans.

In [20]:
res_knnm = cross_validate(algorithms[7], data, measures=['RMSE'], cv=3, 
                         verbose=False)
rknnm = pd.DataFrame.from_dict(res_knnm).mean(axis=0)
rknnm = rknnm.append(pd.Series([str(algorithms[7]).split(' ')[0].split('.')[-1]], 
                         index=['algorithm']))

print(rknnm)

Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
test_rmse        0.871225
fit_time          2.24745
test_time         14.0407
algorithm    KNNWithMeans
dtype: object


KNNWithZScore.

In [21]:
res_knns = cross_validate(algorithms[8], data, measures=['RMSE'], cv=3, 
                         verbose=False)
rknns = pd.DataFrame.from_dict(res_knns).mean(axis=0)
rknns = rknns.append(pd.Series([str(algorithms[8]).split(' ')[0].split('.')[-1]], 
                         index=['algorithm']))

print(rknns)

Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
test_rmse         0.874366
fit_time           2.52029
test_time          15.3096
algorithm    KNNWithZScore
dtype: object


BaselineOnly.

In [22]:
res_bl = cross_validate(algorithms[9], data, measures=['RMSE'], cv=3, 
                         verbose=False)
rbl = pd.DataFrame.from_dict(res_bl).mean(axis=0)
rbl = rbl.append(pd.Series([str(algorithms[9]).split(' ')[0].split('.')[-1]], 
                         index=['algorithm']))

print(rbl)

Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
test_rmse        0.854996
fit_time          1.75228
test_time         1.93467
algorithm    BaselineOnly
dtype: object


CoClustering.

In [23]:
res_cocl = cross_validate(algorithms[10], data, measures=['RMSE'], cv=3, 
                         verbose=False)
rc = pd.DataFrame.from_dict(res_cocl).mean(axis=0)
rc = rc.append(pd.Series([str(algorithms[10]).split(' ')[0].split('.')[-1]], 
                         index=['algorithm']))

print(rc)

test_rmse         0.88906
fit_time          10.1093
test_time         1.83549
algorithm    CoClustering
dtype: object
