# **The comparison of the algorithms used for implementing Collaborative Filtering with Surprise**

## **Installing Surprise**

In [2]:
!pip install scikit-surprise



## **Importing necessary libraries**

In [0]:
import pandas as pd
from surprise import Reader, Dataset
from surprise import CoClustering
from surprise import KNNBaseline
from surprise import KNNBasic
from surprise import KNNWithMeans
from surprise import KNNWithZScore
from surprise import NMF
from surprise import SlopeOne
from surprise import SVD
from surprise import SVDpp
from surprise.model_selection import cross_validate

## **Importing data**

In [0]:
df = pd.read_csv('ratings.csv')

## **Diminishing the dimensionality**
Filtering out rarely rated books and inactive users and reducing the dimensionality.

In [5]:
min_book_rating = 5
filter_books = df['book_id'].value_counts() > min_book_rating
filter_books = filter_books[filter_books].index.tolist()

min_user_rating = 5
filter_users = df['user_id'].value_counts() > min_user_rating
filter_users = filter_users[filter_users].index.tolist()

df_new = df[(df['book_id'].isin(filter_books)) & 
            (df['user_id'].isin(filter_users))][0:560980]
print(f'the original dataset: {df.shape}')
print(f'the new one: {df_new.shape}')

the original dataset: (981756, 3)
the new one: (560980, 3)


## **Comparison of algorithms**

### **Performing cross-validation**

In [0]:
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(df_new[['book_id', 'user_id', 'rating']], reader)

algorithms = [KNNBasic(), KNNWithMeans(), KNNWithZScore(), KNNBaseline(), 
              SVD(), SVDpp(), NMF(), SlopeOne(), CoClustering()]


In [0]:
issue = []

#### *k-NN inspired algorithms*

**KNNBasic**

In [25]:
res_kb = cross_validate(algorithms[0], data, measures=['rmse'], cv=3, 
                         verbose=False)
rkb = pd.DataFrame.from_dict(res_kb).mean(axis=0)
rkb = rkb.append(pd.Series([str(algorithms[0]).split(' ')[0].split('.')[-1]], 
                         index=['algorithm']))

Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.


In [26]:
rkb

test_rmse      0.8977
fit_time      2.42987
test_time      14.686
algorithm    KNNBasic
dtype: object

In [0]:
issue.append(rkb)

**KNNWithMeans**

In [23]:
res_knnm = cross_validate(algorithms[1], data, measures=['rmse'], cv=3, 
                         verbose=False)
rknnm = pd.DataFrame.from_dict(res_knnm).mean(axis=0)
rknnm = rknnm.append(pd.Series([str(algorithms[1]).split(' ')[0].split('.')[-1]], 
                         index=['algorithm']))

Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.


In [24]:
rknnm

test_rmse        0.868269
fit_time          2.51598
test_time         16.7736
algorithm    KNNWithMeans
dtype: object

In [0]:
issue.append(rknnm)

**KNNWithZScore**

In [21]:
res_knns = cross_validate(algorithms[2], data, measures=['rmse'], cv=3, 
                         verbose=False)
rknns = pd.DataFrame.from_dict(res_knns).mean(axis=0)
rknns = rknns.append(pd.Series([str(algorithms[2]).split(' ')[0].split('.')[-1]], 
                         index=['algorithm']))

Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.


In [22]:
rknns

test_rmse         0.871762
fit_time           3.00555
test_time          16.8211
algorithm    KNNWithZScore
dtype: object

In [0]:
issue.append(rknns)

**KNNBaseline**

In [19]:
res_knnb = cross_validate(algorithms[3], data, measures=['rmse'], cv=3, 
                         verbose=False)
rknnb = pd.DataFrame.from_dict(res_knnb).mean(axis=0)
rknnb = rknnb.append(pd.Series([str(algorithms[3]).split(' ')[0].split('.')[-1]], 
                         index=['algorithm']))

Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.


In [20]:
rknnb

test_rmse       0.863546
fit_time         5.12099
test_time        18.7219
algorithm    KNNBaseline
dtype: object

In [0]:
issue.append(rknnb)

#### *Matrix Factorization-based algorithms*

**SVD**

In [0]:
res_svd = cross_validate(algorithms[4], data, measures=['rmse'], cv=3, 
                         verbose=False)
rs = pd.DataFrame.from_dict(res_svd).mean(axis=0)
rs = rs.append(pd.Series([str(algorithms[4]).split(' ')[0].split('.')[-1]], 
                         index=['algorithm']))

In [16]:
rs

test_rmse    0.85566
fit_time     24.2881
test_time     2.1937
algorithm        SVD
dtype: object

In [0]:
issue.append(rs)

**SVD++**

In [0]:
res_svd_pp = cross_validate(algorithms[5], data, measures=['rmse'], cv=3, 
                         verbose=False)
rsp = pd.DataFrame.from_dict(res_svd_pp).mean(axis=0)
rsp = rsp.append(pd.Series([str(algorithms[5]).split(' ')[0].split('.')[-1]], 
                         index=['algorithm']))

In [18]:
rsp

test_rmse    0.840411
fit_time      331.189
test_time     17.6463
algorithm       SVDpp
dtype: object

In [0]:
issue.append(rsp)

**NMF**

In [0]:
res_nmf = cross_validate(algorithms[6], data, measures=['rmse'], cv=3, 
                         verbose=False)
rn = pd.DataFrame.from_dict(res_nmf).mean(axis=0)
rn = rn.append(pd.Series([str(algorithms[6]).split(' ')[0].split('.')[-1]], 
                         index=['algorithm']))

In [13]:
rn

test_rmse    0.907911
fit_time      29.4093
test_time     1.91551
algorithm         NMF
dtype: object

In [0]:
issue.append(rn)

#### *Slope One*

**SlopeOne**

In [0]:
res_slope_one = cross_validate(algorithms[7], data, measures=['rmse'], 
                               cv=3, verbose=False)
rsl = pd.DataFrame.from_dict(res_slope_one).mean(axis=0)
rsl = rsl.append(pd.Series([str(algorithms[7]).split(' ')[0].split('.')[-1]], 
                         index=['algorithm']))

In [9]:
rsl

test_rmse    0.865982
fit_time       32.229
test_time     12.7152
algorithm    SlopeOne
dtype: object

In [0]:
issue.append(rsl)

#### *Co-clustering*

**CoClustering**

In [0]:
res_cocl = cross_validate(algorithms[8], data, measures=['rmse'], cv=3, 
                         verbose=False)
rc = pd.DataFrame.from_dict(res_cocl).mean(axis=0)
rc = rc.append(pd.Series([str(algorithms[8]).split(' ')[0].split('.')[-1]], 
                         index=['algorithm']))

In [11]:
rc

test_rmse         0.88531
fit_time          10.9854
test_time         1.72785
algorithm    CoClustering
dtype: object

In [0]:
issue.append(rc)

### **Comparing results**

In [0]:
benchmark = pd.DataFrame(issue).set_index('algorithm')

In [37]:
benchmark

Unnamed: 0_level_0,test_rmse,fit_time,test_time
algorithm,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
KNNBasic,0.8977,2.429874,14.685977
KNNWithMeans,0.868269,2.515981,16.773579
KNNWithZScore,0.871762,3.005555,16.821147
KNNBaseline,0.863546,5.120994,18.72188
SVD,0.85566,24.288146,2.193698
SVDpp,0.840411,331.188911,17.646263
NMF,0.907911,29.409344,1.915507
SlopeOne,0.865982,32.229018,12.71524
CoClustering,0.88531,10.98538,1.727848


The best one by RMSE is SVD++ but it takes too much time => the winner is SVD. 