In [1]:
%load_ext lab_black

In [2]:
from __future__ import absolute_import, division, print_function, unicode_literals
import pickle
import os

import pandas as pd

from surprise import SVD
from surprise import KNNBasic
from surprise import Dataset
from surprise import Reader
from surprise.model_selection import PredefinedKFold
from surprise import dump
from surprise.accuracy import rmse

In [3]:
# We will train and test on the u1.base and u1.test files of the movielens-100k dataset.
# if you haven't already, you need to download the movielens-100k dataset
# You can do it manually, or by running:

# Dataset.load_builtin('ml-100k')

# Now, let's load the dataset
train_file = os.path.expanduser("~") + "/.surprise_data/ml-100k/ml-100k/u1.base"
test_file = os.path.expanduser("~") + "/.surprise_data/ml-100k/ml-100k/u1.test"
data = Dataset.load_from_folds([(train_file, test_file)], Reader("ml-100k"))

pkf = PredefinedKFold()


# We'll use the well-known SVD algorithm and a basic nearest neighbors approach.
algo_svd = SVD()
algo_knn = KNNBasic()

for trainset, testset in pkf.split(data):
    algo_svd.fit(trainset)
    predictions_svd = algo_svd.test(testset)

    algo_knn.fit(trainset)
    predictions_knn = algo_knn.test(testset)

    rmse(predictions_svd)
    rmse(predictions_knn)

    dump.dump("./dump_SVD", predictions_svd, algo_svd)
    dump.dump("./dump_KNN", predictions_knn, algo_knn)

Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 0.9537
RMSE: 0.9889


In [4]:
# The dumps have been saved and we can now use them whenever we want.

predictions_svd, algo_svd = dump.load("./dump_SVD")
predictions_knn, algo_knn = dump.load("./dump_KNN")

df_svd = pd.DataFrame(predictions_svd, columns=["uid", "iid", "rui", "est", "details"])
df_knn = pd.DataFrame(predictions_knn, columns=["uid", "iid", "rui", "est", "details"])

df_svd["err"] = abs(df_svd.est - df_svd.rui)
df_knn["err"] = abs(df_knn.est - df_knn.rui)

In [5]:
df_svd.head()

Unnamed: 0,uid,iid,rui,est,details,err
0,1,6,5.0,3.437263,{'was_impossible': False},1.562737
1,1,10,3.0,3.978653,{'was_impossible': False},0.978653
2,1,12,5.0,4.430843,{'was_impossible': False},0.569157
3,1,14,5.0,4.010679,{'was_impossible': False},0.989321
4,1,17,3.0,3.719548,{'was_impossible': False},0.719548


In [6]:
df_knn.head()

Unnamed: 0,uid,iid,rui,est,details,err
0,1,6,5.0,3.468613,"{'actual_k': 20, 'was_impossible': False}",1.531387
1,1,10,3.0,3.86629,"{'actual_k': 40, 'was_impossible': False}",0.86629
2,1,12,5.0,4.538194,"{'actual_k': 40, 'was_impossible': False}",0.461806
3,1,14,5.0,4.235741,"{'actual_k': 40, 'was_impossible': False}",0.764259
4,1,17,3.0,3.228002,"{'actual_k': 40, 'was_impossible': False}",0.228002


In [7]:
# Let's check how good are the KNN predictions when the SVD has a huge error:
df_knn[df_svd.err >= 3.5]

Unnamed: 0,uid,iid,rui,est,details,err
1087,14,176,1.0,4.010531,"{'actual_k': 40, 'was_impossible': False}",3.010531
1905,38,211,1.0,4.136955,"{'actual_k': 40, 'was_impossible': False}",3.136955
1912,38,257,1.0,3.989741,"{'actual_k': 40, 'was_impossible': False}",2.989741
1930,38,526,1.0,4.115078,"{'actual_k': 40, 'was_impossible': False}",3.115078
4960,98,173,1.0,4.109546,"{'actual_k': 40, 'was_impossible': False}",3.109546
7390,167,169,1.0,4.664991,"{'actual_k': 40, 'was_impossible': False}",3.664991
9289,206,895,5.0,2.996326,"{'actual_k': 40, 'was_impossible': False}",2.003674
15306,312,265,1.0,4.131875,"{'actual_k': 40, 'was_impossible': False}",3.131875
19140,405,575,5.0,2.410506,"{'actual_k': 36, 'was_impossible': False}",2.589494


In [8]:
# Well... Not much better.
# Now, let's look at the predictions of SVD on the 10 worst predictions for KNN
df_svd.iloc[df_knn.sort_values(by="err")[-10:].index]

Unnamed: 0,uid,iid,rui,est,details,err
9406,208,302,1.0,4.234734,{'was_impossible': False},3.234734
19089,405,169,1.0,3.421209,{'was_impossible': False},2.421209
19785,436,132,1.0,4.168776,{'was_impossible': False},3.168776
157,2,315,1.0,4.19116,{'was_impossible': False},3.19116
8503,193,56,1.0,4.264564,{'was_impossible': False},3.264564
5531,113,976,5.0,2.885436,{'was_impossible': False},2.114564
7917,181,408,1.0,2.504647,{'was_impossible': False},1.504647
7390,167,169,1.0,4.573502,{'was_impossible': False},3.573502
7412,167,1306,5.0,3.486075,{'was_impossible': False},1.513925
5553,114,1104,5.0,3.225325,{'was_impossible': False},1.774675


In [9]:
# Question: when a user has rated only a small number of items (less than 10), which algorithm
# gives the best predictions on average?


def get_Iu(uid):
    """Return the number of items rated by given user

    Args:
        uid: The raw id of the user.
    Returns:
        The number of items rated by the user.
    """

    try:
        return len(trainset.ur[trainset.to_inner_uid(uid)])
    except ValueError:  # user was not part of the trainset
        return 0


df_knn["Iu"] = df_knn.uid.apply(get_Iu)
df_svd["Iu"] = df_svd.uid.apply(get_Iu)

df_knn[df_knn.Iu < 10].err.mean(), df_svd[df_svd.Iu < 10].err.mean()

(1.0382962702232326, 1.0259307702712503)