# Get data

In [None]:
import pandas as pd
from surprise import Reader
from surprise import Dataset
from surprise.model_selection import train_test_split

# df_movies = pd.read_csv('../data_2/lens-movies.csv')
df_movies = pd.read_csv('../data_big/movies.csv')
# df_rating = pd.read_csv('../data_2/lens-ratings.csv')
df_rating = pd.read_csv('../data_big/ratings.csv')
df_rating = df_rating[df_rating['movieId'].between(1, 1000)]
df_rating = df_rating.drop('timestamp', axis=1)

reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(df_rating[['userId', 'movieId', 'rating']], reader)

trainset, testset = train_test_split(data, test_size=.25, random_state=42)

# Non weighted

For the non weighted version of the algorithm, we can simply re-use the unmodified one from package surprise, which does all the job for us.

In [None]:
from surprise import SlopeOne

slop_one = SlopeOne()
slop_one.fit(trainset)


In [None]:
import pickle

pickle.dump(slop_one, open('serialized/slop_one_regular.pkl', 'wb'))


# Weighted

For the weighted version, we will copy-paste the code from surprise package, and tweak a little bit the calculation in the `estimate` function, in order to add the weight.

See 'Mathematics' section for details about the calculation.

Then, we will compare the results from both versions of the algorithm.

In [None]:
import numpy as np

from surprise import AlgoBase
from surprise.prediction_algorithms import PredictionImpossible

class WeightedSlopOne(AlgoBase):

    def __init__(self):
        AlgoBase.__init__(self)
        self.user_mean = None
        self.dev = None
        self.freq = None

    def estimate(self, u, i):
        if not (self.trainset.knows_user(u) and self.trainset.knows_item(i)):
            raise PredictionImpossible('User and/or item is unknown.')

        ratings = [(i_id, i_r) for (i_id, i_r) in self.trainset.ur[u] if self.freq[i, i_id] > 0]

        # ratings = the items rated by the user u, that are also
        # rated by at least another user at the same time.
        # If ratings is empty, we simply return the mean of
        # the ratings of the user as a (poor) estimation.
        est = self.user_mean[u]
        if ratings:
            est = sum((self.dev[i, i_id] + i_r) * self.freq[i, i_id] for (i_id, i_r) in ratings) / \
                  sum(self.freq[i, i_id] for (i_id, _) in ratings)

        return est

    def fit(self, trainset):
        # This will put trainset in self.transet of AlgoBase
        AlgoBase.fit(self, trainset)

        n_items = trainset.n_items

        freq = np.zeros((trainset.n_items, trainset.n_items), int)
        dev = np.zeros((trainset.n_items, trainset.n_items), np.double)

        # Computation of freq and dev arrays.
        # NB These two loops will be much slower than the one from surpise package,
        # because they do not use c-python code.
        for u, u_ratings in trainset.ur.items():
            for i, r_ui in u_ratings:
                for j, r_uj in u_ratings:
                    freq[i, j] += 1
                    dev[i, j] += r_ui - r_uj

        for i in range(n_items):
            dev[i, i] = 0
            for j in range(i + 1, n_items):
                dev[i, j] /= freq[i, j]
                dev[j, i] = -dev[i, j]

        self.freq = freq
        self.dev = dev

        # mean ratings of all users: mu_u
        self.user_mean = [np.mean([r for (_, r) in trainset.ur[u]])
                          for u in trainset.all_users()]

        return self

    def one_user(self, uid, item_ids):
        predictions = [self.predict(uid, iid) for iid in item_ids]

        return predictions


In [None]:
# Train the model (much longer due to the fact that we don't use c-python code)
weighted_slop_one = WeightedSlopOne()

weighted_slop_one.fit(trainset)

In [None]:
import pickle

pickle.dump(weighted_slop_one, open('serialized/slop_one_1003.pkl', 'wb'))


# Comparison

By comparing both models, we see that weighting our ratings improve the predictions by roughly 1%

In [None]:
from surprise import accuracy

In [None]:
predictions_base = slop_one.test(testset)
rmse_base = accuracy.rmse(predictions_base)

rmse_base

In [None]:
predictions_weighted = weighted_slop_one.test(testset)
rmse_weighted = accuracy.rmse(predictions_weighted)

rmse_weighted

# Cross-validation

In order to get a better idea on how well the model will perform, we can cross-validate 5 times.

In [None]:
from surprise.model_selection import cross_validate

cross_validate(slop_one, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

In [None]:
cross_validate(weighted_slop_one, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

With cross-validation :
- RMSE SlopOne : 0.8998 (mean of 5 cv)
- RMSE WeightedSlopOne : 0.8832 (mean of 5 cv)

The improvement is about 1,84 % with the weighted version of algorithm.

# Recommendations

Let's test our reco engine on some random user

In [None]:
pivot_df = df_rating.pivot_table(index='userId',columns='movieId',values='rating').fillna(0)

raw_user_id = 4
# .loc is used to retrieve labels instead of index (see: https://stackoverflow.com/questions/31593201/how-are-iloc-and-loc-different)
user_ratings = pivot_df.loc[raw_user_id,:]
# only the items the user did rate, so we can exclude them later from the recommended movies
user_ratings = user_ratings[user_ratings != 0]

In [None]:
# pivot_df.columns.to_numpy() is the list of all movies IDs
predictions = weighted_slop_one.one_user(raw_user_id, pivot_df.columns.to_numpy())
# exclude the movies he.she has already rated
predictions = [(iid, est) for (_,iid,_,est,_) in predictions if iid not in user_ratings]

# grab the top 10 predictions
predictions = sorted(predictions, key=lambda p: p[1], reverse=True)
predictions = predictions[:10]

# match the movie titles and show results
df_movies[df_movies['movieId'].isin([iid for (iid, _) in predictions])]

Here we see that the recommendations are pretty consistents : same genres are found in several recommended movies.

# Mathematics

L'algo qu'on va implémenter, pour TOUS les ratings :

![](medias/slop_one_with_weight.png)

Cette formule permet de calculer la note prédite pour UN utilisateur sur UN film.

- $u$ est un tableau incomplet comprenant les notes de l'utilisateur $U$. Incomplet, car un utilisateur ne note pas tous les items.
- $S(u)$ est l'ensemble des items que l'user $U$ a effectivement notés
- $S(u)-\{j\}$ : l'ensemble des items notés par l'user $U$, moins le singleton formé par l'item $j$
- $j$ est l'item dont on cherche à prédire la note pour l'utilisateur $U$
- $u_i$ est la note laissée par l'utilisateur $U$ à l'item $i$
- $dev_{j,i}$ est la déviation moyenne entre les notes en commun des items $i$ et $j$. Elle se calcule avec `mean(r_ui - r_uj for u in U_ij)`. C'est une constante qui doit être calculée lors de la phase de de `fit` (*training*)
- $c_{j,i}$ : la quantité (cardinalité d'un ensemble) d'utilisateurs ayant noté à la fois l'item `i` et l'item `j`
- $\sum{i \in S(u)-\{j\}}$ : boucler sur l'ensemble des items $i$ de l'évaluation $u$, en excluant l'item $j$


Le calcul de la déviation entre les items i et j se fait avec :


![](medias/slop_one_deviation.png)

- $u \in S_{j,i}(X)$ : l'ensemble des évaluations $u$ contenant à la fois les items $u_j$ et $u_i$
- Il s'agit donc de boucler sur cet ensemble, et de calculer la déviation entre $u_j$ et $u_i$ à partir des notes laissées par les utilisateurs ayant noté les deux items en même temps

> À partir de cette formule, et en guise d'étape d'entraînement, on bouclera sur tous les n-uplet $(j,i)$ pour calculer leurs déviations respectives.
> On stockera la matrice symétrique ainsi obtenue, afin de ne pas avoir à la recalculer par la suite
> Il ne nous restera plus qu'à aller piocher dedans pour prédire la note de $j$ (cf. première formule)

Comme on le voit, cet algorithme prend en considération les informations des autres utilisateurs qui ont noté le même item (comme une recherche KNN user-user basée sur le cosinus), mais également les informations provenant les autres items notés par l'utilisateur. C'est cela qui en fait un algorithme riche et efficace (?).

Le `SlopeOne` du package `suprise` implémente par défaut la version non pondérée de l'algorithme :

![image](medias/slop_one_without_weight.png)