In [10]:
import pandas as pd
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt
from surprise import SVD, SVDpp, NMF
from surprise import Dataset
from surprise import Reader
from surprise import evaluate, print_perf
from surprise import KNNBasic
from surprise import AlgoBase, BaselineOnly

# Model based

In [2]:
#Load data
dt = pd.read_csv('sample_data.csv',index_col=0)
dt.head()

Unnamed: 0,reviewerID,productID,rating,date
0,A1AIWGGQNVTXVA,B0001CXUHW,5.0,2014-01-29
1,A3B81VS0DT31M8,B0001CXUHW,4.0,2014-03-16
2,A2YEHSCW4TFXRU,B0001CXUHW,5.0,2014-04-03
3,AB8Z395DD49YP,B0001CXUHW,5.0,2014-03-15
4,A6S9AKRSORG2G,B0001CXUHW,5.0,2014-02-05


In [3]:
reader = Reader(rating_scale=(1,5))
dt = Dataset.load_from_df(dt[['reviewerID','productID','rating']],reader)
dt.split(n_folds=3)

In [4]:
#Baseline model: User's bias & item's bias
class BaselineModel(AlgoBase):

    def __init__(self):
        AlgoBase.__init__(self)

    def compute_bias(self):
        bu = np.zeros(self.trainset.n_users)
        bi = np.zeros(self.trainset.n_items)
        global_mean = self.trainset.global_mean
        
        for i in self.trainset.all_items():
            dev_i = 0
            for (u, r) in self.trainset.ir[i]:
                dev_i += r - global_mean
            bi[i] = dev_i/(len(self.trainset.ir[i]))
        
        for u in self.trainset.all_users():
            dev_u = 0
            for (i, r) in self.trainset.ur[u]:
                dev_u += r - global_mean
            bu[u] = dev_u/(len(self.trainset.ur[u]))
        return bu, bi
    
    def train(self, trainset):
        AlgoBase.train(self, trainset)
        self.bu, self.bi = self.compute_bias()

    def estimate(self, u, i):

        est = self.trainset.global_mean
        if self.trainset.knows_user(u):
            est += self.bu[u]
        if self.trainset.knows_item(i):
            est += self.bi[i]

        return est

algo1 = BaselineModel()

evaluate(algo1, dt,measures=['RMSE', 'MAE'],verbose=1)

Evaluating RMSE, MAE of algorithm BaselineModel.

------------
Fold 1
RMSE: 1.1825
MAE:  0.8760
------------
Fold 2
RMSE: 1.1962
MAE:  0.8845
------------
Fold 3
RMSE: 1.1521
MAE:  0.8701
------------
------------
Mean RMSE: 1.1769
Mean MAE : 0.8769
------------
------------


CaseInsensitiveDefaultDict(list,
                           {'mae': [0.87602954696918434,
                             0.88452849585449023,
                             0.87009752380908534],
                            'rmse': [1.1825368824107447,
                             1.1962119294285771,
                             1.1520977628827491]})

In [5]:
#SVD
algo2 = SVD()
perf = evaluate(algo2, dt, measures=['RMSE', 'MAE'])
print_perf(perf)

Evaluating RMSE, MAE of algorithm SVD.

------------
Fold 1
RMSE: 1.1732
MAE:  0.8807
------------
Fold 2
RMSE: 1.1790
MAE:  0.8835
------------
Fold 3
RMSE: 1.1381
MAE:  0.8715
------------
------------
Mean RMSE: 1.1635
Mean MAE : 0.8786
------------
------------
        Fold 1  Fold 2  Fold 3  Mean    
RMSE    1.1732  1.1790  1.1381  1.1635  
MAE     0.8807  0.8835  0.8715  0.8786  


In [8]:
#SVD++
algo3 = SVDpp()
evaluate(algo3, dt, measures=['RMSE', 'MAE'])

Evaluating RMSE, MAE of algorithm SVDpp.

------------
Fold 1
RMSE: 1.1729
MAE:  0.8883
------------
Fold 2
RMSE: 1.1818
MAE:  0.8894
------------
Fold 3
RMSE: 1.1406
MAE:  0.8757
------------
------------
Mean RMSE: 1.1651
Mean MAE : 0.8845
------------
------------


CaseInsensitiveDefaultDict(list,
                           {'mae': [0.88834229401081366,
                             0.88935761439944749,
                             0.87574152224637514],
                            'rmse': [1.1729020666672803,
                             1.1817647372555327,
                             1.1405590928295857]})

In [11]:
#Non-negative matrix factorization
algo4 = NMF()
evaluate(algo3, dt, measures=['RMSE', 'MAE'])

Evaluating RMSE, MAE of algorithm SVDpp.

------------
Fold 1
RMSE: 1.1727
MAE:  0.8853
------------
Fold 2
RMSE: 1.1786
MAE:  0.8849
------------
Fold 3
RMSE: 1.1404
MAE:  0.8814
------------
------------
Mean RMSE: 1.1639
Mean MAE : 0.8839
------------
------------


CaseInsensitiveDefaultDict(list,
                           {'mae': [0.88528584636752927,
                             0.88494679573368806,
                             0.8814034880749444],
                            'rmse': [1.1727075414555548,
                             1.1786141465728615,
                             1.1404371257923989]})

# Metrics

In [None]:
#Coverage, Serendipity

#Try different model parameters

#Plots
