In [1]:
import pandas as pd
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt
from surprise import SVD, SVDpp, NMF
from surprise import Dataset
from surprise import Reader
from surprise import evaluate, print_perf
from surprise import KNNBasic
from surprise import AlgoBase, BaselineOnly
from get_top_n import get_top_n
from surprise.accuracy import rmse, mae

# Model based

In [2]:
#Load data
dt = pd.read_csv('sample_data.csv',index_col=0)
dt.head()

Unnamed: 0,reviewerID,productID,rating,date
0,A1AIWGGQNVTXVA,B0001CXUHW,5.0,2014-01-29
1,A3B81VS0DT31M8,B0001CXUHW,4.0,2014-03-16
2,A2YEHSCW4TFXRU,B0001CXUHW,5.0,2014-04-03
3,AB8Z395DD49YP,B0001CXUHW,5.0,2014-03-15
4,A6S9AKRSORG2G,B0001CXUHW,5.0,2014-02-05


In [3]:
reader = Reader(rating_scale=(1,5))
dt = Dataset.load_from_df(dt[['reviewerID','productID','rating']],reader)

In [4]:
raw_ratings = dt.raw_ratings
threshold = int(.8 * len(raw_ratings))
A_raw_ratings = raw_ratings[:threshold]
B_raw_ratings = raw_ratings[threshold:]

In [5]:
dt.raw_ratings = A_raw_ratings  # data is now the set A
dt.split(n_folds=3)

In [6]:
#Baseline model: User's bias & item's bias
class BaselineModel(AlgoBase):

    def __init__(self):
        AlgoBase.__init__(self)

    def compute_bias(self):
        bu = np.zeros(self.trainset.n_users)
        bi = np.zeros(self.trainset.n_items)
        global_mean = self.trainset.global_mean
        
        for i in self.trainset.all_items():
            dev_i = 0
            for (u, r) in self.trainset.ir[i]:
                dev_i += r - global_mean
            bi[i] = dev_i/(len(self.trainset.ir[i]))
        
        for u in self.trainset.all_users():
            dev_u = 0
            for (i, r) in self.trainset.ur[u]:
                dev_u += r - global_mean
            bu[u] = dev_u/(len(self.trainset.ur[u]))
        return bu, bi
    
    def train(self, trainset):
        AlgoBase.train(self, trainset)
        self.bu, self.bi = self.compute_bias()

    def estimate(self, u, i):

        est = self.trainset.global_mean
        if self.trainset.knows_user(u):
            est += self.bu[u]
        if self.trainset.knows_item(i):
            est += self.bi[i]

        return est

algo1 = BaselineModel()

In [7]:
trainset = dt.build_full_trainset()
algo1.train(trainset)

In [8]:
testset = dt.construct_testset(B_raw_ratings)  # testset is now the set B
baseline_predictions = algo1.test(testset)

baseline_predictions = algo1.test(testset)
top_n = get_top_n(baseline_predictions, n=5)

# Print the recommended items for each user
for uid, user_ratings in top_n.items():
    print(uid, [iid for (iid, _) in user_ratings])

('A2IDFT2FZKJ5PZ', ['B00GYWSDPM'])
('A1V6EY6KARATVC', ['B00B1HL0H8'])
('AQZH7YTWQPOBE', ['B00DRA8HSM'])
('A17SN5WJ6RRKV1', ['B00EHJPT1U'])
('A305XM71TF83CY', ['B00GLP9JI2'])
('AX5Y4X8GRVS6V', ['B00FJEJWGA'])
('A2WPBNE8TB160D', ['B00B3PJHCS'])
('AFCQPI0XYODZD', ['B00GYWSDPM'])
('A1Y87C9PZLSCK3', ['B00GYWSDPM'])
('A133IM5LCXOX1P', ['B00DRA8HSM'])
('A2D20ZDSHJPE07', ['B00DRA8HSM'])
('ATABYJH3EVPI8', ['B00DHETWR8'])
('A3JA0ENH5WFQKN', ['B00B1HL0H8'])
('A2VG0HFAH13MTG', ['B00GYWSDPM'])
('A1EA3QWLRO24X8', ['B00DRA8HSM'])
('A11C0JC8J6YGLI', ['B00ESK1IU4'])
('A1JSH7VIOE8HSW', ['B00CISBK4C'])
('A3K1PSYVVH4N2K', ['B00DRA8HSM'])
('A1Z3RE4WITD7AY', ['B00FE6EJRA'])
('A2UGBLFGPF49U5', ['B00DHETWR8'])
('A3KZCNV1N8R9GC', ['B00FPGL6S4'])
('A3KPJ1MOGTZVGC', ['B00KCJRVO2'])
('A1ACM1CBGORBN1', ['B00DRA8HSM'])
('AFVMF3L8PSHQU', ['B00E0VQGK8'])
('A27S9FH40J97TT', ['B00DRA8HSM'])
('A3V0Y58IQI4DCD', ['B00DRA8HSM'])
('A2GRDQJ4Z7053K', ['B00GLP9JI2'])
('A7RS6A5RQ3DUY', ['B00FJEJWGA'])
('A3VYKXHQDICC6', ['B00KCJ

In [9]:
rmse(baseline_predictions)

RMSE: 1.1264


1.1263765946032394

In [10]:
mae(baseline_predictions)

MAE:  0.8854


0.88540833472613545

In [11]:
#SVD
algo2 = SVD()
algo2.train(trainset)
svd_predictions = algo2.test(testset)
rmse(svd_predictions)
mae(svd_predictions)

RMSE: 1.1231
MAE:  0.8819


0.88194522756965099

In [12]:
svd_predictions

[Prediction(uid='A6TXBHX0O7ATF', iid='B008VQ1RE8', r_ui=5.0, est=4.4031855317301254, details={u'was_impossible': False}),
 Prediction(uid='A3NJR99Q8J2INU', iid='B008VQ1RE8', r_ui=5.0, est=4.4031855317301254, details={u'was_impossible': False}),
 Prediction(uid='A2JOEQP6PAH22N', iid='B008VQ1RE8', r_ui=3.0, est=4.4031855317301254, details={u'was_impossible': False}),
 Prediction(uid='A2N9AGR2QT2UXG', iid='B008VQ1RE8', r_ui=5.0, est=4.4031855317301254, details={u'was_impossible': False}),
 Prediction(uid='A2JQ681HN76SRB', iid='B008VQ1RE8', r_ui=5.0, est=4.4031855317301254, details={u'was_impossible': False}),
 Prediction(uid='AVY406XUHAN72', iid='B00A66Q1WO', r_ui=3.0, est=4.3295687885010263, details={u'was_impossible': False}),
 Prediction(uid='A2XXPTN1VN0LDW', iid='B00A66Q1WO', r_ui=4.0, est=4.3295687885010263, details={u'was_impossible': False}),
 Prediction(uid='A33CPE0XR5XI49', iid='B00A66Q1WO', r_ui=5.0, est=4.3295687885010263, details={u'was_impossible': False}),
 Prediction(uid='A

In [13]:
testset

[('A6TXBHX0O7ATF', 'B008VQ1RE8', 5.0),
 ('A3NJR99Q8J2INU', 'B008VQ1RE8', 5.0),
 ('A2JOEQP6PAH22N', 'B008VQ1RE8', 3.0),
 ('A2N9AGR2QT2UXG', 'B008VQ1RE8', 5.0),
 ('A2JQ681HN76SRB', 'B008VQ1RE8', 5.0),
 ('AVY406XUHAN72', 'B00A66Q1WO', 3.0),
 ('A2XXPTN1VN0LDW', 'B00A66Q1WO', 4.0),
 ('A33CPE0XR5XI49', 'B00A66Q1WO', 5.0),
 ('A358HJ8X7XRMGR', 'B00A66Q1WO', 3.0),
 ('A3BE5T6ST5YK9H', 'B00A66Q1WO', 3.0),
 ('AT1TJFX152CK7', 'B00A66Q1WO', 5.0),
 ('A2ATXW55ZHL2OJ', 'B00A66Q1WO', 5.0),
 ('A3P6B2N7Q0FUSN', 'B00A66Q1WO', 5.0),
 ('AGZG6MFF0V5JL', 'B00A66Q1WO', 5.0),
 ('A21I5YXFBT41NQ', 'B00A66Q1WO', 5.0),
 ('A2XVJ24UENBHAA', 'B00A66Q1WO', 3.0),
 ('A1KNM9LA9NKOWR', 'B00AAVK5PY', 5.0),
 ('A16X18QH3TK8GV', 'B00AAVK5PY', 5.0),
 ('A192NXERQ6IPOL', 'B00AAVK5PY', 5.0),
 ('A11UZPZAFGI74U', 'B00AAVK5PY', 4.0),
 ('A1ZCJVGP3LPR5O', 'B00AAVK5PY', 4.0),
 ('A1UC5PPM6RVU86', 'B00AAVK5PY', 5.0),
 ('AFFXVW8Z1X57Q', 'B00AAVK5PY', 5.0),
 ('A2L4JFETZWBQPS', 'B00AAVK5PY', 5.0),
 ('A1FPDYGW2DGG3C', 'B00AAVK5PY', 5.0),
 ('A2

In [15]:
#SVD++
algo3 = SVDpp()
evaluate(algo3, dt, measures=['RMSE', 'MAE'])

Evaluating RMSE, MAE of algorithm SVD.

------------
Fold 1
RMSE: 1.1974
MAE:  0.9021
------------
Fold 2
RMSE: 1.1500
MAE:  0.8715
------------
Fold 3
RMSE: 1.1613
MAE:  0.8761
------------
------------
Mean RMSE: 1.1696
Mean MAE : 0.8832
------------
------------


CaseInsensitiveDefaultDict(list,
                           {'mae': [0.90211275514508327,
                             0.87147225691180108,
                             0.8760773232306448],
                            'rmse': [1.1974327219679906,
                             1.1499549297580085,
                             1.1612690923310445]})

In [None]:
#Non-negative matrix factorization
algo4 = NMF()
evaluate(algo3, dt, measures=['RMSE', 'MAE'],verbose=2)

# Metrics

In [None]:
#Coverage, Serendipity

#Try different model parameters

#Plots
