In [1]:
import pandas as pd
import numpy as np
import random
%matplotlib inline
import matplotlib.pyplot as plt
from surprise import SVD, SVDpp, NMF
from surprise import Dataset
from surprise import Reader
from surprise import evaluate, print_perf
from surprise import KNNBasic
from surprise import AlgoBase, BaselineOnly
from get_top_n import get_top_n
from surprise.accuracy import rmse, mae

# Model based

In [2]:
#Load data
dt = pd.read_csv('sample_data.csv',index_col=0)
dt.head()

Unnamed: 0,reviewerID,productID,rating,date
0,A1AIWGGQNVTXVA,B0001CXUHW,5.0,2014-01-29
1,A3B81VS0DT31M8,B0001CXUHW,4.0,2014-03-16
2,A2YEHSCW4TFXRU,B0001CXUHW,5.0,2014-04-03
3,AB8Z395DD49YP,B0001CXUHW,5.0,2014-03-15
4,A6S9AKRSORG2G,B0001CXUHW,5.0,2014-02-05


In [3]:
reader = Reader(rating_scale=(1,5))
dt = Dataset.load_from_df(dt[['reviewerID','productID','rating']],reader)

In [4]:
raw_ratings = dt.raw_ratings
#Shuffle ratings
random.seed(42)
random.shuffle(raw_ratings)
#Split to training and test sets
threshold = int(.8 * len(raw_ratings))
A_raw_ratings = raw_ratings[:threshold]
B_raw_ratings = raw_ratings[threshold:]

In [5]:
dt.raw_ratings = A_raw_ratings  # data is now the set A
dt.split(n_folds=3)

In [6]:
#Baseline model: User's bias & item's bias
class BaselineModel(AlgoBase):

    def __init__(self):
        AlgoBase.__init__(self)

    def compute_bias(self):
        bu = np.zeros(self.trainset.n_users)
        bi = np.zeros(self.trainset.n_items)
        global_mean = self.trainset.global_mean
        
        for i in self.trainset.all_items():
            dev_i = 0
            for (u, r) in self.trainset.ir[i]:
                dev_i += r - global_mean
            bi[i] = dev_i/(len(self.trainset.ir[i]))
        
        for u in self.trainset.all_users():
            dev_u = 0
            for (i, r) in self.trainset.ur[u]:
                dev_u += r - global_mean
            bu[u] = dev_u/(len(self.trainset.ur[u]))
        return bu, bi
    
    def train(self, trainset):
        AlgoBase.train(self, trainset)
        self.bu, self.bi = self.compute_bias()

    def estimate(self, u, i):

        est = self.trainset.global_mean
        if self.trainset.knows_user(u):
            est += self.bu[u]
        if self.trainset.knows_item(i):
            est += self.bi[i]

        return est

algo1 = BaselineModel()

In [7]:
trainset = dt.build_full_trainset()
algo1.train(trainset)

In [8]:
testset = dt.construct_testset(B_raw_ratings)  # testset is now the set B
baseline_predictions = algo1.test(testset)

baseline_predictions = algo1.test(testset)

In [9]:
rmse(baseline_predictions)

RMSE: 1.1511


1.151064116823681

In [10]:
mae(baseline_predictions)

MAE:  0.8493


0.84933024512871458

In [11]:
top_n = get_top_n(baseline_predictions, n=5)

# Print the recommended items for each user
for uid, user_ratings in top_n.items():
    print(uid, [iid for (iid, _) in user_ratings])

('A2IDFT2FZKJ5PZ', ['B00GYWSDPM'])
('AQZH7YTWQPOBE', ['B00DRA8HSM'])
('A1CU1FCTFTG7DT', ['B0051SU0OW'])
('A1IOL59SJFX3DO', ['B0001CXUHW'])
('A2PV45NEZ7PDJZ', ['B008VQ1RE8'])
('A1JB2DFRW1H6J9', ['B0051SU0OW'])
('A3H7Y9IW2CP80C', ['B008P0BG7I'])
('A1Z54EM24Y40LL', ['B003PMSTTE'])
('A2S4PMQ1CNZSV2', ['B001CTO0YA'])
('A3EWYHYD4X7PXV', ['B0051SU0OW'])
('A133IM5LCXOX1P', ['B00DRA8HSM'])
('A3GCMC59YFAA3O', ['B003IHO8LE'])
('A1CA6A79GYMEDW', ['B00CISBK4C'])
('AA85CD1MQA98Q', ['B004KL4FTW'])
('A1X0V8Y4MP06JT', ['B003LPKEPC'])
('A3B2NVF1LKISZD', ['B003IHO8LE'])
('ADS99W8WMEXZ2', ['B00FPGL6S4'])
('AKO27KBDBLH3E', ['B0051SU0OW'])
('AJXVSZS690XSU', ['B0051SU0OW'])
('AJ6B83I4YJHYW', ['B0051SU0OW'])
('A27S9FH40J97TT', ['B00DRA8HSM'])
('AJRFNHHGI1RZV', ['B003LPKEPC'])
('A3FS2KNCKXYZYN', ['B008P0BG7I'])
('A25EMCU7BZL2T8', ['B003LPKEPC'])
('A3AD71K9NRX6EM', ['B000NKL6HS'])
('A36MP37DITBU6F', ['B00GLP9JI2'])
('A3JZAGBZDY2T8E', ['B003LPKEPC'])
('AQI5CPEEMWGRL', ['B000XR5MLW'])
('A26MU8FMFO713Y', ['B00FJEJ

In [12]:
#SVD
algo2 = SVD()
algo2.train(trainset)
svd_predictions = algo2.test(testset)
rmse(svd_predictions)
mae(svd_predictions)

RMSE: 1.1412
MAE:  0.8535


0.85346580151504459

In [13]:
svd_predictions

[Prediction(uid='A3GEE8RYZO869A', iid='B004U7QSYQ', r_ui=5.0, est=4.4626873783093552, details={u'was_impossible': False}),
 Prediction(uid='AKO27KBDBLH3E', iid='B0051SU0OW', r_ui=5.0, est=4.2316825142034622, details={u'was_impossible': False}),
 Prediction(uid='A21I5YXFBT41NQ', iid='B00A66Q1WO', r_ui=5.0, est=4.3330559626417964, details={u'was_impossible': False}),
 Prediction(uid='AEGCR5ZPWJCIY', iid='B002ZJ3BTQ', r_ui=5.0, est=4.0480516049072195, details={u'was_impossible': False}),
 Prediction(uid='A3FJBNL8H7IQOR', iid='B0051SU0OW', r_ui=4.0, est=4.2316825142034622, details={u'was_impossible': False}),
 Prediction(uid='A2JQ681HN76SRB', iid='B008VQ1RE8', r_ui=5.0, est=4.4022710126520614, details={u'was_impossible': False}),
 Prediction(uid='A1VPI2IDB4CYX5', iid='B00HFEQ8QY', r_ui=5.0, est=4.4902989427865307, details={u'was_impossible': False}),
 Prediction(uid='A3VQYZ3FT3C1MC', iid='B0051SU0OW', r_ui=5.0, est=4.2316825142034622, details={u'was_impossible': False}),
 Prediction(uid='A

In [14]:
testset

[('A3GEE8RYZO869A', 'B004U7QSYQ', 5.0),
 ('AKO27KBDBLH3E', 'B0051SU0OW', 5.0),
 ('A21I5YXFBT41NQ', 'B00A66Q1WO', 5.0),
 ('AEGCR5ZPWJCIY', 'B002ZJ3BTQ', 5.0),
 ('A3FJBNL8H7IQOR', 'B0051SU0OW', 4.0),
 ('A2JQ681HN76SRB', 'B008VQ1RE8', 5.0),
 ('A1VPI2IDB4CYX5', 'B00HFEQ8QY', 5.0),
 ('A3VQYZ3FT3C1MC', 'B0051SU0OW', 5.0),
 ('A1IUUMMKPEV9ZQ', 'B0051SU0OW', 4.0),
 ('A2N1EQWZ1Z4IMM', 'B00B1HL0H8', 5.0),
 ('AS2DLB6EBN6YK', 'B000F9X40O', 5.0),
 ('AJ26F0ASJEEUI', 'B002DCPR9C', 5.0),
 ('AC7PM6MQ0VWIE', 'B00GYWSDPM', 5.0),
 ('A2B25BGEREHW9X', 'B0048KMJBG', 5.0),
 ('AFWFTVH2IK6NZ', 'B0008IT4OM', 5.0),
 ('AMRJP1UBCX5LL', 'B000ET4SM8', 5.0),
 ('A3IIHZEN5XRKNU', 'B0051SU0OW', 5.0),
 ('A3CA3RWZYJDWXE', 'B00GLP9JI2', 5.0),
 ('A3BIP0OKX6W30O', 'B008P0BG7I', 5.0),
 ('AJXVSZS690XSU', 'B0051SU0OW', 5.0),
 ('A1JB2DFRW1H6J9', 'B0051SU0OW', 1.0),
 ('A6I5PAPDPDG29', 'B000SARJO2', 5.0),
 ('A3TI5J0Q90AKJW', 'B003IHO8LE', 1.0),
 ('A1FO0N69D64YSG', 'B0051SU0OW', 3.0),
 ('A1KYSXFU5CDKA8', 'B0051SU0OW', 5.0),
 ('A3GF4C

In [15]:
#SVD++
algo3 = SVDpp()
evaluate(algo3, dt, measures=['RMSE', 'MAE'])

Evaluating RMSE, MAE of algorithm SVDpp.

------------
Fold 1
RMSE: 1.1185
MAE:  0.8653
------------
Fold 2
RMSE: 1.2487
MAE:  0.9324
------------
Fold 3
RMSE: 1.1498
MAE:  0.8786
------------
------------
Mean RMSE: 1.1723
Mean MAE : 0.8921
------------
------------


CaseInsensitiveDefaultDict(list,
                           {'mae': [0.86530419987757345,
                             0.93242272497870693,
                             0.87862079917120539],
                            'rmse': [1.1185081247100885,
                             1.2487108513567307,
                             1.1497893793872747]})

In [16]:
#Non-negative matrix factorization
algo4 = NMF()
evaluate(algo3, dt, measures=['RMSE', 'MAE'],verbose=2)

Evaluating RMSE, MAE of algorithm SVDpp.

------------
Fold 1
user: AB4MB545B021O item: B0001CXUHW r_ui = 5.00   est = 4.53   {u'was_impossible': False}
user: A1WSFIJRJLA1JL item: B003PMSTTE r_ui = 5.00   est = 4.28   {u'was_impossible': False}
user: A1WWGVW0M5FZB6 item: B001CTO0YA r_ui = 5.00   est = 4.58   {u'was_impossible': False}
user: A2JZGJNLAZ7XSY item: B000EDBQ6A r_ui = 5.00   est = 4.60   {u'was_impossible': False}
user: AJ1KE13D9QMOG item: B0051SU0OW r_ui = 5.00   est = 4.22   {u'was_impossible': False}
user: A10IK1WRIZFV95 item: B0051SU0OW r_ui = 3.00   est = 4.22   {u'was_impossible': False}
user: A2R0GG1DCVSLAM item: B001CTO0YA r_ui = 5.00   est = 4.58   {u'was_impossible': False}
user: A375XJ54FB879D item: B0051SU0OW r_ui = 5.00   est = 4.22   {u'was_impossible': False}
user: ARQK0VWTIQUTG item: B000FK63QA r_ui = 5.00   est = 4.52   {u'was_impossible': False}
user: A1XSTIVDYWWGMD item: B00GYWSDPM r_ui = 5.00   est = 4.03   {u'was_impossible': False}
user: A2ZM11MEGHZ4ZA 

user: A1UFGF8WG87ZQT item: B008VQ1RE8 r_ui = 5.00   est = 4.36   {u'was_impossible': False}
user: A24GRZNYY8V2KP item: B00HFEQ8QY r_ui = 5.00   est = 4.60   {u'was_impossible': False}
user: A2UODGS428U84Z item: B00DRA8HSM r_ui = 5.00   est = 4.35   {u'was_impossible': False}
user: A162ZCWBW1TZIO item: B00B3PJHCS r_ui = 5.00   est = 4.33   {u'was_impossible': False}
user: A35QWFV4WDD4VD item: B0051SU0OW r_ui = 5.00   est = 4.22   {u'was_impossible': False}
user: A1V4V3B9QBXDX1 item: B0048KMJBG r_ui = 5.00   est = 4.30   {u'was_impossible': False}
user: A3GUYKX9WS44VM item: B001I16ENM r_ui = 5.00   est = 4.10   {u'was_impossible': False}
user: ASYDKYSLPJAJ7 item: B00GYWSDPM r_ui = 5.00   est = 4.03   {u'was_impossible': False}
user: A1RIIXAHSANKIQ item: B0051SU0OW r_ui = 4.00   est = 4.22   {u'was_impossible': False}
user: A2TP362JXCM57J item: B0051SU0OW r_ui = 3.00   est = 4.22   {u'was_impossible': False}
user: A3H11VW0L01A1T item: B00DRA8HSM r_ui = 5.00   est = 4.35   {u'was_impossibl

user: A1HKHLRU9EI5QT item: B008VQ1RE8 r_ui = 5.00   est = 4.47   {u'was_impossible': False}
user: A3TRXXVLIE7BCB item: B004U7QSYQ r_ui = 5.00   est = 4.41   {u'was_impossible': False}
user: A31DBRBFNTMQHD item: B0051SU0OW r_ui = 5.00   est = 4.29   {u'was_impossible': False}
user: A1YBWEO6SIWRW4 item: B003LPKEPC r_ui = 5.00   est = 4.70   {u'was_impossible': False}
user: AY95I1L39JACN item: B008VQ1RE8 r_ui = 5.00   est = 4.47   {u'was_impossible': False}
user: A3AN6PPPCVINOD item: B007B9YQ12 r_ui = 5.00   est = 4.17   {u'was_impossible': False}
user: A2LSXHCBIRIZSQ item: B00GYWSDPM r_ui = 2.00   est = 4.21   {u'was_impossible': False}
user: A1NV2MS1AXHBMY item: B00FE6EJRA r_ui = 1.00   est = 4.54   {u'was_impossible': False}
user: A2P3URWE45OZF0 item: B00B1HL0H8 r_ui = 5.00   est = 4.36   {u'was_impossible': False}
user: A3UVV77XLHBZIZ item: B0051SU0OW r_ui = 5.00   est = 4.29   {u'was_impossible': False}
user: A2VUHFYCISV78N item: B0051SU0OW r_ui = 5.00   est = 4.29   {u'was_impossibl

CaseInsensitiveDefaultDict(list,
                           {'mae': [0.8641028884825368,
                             0.93230709065124162,
                             0.88668496734839208],
                            'rmse': [1.1160592302192296,
                             1.2470931632192643,
                             1.1529606145360094]})

# Metrics

In [17]:
#Coverage, Serendipity

#Try different model parameters

#Plots
