In [1]:
import pandas as pd
import numpy as np
import random
%matplotlib inline
import matplotlib.pyplot as plt
from surprise import SVD, SVDpp, NMF
from surprise import Dataset
from surprise import Reader
from surprise import evaluate, print_perf
from surprise import KNNBasic
from surprise import AlgoBase, BaselineOnly
from get_top_n import get_top_n
from surprise.accuracy import rmse, mae

# Model based

In [2]:
#Load data
dt = pd.read_csv('sample_data.csv',index_col=0)
dt.head()

Unnamed: 0,reviewerID,productID,rating,date
0,A1AIWGGQNVTXVA,B0001CXUHW,5.0,2014-01-29
1,A3B81VS0DT31M8,B0001CXUHW,4.0,2014-03-16
2,A2YEHSCW4TFXRU,B0001CXUHW,5.0,2014-04-03
3,AB8Z395DD49YP,B0001CXUHW,5.0,2014-03-15
4,A6S9AKRSORG2G,B0001CXUHW,5.0,2014-02-05


In [3]:
reader = Reader(rating_scale=(1,5))
dt = Dataset.load_from_df(dt[['reviewerID','productID','rating']],reader)

In [4]:
raw_ratings = dt.raw_ratings
#Shuffle ratings
random.seed(42)
random.shuffle(raw_ratings)
#Split to training and test sets
threshold = int(.8 * len(raw_ratings))
A_raw_ratings = raw_ratings[:threshold]
B_raw_ratings = raw_ratings[threshold:]

In [5]:
dt.raw_ratings = A_raw_ratings  # dt is now the set A - training set
dt.split(n_folds=3)

In [6]:
#Baseline model: User's bias & item's bias
class BaselineModel(AlgoBase):

    def __init__(self):
        AlgoBase.__init__(self)

    def compute_bias(self):
        bu = np.zeros(self.trainset.n_users)
        bi = np.zeros(self.trainset.n_items)
        global_mean = self.trainset.global_mean
        
        for i in self.trainset.all_items():
            dev_i = 0
            for (u, r) in self.trainset.ir[i]:
                dev_i += r - global_mean
            bi[i] = dev_i/(len(self.trainset.ir[i]))
        
        for u in self.trainset.all_users():
            dev_u = 0
            for (i, r) in self.trainset.ur[u]:
                dev_u += r - global_mean
            bu[u] = dev_u/(len(self.trainset.ur[u]))
        return bu, bi
    
    def train(self, trainset):
        AlgoBase.train(self, trainset)
        self.bu, self.bi = self.compute_bias()

    def estimate(self, u, i):
        est = self.trainset.global_mean
        if self.trainset.knows_user(u):
            est += self.bu[u]
        if self.trainset.knows_item(i):
            est += self.bi[i]
        return est

algo1 = BaselineModel()

In [7]:
#Construct train set and test set for use by surprise
trainset = dt.build_full_trainset()
testset = dt.construct_testset(B_raw_ratings)  # testset is now the set B

In [8]:
#Train and predict using baseline model
algo1.train(trainset)
baseline_predictions = algo1.test(testset)

In [9]:
rmse(baseline_predictions)
mae(baseline_predictions)

RMSE: 1.1511
MAE:  0.8493


0.84933024512871458

In [10]:
#Get top n items for each user
top_n = get_top_n(baseline_predictions, n=5)

# Print the recommended items for each user
for uid, user_ratings in top_n.items():
    print(uid, [iid for (iid, _) in user_ratings])

('A2IDFT2FZKJ5PZ', ['B00GYWSDPM'])
('AQZH7YTWQPOBE', ['B00DRA8HSM'])
('A1CU1FCTFTG7DT', ['B0051SU0OW'])
('A1IOL59SJFX3DO', ['B0001CXUHW'])
('A2PV45NEZ7PDJZ', ['B008VQ1RE8'])
('A1JB2DFRW1H6J9', ['B0051SU0OW'])
('A3H7Y9IW2CP80C', ['B008P0BG7I'])
('A1Z54EM24Y40LL', ['B003PMSTTE'])
('A2S4PMQ1CNZSV2', ['B001CTO0YA'])
('A3EWYHYD4X7PXV', ['B0051SU0OW'])
('A133IM5LCXOX1P', ['B00DRA8HSM'])
('A3GCMC59YFAA3O', ['B003IHO8LE'])
('A1CA6A79GYMEDW', ['B00CISBK4C'])
('AA85CD1MQA98Q', ['B004KL4FTW'])
('A1X0V8Y4MP06JT', ['B003LPKEPC'])
('A3B2NVF1LKISZD', ['B003IHO8LE'])
('ADS99W8WMEXZ2', ['B00FPGL6S4'])
('AKO27KBDBLH3E', ['B0051SU0OW'])
('AJXVSZS690XSU', ['B0051SU0OW'])
('AJ6B83I4YJHYW', ['B0051SU0OW'])
('A27S9FH40J97TT', ['B00DRA8HSM'])
('AJRFNHHGI1RZV', ['B003LPKEPC'])
('A3FS2KNCKXYZYN', ['B008P0BG7I'])
('A25EMCU7BZL2T8', ['B003LPKEPC'])
('A3AD71K9NRX6EM', ['B000NKL6HS'])
('A36MP37DITBU6F', ['B00GLP9JI2'])
('A3JZAGBZDY2T8E', ['B003LPKEPC'])
('AQI5CPEEMWGRL', ['B000XR5MLW'])
('A26MU8FMFO713Y', ['B00FJEJ

In [11]:
#SVD
algo2 = SVD()
algo2.train(trainset)
svd_predictions = algo2.test(testset)
rmse(svd_predictions)
mae(svd_predictions)

RMSE: 1.1409
MAE:  0.8513


0.85127811206832182

In [12]:
svd_predictions

[Prediction(uid='A3GEE8RYZO869A', iid='B004U7QSYQ', r_ui=5.0, est=4.4831026583235243, details={u'was_impossible': False}),
 Prediction(uid='AKO27KBDBLH3E', iid='B0051SU0OW', r_ui=5.0, est=4.2545023091071261, details={u'was_impossible': False}),
 Prediction(uid='A21I5YXFBT41NQ', iid='B00A66Q1WO', r_ui=5.0, est=4.3306418178071917, details={u'was_impossible': False}),
 Prediction(uid='AEGCR5ZPWJCIY', iid='B002ZJ3BTQ', r_ui=5.0, est=4.0652945749792186, details={u'was_impossible': False}),
 Prediction(uid='A3FJBNL8H7IQOR', iid='B0051SU0OW', r_ui=4.0, est=4.2545023091071261, details={u'was_impossible': False}),
 Prediction(uid='A2JQ681HN76SRB', iid='B008VQ1RE8', r_ui=5.0, est=4.3871247869677745, details={u'was_impossible': False}),
 Prediction(uid='A1VPI2IDB4CYX5', iid='B00HFEQ8QY', r_ui=5.0, est=4.5222615338494405, details={u'was_impossible': False}),
 Prediction(uid='A3VQYZ3FT3C1MC', iid='B0051SU0OW', r_ui=5.0, est=4.2545023091071261, details={u'was_impossible': False}),
 Prediction(uid='A

# Metrics

In [13]:
#Coverage, Serendipity

#Try different model parameters

#Plots
