In [1]:
import pandas as pd
import numpy as np
import itertools
import sklearn.model_selection

# display options
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

# visualization libraries
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go

# custom helper functions
%load_ext autoreload
%autoreload 2
from helper.general_helper import *

# surprise imorts
from surprise import Reader, Dataset
from surprise.model_selection import cross_validate
from surprise.prediction_algorithms import SVD, NormalPredictor, SlopeOne, CoClustering
from surprise.prediction_algorithms import KNNWithMeans, KNNBasic, KNNBaseline
from surprise.model_selection import GridSearchCV
from surprise.model_selection import train_test_split
from surprise import accuracy


# view plotly in jupyter 
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
init_notebook_mode(connected=True)

# ignore warnings
import warnings
warnings.filterwarnings("ignore")



In [2]:
df = pd.read_csv('../../merged_df.csv')

In [3]:
df

Unnamed: 0,reviewerID,asin,reviewerName,helpful,reviewText,overall,summary,unixReviewTime,reviewTime,description,title,imUrl,related,salesRank,categories,price,brand,prod_review_counts
0,ALC5GH8CAMAI7,159985130X,AnnN,"[1, 1]",This is a great little gadget to have around. ...,5.0,Handy little gadget,1294185600,"01 5, 2011","The Pocket Magnifier is so popular, we are hav...","lightwedge-lighted-pocket-magnifier,-plum",http://ecx.images-amazon.com/images/I/31eVZtZ%...,"{'also_bought': ['B002DGPUM2', 'B00524H8MC', '...",0.0,Medical Supplies & Equipment Daily Living Aids...,24.95,,medium
1,AHKSURW85PJUE,159985130X,"AZ buyer ""AZ buyer""","[1, 1]",I would recommend this for a travel magnifier ...,4.0,Small & may need to encourage battery,1329523200,"02 18, 2012","The Pocket Magnifier is so popular, we are hav...","lightwedge-lighted-pocket-magnifier,-plum",http://ecx.images-amazon.com/images/I/31eVZtZ%...,"{'also_bought': ['B002DGPUM2', 'B00524H8MC', '...",0.0,Medical Supplies & Equipment Daily Living Aids...,24.95,,medium
2,A38RMU1Y5TDP9,159985130X,"Bob Tobias ""Robert Tobias""","[75, 77]",What I liked was the quality of the lens and t...,4.0,Very good but not great,1275955200,"06 8, 2010","The Pocket Magnifier is so popular, we are hav...","lightwedge-lighted-pocket-magnifier,-plum",http://ecx.images-amazon.com/images/I/31eVZtZ%...,"{'also_bought': ['B002DGPUM2', 'B00524H8MC', '...",0.0,Medical Supplies & Equipment Daily Living Aids...,24.95,,medium
3,A1XZUG7DFXXOS4,159985130X,Cat lover,"[56, 60]",Love the Great point light pocket magnifier! ...,4.0,great addition to your purse,1202428800,"02 8, 2008","The Pocket Magnifier is so popular, we are hav...","lightwedge-lighted-pocket-magnifier,-plum",http://ecx.images-amazon.com/images/I/31eVZtZ%...,"{'also_bought': ['B002DGPUM2', 'B00524H8MC', '...",0.0,Medical Supplies & Equipment Daily Living Aids...,24.95,,medium
4,A1MS3M7M7AM13X,159985130X,Cricketoes,"[1, 1]",This is very nice. You pull out on the magnifi...,5.0,Very nice and convenient.,1313452800,"08 16, 2011","The Pocket Magnifier is so popular, we are hav...","lightwedge-lighted-pocket-magnifier,-plum",http://ecx.images-amazon.com/images/I/31eVZtZ%...,"{'also_bought': ['B002DGPUM2', 'B00524H8MC', '...",0.0,Medical Supplies & Equipment Daily Living Aids...,24.95,,medium
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
346350,A11J1FHCK5U06J,B00LJBMCKK,Karinna Ball,"[1, 1]",My appetite is definitely suppressed - my ener...,5.0,Great ingredients!,1405555200,"07 17, 2014",Lose Weight Fast and Safely with Premium Natur...,#1-pure-raspberry-ketones---antioxidant-nutrit...,http://ecx.images-amazon.com/images/I/51NaXSF1...,"{'also_bought': ['B00JJOEV9Y', 'B00M04HZIO', '...",110482.0,Vitamins & Dietary Supplements Weight Loss Sup...,19.95,,low
346351,AFVKI7BCS3FSX,B00LJBMCKK,Laura hogan,"[0, 0]",I was drawn to this particular brand because o...,5.0,Great!,1405468800,"07 16, 2014",Lose Weight Fast and Safely with Premium Natur...,#1-pure-raspberry-ketones---antioxidant-nutrit...,http://ecx.images-amazon.com/images/I/51NaXSF1...,"{'also_bought': ['B00JJOEV9Y', 'B00M04HZIO', '...",110482.0,Vitamins & Dietary Supplements Weight Loss Sup...,19.95,,low
346352,A1Y3FEUELH3FTI,B00LJBMCKK,Onlinesalesgirl,"[0, 0]","Great Product, Gave extra energy without the j...",5.0,Energy minus the jitters,1405123200,"07 12, 2014",Lose Weight Fast and Safely with Premium Natur...,#1-pure-raspberry-ketones---antioxidant-nutrit...,http://ecx.images-amazon.com/images/I/51NaXSF1...,"{'also_bought': ['B00JJOEV9Y', 'B00M04HZIO', '...",110482.0,Vitamins & Dietary Supplements Weight Loss Sup...,19.95,,low
346353,A11LC938XF35XN,B00LJBMCKK,"SJ Blouse ""Stacey""","[0, 0]",I get lots of energy from this product. I felt...,5.0,I get lots of energy from this product.,1405641600,"07 18, 2014",Lose Weight Fast and Safely with Premium Natur...,#1-pure-raspberry-ketones---antioxidant-nutrit...,http://ecx.images-amazon.com/images/I/51NaXSF1...,"{'also_bought': ['B00JJOEV9Y', 'B00M04HZIO', '...",110482.0,Vitamins & Dietary Supplements Weight Loss Sup...,19.95,,low


In [8]:
# lets cut down the sparcity by only selecting users that have rated over 20 products
cut_down = df.groupby('reviewerID').filter(lambda x: len(x) >= 10)

In [9]:
cut_down.reviewerID.value_counts()

A3NHUQ33CFH3VM    292
A1UQBFCERIP7VJ    264
A3OXHLG6DIBRW8    250
A34BZM6S9L7QI4    246
ALNFHVS3SC4FV     219
                 ... 
A1CYDKM3TYTB3H     10
A3SYYNCB421RNF     10
A2S0LBS0S21GPG     10
AV3JRG103FWWM      10
A3CVXU4JDLYXJ1     10
Name: reviewerID, Length: 8731, dtype: int64

In [10]:
# get data into correct format
cut_down = cut_down[['reviewerID', 'asin', 'overall']]

In [11]:
# create a holdout set for final testing
train_data, holdout_data = sklearn.model_selection.train_test_split(cut_down, test_size=.2, random_state=42)

In [12]:
# load into correct format
reader = Reader(rating_scale=(1, 5))

# create testing data and holdout data into surprise formatting
data = Dataset.load_from_df(train_data, reader)
holdout = Dataset.load_from_df(holdout_data, reader)

#### model selection and testing

In [15]:
benchmark = []
for model in [SVD(), NormalPredictor(), SlopeOne(), CoClustering()]:
    cv_score = cross_validate(model, data, measures=['RMSE'], cv=3, verbose=False)
    
    tmp = pd.DataFrame.from_dict(cv_score).mean(axis=0)
    tmp = tmp.append(pd.Series([str(model).split(' ')[0].split('.')[-1]], index=['Model']))
    benchmark.append(tmp)
    
pd.DataFrame(benchmark).set_index('Model').sort_values('test_rmse')  

Unnamed: 0_level_0,test_rmse,fit_time,test_time
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
SVD,0.987893,6.75458,0.440871
SlopeOne,1.082729,8.26318,1.018466
CoClustering,1.08327,4.564478,0.263927
NormalPredictor,1.389417,0.186949,0.399694


In [None]:
param_grid = {'n_factors': [5, 10, 20],
              'n_epochs': [100, 200],
              'lr_all': [0.002, 0.005, 0.01],
              'reg_all': [0.02, 0.05, 0.1]}

gs = GridSearchCV(SVD, param_grid, measures = ['rmse', 'mae'], cv=3)
gs.fit(data)

In [None]:
gs.best_score

In [None]:
gs.best_params

In [None]:
trainset, testset = train_test_split(data, test_size=0.25)
# algo = BaselineOnly(bsl_options=bsl_options)
# predictions = algo.fit(trainset).test(testset)
# accuracy.rmse(predictions)

In [None]:
def get_Iu(uid):
    """ return the number of items rated by given user
    args: 
      uid: the id of the user
    returns: 
      the number of items rated by the user
    """
    try:
        return len(trainset.ur[trainset.to_inner_uid(uid)])
    except ValueError: # user was not part of the trainset
        return 0
    
def get_Ui(iid):
    """ return number of users that have rated given item
    args:
      iid: the raw id of the item
    returns:
      the number of users that have rated the item.
    """
    try: 
        return len(trainset.ir[trainset.to_inner_iid(iid)])
    except ValueError:
        return 0
    
df = pd.DataFrame(predictions, columns=['uid', 'iid', 'rui', 'est', 'details'])
df['Iu'] = df.uid.apply(get_Iu)
df['Ui'] = df.iid.apply(get_Ui)
df['err'] = abs(df.est - df.rui)
best_predictions = df.sort_values(by='err')[:10]
worst_predictions = df.sort_values(by='err')[-10:]

In [44]:
from sklearn.metrics import roc_auc_score, roc_curve, auc

In [46]:
# fpr, tpr, thresholds = metrics.roc_curve(y, pred)
# metrics.auc(fpr, tpr)

In [48]:
def graph_roc (true_r, est):
    
    fpr, tpr, thresholds = roc_curve(true_r, est)
    auc_ = auc(fpr, tpr)
    plt.plot(fpr,tpr, '-')
    plt.ylabel("FPR/Precision Scores")
    plt.xlabel("TPR/Recall Scores")
    plt.title("ROC Scores and Auc of " + str(round(auc_, 2)))
    plt.show()
    

In [None]:
def precision_recall_at_k(predictions, k=10, threshold=3.5):
    """Return precision and recall at k metrics for each user"""

    # First map the predictions to each user.
    user_est_true = defaultdict(list)
    for uid, _, true_r, est, _ in predictions:
        user_est_true[uid].append((est, true_r))

    precisions = dict()
    recalls = dict()
    for uid, user_ratings in user_est_true.items():

        # Sort user ratings by estimated value
        user_ratings.sort(key=lambda x: x[0], reverse=True)

        # Number of relevant items
        n_rel = sum((true_r >= threshold) for (_, true_r) in user_ratings)

        # Number of recommended items in top k
        n_rec_k = sum((est >= threshold) for (est, _) in user_ratings[:k])

        # Number of relevant and recommended items in top k
        n_rel_and_rec_k = sum(((true_r >= threshold) and (est >= threshold))
                              for (est, true_r) in user_ratings[:k])

        # Precision@K: Proportion of recommended items that are relevant
        # When n_rec_k is 0, Precision is undefined. We here set it to 0.

        precisions[uid] = n_rel_and_rec_k / n_rec_k if n_rec_k != 0 else 0

        # Recall@K: Proportion of relevant items that are recommended
        # When n_rel is 0, Recall is undefined. We here set it to 0.

        recalls[uid] = n_rel_and_rec_k / n_rel if n_rel != 0 else 0

    return precisions, recalls

In [None]:

data = Dataset.load_builtin('ml-100k')
kf = KFold(n_splits=5)
algo = SVD()

for trainset, testset in kf.split(data):
    algo.fit(trainset)
    predictions = algo.test(testset)
    precisions, recalls = precision_recall_at_k(predictions, k=5, threshold=4)

    # Precision and recall can then be averaged over all users
    print(sum(prec for prec in precisions.values()) / len(precisions))
    print(sum(rec for rec in recalls.values()) / len(recalls))