In [58]:
import pandas as pd
import numpy as np
import itertools
import sklearn.model_selection

# display options
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

# visualization libraries
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go

# custom helper functions
%load_ext autoreload
%autoreload 2
from helper.general_helper import *

# surprise imorts
from surprise import Reader, Dataset
from surprise.model_selection import cross_validate
from surprise.prediction_algorithms import SVD, NormalPredictor, SlopeOne, CoClustering
from surprise.prediction_algorithms import KNNWithMeans, KNNBasic, KNNBaseline
from surprise.model_selection import GridSearchCV
from surprise.model_selection import train_test_split
from surprise import accuracy


# view plotly in jupyter 
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
init_notebook_mode(connected=True)

# ignore warnings
import warnings
warnings.filterwarnings("ignore")

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [2]:
df = pd.read_csv('../../merged_df.csv')

In [49]:
df

Unnamed: 0,reviewerID,asin,reviewerName,helpful,reviewText,overall,summary,unixReviewTime,reviewTime,related,categories,price,prod_review_counts,free,percent_helpful
0,A1N4O8VOJZTDVB,B004A9SDD8,Annette Yancey,"[1, 1]","Loves the song, so he really couldn't wait to ...",3.0,Really cute,1383350400,2013-11-02,"{'also_bought': ['B006M3K874', 'B00F85SMOI', '...",Kids,0.0,medium,1,1.00
1,A2HQWU6HUKIEC7,B004A9SDD8,"Audiobook lover ""Kathy""","[0, 0]","Oh, how my little grandson loves this app. He'...",5.0,2-year-old loves it,1323043200,2011-12-05,"{'also_bought': ['B006M3K874', 'B00F85SMOI', '...",Kids,0.0,medium,1,0.00
2,A1SXASF6GYG96I,B004A9SDD8,Barbara Gibbs,"[0, 0]",I found this at a perfect time since my daught...,5.0,Fun game,1337558400,2012-05-21,"{'also_bought': ['B006M3K874', 'B00F85SMOI', '...",Kids,0.0,medium,1,0.00
3,A2B54P9ZDYH167,B004A9SDD8,"Brooke Greenstreet ""Babylove""","[3, 4]",My 1 year old goes back to this game over and ...,5.0,We love our Monkeys!,1354752000,2012-12-06,"{'also_bought': ['B006M3K874', 'B00F85SMOI', '...",Kids,0.0,medium,1,0.75
4,AFOFZDTX5UC6D,B004A9SDD8,C. Galindo,"[1, 1]",There are three different versions of the song...,5.0,This is my granddaughters favorite app on my K...,1391212800,2014-02-01,"{'also_bought': ['B006M3K874', 'B00F85SMOI', '...",Kids,0.0,medium,1,1.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
752932,AZJ11YS0E52AI,B00LUEMK44,K. Perna,"[0, 0]",I love it!!!!!!!!!! really keeps your attenti...,5.0,Five Stars,1405814400,2014-07-20,"{'also_bought': ['B00KTDD69A', 'B00LAEGVDK', '...",Games,0.0,medium,1,0.00
752933,A2550XGZEFDH2Y,B00LUEMK44,"Melanie G. Nihart ""Grammy""","[0, 0]",Okay but there are so many free ones that are ...,3.0,... are so many free ones that are so much bet...,1405900800,2014-07-21,"{'also_bought': ['B00KTDD69A', 'B00LAEGVDK', '...",Games,0.0,medium,1,0.00
752934,A1KNDB16TG5QXD,B00LUEMK44,P. O'Reilly,"[0, 0]",Another great jewels game that just keeps you ...,4.0,Enjoyable,1405900800,2014-07-21,"{'also_bought': ['B00KTDD69A', 'B00LAEGVDK', '...",Games,0.0,medium,1,0.00
752935,A1IHFHA5LI9SGI,B00LUEMK44,redhatflusher,"[0, 0]",I find this the best jewels star ever. There s...,5.0,entertaining,1405814400,2014-07-20,"{'also_bought': ['B00KTDD69A', 'B00LAEGVDK', '...",Games,0.0,medium,1,0.00


In [50]:
# lets cut down the sparcity by only selecting users that have rated over 20 products
cut_down = df.groupby('reviewerID').filter(lambda x: len(x) >= 10)

In [51]:
cut_down.reviewerID.value_counts()

A1X1CEGHTHMBL1    565
ACX8G3IW95BNE     513
A2C05OHYJERICI    469
A3F3B6HY9RJI04    467
A1VM5Y6BL07X9X    338
                 ... 
A1I5CXNJMDY52L     10
A27APFGL06398L     10
A3W12UN5P6UAB2     10
A2FCX6CDFG392      10
A383N522ZXP835     10
Name: reviewerID, Length: 20104, dtype: int64

In [55]:
# get data into correct format
cut_down = cut_down[['reviewerID', 'asin', 'overall']]

In [56]:
# create a holdout set for final testing
train_data, holdout_data = sklearn.model_selection.train_test_split(cut_down, test_size=.2, random_state=42)

In [57]:
# load into correct format
reader = Reader(rating_scale=(1, 5))

# create testing data and holdout data into surprise formatting
data = Dataset.load_from_df(train_data, reader)
holdout = Dataset.load_from_df(holdout_data, reader)

#### model selection and testing

In [None]:
benchmark = []
for models in [SVD(), NormalPredictor(), SlopeOne(), CoClustering()]:
    cv_score = cross_validate(algorithm, data, measures=['RMSE'], cv=3, verbose=False)
    
    tmp = pd.DataFrame.from_dict(cv_score).mean(axis=0)
    tmp = tmp.append(pd.Series([str(algorithm).split(' ')[0].split('.')[-1]], index=['Algorithm']))
    benchmark.append(tmp)
    
pd.DataFrame(benchmark).set_index('Algorithm').sort_values('test_rmse')  

In [None]:
trainset, testset = train_test_split(data, test_size=0.25)
# algo = BaselineOnly(bsl_options=bsl_options)
# predictions = algo.fit(trainset).test(testset)
# accuracy.rmse(predictions)

In [None]:
def get_Iu(uid):
    """ return the number of items rated by given user
    args: 
      uid: the id of the user
    returns: 
      the number of items rated by the user
    """
    try:
        return len(trainset.ur[trainset.to_inner_uid(uid)])
    except ValueError: # user was not part of the trainset
        return 0
    
def get_Ui(iid):
    """ return number of users that have rated given item
    args:
      iid: the raw id of the item
    returns:
      the number of users that have rated the item.
    """
    try: 
        return len(trainset.ir[trainset.to_inner_iid(iid)])
    except ValueError:
        return 0
    
df = pd.DataFrame(predictions, columns=['uid', 'iid', 'rui', 'est', 'details'])
df['Iu'] = df.uid.apply(get_Iu)
df['Ui'] = df.iid.apply(get_Ui)
df['err'] = abs(df.est - df.rui)
best_predictions = df.sort_values(by='err')[:10]
worst_predictions = df.sort_values(by='err')[-10:]

In [44]:
from sklearn.metrics import roc_auc_score, roc_curve, auc

In [46]:
# fpr, tpr, thresholds = metrics.roc_curve(y, pred)
# metrics.auc(fpr, tpr)

In [48]:
def graph_roc (true_r, est):
    
    fpr, tpr, thresholds = roc_curve(true_r, est)
    auc_ = auc(fpr, tpr)
    plt.plot(fpr,tpr, '-')
    plt.ylabel("FPR/Precision Scores")
    plt.xlabel("TPR/Recall Scores")
    plt.title("ROC Scores and Auc of " + str(round(auc_, 2)))
    plt.show()
    

In [None]:
def precision_recall_at_k(predictions, k=10, threshold=3.5):
    """Return precision and recall at k metrics for each user"""

    # First map the predictions to each user.
    user_est_true = defaultdict(list)
    for uid, _, true_r, est, _ in predictions:
        user_est_true[uid].append((est, true_r))

    precisions = dict()
    recalls = dict()
    for uid, user_ratings in user_est_true.items():

        # Sort user ratings by estimated value
        user_ratings.sort(key=lambda x: x[0], reverse=True)

        # Number of relevant items
        n_rel = sum((true_r >= threshold) for (_, true_r) in user_ratings)

        # Number of recommended items in top k
        n_rec_k = sum((est >= threshold) for (est, _) in user_ratings[:k])

        # Number of relevant and recommended items in top k
        n_rel_and_rec_k = sum(((true_r >= threshold) and (est >= threshold))
                              for (est, true_r) in user_ratings[:k])

        # Precision@K: Proportion of recommended items that are relevant
        # When n_rec_k is 0, Precision is undefined. We here set it to 0.

        precisions[uid] = n_rel_and_rec_k / n_rec_k if n_rec_k != 0 else 0

        # Recall@K: Proportion of relevant items that are recommended
        # When n_rel is 0, Recall is undefined. We here set it to 0.

        recalls[uid] = n_rel_and_rec_k / n_rel if n_rel != 0 else 0

    return precisions, recalls

In [None]:

data = Dataset.load_builtin('ml-100k')
kf = KFold(n_splits=5)
algo = SVD()

for trainset, testset in kf.split(data):
    algo.fit(trainset)
    predictions = algo.test(testset)
    precisions, recalls = precision_recall_at_k(predictions, k=5, threshold=4)

    # Precision and recall can then be averaged over all users
    print(sum(prec for prec in precisions.values()) / len(precisions))
    print(sum(rec for rec in recalls.values()) / len(recalls))