In [1]:

import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from surprise import SVDpp
from surprise import Dataset
from surprise import Reader
from surprise import accuracy
from collections import defaultdict
from surprise.model_selection import train_test_split 
from sklearn.model_selection import train_test_split 
from sklearn.metrics import roc_auc_score


  from pandas.core.computation.check import NUMEXPR_INSTALLED


In [35]:
#reading files
df_books = pd.read_csv('goodreads_books_poetry.csv')
df_users = pd.read_csv('goodreads_interactions_poetry.csv')


In [None]:
df_users.columns

In [7]:
df_books = df_books[df_books['language_code'] == 'eng']

In [9]:
df_books.shape

(8393, 29)

In [11]:
# Extract the unique book_id values from df_books
valid_book_ids = df_books['book_id'].unique()

# Filter the rows in df_users
df_users = df_users[df_users['book_id'].isin(valid_book_ids)]

In [5]:
df_users.head()

Unnamed: 0,user_id,book_id,review_id,is_read,rating,review_text_incomplete,date_added,date_updated,read_at,started_at
0,8842281e1d1347389f2ab93d60773d4d,1384,1bad0122cebb4aa9213f9fe1aa281f66,True,4,,Wed May 09 09:33:44 -0700 2007,Wed May 09 09:33:44 -0700 2007,,
1,8842281e1d1347389f2ab93d60773d4d,1376,eb6e502d0c04d57b43a5a02c21b64ab4,True,4,,Wed May 09 09:33:18 -0700 2007,Wed May 09 09:33:18 -0700 2007,,
2,8842281e1d1347389f2ab93d60773d4d,30119,787564bef16cb1f43e0f641ab59d25b7,True,5,,Sat Jan 13 13:44:20 -0800 2007,Wed Mar 22 11:45:08 -0700 2017,Tue Mar 01 00:00:00 -0800 1983,
3,72fb0d0087d28c832f15776b0d936598,24769928,8c80ee74743d4b3b123dd1a2e0c0bcac,False,0,,Wed Apr 27 11:05:51 -0700 2016,Wed Apr 27 11:05:52 -0700 2016,,
4,72fb0d0087d28c832f15776b0d936598,30119,2a83589fb597309934ec9b1db5876aaf,True,3,,Mon Jun 04 18:58:08 -0700 2012,Mon Jun 04 18:58:13 -0700 2012,,


In [12]:
df_users=df_users[['user_id', 'book_id', 'rating']]

In [13]:
df_users.head()

Unnamed: 0,user_id,book_id,rating
0,8842281e1d1347389f2ab93d60773d4d,1384,4
1,8842281e1d1347389f2ab93d60773d4d,1376,4
2,8842281e1d1347389f2ab93d60773d4d,30119,5
3,72fb0d0087d28c832f15776b0d936598,24769928,0
4,72fb0d0087d28c832f15776b0d936598,30119,3


In [14]:
#splitting data into train and test sets
train_split, test_split = train_test_split(df_users, test_size = 0.25, random_state = 20)
print("Training data size:", train_split.shape)
print("Test data size:", test_split.shape)


Training data size: (1014663, 3)
Test data size: (338221, 3)


In [15]:
#reader for parsing the ratings file
reader = Reader(rating_scale=(1, 5))
#building the train and test set, loading the data from dataframe 
train_build = Dataset.load_from_df(train_split, reader)
test_build = Dataset.load_from_df(test_split, reader)
trainset = train_build.build_full_trainset()
testset = test_build.build_full_trainset().build_testset() 
print("Test set size:", len(testset))


Test set size: 338221


In [17]:
#model building
#takes in factors, epochs, learning rate and regularization parameter 
model = SVDpp(n_factors=30,n_epochs=20,lr_all=0.01,reg_all=0.05) 
model.fit(trainset)


<surprise.prediction_algorithms.matrix_factorization.SVDpp at 0x7ff6bff716a0>

In [18]:
#making predictions
predictions = model.test(testset)
#calculating rmse
accuracy.rmse(predictions, verbose = True)
#Save all the predicted ratings and convert it to a dataframe 



RMSE: 1.8019


1.8018896613803914

In [19]:
all_recommendations_list = defaultdict(list)
all_recommendations_df = pd.DataFrame([])
# Create empty lists to collect data for each column
user_list = []
book_id_list = []
predicted_rating_list = []

for uid, iid, true_r, est, _ in predictions:
    all_recommendations_list[uid].append((iid, est))
    user_list.append(uid)
    book_id_list.append(iid)
    predicted_rating_list.append(est)


#for uid, iid, true_r, est, _ in predictions:
    #all_recommendations_list[uid].append((iid, est))
    
# Create a DataFrame using the collected data
all_recommendations_df = pd.DataFrame({'user': user_list, 'book_id': book_id_list, 'predicted_rating': predicted_rating_list})
    #all_recommendations_df = all_recommendations_df.append(pd.DataFrame({'user': uid, 'movieId': iid, 'predicted_rating' : est}, index=[0]), ignore_index=True); 
print(all_recommendations_df.head(5)) 
print(all_recommendations_df.shape)



                               user   book_id  predicted_rating
0  95c25c4181bd7449f1ce2e2dafe7c21e     30119          2.139364
1  091402d81bef94103c1ef99586c4d365     27822          1.922856
2  091402d81bef94103c1ef99586c4d365      1420          2.014041
3  c958665f697b9b750102b28d61680a0b  18263725          2.905318
4  a8e23ebfd310fc499af43ace51b1c894      1715          1.946904
(338221, 3)


In [20]:
#Merging with movies file to get genre, title information for predictions
all_recommendations_df_details = pd.merge(all_recommendations_df,df_books, on='book_id', how='inner') 
print(all_recommendations_df_details.head(5))


                               user  book_id  predicted_rating        isbn  \
0  95c25c4181bd7449f1ce2e2dafe7c21e    30119          2.139364  0060513039   
1  a8e23ebfd310fc499af43ace51b1c894    30119          3.126805  0060513039   
2  a5b0434e1f3179930816fea2b9193b6e    30119          2.909621  0060513039   
3  238dcb73b558f030afd477ed6be48232    30119          3.330055  0060513039   
4  87070937fd0a8f99c7dd0dfceef7ee37    30119          1.000000  0060513039   

   text_reviews_count series country_code language_code  \
0                8950     []           US           eng   
1                8950     []           US           eng   
2                8950     []           US           eng   
3                8950     []           US           eng   
4                8950     []           US           eng   

                                     popular_shelves asin  ...         isbn13  \
0  [{'count': '20447', 'name': 'to-read'}, {'coun...  NaN  ...  9780060513030   
1  [{'count': 

In [34]:
all_recommendations_df_details.columns

Index(['user', 'book_id', 'predicted_rating', 'isbn', 'text_reviews_count',
       'series', 'country_code', 'language_code', 'popular_shelves', 'asin',
       'is_ebook', 'average_rating', 'kindle_asin', 'similar_books',
       'description', 'format', 'link', 'authors', 'publisher', 'num_pages',
       'publication_day', 'isbn13', 'publication_month', 'edition_information',
       'publication_year', 'url', 'image_url', 'ratings_count', 'work_id',
       'title', 'title_without_series'],
      dtype='object')

In [21]:
#List of top n recommendations list as per SVD++
def get_top_n_recommendation_list_df(all_recommendations_df_details, n=10):
    top_n_recommendations_df = all_recommendations_df_details.sort_values(['user','predicted_rating'] ,ascending=[True, False])
    return top_n_recommendations_df


In [22]:
top_n_recommendations_df = get_top_n_recommendation_list_df(all_recommendations_df_details, 10) 
print(top_n_recommendations_df.head())


                                    user   book_id  predicted_rating  \
257732  0001085188e302fc6b2568de45a5f56b  32334098          1.000000   
191271  00013344a98d0147eacab88d3e1502ef      5932          4.888991   
274369  00013344a98d0147eacab88d3e1502ef    138165          4.674417   
106610  000192962b87d560f00b06fdcbd71681     30118          4.082191   
83978   000243c8e211fb3f359e4ff45ca899ea     15997          1.000000   

              isbn  text_reviews_count       series country_code  \
257732  144948641X                 221  ['1137480']           US   
191271  0143039962                 882           []           US   
274369  0393974979                  19           []           US   
106610  0060513063                2222           []           US   
83978   0140424393                2098   ['465549']           US   

       language_code                                    popular_shelves asin  \
257732           eng  [{'count': '62826', 'name': 'to-read'}, {'coun...  NaN  

In [23]:
metrics=[]
true_positives_array = []
est_array = []
for rating_threshold in np.arange(0,5.5,0.5):
    truePositives = 0
    trueNegatives = 0
    falseNegatives = 0
    falsePositives = 0
    accuracy =0
    precision =0
    recall =0
    f1_score = 0
    for uid,_, true_r, est, _ in predictions:
        if(true_r >= rating_threshold and est >= rating_threshold):
            truePositives = truePositives + 1 
            true_positives_array.append(true_r) 
            est_array.append(est)
#here
        elif(true_r >= rating_threshold and est<= rating_threshold):
            falseNegatives = falseNegatives + 1 
        elif(true_r <= rating_threshold and est >=rating_threshold):
             falsePositives = falsePositives + 1 
        elif(true_r <= rating_threshold and est<=rating_threshold):
            trueNegatives = trueNegatives + 1 
        if(truePositives > 0):
            accuracy = (truePositives + trueNegatives ) / (truePositives + trueNegatives + falsePositives + falseNegatives)
            precision = truePositives / (truePositives + falsePositives)
            recall = truePositives / (truePositives + falseNegatives) 
            f1_score = 2 * (precision * recall) / (precision + recall)



In [24]:
metrics.append([rating_threshold,truePositives,trueNegatives,falsePositives,falseNegatives,accuracy,precision,recall,f1_score])
metrics_df = pd.DataFrame(metrics)
metrics_df.rename(columns={0:'rating_threshold', 1:'truePositives', 2: 'trueNegatives', 3: 'falsePositives', 4:'falseNegatives', 5: 'Accuracy', 6: 'Precision', 7:'Recall', 8:'F1 Score'},inplace=True)
true_bin_array =[]
for x in true_positives_array:
    if x >= rating_threshold: 
        x=1
    else: 
        x=0
    true_bin_array.append(x)
auc_score =roc_auc_score(true_bin_array,est_array,multi_class='raise',average='macro')
print('AUC Score: ',auc_score)


AUC Score:  0.6871795906097623


In [27]:
#Calculate precision and recall at n
def get_precision_recall_at_n(predictions,topn,rating_threshold): 
    all_actual_predicted_list = defaultdict(list)
    precision = dict()
    recall= dict()
    no_of_relevant_items = 0 
    no_of_recommended_items_at_top_n = 0 
    no_of_relevant_recommended_items_at_top_n = 0  
    for uid, iid, true_r, est, _ in predictions:
        all_actual_predicted_list[uid].append((est, true_r)) 
    for uid, user_ratings in all_actual_predicted_list.items():
        user_ratings.sort(key=lambda x: x[0], reverse=True)
        no_of_relevant_items = sum((true_r >= rating_threshold) for (_, true_r) in user_ratings)
        no_of_recommended_items_at_top_n = sum((est >= rating_threshold) for (est, _) in user_ratings[:topn])
        no_of_relevant_recommended_items_at_top_n = sum(((true_r >= rating_threshold) and (est >= rating_threshold)) for (est, true_r) in user_ratings[:topn])
        precision[uid] = no_of_relevant_recommended_items_at_top_n / no_of_recommended_items_at_top_n if no_of_recommended_items_at_top_n != 0 else 1
        recall[uid] = no_of_relevant_recommended_items_at_top_n / no_of_relevant_items if no_of_relevant_items != 0 else 1
    return precision, recall 

rating_threshold=3 
precision_recall_at_n = [] 
for topn in range(2,20):
    precision, recall = get_precision_recall_at_n(predictions,topn,rating_threshold)
    precision_at_n = sum(prec for prec in precision.values()) / len(precision)

    recall_at_n = sum(rec for rec in recall.values()) / len(recall)
    precision_recall_at_n.append({'topN' : topn, 'Precision' : precision_at_n, 'Recall': recall_at_n})
for n in range(3,9):
    print(precision_recall_at_n[n])
#get user high rated and liked movies
all_book_df_details = pd.merge(df_users,df_books, on='book_id', how='inner')
all_book_df_details = all_book_df_details.sort_values(['user_id','rating'],ascending=[True, False])
print(all_book_df_details.loc[all_book_df_details['user_id'] == "0001085188e302fc6b2568de45a5f56b"].head(10)) #user 10 top 10 rated movies
#user 10 top 10 movie recommendations list 
print(top_n_recommendations_df.loc[top_n_recommendations_df['user'] == "0001085188e302fc6b2568de45a5f56b"].head(10))

{'topN': 5, 'Precision': 0.9514744846167148, 'Recall': 0.6194278743750823}
{'topN': 6, 'Precision': 0.9514117043667423, 'Recall': 0.6205048448279459}
{'topN': 7, 'Precision': 0.9513568286518761, 'Recall': 0.6211155210775254}
{'topN': 8, 'Precision': 0.9513252485140271, 'Recall': 0.6215143435700378}
{'topN': 9, 'Precision': 0.9512958978604388, 'Recall': 0.6217608647218136}
{'topN': 10, 'Precision': 0.9512856118307823, 'Recall': 0.6219511636655016}
                                 user_id   book_id  rating        isbn  \
7028    0001085188e302fc6b2568de45a5f56b     30119       5  0060513039   
134334  0001085188e302fc6b2568de45a5f56b     30118       5  0060513063   
183629  0001085188e302fc6b2568de45a5f56b      1420       0  0521618746   
790800  0001085188e302fc6b2568de45a5f56b  32334098       0  144948641X   

        text_reviews_count       series country_code language_code  \
7028                  8950           []           US           eng   
134334                2222           [