In [1]:

import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from surprise import SVDpp
from surprise import Dataset
from surprise import Reader
from surprise import accuracy
from collections import defaultdict
from surprise.model_selection import train_test_split 
from sklearn.model_selection import train_test_split 
from sklearn.metrics import roc_auc_score


  from pandas.core.computation.check import NUMEXPR_INSTALLED


In [2]:
#reading files
df_books = pd.read_csv('goodreads_books_poetry.csv')
df_users = pd.read_csv('goodreads_interactions_poetry.csv')


In [3]:
df_users.columns

Index(['user_id', 'book_id', 'review_id', 'is_read', 'rating',
       'review_text_incomplete', 'date_added', 'date_updated', 'read_at',
       'started_at'],
      dtype='object')

In [4]:
df_books = df_books[df_books['language_code'] == 'eng']

In [5]:
df_books.shape

(8393, 29)

In [6]:
# Extract the unique book_id values from df_books
valid_book_ids = df_books['book_id'].unique()

# Filter the rows in df_users
df_users = df_users[df_users['book_id'].isin(valid_book_ids)]

In [7]:
df_users.head()

Unnamed: 0,user_id,book_id,review_id,is_read,rating,review_text_incomplete,date_added,date_updated,read_at,started_at
0,8842281e1d1347389f2ab93d60773d4d,1384,1bad0122cebb4aa9213f9fe1aa281f66,True,4,,Wed May 09 09:33:44 -0700 2007,Wed May 09 09:33:44 -0700 2007,,
1,8842281e1d1347389f2ab93d60773d4d,1376,eb6e502d0c04d57b43a5a02c21b64ab4,True,4,,Wed May 09 09:33:18 -0700 2007,Wed May 09 09:33:18 -0700 2007,,
2,8842281e1d1347389f2ab93d60773d4d,30119,787564bef16cb1f43e0f641ab59d25b7,True,5,,Sat Jan 13 13:44:20 -0800 2007,Wed Mar 22 11:45:08 -0700 2017,Tue Mar 01 00:00:00 -0800 1983,
3,72fb0d0087d28c832f15776b0d936598,24769928,8c80ee74743d4b3b123dd1a2e0c0bcac,False,0,,Wed Apr 27 11:05:51 -0700 2016,Wed Apr 27 11:05:52 -0700 2016,,
4,72fb0d0087d28c832f15776b0d936598,30119,2a83589fb597309934ec9b1db5876aaf,True,3,,Mon Jun 04 18:58:08 -0700 2012,Mon Jun 04 18:58:13 -0700 2012,,


In [8]:
df_users=df_users[['user_id', 'book_id', 'rating']]

In [9]:
df_users.head()

Unnamed: 0,user_id,book_id,rating
0,8842281e1d1347389f2ab93d60773d4d,1384,4
1,8842281e1d1347389f2ab93d60773d4d,1376,4
2,8842281e1d1347389f2ab93d60773d4d,30119,5
3,72fb0d0087d28c832f15776b0d936598,24769928,0
4,72fb0d0087d28c832f15776b0d936598,30119,3


In [10]:
#splitting data into train and test sets
train_split, test_split = train_test_split(df_users, test_size = 0.25, random_state = 20)
print("Training data size:", train_split.shape)
print("Test data size:", test_split.shape)


Training data size: (1014663, 3)
Test data size: (338221, 3)


In [11]:
#reader for parsing the ratings file
reader = Reader(rating_scale=(1, 5))
#building the train and test set, loading the data from dataframe 
train_build = Dataset.load_from_df(train_split, reader)
test_build = Dataset.load_from_df(test_split, reader)
trainset = train_build.build_full_trainset()
testset = test_build.build_full_trainset().build_testset() 
print("Test set size:", len(testset))


Test set size: 338221


In [12]:
#model building
#takes in factors, epochs, learning rate and regularization parameter 
model = SVDpp(n_factors=30,n_epochs=20,lr_all=0.01,reg_all=0.05) 
model.fit(trainset)


<surprise.prediction_algorithms.matrix_factorization.SVDpp at 0x7fae62182fa0>

In [13]:
#making predictions
predictions = model.test(testset)
#calculating rmse
accuracy.rmse(predictions, verbose = True)
#Save all the predicted ratings and convert it to a dataframe 



RMSE: 1.8000


1.8000174327805702

In [14]:
all_recommendations_list = defaultdict(list)
all_recommendations_df = pd.DataFrame([])
# Create empty lists to collect data for each column
user_list = []
book_id_list = []
predicted_rating_list = []

for uid, iid, true_r, est, _ in predictions:
    all_recommendations_list[uid].append((iid, est))
    user_list.append(uid)
    book_id_list.append(iid)
    predicted_rating_list.append(est)


#for uid, iid, true_r, est, _ in predictions:
    #all_recommendations_list[uid].append((iid, est))
    
# Create a DataFrame using the collected data
all_recommendations_df = pd.DataFrame({'user': user_list, 'book_id': book_id_list, 'predicted_rating': predicted_rating_list})
    #all_recommendations_df = all_recommendations_df.append(pd.DataFrame({'user': uid, 'movieId': iid, 'predicted_rating' : est}, index=[0]), ignore_index=True); 
print(all_recommendations_df.head(5)) 
print(all_recommendations_df.shape)



                               user   book_id  predicted_rating
0  95c25c4181bd7449f1ce2e2dafe7c21e     30119          2.139957
1  091402d81bef94103c1ef99586c4d365     27822          1.904122
2  091402d81bef94103c1ef99586c4d365      1420          2.035555
3  c958665f697b9b750102b28d61680a0b  18263725          2.550888
4  a8e23ebfd310fc499af43ace51b1c894      1715          2.794470
(338221, 3)


In [15]:
#Merging with movies file to get genre, title information for predictions
all_recommendations_df_details = pd.merge(all_recommendations_df,df_books, on='book_id', how='inner') 
print(all_recommendations_df_details.head(5))


                               user  book_id  predicted_rating        isbn  \
0  95c25c4181bd7449f1ce2e2dafe7c21e    30119          2.139957  0060513039   
1  a8e23ebfd310fc499af43ace51b1c894    30119          2.895295  0060513039   
2  a5b0434e1f3179930816fea2b9193b6e    30119          2.814852  0060513039   
3  238dcb73b558f030afd477ed6be48232    30119          3.612540  0060513039   
4  87070937fd0a8f99c7dd0dfceef7ee37    30119          1.094194  0060513039   

   text_reviews_count series country_code language_code  \
0                8950     []           US           eng   
1                8950     []           US           eng   
2                8950     []           US           eng   
3                8950     []           US           eng   
4                8950     []           US           eng   

                                     popular_shelves asin  ...         isbn13  \
0  [{'count': '20447', 'name': 'to-read'}, {'coun...  NaN  ...  9780060513030   
1  [{'count': 

In [16]:
all_recommendations_df_details.columns

Index(['user', 'book_id', 'predicted_rating', 'isbn', 'text_reviews_count',
       'series', 'country_code', 'language_code', 'popular_shelves', 'asin',
       'is_ebook', 'average_rating', 'kindle_asin', 'similar_books',
       'description', 'format', 'link', 'authors', 'publisher', 'num_pages',
       'publication_day', 'isbn13', 'publication_month', 'edition_information',
       'publication_year', 'url', 'image_url', 'ratings_count', 'work_id',
       'title', 'title_without_series'],
      dtype='object')

In [17]:
#List of top n recommendations list as per SVD++
def get_top_n_recommendation_list_df(all_recommendations_df_details, n=10):
    top_n_recommendations_df = all_recommendations_df_details.sort_values(['user','predicted_rating'] ,ascending=[True, False])
    return top_n_recommendations_df


In [18]:
top_n_recommendations_df = get_top_n_recommendation_list_df(all_recommendations_df_details, 10) 
print(top_n_recommendations_df.head())


                                    user   book_id  predicted_rating  \
257732  0001085188e302fc6b2568de45a5f56b  32334098          1.000000   
274369  00013344a98d0147eacab88d3e1502ef    138165          4.965609   
191271  00013344a98d0147eacab88d3e1502ef      5932          4.689238   
106610  000192962b87d560f00b06fdcbd71681     30118          5.000000   
83978   000243c8e211fb3f359e4ff45ca899ea     15997          1.000000   

              isbn  text_reviews_count       series country_code  \
257732  144948641X                 221  ['1137480']           US   
274369  0393974979                  19           []           US   
191271  0143039962                 882           []           US   
106610  0060513063                2222           []           US   
83978   0140424393                2098   ['465549']           US   

       language_code                                    popular_shelves asin  \
257732           eng  [{'count': '62826', 'name': 'to-read'}, {'coun...  NaN  

In [19]:
metrics=[]
true_positives_array = []
est_array = []
for rating_threshold in np.arange(0,5.5,0.5):
    truePositives = 0
    trueNegatives = 0
    falseNegatives = 0
    falsePositives = 0
    accuracy =0
    precision =0
    recall =0
    f1_score = 0
    for uid,_, true_r, est, _ in predictions:
        if(true_r >= rating_threshold and est >= rating_threshold):
            truePositives = truePositives + 1 
            true_positives_array.append(true_r) 
            est_array.append(est)
#here
        elif(true_r >= rating_threshold and est<= rating_threshold):
            falseNegatives = falseNegatives + 1 
        elif(true_r <= rating_threshold and est >=rating_threshold):
             falsePositives = falsePositives + 1 
        elif(true_r <= rating_threshold and est<=rating_threshold):
            trueNegatives = trueNegatives + 1 
        if(truePositives > 0):
            accuracy = (truePositives + trueNegatives ) / (truePositives + trueNegatives + falsePositives + falseNegatives)
            precision = truePositives / (truePositives + falsePositives)
            recall = truePositives / (truePositives + falseNegatives) 
            f1_score = 2 * (precision * recall) / (precision + recall)



In [20]:
metrics.append([rating_threshold,truePositives,trueNegatives,falsePositives,falseNegatives,accuracy,precision,recall,f1_score])
metrics_df = pd.DataFrame(metrics)
metrics_df.rename(columns={0:'rating_threshold', 1:'truePositives', 2: 'trueNegatives', 3: 'falsePositives', 4:'falseNegatives', 5: 'Accuracy', 6: 'Precision', 7:'Recall', 8:'F1 Score'},inplace=True)
true_bin_array =[]
for x in true_positives_array:
    if x >= rating_threshold: 
        x=1
    else: 
        x=0
    true_bin_array.append(x)
auc_score =roc_auc_score(true_bin_array,est_array,multi_class='raise',average='macro')
print('AUC Score: ',auc_score)


AUC Score:  0.6888907278757329


In [21]:
#Calculate precision and recall at n
def get_precision_recall_at_n(predictions,topn,rating_threshold): 
    all_actual_predicted_list = defaultdict(list)
    precision = dict()
    recall= dict()
    no_of_relevant_items = 0 
    no_of_recommended_items_at_top_n = 0 
    no_of_relevant_recommended_items_at_top_n = 0  
    for uid, iid, true_r, est, _ in predictions:
        all_actual_predicted_list[uid].append((est, true_r)) 
    for uid, user_ratings in all_actual_predicted_list.items():
        user_ratings.sort(key=lambda x: x[0], reverse=True)
        no_of_relevant_items = sum((true_r >= rating_threshold) for (_, true_r) in user_ratings)
        no_of_recommended_items_at_top_n = sum((est >= rating_threshold) for (est, _) in user_ratings[:topn])
        no_of_relevant_recommended_items_at_top_n = sum(((true_r >= rating_threshold) and (est >= rating_threshold)) for (est, true_r) in user_ratings[:topn])
        precision[uid] = no_of_relevant_recommended_items_at_top_n / no_of_recommended_items_at_top_n if no_of_recommended_items_at_top_n != 0 else 1
        recall[uid] = no_of_relevant_recommended_items_at_top_n / no_of_relevant_items if no_of_relevant_items != 0 else 1
    return precision, recall 

rating_threshold=3 
precision_recall_at_n = [] 
for topn in range(2,20):
    precision, recall = get_precision_recall_at_n(predictions,topn,rating_threshold)
    precision_at_n = sum(prec for prec in precision.values()) / len(precision)

    recall_at_n = sum(rec for rec in recall.values()) / len(recall)
    precision_recall_at_n.append({'topN' : topn, 'Precision' : precision_at_n, 'Recall': recall_at_n})
for n in range(3,9):
    print(precision_recall_at_n[n])
#get user high rated and liked movies
all_book_df_details = pd.merge(df_users,df_books, on='book_id', how='inner')
all_book_df_details = all_book_df_details.sort_values(['user_id','rating'],ascending=[True, False])
print(all_book_df_details.loc[all_book_df_details['user_id'] == "c5823767a1a164cd8e9d029f1806f2aa"].head(10)) #user 10 top 10 rated movies
#user 10 top 10 movie recommendations list 
print(top_n_recommendations_df.loc[top_n_recommendations_df['user'] == "c5823767a1a164cd8e9d029f1806f2aa"].head(10))

{'topN': 5, 'Precision': 0.9517699774203956, 'Recall': 0.6212772639304668}
{'topN': 6, 'Precision': 0.9516893207602612, 'Recall': 0.6223076123423367}
{'topN': 7, 'Precision': 0.9516432616422433, 'Recall': 0.6229383976517086}
{'topN': 8, 'Precision': 0.9516038149718615, 'Recall': 0.623335690100741}
{'topN': 9, 'Precision': 0.9515805827324656, 'Recall': 0.6235884657850648}
{'topN': 10, 'Precision': 0.9515682394968779, 'Recall': 0.6237658112812048}
                                 user_id   book_id  rating        isbn  \
44569   c5823767a1a164cd8e9d029f1806f2aa     30119       5  0060513039   
76521   c5823767a1a164cd8e9d029f1806f2aa  24769928       5  193644965X   
147224  c5823767a1a164cd8e9d029f1806f2aa     30118       5  0060513063   
218900  c5823767a1a164cd8e9d029f1806f2aa      1420       5  0521618746   
257964  c5823767a1a164cd8e9d029f1806f2aa  17707772       5  1594204780   
266848  c5823767a1a164cd8e9d029f1806f2aa  20821284       5  0399252517   
337192  c5823767a1a164cd8e9d029f

In [24]:
top_books=all_book_df_details.loc[all_book_df_details['user_id'] == "c5823767a1a164cd8e9d029f1806f2aa"].head(10)
top_books_with_descriptions = top_books.merge(df_books[['book_id', 'description']], on='book_id', how='left')


In [30]:
top_books_with_descriptions.columns

Index(['user_id', 'book_id', 'rating', 'isbn', 'text_reviews_count', 'series',
       'country_code', 'language_code', 'popular_shelves', 'asin', 'is_ebook',
       'average_rating', 'kindle_asin', 'similar_books', 'description_x',
       'format', 'link', 'authors', 'publisher', 'num_pages',
       'publication_day', 'isbn13', 'publication_month', 'edition_information',
       'publication_year', 'url', 'image_url', 'ratings_count', 'work_id',
       'title', 'title_without_series', 'description_y'],
      dtype='object')

In [31]:
top_books_with_descriptions=top_books_with_descriptions[['user_id', 'book_id', 'rating', 'title', 'description_y']]

In [33]:
top_books_with_descriptions.head(10)

Unnamed: 0,user_id,book_id,rating,title,description_y
0,c5823767a1a164cd8e9d029f1806f2aa,30119,5,Where the Sidewalk Ends,Where the Sidewalk Ends turns forty! Celebrate...
1,c5823767a1a164cd8e9d029f1806f2aa,24769928,5,Constructed of Magic: And Other Poems on the I...,What Would Life be Like if You Knew You Were a...
2,c5823767a1a164cd8e9d029f1806f2aa,30118,5,A Light in the Attic,Last night while I lay thinking here\nSome Wha...
3,c5823767a1a164cd8e9d029f1806f2aa,1420,5,Hamlet,"One of the greatest plays of all time, the com..."
4,c5823767a1a164cd8e9d029f1806f2aa,17707772,5,Dog Songs,Mary Oliver's Dog Songsis a celebration of the...
5,c5823767a1a164cd8e9d029f1806f2aa,20821284,5,Brown Girl Dreaming,"Jacqueline Woodson, one of today's finest writ..."
6,c5823767a1a164cd8e9d029f1806f2aa,1381,5,The Odyssey,Literature's grandest evocation of life's jour...
7,c5823767a1a164cd8e9d029f1806f2aa,16180680,5,Utterly Loved,Foreword by New York Times and USA Today Bests...
8,c5823767a1a164cd8e9d029f1806f2aa,22151696,5,Lullabies,"A sequel to the hugely popular, best-selling L..."
9,c5823767a1a164cd8e9d029f1806f2aa,95819,5,The Poetry of Robert Frost,The only comprehensive gathering of Frost's pu...


In [23]:
filtered_df = top_n_recommendations_df[top_n_recommendations_df['user'] == "c5823767a1a164cd8e9d029f1806f2aa"].head(10)
filtered_df=filtered_df[['user', 'book_id', 'predicted_rating', 'title', 'description']]
filtered_df.head(10)

Unnamed: 0,user,book_id,predicted_rating,title,description
105529,c5823767a1a164cd8e9d029f1806f2aa,30118,5.0,A Light in the Attic,Last night while I lay thinking here\nSome Wha...
111584,c5823767a1a164cd8e9d029f1806f2aa,2547,5.0,The Prophet,"Kahlil Gibran's masterpiece, The Prophet, is o..."
136464,c5823767a1a164cd8e9d029f1806f2aa,30325231,5.0,The Chaos of Longing,The Chaos of Longingis a prose and poetry coll...
192516,c5823767a1a164cd8e9d029f1806f2aa,13324301,5.0,The Tiny Book of Tiny Stories - 2010,
236185,c5823767a1a164cd8e9d029f1806f2aa,26596,5.0,"Complete Poems, 1904-1962","At the time of his death in 1962, E. E. Cummin..."
249878,c5823767a1a164cd8e9d029f1806f2aa,294399,5.0,In Memoriam,In Memoriam is Tennyson's tribute to his frien...
251091,c5823767a1a164cd8e9d029f1806f2aa,321538,5.0,Collected Poems,"Between 1927 and his death in 1973, W. H. Aude..."
266327,c5823767a1a164cd8e9d029f1806f2aa,26114416,5.0,The Hatred of Poetry,No art has been denounced as often as poetry. ...
271224,c5823767a1a164cd8e9d029f1806f2aa,151717,5.0,Crossing the Unknown Sea: Work as a Pilgrimage...,Crossing the Unknown Seais about reuniting the...
286197,c5823767a1a164cd8e9d029f1806f2aa,22253712,5.0,Deep Lane: Poems,Deep Lane is a book of descents: into the eart...
