# LIU Post Class Presentation 
* May 7, 2021

In [155]:

import time
import datetime
import random

import numpy as np
from collections import defaultdict

from surprise import Reader
from surprise.model_selection import cross_validate
from surprise.model_selection import GridSearchCV
from surprise.model_selection import KFold
from surprise.model_selection.split import train_test_split
from surprise import NormalPredictor
from surprise import BaselineOnly
from surprise import KNNBasic
from surprise import KNNWithMeans
from surprise import KNNBaseline
from surprise import SVD
from surprise import SVDpp
from surprise import NMF
from surprise import SlopeOne
from surprise import CoClustering
from surprise import accuracy


In [156]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [157]:
import pandas as pd
rating = pd.read_csv('data/rating.csv', encoding="latin-1")

In [158]:
# rename columns to fit the model 
all_ratings = pd.DataFrame(rating.rename(columns={"User-ID": "userID", "ISBN": "itemID", "Book-Rating": "rating"}))

In [159]:
all_ratings.head()

Unnamed: 0,userID,itemID,rating
0,276725,034545104X,0.0
1,276726,155061224,5.0
2,276727,446520802,0.0
3,276729,052165615X,3.0
4,276729,521795028,6.0


# Explecit and implicit rating 

In [160]:
# Rating without 0 
ratings = all_ratings[all_ratings['rating'] != 0]
implicit_ratings = all_ratings[all_ratings['rating'] == 0]


In [161]:
ratings.shape
ratings.columns
implicit_ratings.shape
implicit_ratings.columns

(397248, 3)

Index(['userID', 'itemID', 'rating'], dtype='object')

(651327, 3)

Index(['userID', 'itemID', 'rating'], dtype='object')

In [162]:
ratings.head()
implicit_ratings.head()

Unnamed: 0,userID,itemID,rating
1,276726,155061224,5.0
3,276729,052165615X,3.0
4,276729,521795028,6.0
6,276736,3257224281,8.0
7,276737,600570967,6.0


Unnamed: 0,userID,itemID,rating
0,276725,034545104X,0.0
2,276727,446520802,0.0
5,276733,2080674722,0.0
10,276746,425115801,0.0
11,276746,449006522,0.0


In [163]:
# users who rated most books 

ratings.groupby('userID').count().sort_values('rating', ascending = False).head()


Unnamed: 0_level_0,itemID,rating
userID,Unnamed: 1_level_1,Unnamed: 2_level_1
11676,8524,8524
98391,5802,5802
153662,1969,1969
189835,1906,1906
23902,1395,1395


In [164]:
books_df = pd.read_csv('data/books.csv', encoding="latin-1")

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [165]:
# Drop imageURL features, don't need them 
features = ['ISBN', 'Book-Title', 'Book-Author', 'Year-Of-Publication', 'Publisher']
books = books_df[features]

In [166]:
books.shape
books.head(2)

(271379, 5)

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher
0,195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press
1,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada


In [167]:
ratings.shape
ratings.head()
ratings.describe()
ratings.info()

(397248, 3)

Unnamed: 0,userID,itemID,rating
1,276726,155061224,5.0
3,276729,052165615X,3.0
4,276729,521795028,6.0
6,276736,3257224281,8.0
7,276737,600570967,6.0


Unnamed: 0,userID,rating
count,397248.0,397243.0
mean,123705.417034,7.601851
std,73864.403076,1.841274
min,8.0,1.0
25%,60028.75,7.0
50%,120892.0,8.0
75%,187065.0,9.0
max,278854.0,10.0


<class 'pandas.core.frame.DataFrame'>
Int64Index: 397248 entries, 1 to 1048571
Data columns (total 3 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   userID  397248 non-null  int64  
 1   itemID  397247 non-null  object 
 2   rating  397243 non-null  float64
dtypes: float64(1), int64(1), object(1)
memory usage: 12.1+ MB


In [168]:
# check range of rating 
ratings.rating.min()
ratings.rating.max()

1.0

10.0

##  Merge rating and books data

In [169]:
rating_books_merged = pd.merge(rating, books, on='ISBN')
rating_books_merged.shape
rating_books_merged.head()

(941148, 7)

Unnamed: 0,User-ID,ISBN,Book-Rating,Book-Title,Book-Author,Year-Of-Publication,Publisher
0,276725,034545104X,0.0,Flesh Tones: A Novel,M. J. Rose,2002,Ballantine Books
1,2313,034545104X,5.0,Flesh Tones: A Novel,M. J. Rose,2002,Ballantine Books
2,6543,034545104X,0.0,Flesh Tones: A Novel,M. J. Rose,2002,Ballantine Books
3,8680,034545104X,5.0,Flesh Tones: A Novel,M. J. Rose,2002,Ballantine Books
4,10314,034545104X,9.0,Flesh Tones: A Novel,M. J. Rose,2002,Ballantine Books


In [170]:
rating_books_merged[rating_books_merged['ISBN'] == '034545104X']['Book-Title'].unique()

array(['Flesh Tones: A Novel'], dtype=object)

# Reduce the dimension
* Inlcude threshold

In [171]:
# Update threshold for ratings to include 

book_threshold = 50
user_threshold = 20

selected_books = ratings['itemID'].value_counts() > book_threshold
selected_books = selected_books[selected_books].index.tolist()

selected_users = ratings['userID'].value_counts() > user_threshold
selected_users = selected_users[selected_users].index.tolist()

In [172]:
updated_df = ratings[(ratings['itemID'].isin(selected_books)) & (ratings['userID'].isin(selected_users))]
print('The original data frame shape:\t{}'.format(ratings.shape))
print('The new data frame shape:\t{}'.format(updated_df.shape))

The original data frame shape:	(397248, 3)
The new data frame shape:	(18475, 3)


In [173]:
# features to save for flask app : 
# updated_df.to_csv('for_flask_app/model/ratings.csv')



In [174]:
updated_df.columns

Index(['userID', 'itemID', 'rating'], dtype='object')

In [175]:
# Specify range of reviews using Reader class 
# put 1 probably

reader = Reader(rating_scale=(1, 10))
data = Dataset.load_from_df(updated_df[['userID', 'itemID', 'rating']], reader)

In [176]:
# Run later 
benchmark = []

for algorithm in [NormalPredictor(), BaselineOnly(), KNNBasic(), SVD(), SVDpp(), KNNWithMeans(), NMF()]:
# [SVD(), SVDpp(), SlopeOne(), NMF(), NormalPredictor(), KNNBaseline(), KNNBasic(), KNNWithMeans(), KNNWithZScore(), BaselineOnly(), CoClustering()]:
   
    # Perform cross validation
    results = cross_validate(algorithm, data, measures=['RMSE'], cv=3, verbose=False)
    
    # Get results & append algorithm name
    tmp = pd.DataFrame.from_dict(results).mean(axis=0)
    tmp = tmp.append(pd.Series([str(algorithm).split(' ')[0].split('.')[-1]], index=['Algorithm']))
    benchmark.append(tmp)
    
df_models = pd.DataFrame(benchmark).set_index('Algorithm').sort_values('test_rmse')

Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.


In [177]:
df_models.round(3)

Unnamed: 0_level_0,test_rmse,fit_time,test_time
Algorithm,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
SVD,1.583,0.658,0.034
SVDpp,1.588,2.956,0.139
BaselineOnly,1.595,0.02,0.023
KNNWithMeans,1.766,0.176,0.272
KNNBasic,1.832,0.178,0.277
NormalPredictor,2.357,0.017,0.038
NMF,2.65,0.806,0.027


# ALS model : 
* ALS method 

In [178]:
# # Split data into train-set test-set

train_data, test_data = train_test_split(data, test_size=0.2)

In [179]:
model_bsl = BaselineOnly(bsl_options={'method': 'als', 'n_epochs': 5,
                                     'reg_u': 12, 'reg_i': 5})
predictions_bsl = model_bsl.fit(train_data).test(test_data)
accuracy.rmse(predictions_bsl)

Estimating biases using als...
RMSE: 1.5949


1.5949384620632525

In [180]:
# build a pandas dataframe with all the predictions

def get_Iu(uid):
    """Return the number of items rated by given user
    
    Args:
        uid: The raw id of the user.
    Returns:
        The number of items rated by the user.
    """
    
    try:
        return len(train_data.ur[train_data.to_inner_uid(uid)])
    except ValueError:  # user was not part of the trainset
        return 0
    
def get_Ui(iid):
    """Return the number of users that have rated given item
    
    Args:
        iid: The raw id of the item.
    Returns:
        The number of users that have rated the item.
    """
    
    try:
        return len(train_data.ir[train_data.to_inner_iid(iid)])
    except ValueError:  # item was not part of the trainset
        return 0

In [181]:
df = pd.DataFrame(predictions_bsl, columns=['uid', 'iid', 'rui', 'est', 'details'])    
df['Iu'] = df.uid.apply(get_Iu)
df['Ui'] = df.iid.apply(get_Ui)
df['err'] = abs(df.est - df.rui)

In [182]:
df.head()

Unnamed: 0,uid,iid,rui,est,details,Iu,Ui,err
0,135360,380789035,9.0,8.320091,{'was_impossible': False},7,52,0.679909
1,74630,345387651,10.0,7.191574,{'was_impossible': False},3,36,2.808426
2,75819,440206154,9.0,8.01614,{'was_impossible': False},17,46,0.98386
3,30994,067976402X,10.0,8.447384,{'was_impossible': False},10,63,1.552616
4,122793,375506039,10.0,7.437053,{'was_impossible': False},13,18,2.562947


In [183]:
# Note : 0.0 rating 
best_predictions_bsl = df.sort_values(by='err')[:10]
worst_predictions_bsl = df.sort_values(by='err')[-10:]
best_predictions_bsl.drop('details', axis = 1)
worst_predictions_bsl.drop('details', axis = 1)


Unnamed: 0,uid,iid,rui,est,Iu,Ui,err
1540,168816,385722206,8.0,7.999384,9,48,0.000616
3116,243879,670892963,8.0,8.00114,8,27,0.00114
1452,133706,425180638,7.0,6.997952,11,18,0.002048
1998,201526,452269571,8.0,8.002089,7,30,0.002089
167,143909,380012863,8.0,7.997362,3,32,0.002638
676,23699,786817070,8.0,8.003326,5,27,0.003326
2403,196053,067088300X,7.0,7.00389,6,17,0.00389
2296,138543,684801523,8.0,7.995948,19,37,0.004052
324,169699,61097101,8.0,7.995139,3,19,0.004861
470,114444,034538475X,7.0,6.99348,7,39,0.00652


Unnamed: 0,uid,iid,rui,est,Iu,Ui,err
604,22252,312278586,1.0,7.252675,5,57,6.252675
3144,110361,312995423,1.0,7.534374,5,19,6.534374
1665,30985,553574574,1.0,7.739237,2,15,6.739237
3033,90049,345361792,1.0,8.025626,5,55,7.025626
1722,238889,014028009X,1.0,8.037682,10,57,7.037682
1654,6563,684872153,1.0,8.318046,14,43,7.318046
3548,157811,60392452,1.0,8.684037,3,57,7.684037
3282,114007,385484518,1.0,8.716875,0,68,7.716875
1475,98297,440998050,1.0,8.877401,0,30,7.877401
2215,97874,385504209,1.0,8.976222,11,155,7.976222


# GridSearchCV: Paramater tunning 

# Get top N-book recommendations for each user

In [184]:
# First train an SVD algorithm

trainset_bsl = data.build_full_trainset()

model_bsl.fit(trainset_bsl)

Estimating biases using als...


<surprise.prediction_algorithms.baseline_only.BaselineOnly at 0x7fe34bb87160>

In [185]:
# predict ratings for all pairs (u, i) that are NOT in the training set

testset_bsl = trainset_bsl.build_anti_testset()
predictions_bsl = model_bsl.test(testset_bsl)


In [186]:
accuracy.rmse(predictions_bsl)

RMSE: 0.6094


0.609362633311807

# Enter User info : User-ID to get recommended books

In [187]:
# 228998, 144255, 124487, 228311, 244657, 277965, 222296, 224764,22074, 210485, 107244, 135149, 133747, 197364, 219683, 177432, 
# 67840, 98263, 54898, 158295, 48025, 244286, 51450, 157811, 69232,

In [188]:
def get_top_n(predictions, n=10):
    '''Return the top-N recommendation for each user from a set of predictions.

    Args:
        predictions(list of Prediction objects): The list of predictions, as
            returned by the test method of an algorithm.
        n(int): The number of recommendation to output for each user. Default
            is 10.

    Returns:
    A dict where keys are user (raw) ids and values are lists of tuples:
        [(raw item id, rating estimation), ...] of size n.
    '''

    # First map the predictions to each user.
    top_n = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        top_n[uid].append((iid, est))

    # Then sort the predictions for each user and retrieve the k highest ones.
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:n]

    return top_n

In [189]:
# Note: find an effecient way to input user info 
# Enter use info 

user = 224430
top_n = get_top_n(predictions_bsl)
user_book_isbn = top_n.get(user)


print('\n\n Next book recommendations for User', str(user),':\n')
for isbn, pred_rating in user_book_isbn:
    book_name = rating_books_merged[rating_books_merged['ISBN'] == isbn]['Book-Title'].unique()
    print('Book name: \t', str(book_name)[1:-1])
    print('Possible rating:\t ', round(pred_rating, 2),'/10.0' '\n')



 Next book recommendations for User 224430 :

Book name: 	 'The Return of the King (The Lord of the Rings'
Possible rating:	  8.95 /10.0

Book name: 	 "Charlotte's Web (Trophy Newbery)"
Possible rating:	  8.87 /10.0

Book name: 	 'The Two Towers (The Lord of the Rings'
Possible rating:	  8.86 /10.0

Book name: 	 'Harry Potter and the Goblet of Fire (Book 4)'
Possible rating:	  8.86 /10.0

Book name: 	 'Seabiscuit: An American Legend'
Possible rating:	  8.85 /10.0

Book name: 	 'Harry Potter and the Prisoner of Azkaban (Book 3)'
Possible rating:	  8.8 /10.0

Book name: 	 'Harry Potter and the Prisoner of Azkaban (Book 3)'
Possible rating:	  8.8 /10.0

Book name: 	 "The Princess Bride: S Morgenstern's Classic Tale of True Love and High Adventure"
Possible rating:	  8.78 /10.0

Book name: 	 'Where the Red Fern Grows'
Possible rating:	  8.76 /10.0

Book name: 	 'To Kill a Mockingbird'
Possible rating:	  8.76 /10.0



In [190]:
rating_books_merged.head()
rating_books_merged.shape

Unnamed: 0,User-ID,ISBN,Book-Rating,Book-Title,Book-Author,Year-Of-Publication,Publisher
0,276725,034545104X,0.0,Flesh Tones: A Novel,M. J. Rose,2002,Ballantine Books
1,2313,034545104X,5.0,Flesh Tones: A Novel,M. J. Rose,2002,Ballantine Books
2,6543,034545104X,0.0,Flesh Tones: A Novel,M. J. Rose,2002,Ballantine Books
3,8680,034545104X,5.0,Flesh Tones: A Novel,M. J. Rose,2002,Ballantine Books
4,10314,034545104X,9.0,Flesh Tones: A Novel,M. J. Rose,2002,Ballantine Books


(941148, 7)

In [191]:
for_flask_features = ['User-ID', 'ISBN', 'Book-Title']
for_flask = rating_books_merged[for_flask_features]
for_flask.shape
for_flask.head()

(941148, 3)

Unnamed: 0,User-ID,ISBN,Book-Title
0,276725,034545104X,Flesh Tones: A Novel
1,2313,034545104X,Flesh Tones: A Novel
2,6543,034545104X,Flesh Tones: A Novel
3,8680,034545104X,Flesh Tones: A Novel
4,10314,034545104X,Flesh Tones: A Novel


# Matrix Factorization: SVD, SVDpp
* Hyper parameter tunning using GridSearchCV 

In [192]:
model_svd = SVD()
fit_model_svd = model_svd.fit(train_data)
predictions_svd = fit_model_svd.test(test_data)

In [193]:
accuracy.rmse(predictions_svd)

RMSE: 1.5909


1.5909028039474458

In [194]:
# predict one uid and one iid
pred1 = model_svd.predict(uid ='6', iid = '2' )
pred1.est

7.977807848443843

# GridSearchCV: Paramater tunning 

In [195]:
# Train model with updated hyper-parameters 

param_grid_svd = {'n_epochs': 10, 'lr_all': 0.05, 'n_factors': 201, 'reg_all': 0.3}


In [196]:
model_svd = SVD(n_epochs = 10, lr_all = 0.05, n_factors= 201, reg_all = 0.3)
fit_model_svd = model_svd.fit(train_data)
predictions_svd = fit_model_svd.test(test_data)

In [197]:
accuracy.rmse(predictions_svd)

RMSE: 1.5713


1.5712926355850891

In [198]:
# predict one uid and one iid
pred1 = model_svd.predict(uid ='6', iid = '2' )
pred1.est

7.977807848443843

In [199]:
# 135149, 107244, 219683, 177432
#, , 124487, 228311, 224764,22074, 210485, 107244, 135149, 133747, 197364, 219683, 177432, 
# 98263, 158295, 48025, 228998

In [200]:
# Note: find an effecient way to input user info 
# Enter use info 

user = 177432
top_n_svd = get_top_n(predictions_svd)
user_book_isbn_svd = top_n_svd.get(user)


print('\n\n Next book recommendations for User', str(user),':\n')
for isbn, pred_rating in user_book_isbn_svd:
    book_name = rating_books_merged[rating_books_merged['ISBN'] == isbn]['Book-Title'].unique()
    print('Book name: \t', str(book_name)[1:-1])
    print('Possible rating:\t ', round(pred_rating, 2),'/10.0' '\n')
else:
    print("The user id is not found on the system.")



 Next book recommendations for User 177432 :

Book name: 	 'To Kill a Mockingbird'
Possible rating:	  8.92 /10.0

Book name: 	 'Stranger in a Strange Land (Remembering Tomorrow)'
Possible rating:	  8.5 /10.0

Book name: 	 'Timeline'
Possible rating:	  8.23 /10.0

Book name: 	 'The Dark Half'
Possible rating:	  8.05 /10.0

Book name: 	 'The Catcher in the Rye'
Possible rating:	  8.04 /10.0

The user id is not found on the system.


# Get top N-book recommendations for each user

In [201]:
def get_top_n(predictions, n=10):
    '''Return the top-N recommendation for each user from a set of predictions.

    Args:
        predictions(list of Prediction objects): The list of predictions, as
            returned by the test method of an algorithm.
        n(int): The number of recommendation to output for each user. Default
            is 10.

    Returns:
    A dict where keys are user (raw) ids and values are lists of tuples:
        [(raw item id, rating estimation), ...] of size n.
    '''

    # First map the predictions to each user.
    top_n = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        top_n[uid].append((iid, est))

    # Then sort the predictions for each user and retrieve the k highest ones.
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:n]

    return top_n

In [202]:
accuracy.rmse(predictions_svd)

RMSE: 1.5713


1.5712926355850891

In [203]:
# print already studied books and new recommended books 
# books[books['Book-Title'] == 'Harry Potter and the Goblet of Fire (Book 4)']
rating_books_merged

Unnamed: 0,User-ID,ISBN,Book-Rating,Book-Title,Book-Author,Year-Of-Publication,Publisher
0,276725,034545104X,0.0,Flesh Tones: A Novel,M. J. Rose,2002,Ballantine Books
1,2313,034545104X,5.0,Flesh Tones: A Novel,M. J. Rose,2002,Ballantine Books
2,6543,034545104X,0.0,Flesh Tones: A Novel,M. J. Rose,2002,Ballantine Books
3,8680,034545104X,5.0,Flesh Tones: A Novel,M. J. Rose,2002,Ballantine Books
4,10314,034545104X,9.0,Flesh Tones: A Novel,M. J. Rose,2002,Ballantine Books
...,...,...,...,...,...,...,...
941143,250764,440106575,0.0,HIDDEN FIRES,JANETTE RADCLIFFE,1978,Dell
941144,250764,451157516,0.0,Cheyenne (Fortunes West,,,
941145,250764,048623715X,0.0,Glamorous Movie Stars of the Thirties: Paper D...,Tom Tierney,1982,Dover Publications
941146,250764,486256588,0.0,Schiaparelli Fashion Review: Paper Dolls in Fu...,Tom Tierney,1988,Dover Publications


# Enter User info : User-ID to get recommended books

In [204]:
# top_n

In [205]:
# Ask user's information (user-id) and how many books to recommend 

# how many similar books 
number_of_books_to_recommend = 5

# Enter user info 
user_info = 36907

    # OR OR ROROR
# random entry from users 
# user_info = np.random.choice(ratings.shape[0])

# Implement function
top_n = get_top_n(predictions_svd, n = number_of_books_to_recommend)
user_book_isbn = top_n.get(user_info)

user_info
user_book_isbn


print('\n\n The following books are remmended for you:\n\n ')

for isbn, pred_rating in user_book_isbn:
#     if isbn in top_n.keys():
        book_name = rating_books_merged[rating_books_merged['ISBN'] == isbn]['Book-Title'].unique()
        print(str(book_name)[2:-2] )   
        print('\n\t\t You will like this book with (predictive) rating of: ', round(pred_rating, 2),'/10.0' '\n')
#     else:
#         continue 
# #         print('Sorry, No references found to recommend books for you. Please, rate some books, and come back.')


36907

[('446532231', 9.533886465514202),
 ('038549081X', 9.43136751818322),
 ('804114986', 9.287643058547419),
 ('439136350', 9.221763114105917),
 ('804106304', 9.140586127112615)]



 The following books are remmended for you:

 
Dude

		 You will like this book with (predictive) rating of:  9.53 /10.0

The Handmaid's Tale : A Novel

		 You will like this book with (predictive) rating of:  9.43 /10.0

The Bonesetter's Daughter

		 You will like this book with (predictive) rating of:  9.29 /10.0

Harry Potter and the Prisoner of Azkaban (Book 3)

		 You will like this book with (predictive) rating of:  9.22 /10.0

The Joy Luck Club

		 You will like this book with (predictive) rating of:  9.14 /10.0



In [206]:
updated_df.shape
updated_df.head()

(18475, 3)

Unnamed: 0,userID,itemID,rating
1456,277427,002542730X,10.0
1474,277427,61009059,9.0
1522,277427,316776963,8.0
1543,277427,345413903,10.0
1581,277427,385486804,9.0


In [207]:
param_grid = {'n_epochs': [15, 20, 25, 30, 35, 40, 45, 50],
                'n_factors': [100, 150, 200, 300],
                'lr_all': [0.002, 0.003, 0.004, 0.005],
                'reg_all': [0.02,0.01,0.05,0.1, 0.4, 0.6]}
gs_svd = GridSearchCV(SVD, param_grid, measures=['rmse', 'mae'], cv=3, n_jobs = -1)

# Use SVD++  for Matrix Factorization
* permorms regularization and optimizes known terms 
* Takes into account of implicit ratings 
* extends vanilla SVD 
* used in Netflix challange 

The method SVD++, as well as most other matrix factorisation algorithms, will depend on a number of main tuning constants: 
* the dimension DD affecting the size of UU and VV
* the learning rate affects the performance of the optimisation step
* the regularisation term affecting the overfitting of the model 
* the number of epochs, which determines how many iterations of optimisation are used

 In surprise, tuning is performed using a function called GridSearchCV, which picks the constants which perform the best at predicting a held out testset. This means constant values to try need to be predefined.

In [208]:
model_svdpp = SVDpp()
fit_model_svdpp = model_svdpp.fit(train_data)
predictions_svdpp = fit_model_svdpp.test(test_data)

In [209]:
accuracy.rmse(predictions_svdpp)

RMSE: 1.6005


1.6004606938987602

In [210]:
# predict one uid and one iid
predpp = model_svdpp.predict(uid ='6', iid = '2' )
predpp.est

7.977807848443843

# GridSearchCV for SVDpp

In [211]:
model_svdpp = SVDpp(n_factors = 10, n_epochs = 40, lr_all = 0.005, reg_all = 0.3)

fit_model_svdpp = model_svdpp.fit(train_data)

predictions_svdpp = fit_model_svdpp.test(test_data)

In [212]:

for algorithm in [BaselineOnly(bsl_options={'method': 'als', 'n_epochs': 5,
                                     'reg_u': 12, 'reg_i': 5}), SVD(n_epochs = 20, lr_all = 0.05, n_factors= 201, reg_all = 0.3), SVDpp(n_factors = 10, n_epochs = 40, lr_all = 0.005, reg_all = 0.3)]:

    # Perform cross validation
    resultss = cross_validate(algorithm, data, measures=['RMSE'], cv=3, verbose=False)
    
    # Get results & append algorithm name
    tmps = pd.DataFrame.from_dict(resultss).mean(axis=0)
    tmps = tmps.append(pd.Series([str(algorithm).split(' ')[0].split('.')[-1]], index=['Algorithm']))
    benchmark.append(tmp)
    
df_mod_tunned = pd.DataFrame(benchmark).set_index('Algorithm').sort_values('test_rmse')

Estimating biases using als...
Estimating biases using als...
Estimating biases using als...


In [213]:
df_models

Unnamed: 0_level_0,test_rmse,fit_time,test_time
Algorithm,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
SVD,1.582872,0.658082,0.033857
SVDpp,1.587716,2.956479,0.139175
BaselineOnly,1.595144,0.020377,0.022545
KNNWithMeans,1.765838,0.176035,0.272306
KNNBasic,1.831799,0.177764,0.277175
NormalPredictor,2.356548,0.016529,0.038113
NMF,2.650214,0.806377,0.027025


In [214]:
accuracy.rmse(predictions_svdpp)

RMSE: 1.5696


1.5695764036986126

In [215]:
# 210485, 135149, 133747, 197364, 219683, 177432, 48025

In [216]:
# Note: find an effecient way to input user info 
# Enter use info 

user = 135149
top_n_svdpp = get_top_n(predictions_svdpp)
user_book_isbn_svdpp = top_n_svdpp.get(user)


print('\n\n Next book recommendations for User', str(user),':\n')
for isbn, pred_rating in user_book_isbn_svdpp:
    book_name = rating_books_merged[rating_books_merged['ISBN'] == isbn]['Book-Title'].unique()
    print('Book name: \t\t', str(book_name)[1:-1])
    print('Possible rating:\t ', round(pred_rating, 2),'/10.0' '\n')



 Next book recommendations for User 135149 :

Book name: 		 'The Poisonwood Bible: A Novel'
Possible rating:	  8.1 /10.0

Book name: 		 'All Around the Town'
Possible rating:	  8.01 /10.0

Book name: 		 'One for the Money (Stephanie Plum Novels (Paperback))'
Possible rating:	  7.92 /10.0

Book name: 		 'The Horse Whisperer'
Possible rating:	  7.8 /10.0

Book name: 		 "The Sweet Potato Queens' Book of Love"
Possible rating:	  7.76 /10.0

Book name: 		 'Prey: A Novel'
Possible rating:	  7.76 /10.0

Book name: 		 'The Brethren'
Possible rating:	  7.73 /10.0

Book name: 		 'A Painted House'
Possible rating:	  7.73 /10.0

Book name: 		 'The Chamber'
Possible rating:	  7.52 /10.0

Book name: 		 'The Brethren'
Possible rating:	  7.49 /10.0

