### Simple Recommender & Collaborative Filtering Based

#### 1. Importing Libraries and Loading Our Data

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import datetime

import warnings
warnings.filterwarnings('ignore')

In [2]:
books = pd.read_csv('../data/processed/processed_books.csv')
reviews = pd.read_csv('../data/processed/processed_reviews.csv')
ratings_dist = pd.read_csv('../data/processed/processed_ratings.csv')

In [3]:
reviews

Unnamed: 0.1,Unnamed: 0,review_id,book_id,user_id,text,rating
0,0,8947952,77203.The_Kite_Runner,613434,Finished this book about a month ago but it's ...,1.0
1,1,1305882067,77203.The_Kite_Runner,31207039,"In 2012, when I was Mathematics teacher at a p...",5.0
2,2,22703379,77203.The_Kite_Runner,84023,This is the sort of book White America reads t...,2.0
3,3,9020638,77203.The_Kite_Runner,616569,"""For you, a thousand times over.""""Children are...",5.0
4,4,1338106,77203.The_Kite_Runner,91373,\nDue to the large number of negative comments...,1.0
...,...,...,...,...,...,...
114456,114456,2063019524,7890698-toward-a-zero-energy-home,1395652,Gives alternatives to start exploring what you...,5.0
114457,114457,481531638,7890698-toward-a-zero-energy-home,966963,"It was interesting, though it was less practic...",4.0
114458,114458,125166556,7890698-toward-a-zero-energy-home,1895489,"If I was building a house from the ground, I t...",1.0
114459,114459,1780100027,7890698-toward-a-zero-energy-home,27453018,Informative . Some content does not apply in o...,


#### Dropping nulls & duplicates

#### Books

In [4]:
# count null values in each column
print('\nColumn null count: ', '\n', books.isnull().sum(axis=0))

# count null values in each row
print('\nRow null count: ', '\n', books.isnull().sum(axis=1))


Column null count:  
 Unnamed: 0         0
book_id            0
title              0
author             0
price              0
genres             0
series             0
publisher          0
year_published     0
current_readers    0
wanted_to_read     0
num_reviews        0
num_ratings        0
rating             0
awards             0
primary_lists      0
book_score         0
author_score       0
dtype: int64

Row null count:  
 0       0
1       0
2       0
3       0
4       0
       ..
4982    0
4983    0
4984    0
4985    0
4986    0
Length: 4987, dtype: int64


In [5]:
# count null values in each column
print('\nColumn dup count: ', '\n', books.duplicated().sum(axis=0))


Column dup count:  
 0


#### Reviews

In [6]:
# count null values in each column
print('\nColumn null count: ', '\n', reviews.isnull().sum(axis=0))

# count null values in each row
print('\nRow null count: ', '\n', reviews.isnull().sum(axis=1))


Column null count:  
 Unnamed: 0       0
review_id        0
book_id          0
user_id          0
text            35
rating        3431
dtype: int64

Row null count:  
 0         0
1         0
2         0
3         0
4         0
         ..
114456    0
114457    0
114458    0
114459    1
114460    0
Length: 114461, dtype: int64


In [7]:
print('Total review dataset size: ', len(reviews))
print('Relative size of nulls (text): ', reviews['text'].isnull().sum(axis=0)/len(reviews))
print('Relative size of nulls (rating): ', reviews['rating'].isnull().sum(axis=0)/len(reviews))

Total review dataset size:  114461
Relative size of nulls (text):  0.0003057810083783996
Relative size of nulls (rating):  0.029975275421322545


There's not much percentage nulls, hence we can aggregate & delete new data easily

In [8]:
reviews['rating'] = reviews['rating'].fillna(reviews['rating'].mean()).round(0)

In [9]:
reviews['text'] = reviews['text'].dropna()

In [10]:
print('Total review dataset size: ', len(reviews))
print('Relative size of nulls (text): ', reviews['text'].isnull().sum(axis=0)/len(reviews))
print('Relative size of nulls (rating): ', reviews['rating'].isnull().sum(axis=0)/len(reviews))

Total review dataset size:  114461
Relative size of nulls (text):  0.0003057810083783996
Relative size of nulls (rating):  0.0


In [11]:
# count null values in each column
print('\nColumn dup count: ', '\n', reviews.duplicated().sum(axis=0))


Column dup count:  
 0


#### Ratings

In [12]:
# count null values in each column
print('\nColumn null count: ', '\n', ratings_dist.isnull().sum(axis=0))

# count null values in each row
print('\nRow null count: ', '\n', ratings_dist.isnull().sum(axis=1))


Column null count:  
 Unnamed: 0            0
book_id               0
5 star                0
4 star                0
3 star                0
2 star                0
1 star                0
total_review_count    0
ratings_count         0
dtype: int64

Row null count:  
 0       0
1       0
2       0
3       0
4       0
       ..
4315    0
4316    0
4317    0
4318    0
4319    0
Length: 4320, dtype: int64


In [13]:
# count null values in each column
print('\nColumn dup count: ', '\n', ratings_dist.duplicated().sum(axis=0))


Column dup count:  
 0


#### Quick clean & one-hot enconding

In [14]:
books['genres'] = books['genres'].apply(lambda x: eval(x))
books['genres']

0       [Fiction, Historical Fiction, Classics, Contem...
1       [Fiction, Historical Fiction, Romance, Histori...
2       [Fiction, Historical Fiction, Contemporary, Hi...
3       [Historical Fiction, Fiction, Young Adult, His...
4       [Fiction, Fantasy, Classics, Adventure, Contem...
                              ...                        
4982    [Mystery, Historical Fiction, Fiction, Histori...
4983    [Mystery, Historical Fiction, Historical, Fict...
4984    [Mystery, Historical Fiction, Historical Myste...
4985    [Historical Fiction, Mystery, Fiction, Histori...
4986    [Historical Fiction, Mystery, Fiction, Histori...
Name: genres, Length: 4987, dtype: object

In [15]:
genres = ["Art", "Biography", "Business", "Chick Lit", "Children's", "Christian", "Classics",
          "Comics", "Contemporary", "Cookbooks", "Crime", "Ebooks", "Fantasy", "Fiction",
          "Gay and Lesbian", "Graphic Novels", "Historical Fiction", "History", "Horror",
          "Humor and Comedy", "Manga", "Memoir", "Music", "Mystery", "Nonfiction", "Paranormal",
          "Philosophy", "Poetry", "Psychology", "Religion", "Romance", "Science", "Science Fiction", 
          "Self Help", "Suspense", "Spirituality", "Sports", "Thriller", "Travel", "Young Adult"]

# Create an empty dictionary to store the one-hot encoding
genrkct = {}

# Loop through each row in the book dataset
for index, row in books.iterrows():
    # Loop through each genre in the row's list of genres
    for genre in row["genres"]:
        # Check if the genre already exists in the dictionary
        if genre in genrkct:
            # If it does, set its value to 1
            genrkct[genre] = 1
        else:
            # If it doesn't, add it to the dictionary with a value of 1
            genrkct[genre] = 1

# Create a new dataframe with the one-hot encoded genre columns
one_hok = pd.DataFrame(columns=genres)

# Loop through each row in the book dataset
for index, row in books.iterrows():
    # Create an empty list to store the one-hot encoded genre values for this row
    one_hot_row = []
    # Loop through each possible genre
    for genre in genres:
        # If the current genre is in the row's list of genres, append a 1 to the one-hot encoded row
        if genre in row["genres"]:
            one_hot_row.append(1)
        # Otherwise, append a 0
        else:
            one_hot_row.append(0)
    # Add the one-hot encoded row to the new dataframe
    one_hok.loc[index] = one_hot_row

# Concatenate the original book dataframe with the one-hot encoded genre dataframe
books = pd.concat([books, one_hok], axis=1)

In [16]:
books

Unnamed: 0.1,Unnamed: 0,book_id,title,author,price,genres,series,publisher,year_published,current_readers,...,Romance,Science,Science Fiction,Self Help,Suspense,Spirituality,Sports,Thriller,Travel,Young Adult
0,0,77203.The_Kite_Runner,The Kite Runner,Khaled Hosseini,8.717848,"[Fiction, Historical Fiction, Classics, Contem...",0,Riverhead Books,2004-05-01,42900.0,...,0,0,0,0,0,0,0,0,0,0
1,1,929.Memoirs_of_a_Geisha,Memoirs of a Geisha,Arthur Golden,12.990000,"[Fiction, Historical Fiction, Romance, Histori...",0,Vintage Books USA,2005-11-22,12300.0,...,1,0,0,0,0,0,0,0,0,0
2,2,128029.A_Thousand_Splendid_Suns,A Thousand Splendid Suns,Khaled Hosseini,12.990000,"[Fiction, Historical Fiction, Contemporary, Hi...",0,Riverhead Books,2007-06-01,32700.0,...,0,0,0,0,0,0,0,0,0,0
3,3,19063.The_Book_Thief,The Book Thief,Markus Zusak,10.990000,"[Historical Fiction, Fiction, Young Adult, His...",0,Alfred A. Knopf,2006-03-14,86000.0,...,0,0,0,0,0,0,0,0,0,1
4,4,4214.Life_of_Pi,Life of Pi,Yann Martel,8.717848,"[Fiction, Fantasy, Classics, Adventure, Contem...",0,Seal Books,2006-08-29,24900.0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4982,6257,25489259-death-of-an-alchemist,Death of an Alchemist,Mary Lawrence,5.990000,"[Mystery, Historical Fiction, Fiction, Histori...",1,Kensington Books,2016-01-26,-1.0,...,0,0,0,0,0,0,0,0,0,0
4983,6259,52185047-the-lost-boys-of-london,The Lost Boys of London,Mary Lawrence,8.717848,"[Mystery, Historical Fiction, Historical, Fict...",1,Red Puddle Print,2020-04-28,-1.0,...,0,0,0,0,0,0,0,0,0,0
4984,6262,36445482-no-cure-for-the-dead,No Cure for the Dead,Christine Trent,12.990000,"[Mystery, Historical Fiction, Historical Myste...",1,Crooked Lane Books,2018-05-08,-1.0,...,0,0,0,0,0,0,0,0,0,0
4985,6263,15793166-the-midwife-s-tale,The Midwife's Tale,Sam Thomas,5.990000,"[Historical Fiction, Mystery, Fiction, Histori...",1,Minotaur Books,2013-01-08,-1.0,...,0,0,0,0,0,0,0,0,0,0


#### Looking at Data

In [17]:
books = books.drop('Unnamed: 0', axis=1)
books.head(3)

Unnamed: 0,book_id,title,author,price,genres,series,publisher,year_published,current_readers,wanted_to_read,...,Romance,Science,Science Fiction,Self Help,Suspense,Spirituality,Sports,Thriller,Travel,Young Adult
0,77203.The_Kite_Runner,The Kite Runner,Khaled Hosseini,8.717848,"[Fiction, Historical Fiction, Classics, Contem...",0,Riverhead Books,2004-05-01,42900.0,1000000.0,...,0,0,0,0,0,0,0,0,0,0
1,929.Memoirs_of_a_Geisha,Memoirs of a Geisha,Arthur Golden,12.99,"[Fiction, Historical Fiction, Romance, Histori...",0,Vintage Books USA,2005-11-22,12300.0,793000.0,...,1,0,0,0,0,0,0,0,0,0
2,128029.A_Thousand_Splendid_Suns,A Thousand Splendid Suns,Khaled Hosseini,12.99,"[Fiction, Historical Fiction, Contemporary, Hi...",0,Riverhead Books,2007-06-01,32700.0,760000.0,...,0,0,0,0,0,0,0,0,0,0


In [18]:
reviews = reviews.drop('Unnamed: 0', axis=1)
reviews.head(3)

Unnamed: 0,review_id,book_id,user_id,text,rating
0,8947952,77203.The_Kite_Runner,613434,Finished this book about a month ago but it's ...,1.0
1,1305882067,77203.The_Kite_Runner,31207039,"In 2012, when I was Mathematics teacher at a p...",5.0
2,22703379,77203.The_Kite_Runner,84023,This is the sort of book White America reads t...,2.0


In [19]:
ratings_dist = ratings_dist.drop('Unnamed: 0', axis = 1)
ratings_dist.head(3)

Unnamed: 0,book_id,5 star,4 star,3 star,2 star,1 star,total_review_count,ratings_count
0,77203.The_Kite_Runner,1582498,918930,308702,79972,45210,90233,2935312
1,929.Memoirs_of_a_Geisha,822393,680546,318849,71382,29326,34099,1922496
2,128029.A_Thousand_Splendid_Suns,804612,444222,134258,24384,9736,69429,1417212


In [20]:
ratings_rmv_duplicates = reviews.drop_duplicates()
unwanted_users = ratings_rmv_duplicates.groupby('user_id')['user_id'].count()
unwanted_users = unwanted_users[unwanted_users < 3]
unwanted_ratings = ratings_rmv_duplicates[ratings_rmv_duplicates.user_id.isin(unwanted_users.index)]
new_reviews = ratings_rmv_duplicates.drop(unwanted_ratings.index)

In [21]:
new_reviews

Unnamed: 0,review_id,book_id,user_id,text,rating
1,1305882067,77203.The_Kite_Runner,31207039,"In 2012, when I was Mathematics teacher at a p...",5.0
2,22703379,77203.The_Kite_Runner,84023,This is the sort of book White America reads t...,2.0
4,1338106,77203.The_Kite_Runner,91373,\nDue to the large number of negative comments...,1.0
5,3694040479,77203.The_Kite_Runner,124132123,"WOW! This book was beautiful, exquisite.This b...",5.0
6,20046758,77203.The_Kite_Runner,614778,"The Kite Runner, 2003, Khaled HosseiniThe Kite...",5.0
...,...,...,...,...,...
114344,2754379556,32920234-significant-zero,22904453,Está bien esta biografía de auge y caída de un...,3.0
114383,2039844012,33295525-size-zero,1237196,"Finished the book, so this is a review, rather...",4.0
114384,2906659840,33295525-size-zero,16054419,"Мне нравится, когда книжки оправдывают ожидани...",4.0
114386,2126844590,33295525-size-zero,50761363,"Books, movies, reality shows etc. about fashio...",3.0


In [22]:
new_reviews = pd.merge(books[['book_id', 'title']], new_reviews, on='book_id', how='inner')
new_reviews

Unnamed: 0,book_id,title,review_id,user_id,text,rating
0,77203.The_Kite_Runner,The Kite Runner,1305882067,31207039,"In 2012, when I was Mathematics teacher at a p...",5.0
1,77203.The_Kite_Runner,The Kite Runner,22703379,84023,This is the sort of book White America reads t...,2.0
2,77203.The_Kite_Runner,The Kite Runner,1338106,91373,\nDue to the large number of negative comments...,1.0
3,77203.The_Kite_Runner,The Kite Runner,3694040479,124132123,"WOW! This book was beautiful, exquisite.This b...",5.0
4,77203.The_Kite_Runner,The Kite Runner,20046758,614778,"The Kite Runner, 2003, Khaled HosseiniThe Kite...",5.0
...,...,...,...,...,...,...
58905,13573383-the-zero-waste-lifestyle,The Zero-Waste Lifestyle: Live Well by Throwin...,522779263,2401727,The author of this book and her husband challe...,3.0
58906,13573383-the-zero-waste-lifestyle,The Zero-Waste Lifestyle: Live Well by Throwin...,3229052729,215964,Not as useful as other books. It’s a bit older...,2.0
58907,13573383-the-zero-waste-lifestyle,The Zero-Waste Lifestyle: Live Well by Throwin...,893840720,14011226,As usual I received this book free in exchange...,5.0
58908,18616732-the-zero-waste-solution,The Zero Waste Solution: Untrashing the Planet...,2998995862,26234267,Finally finished this. It took me about 6 week...,4.0


#### Introduction 

The Simple Recommender offers generalized recommendations to every user based on book popularity and (sometimes) genre. The basic idea behind this recommender is that books that are more popular and more critically acclaimed will have a higher probability of being liked by the average audience. This model does not give personalized recommendations based on the user.

The implementation of this model is extremely trivial. All we have to do is sort our books based on ratings and popularity and display the top books of our list. As an added step, we can pass in a genre argument to get the top books of a particular genre.

I will use IMDB's weighted rating formula to construct my chart. Mathematically, it is represented as follows:

Weighted Rating 

$(WR) = (v/m*v)*R+(m/v*m)*C $

where,

- v is the number of ratings for the book
- m is the minimum ratings required to be listed in the chart
- R is the average rating of the book
- C is the mean rating across the whole report
The next step is to determine an appropriate value for m, the minimum ratings required to be listed in the chart. We will use 95th percentile as our cutoff. In other words, for a book to feature in the charts, it must have more ratings than at least 95% of the books in the list.

In [23]:
v = books['num_ratings']
m = books['num_ratings'].quantile(0.95)
R = books['rating']
C = books['rating'].mean()
W = (R*v + C*m) / (v + m)

In [24]:
books['weighted_rating'] = W

In [25]:
qualified  = books.sort_values('weighted_rating', ascending=False).head(250)

#### Coolest books :)

In [26]:
qualified[['title', 'author', 'rating', 'weighted_rating']].head(15)

Unnamed: 0,title,author,rating,weighted_rating
114,Harry Potter and the Sorcerer's Stone,J.K. Rowling,4.0,3.973832
98,The Hunger Games,Suzanne Collins,4.0,3.969764
1049,To Kill a Mockingbird,Harper Lee,4.0,3.958772
889,To Kill a Mockingbird,Harper Lee,4.0,3.958772
803,The Fault in Our Stars,John Green,4.0,3.951507
2473,1984,George Orwell,4.0,3.94569
656,1984,George Orwell,4.0,3.945677
804,Pride and Prejudice,Jane Austen,4.0,3.942608
135,Harry Potter and the Prisoner of Azkaban,J.K. Rowling,4.0,3.940748
105,Divergent,Veronica Roth,4.0,3.939477


We see that J.K. Rowling's, Harper Lee, George Orwell Books occur at the very top of our chart. The chart also indicates a strong bias of Goodreads Users towards particular genres and authors.

Let us now construct our function that builds charts for particular genres. For this, we will use relax our default conditions to the 85th percentile instead of 95.

In [27]:
m = books['num_ratings'].quantile(0.85)
W = (R*v + C*m) / (v + m)

qualified[['title', 'author', 'rating', 'weighted_rating']].head(15)

Unnamed: 0,title,author,rating,weighted_rating
114,Harry Potter and the Sorcerer's Stone,J.K. Rowling,4.0,3.973832
98,The Hunger Games,Suzanne Collins,4.0,3.969764
1049,To Kill a Mockingbird,Harper Lee,4.0,3.958772
889,To Kill a Mockingbird,Harper Lee,4.0,3.958772
803,The Fault in Our Stars,John Green,4.0,3.951507
2473,1984,George Orwell,4.0,3.94569
656,1984,George Orwell,4.0,3.945677
804,Pride and Prejudice,Jane Austen,4.0,3.942608
135,Harry Potter and the Prisoner of Azkaban,J.K. Rowling,4.0,3.940748
105,Divergent,Veronica Roth,4.0,3.939477


#### Top "Genres" Books

In [28]:
def build_chart(genre, percentile=0.85):
    df = books[books[genre] == 1]
    qualified = books.set_index('book_id').loc[df.book_id]

    v = qualified['num_ratings']
    m = qualified['num_ratings'].quantile(percentile)
    R = qualified['rating']
    C = qualified['rating'].mean()
    qualified['weighted_rating'] = (R*v + C*m) / (v + m)

    qualified.sort_values('weighted_rating', ascending=False, inplace=True)
    return qualified

In [29]:
genre = 'Fiction'
build_chart(genre)[['title', 'author', 'publisher', 'year_published']].head(10)

Unnamed: 0_level_0,title,author,publisher,year_published
book_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
3.Harry_Potter_and_the_Sorcerer_s_Stone,Harry Potter and the Sorcerer's Stone,J.K. Rowling,Scholastic Inc,2003-11-01
2767052-the-hunger-games,The Hunger Games,Suzanne Collins,Scholastic Press,2008-10-14
6682611-to-kill-a-mockingbird,To Kill a Mockingbird,Harper Lee,Popular Library,1962-03-01
2657.To_Kill_a_Mockingbird,To Kill a Mockingbird,Harper Lee,Harper Perennial Modern Classics,2006-05-23
11870085-the-fault-in-our-stars,The Fault in Our Stars,John Green,Dutton Books,2012-01-10
40961427-1984,1984,George Orwell,Houghton Mifflin Harcourt,2013-09-03
5470.1984,1984,George Orwell,New American Library,1950-07-01
1885.Pride_and_Prejudice,Pride and Prejudice,Jane Austen,Modern Library,2000-10-10
5.Harry_Potter_and_the_Prisoner_of_Azkaban,Harry Potter and the Prisoner of Azkaban,J.K. Rowling,Scholastic Inc.,2004-05-01
13335037-divergent,Divergent,Veronica Roth,Katherine Tegen Books,2012-01-01


## Collaborative Based Filtering

### SVD's

In [30]:
import sys
import os
import surprise
import scrapbook as sb

from recommenders.utils.timer import Timer
from recommenders.datasets.python_splitters import python_random_split
from recommenders.evaluation.python_evaluation import (rmse, mae, rsquared, exp_var, map_at_k, ndcg_at_k, precision_at_k, 
                                                     recall_at_k, get_top_k_items)
from recommenders.models.surprise.surprise_utils import predict, compute_ranking_predictions

print("System version: {}".format(sys.version))
print("Surprise version: {}".format(surprise.__version__))

System version: 3.9.13 (tags/v3.9.13:6de2ca5, May 17 2022, 16:36:42) [MSC v.1929 64 bit (AMD64)]
Surprise version: 1.1.3


Get user_id & book_id Label_Encoded for training

In [31]:
new_reviews

Unnamed: 0,book_id,title,review_id,user_id,text,rating
0,77203.The_Kite_Runner,The Kite Runner,1305882067,31207039,"In 2012, when I was Mathematics teacher at a p...",5.0
1,77203.The_Kite_Runner,The Kite Runner,22703379,84023,This is the sort of book White America reads t...,2.0
2,77203.The_Kite_Runner,The Kite Runner,1338106,91373,\nDue to the large number of negative comments...,1.0
3,77203.The_Kite_Runner,The Kite Runner,3694040479,124132123,"WOW! This book was beautiful, exquisite.This b...",5.0
4,77203.The_Kite_Runner,The Kite Runner,20046758,614778,"The Kite Runner, 2003, Khaled HosseiniThe Kite...",5.0
...,...,...,...,...,...,...
58905,13573383-the-zero-waste-lifestyle,The Zero-Waste Lifestyle: Live Well by Throwin...,522779263,2401727,The author of this book and her husband challe...,3.0
58906,13573383-the-zero-waste-lifestyle,The Zero-Waste Lifestyle: Live Well by Throwin...,3229052729,215964,Not as useful as other books. It’s a bit older...,2.0
58907,13573383-the-zero-waste-lifestyle,The Zero-Waste Lifestyle: Live Well by Throwin...,893840720,14011226,As usual I received this book free in exchange...,5.0
58908,18616732-the-zero-waste-solution,The Zero Waste Solution: Untrashing the Planet...,2998995862,26234267,Finally finished this. It took me about 6 week...,4.0


In [52]:
training_reviews = new_reviews

In [53]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()

column_to_encode = 'user_id'
training_reviews[column_to_encode] = le.fit_transform(training_reviews[column_to_encode])

In [54]:
column_to_encode = 'book_id'
training_reviews[column_to_encode] = le.fit_transform(training_reviews[column_to_encode])

In [55]:
training_reviews_1 = training_reviews.drop(columns= ['text', 'review_id', 'title'], axis = 1)
training_reviews_1 = training_reviews_1.rename(columns={'user_id': 'userID'})
training_reviews_1 = training_reviews_1.rename(columns={'book_id': 'itemID'})
training_reviews_1

Unnamed: 0,itemID,userID,rating
0,3250,5020,5.0
1,3250,89,2.0
2,3250,99,1.0
3,3250,6726,5.0
4,3250,423,5.0
...,...,...,...
58905,529,1333,3.0
58906,529,229,2.0
58907,529,3926,5.0
58908,1084,4773,4.0


In [56]:
train, test = python_random_split(training_reviews_1, 0.75)

In [57]:
train_set = surprise.Dataset.load_from_df(train, reader=surprise.Reader('ml-100k')).build_full_trainset()
train_set

<surprise.trainset.Trainset at 0x22b1da77dc0>

In [58]:
svd = surprise.SVD(random_state=0, n_factors=200, n_epochs=30, verbose=True)

with Timer() as train_time:
    svd.fit(train_set)

print("Took {} seconds for training.".format(train_time.interval))

Processing epoch 0
Processing epoch 1
Processing epoch 2
Processing epoch 3
Processing epoch 4
Processing epoch 5
Processing epoch 6
Processing epoch 7
Processing epoch 8
Processing epoch 9
Processing epoch 10
Processing epoch 11
Processing epoch 12
Processing epoch 13
Processing epoch 14
Processing epoch 15
Processing epoch 16
Processing epoch 17
Processing epoch 18
Processing epoch 19
Processing epoch 20
Processing epoch 21
Processing epoch 22
Processing epoch 23
Processing epoch 24
Processing epoch 25
Processing epoch 26
Processing epoch 27
Processing epoch 28
Processing epoch 29
Took 1.3259460999997827 seconds for training.


In [59]:
predictions = predict(svd, test, usercol='userID', itemcol='itemID')
predictions.head()

Unnamed: 0,userID,itemID,prediction
0,6842,1673,3.839569
1,2895,2596,4.097121
2,504,584,4.449095
3,3231,801,3.870614
4,2182,3377,4.241434


In [61]:

with Timer() as test_time:
    all_predictions = compute_ranking_predictions(svd, test, usercol='userID', itemcol='itemID', remove_seen=True)
    
print("Took {} seconds for prediction.".format(test_time.interval))


Took 114.43916189999982 seconds for prediction.


In [62]:
eval_rmse = rmse(test, predictions)
eval_mae = mae(test, predictions)
eval_rsquared = rsquared(test, predictions)
eval_exp_var = exp_var(test, predictions)

k = 10
eval_map = map_at_k(test, all_predictions, col_prediction='prediction', k=k)
eval_ndcg = ndcg_at_k(test, all_predictions, col_prediction='prediction', k=k)
eval_precision = precision_at_k(test, all_predictions, col_prediction='prediction', k=k)
eval_recall = recall_at_k(test, all_predictions, col_prediction='prediction', k=k)


print("RMSE:\t\t%f" % eval_rmse,
      "MAE:\t\t%f" % eval_mae,
      "rsquared:\t%f" % eval_rsquared,
      "exp var:\t%f" % eval_exp_var, sep='\n')

print('----')

print("MAP:\t%f" % eval_map,
      "NDCG:\t%f" % eval_ndcg,
      "Precision@K:\t%f" % eval_precision,
      "Recall@K:\t%f" % eval_recall, sep='\n')

RMSE:		1.097448
MAE:		0.859866
rsquared:	-0.108721
exp var:	-0.108077
----
MAP:	0.000000
NDCG:	0.000000
Precision@K:	0.000000
Recall@K:	0.000000


#### Item Based

Here we will build a table for users with their corresponding ratings for each book.

In [None]:
bookmat = new_reviews.pivot_table(index='user_id', columns='title', values='rating')
bookmat.head()

title,...And Ladies of the Club,1 Dead in Attic: After Katrina,1 Hunter,1 Is One,"1, 2, 3 to the Zoo",1-2-3 Peas,10 Books That Screwed Up the World And 5 Others That Didn't Help,10 Fat Turkeys,10 Minutes till Bedtime,"10 Sure Signs a Movie Character Is Doomed, and Other Surprising Movie Lists",...,Zero's Return,Zero: The Biography of a Dangerous Idea,Zeroes,Zeros and Ones,Zeroville,Zin! Zin! Zin! A Violin,Zoia's Gold,Zorgamazoo,Zoroastrians' Fight for Survival,t zero
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,,,,,,,,,,,...,,,,,,,,,,
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,


In [None]:
def get_similar(title, mat):
    title_user_ratings = mat[title]
    similar_to_title = mat.corrwith(title_user_ratings)
    corr_title = pd.DataFrame(similar_to_title, columns=['correlation'])
    corr_title.dropna(inplace=True)
    corr_title.sort_values('correlation', ascending=False, inplace=True)
    return corr_title

In [None]:
title = "The Kite Runner"
smlr = get_similar(title, bookmat)

In [None]:
smlr.head(10)

Unnamed: 0_level_0,correlation
title,Unnamed: 1_level_1
Circe,1.0
Room,1.0
The Forever War,1.0
Sense and Sensibility,1.0
Darkness at Noon,1.0
The House of the Seven Gables,1.0
The Kite Runner,1.0
"The Lion, the Witch and the Wardrobe",1.0
Running with Scissors,1.0
Artemis Fowl,1.0


Ok, we got similar books, but we need to filter them by their ratings_count. To get better a better view.

In [None]:
smlr = smlr.join(books.set_index('title')['num_ratings'])
smlr.head()

Unnamed: 0_level_0,correlation,num_ratings
title,Unnamed: 1_level_1,Unnamed: 2_level_1
1984,0.870388,4161994
1984,0.870388,4163051
"20,000 Leagues Under the Sea",0.866025,238762
A Clash of Kings,0.333333,885857
A Game of Thrones,0.968496,2342918


Get similar books with at least 500k ratings.

In [None]:
smlr[smlr.num_ratings > 5e5].sort_values('correlation', ascending=False).head(10)

Unnamed: 0_level_0,correlation,num_ratings
title,Unnamed: 1_level_1,Unnamed: 2_level_1
Circe,1.0,845101
Room,1.0,780775
Room,1.0,780828
The Return of the King,1.0,823334
The Return of the King,1.0,823318
The Odyssey,1.0,1011445
Sense and Sensibility,1.0,1124041
"The Lion, the Witch and the Wardrobe",1.0,2604453
Artemis Fowl,1.0,535416
The Kite Runner,1.0,2935385


That's more interesting and reasonable result, since we could get Twilight book series in our top results.

### BiVAE

In [None]:
import torch
import cornac
from recommenders.models.cornac.cornac_utils import predict_ranking
from recommenders.utils.constants import SEED

FM model is only supported on Linux.
Windows executable can be found at http://www.libfm.org.


In [None]:
print("System version: {}".format(sys.version))
print("PyTorch version: {}".format(torch.__version__))
print("Cornac version: {}".format(cornac.__version__))

System version: 3.9.13 (tags/v3.9.13:6de2ca5, May 17 2022, 16:36:42) [MSC v.1929 64 bit (AMD64)]
PyTorch version: 1.13.1+cu117
Cornac version: 1.15.0


In [None]:
# top k items to recommend
TOP_K = 5

# Model parameters
LATENT_DIM = 50
ENCODER_DIMS = [100]
ACT_FUNC = "tanh"
LIKELIHOOD = "pois"
NUM_EPOCHS = 500
BATCH_SIZE = 128
LEARNING_RATE = 0.001

In [None]:
train, test = python_random_split(training_reviews_1, 0.75)

In [None]:
train_set = cornac.data.Dataset.from_uir(train.itertuples(index=False), seed=SEED)

print('Number of users: {}'.format(train_set.num_users))
print('Number of items: {}'.format(train_set.num_items))

Number of users: 4279
Number of items: 41807


- k: dimension of the latent space (i.e. the size of and ).
- encoder_structure: dimension(s) of hidden layer(s) of the user and item encoders.
- act_fn: non-linear activation function used in the encoders.
- likelihood: choice of the likelihood function being optimized.
- n_epochs: number of passes through training data.
- batch_size: size of mini-batches of data during training.
- learning_rate: step size in the gradient update rules.

In [None]:
bivae = cornac.models.BiVAECF(
    k=LATENT_DIM,
    encoder_structure=ENCODER_DIMS,
    act_fn=ACT_FUNC,
    likelihood=LIKELIHOOD,
    n_epochs=NUM_EPOCHS,
    batch_size=BATCH_SIZE,
    learning_rate=LEARNING_RATE,
    seed=SEED,
    use_gpu=torch.cuda.is_available(),
    verbose=True
)

with Timer() as t:
    bivae.fit(train_set)
print("Took {} seconds for training.".format(t))

  0%|          | 0/500 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [None]:
with Timer() as t:
    all_predictions = predict_ranking(bivae, train, usercol='userID', itemcol='itemID', remove_seen=True)
print("Took {} seconds for prediction.".format(t))

In [None]:
eval_map = map_at_k(test, all_predictions, col_prediction='prediction', k=TOP_K)
eval_ndcg = ndcg_at_k(test, all_predictions, col_prediction='prediction', k=TOP_K)
eval_precision = precision_at_k(test, all_predictions, col_prediction='prediction', k=TOP_K)
eval_recall = recall_at_k(test, all_predictions, col_prediction='prediction', k=TOP_K)

print("MAP:\t%f" % eval_map,
      "NDCG:\t%f" % eval_ndcg,
      "Precision@K:\t%f" % eval_precision,
      "Recall@K:\t%f" % eval_recall, sep='\n')