# 1. Preparation

In [1]:
# Importing libraries
import numpy as np
import pandas as pd

In [2]:
# Setting option
pd.set_option('display.max_columns', 30)

In [3]:
# Importing data
books = pd.read_csv('datasets/books_ml.csv')
ratings = pd.read_csv('datasets/ratings_ml.csv')
book_tag = pd.read_csv('datasets/book_tag_ml.csv')

In [4]:
books.head()

Unnamed: 0,book_id,goodreads_book_id,best_book_id,work_id,books_count,isbn,isbn13,authors,original_publication_year,original_title,title,language_code,average_rating,ratings_count,work_ratings_count,work_text_reviews_count,ratings_1,ratings_2,ratings_3,ratings_4,ratings_5,image_url,small_image_url,cleaned_authors,cleaned_title,top_5_tags
0,1,2767052,2767052,2792775,272,439023483,9780439023480,Suzanne Collins,2008,The Hunger Games,"The Hunger Games (The Hunger Games, #1)",eng,4.34,4780653,4942365,155254,66715,127936,560092,1481305,2706317,https://images.gr-assets.com/books/1447303603m...,https://images.gr-assets.com/books/1447303603s...,suzanne collins,the hunger games the hunger games 1,"favorites, currently-reading, young-adult, fic..."
1,2,3,3,4640799,491,439554934,9780439554930,"J.K. Rowling, Mary GrandPré",1997,Harry Potter and the Philosopher's Stone,Harry Potter and the Sorcerer's Stone (Harry P...,eng,4.44,4602479,4800065,75867,75504,101676,455024,1156318,3011543,https://images.gr-assets.com/books/1474154022m...,https://images.gr-assets.com/books/1474154022s...,jk rowling mary grandpr,harry potter and the sorcerers stone harry pot...,"to-read, favorites, fantasy, currently-reading..."
2,3,41865,41865,3212258,226,316015849,9780316015840,Stephenie Meyer,2005,Twilight,"Twilight (Twilight, #1)",en-US,3.57,3866839,3916824,95009,456191,436802,793319,875073,1355439,https://images.gr-assets.com/books/1361039443m...,https://images.gr-assets.com/books/1361039443s...,stephenie meyer,twilight twilight 1,"young-adult, fantasy, favorites, vampires, ya"
3,4,2657,2657,3275794,487,61120081,9780061120080,Harper Lee,1960,To Kill a Mockingbird,To Kill a Mockingbird,eng,4.25,3198671,3340896,72586,60427,117415,446835,1001952,1714267,https://images.gr-assets.com/books/1361975680m...,https://images.gr-assets.com/books/1361975680s...,harper lee,to kill a mockingbird,"classics, favorites, to-read, classic, histori..."
4,5,4671,4671,245494,1356,743273567,9780743273560,F. Scott Fitzgerald,1925,The Great Gatsby,The Great Gatsby,eng,3.89,2683664,2773745,51992,86236,197621,606158,936012,947718,https://images.gr-assets.com/books/1490528560m...,https://images.gr-assets.com/books/1490528560s...,f scott fitzgerald,the great gatsby,"classics, favorites, fiction, classic, books-i..."


In [5]:
ratings.head()

Unnamed: 0,user_id,book_id,rating
0,75,11,5
1,143,11,5
2,230,11,5
3,247,11,5
4,256,11,4


In [6]:
book_tag.head()

Unnamed: 0,book_id,tag_id,count,tag_name
0,1,11557,50755,favorites
1,2,11557,48220,favorites
2,3,11557,11782,favorites
3,4,11557,30903,favorites
4,5,11557,20851,favorites


In [7]:
# Counting number of users and items
n_users = ratings['user_id'].nunique()
n_items = ratings['book_id'].nunique()

print(f'Total number of users is {n_users}')
print(f'Total number of items (books) is {n_items}')

Total number of users is 4011
Total number of items (books) is 762


# 2. Recommender System

For this project, I will try two approaches for the recommender system which are **Content-Based Recommendation** and **Collaborative Filtering (CF) Recommendation**.

## 2.1. Content-based Recommendation

For content-based recommendation, I will use three features to recommend a book to a user which are 'title', 'author', and 'tag'. For each feature, I will try two approaches using TfidfVectorizer and CountVectorizer and pick the best approach based on my interpretation of the recommendation result. First, I will make function to get top-N recommendation.

In [8]:
# Importing libraries
from sklearn.feature_extraction.text import TfidfVectorizer, TfidfTransformer, CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [9]:
# Making function to get top-N recommendation for content-based algorithm
def top_n_recom_content(iid, similarity, n):
    # Looking for index of the book_id (iid) in the dataset
    idx = books[books['book_id'] == iid].index[0]
    # Making list of index of the top-N recommended books
    recom = pd.Series(similarity[idx]).sort_values(ascending = False)[1 : n + 1].index
    return recom.to_list()

### 2.1.1. Title

In [10]:
# Making similarity matrix using TfidfVectorizer based on title
tfidf = TfidfVectorizer(ngram_range = (1, 2), min_df = 0)
tfidf_matrix = tfidf.fit_transform(books['cleaned_title'])
sim_title_tfidf = cosine_similarity(tfidf_matrix)

In [11]:
# Making similarity matrix using CountVectorizer based on title
count = CountVectorizer(ngram_range = (1, 2), min_df = 0)
count_matrix = count.fit_transform(books['cleaned_title'])
sim_title_count = cosine_similarity(count_matrix)

In [12]:
# Recommendation result based on TfidfVectorizer (title)
iid = 2
recom = top_n_recom_content(iid, sim_title_tfidf, 10)
print(f"Book:\n{books[books['book_id'] == iid]['title'].values[0]}\n\nTop-10 Recommended Books:", 
      '\n'.join([f'- {books.loc[i]["title"]}'for i in recom]), sep = '\n')

Book:
Harry Potter and the Sorcerer's Stone (Harry Potter, #1)

Top-10 Recommended Books:
- Harry Potter and the Half-Blood Prince (Harry Potter, #6)
- Harry Potter and the Deathly Hallows (Harry Potter, #7)
- Harry Potter and the Goblet of Fire (Harry Potter, #4)
- Harry Potter and the Chamber of Secrets (Harry Potter, #2)
- Harry Potter and the Prisoner of Azkaban (Harry Potter, #3)
- Harry Potter and the Order of the Phoenix (Harry Potter, #5)
- Harry Potter Boxset (Harry Potter, #1-7)
- Harry Potter and the Cursed Child - Parts One and Two (Harry Potter, #8)
- Cutting for Stone
- And the Mountains Echoed


In [13]:
# Recommendation result based on CountVectorizer (title)
recom = top_n_recom_content(iid, sim_title_count, 10)
print(f"Book:\n{books[books['book_id'] == iid]['title'].values[0]}\n\nTop-10 Recommended Books:", 
      '\n'.join([f'- {books.loc[i]["title"]}'for i in recom]), sep = '\n')

Book:
Harry Potter and the Sorcerer's Stone (Harry Potter, #1)

Top-10 Recommended Books:
- Harry Potter and the Deathly Hallows (Harry Potter, #7)
- Harry Potter and the Half-Blood Prince (Harry Potter, #6)
- Harry Potter and the Goblet of Fire (Harry Potter, #4)
- Harry Potter and the Chamber of Secrets (Harry Potter, #2)
- Harry Potter and the Prisoner of Azkaban (Harry Potter, #3)
- Harry Potter and the Order of the Phoenix (Harry Potter, #5)
- Harry Potter and the Cursed Child - Parts One and Two (Harry Potter, #8)
- Harry Potter Boxset (Harry Potter, #1-7)
- The Sound and the Fury
- And the Mountains Echoed


### 2.1.2. Author

In [14]:
# Making similarity matrix using TfidfVectorizer based on author
tfidf = TfidfVectorizer(ngram_range = (1, 2), min_df = 0)
tfidf_matrix = tfidf.fit_transform(books['cleaned_authors'])
sim_author_tfidf = cosine_similarity(tfidf_matrix)

In [15]:
# Making similarity matrix using CountVectorizer based on author
count = CountVectorizer(ngram_range = (1, 2), min_df = 0)
count_matrix = count.fit_transform(books['cleaned_authors'])
sim_author_count = cosine_similarity(count_matrix)

In [16]:
# Recommendation result based on TfidfVectorizer (author)
recom = top_n_recom_content(iid, sim_author_tfidf, 10)
print(f"Book:\n{books[books['book_id'] == iid]['title'].values[0]}\n\nTop-10 Recommended Books:", 
      '\n'.join([f'- {books.loc[i]["title"]}'for i in recom]), sep = '\n')

Book:
Harry Potter and the Sorcerer's Stone (Harry Potter, #1)

Top-10 Recommended Books:
- Harry Potter and the Order of the Phoenix (Harry Potter, #5)
- Harry Potter and the Sorcerer's Stone (Harry Potter, #1)
- Harry Potter and the Chamber of Secrets (Harry Potter, #2)
- Harry Potter and the Goblet of Fire (Harry Potter, #4)
- Harry Potter and the Deathly Hallows (Harry Potter, #7)
- Harry Potter and the Prisoner of Azkaban (Harry Potter, #3)
- The Casual Vacancy
- Harry Potter Boxset (Harry Potter, #1-7)
- The Tales of Beedle the Bard
- The Silkworm (Cormoran Strike, #2)


In [17]:
# Recommendation result based on CountVectorizer (author)
recom = top_n_recom_content(2, sim_author_count, 10)
print(f"Book:\n{books[books['book_id'] == iid]['title'].values[0]}\n\nTop-10 Recommended Books:", '\n'.join([f'- {books.loc[i]["title"]}'for i in recom]), sep = '\n')

Book:
Harry Potter and the Sorcerer's Stone (Harry Potter, #1)

Top-10 Recommended Books:
- Harry Potter and the Order of the Phoenix (Harry Potter, #5)
- Harry Potter and the Sorcerer's Stone (Harry Potter, #1)
- Harry Potter and the Chamber of Secrets (Harry Potter, #2)
- Harry Potter and the Goblet of Fire (Harry Potter, #4)
- Harry Potter and the Deathly Hallows (Harry Potter, #7)
- Harry Potter and the Prisoner of Azkaban (Harry Potter, #3)
- The Casual Vacancy
- Harry Potter Boxset (Harry Potter, #1-7)
- The Tales of Beedle the Bard
- The Silkworm (Cormoran Strike, #2)


### 2.1.3. Tag

Since the tag data's format is slightly different with other data, I will make count matrix manually and then transform it into tf-idf matrix

In [18]:
# Making pivot table consists of zeros
idx_row = book_tag['book_id'].unique() 
n_row = len(idx_row)
idx_col = book_tag['tag_id'].unique() 
n_col = len(idx_col)

count_matrix = pd.DataFrame(np.zeros((n_row, n_col)), index = idx_row, columns = idx_col)
count_matrix.head()

Unnamed: 0,11557,8717,33114,11743,10064,30574,10059,11305,32989,26837,5207,26771,27199,22743,11590,...,20006,11109,1729,14820,28680,19911,6185,6187,30779,2758,23407,29241,28121,24744,11051
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [19]:
# Filling dataframes with tag counts
for data in book_tag.itertuples():
    count_matrix.at[data[1], data[2]] = data[3]

# Changing dataframe into 2D array
count_matrix = count_matrix.to_numpy()
count_matrix

array([[50755., 35418., 25968., ...,     0.,     0.,     0.],
       [48220., 44640., 14984., ...,     0.,     0.,     0.],
       [11782.,   240., 16446., ...,     0.,     0.,     0.],
       ...,
       [    0.,     0.,     0., ...,     0.,     0.,     0.],
       [    0.,     0.,     0., ...,     0.,     0.,     0.],
       [    0.,     0.,     0., ...,     0.,     0.,     0.]])

Now, we have our matrix (like using CountVectorizer). I will use TfidfTransformer to transform it into Tf-Idf matrix.

In [20]:
# Making Tf-Idf matrix
tfidf = TfidfTransformer()
tfidf_matrix = tfidf.fit_transform(count_matrix).toarray()
tfidf_matrix

array([[0.48755726, 0.34653371, 0.4056331 , ..., 0.        , 0.        ,
        0.        ],
       [0.09721417, 0.09166451, 0.04912225, ..., 0.        , 0.        ,
        0.        ],
       [0.21126071, 0.00438314, 0.47952151, ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ]])

After this step, I calculate cosine similarity just like other attributes (title and author)

In [21]:
# Calculating cosine similarity
sim_tag_tfidf = cosine_similarity(tfidf_matrix)
sim_tag_count = cosine_similarity(count_matrix)

In [22]:
# Recommendation result based on TfidfVectorizer (tag)
recom = top_n_recom_content(2, sim_tag_tfidf, 10)
print(f"Book:\n{books[books['book_id'] == iid]['title'].values[0]}\n\nTop-10 Recommended Books:", '\n'.join([f'- {books.loc[i]["title"]}'for i in recom]), sep = '\n')

Book:
Harry Potter and the Sorcerer's Stone (Harry Potter, #1)

Top-10 Recommended Books:
- Where the Heart Is
- Clockwork Angel (The Infernal Devices, #1)
- The Other Boleyn Girl (The Plantagenet and Tudor Novels, #9)
- Midnight in the Garden of Good and Evil
- The Phantom of the Opera
- The Adventures of Huckleberry Finn
- Beautiful Creatures (Caster Chronicles, #1)
- Cujo
- Walden
- Do Androids Dream of Electric Sheep?


In [23]:
# Recommendation result based on CountVectorizer (tag)
recom = top_n_recom_content(2, sim_tag_count, 10)
print(f"Book:\n{books[books['book_id'] == iid]['title'].values[0]}\n\nTop-10 Recommended Books:", '\n'.join([f'- {books.loc[i]["title"]}'for i in recom]), sep = '\n')

Book:
Harry Potter and the Sorcerer's Stone (Harry Potter, #1)

Top-10 Recommended Books:
- Clockwork Angel (The Infernal Devices, #1)
- Fried Green Tomatoes at the Whistle Stop Cafe
- Coraline
- City of Ashes (The Mortal Instruments, #2)
- Definitely Dead (Sookie Stackhouse, #6)
- Thirteen Reasons Why
- Deception Point
- The Other Boleyn Girl (The Plantagenet and Tudor Novels, #9)
- Midnight in the Garden of Good and Evil
- The Son of Neptune (The Heroes of Olympus, #2)


Comparing the results between Tf-Ifd and Count vectorizer for each feature (title, author, tag), I decided to use TfifdVectorizer based on my interpretation regarding which approach gives the best result.

## 2.2. Collaborative Filtering

For CF recommendation, I will use several methods such as user-based CF, item-based CF, SVD, SlopeOne, and Co-clustering. I use RMSE as the evaluation metric. I will look into base model that gives the lowest RMSE and after that I will conduct hyperparameter tuning and cross validation.

In [24]:
# Import libraries
from surprise import Reader, Dataset
from surprise import KNNBasic, KNNWithMeans, SVD, SlopeOne, CoClustering
from surprise.accuracy import rmse

In [25]:
# Loading dataset
reader = Reader()
df = Dataset.load_from_df(ratings, reader)

In [26]:
# Splitting dataset into training and testing data
from surprise.model_selection import train_test_split
df_train, df_test = train_test_split(df, test_size = 0.3, random_state = 101)

In [27]:
# Training and testing the algorithms
data = list()

for algorithm in [KNNBasic(), KNNBasic(sim_options = {'user_based' : False}), 
                  KNNWithMeans(), KNNWithMeans(sim_options = {'user_based' : False}), 
                  SVD(), SlopeOne(), CoClustering()]:
    data.append([rmse(algorithm.fit(df_train).test(df_test))])

Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 0.8970
Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 0.8964
Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 0.8729
Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 0.8685
RMSE: 0.8723
RMSE: 0.8776
RMSE: 0.8904


In [28]:
# Making summary of the evaluation metric for each algorithm
pd.DataFrame(data, columns = ['RMSE'],
             index = ['KNNBasic (User)', 'KNNBasic (Item)', 'KNNWithMeans (User)', 'KNNWithMeans (Item)', 
                      'SVD', 'SlopeOne', 'CoClustering']).sort_values(by = 'RMSE')

Unnamed: 0,RMSE
KNNWithMeans (Item),0.868518
SVD,0.872308
KNNWithMeans (User),0.872853
SlopeOne,0.877565
CoClustering,0.890414
KNNBasic (Item),0.896359
KNNBasic (User),0.897008


Looking at the table above, it seems that the most optimum model is Item-Based CF. I will conduct hyperparameter tuning for this chosen model searching for the best hyperparameter setting.

In [29]:
# Setting parameter for hyperparameter tuning
k = np.arange(10, 55, 5)
min_k = np.arange(1, 11, 1)
name = ['msd', 'pearson', 'cosine']

param = {'k' : k, 
         'min_k' : min_k,
        'sim_options' : 
         {'name' : name,
          'user_based' : [False]},
        'verbose' : [False]} 

In [30]:
# Hyperparameter tuning for the selected model (KNNWithMeans)
from surprise.model_selection import RandomizedSearchCV
grid = RandomizedSearchCV(algo_class = KNNWithMeans, 
                          param_distributions = param,
                          n_iter = 20,
                          measures = ['rmse'],
                          random_state = 101)
grid.fit(df)

In [31]:
# Best hyperparameters setting and best evaluation metric score
print(f'Best hyperparameters: {grid.best_params}')
print(f'Best score: {grid.best_score}')

Best hyperparameters: {'rmse': {'k': 25, 'min_k': 4, 'sim_options': {'name': 'pearson', 'user_based': False}, 'verbose': False}}
Best score: {'rmse': 0.8566233058242118}


We got our best parameters for this model. Next, I will conduct cross validation to see if our result is consistent across different training and test dataset.

In [32]:
# Cross validation for selected model
from surprise.model_selection.validation import cross_validate
algo = KNNWithMeans(k = 25, min_k = 4, 
                    sim_options = {'name' : 'pearson', 'user_based' : False, 'verbose' : False})
results = cross_validate(algo = algo, 
                         data = df, 
                         measures = ['RMSE'],
                        return_train_measures = True, verbose = False)

Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.


In [33]:
# Making summary of the evaluation metric for each iteration
result_1 = pd.DataFrame(results, index = (f'Iterasi ke-{i}' for i in range(1,6)))
result_mean = pd.DataFrame((result_1[i].mean() for i in result_1), columns = ['Rata-rata']).transpose()
result_mean.columns = result_1.columns

pd.concat([result_1, result_mean])

Unnamed: 0,test_rmse,train_rmse,fit_time,test_time
Iterasi ke-1,0.856423,0.668343,5.172673,12.228359
Iterasi ke-2,0.857046,0.66677,4.793096,12.387994
Iterasi ke-3,0.854914,0.667576,4.812164,12.689004
Iterasi ke-4,0.858838,0.666737,4.780924,12.273637
Iterasi ke-5,0.855949,0.667774,4.812153,12.549448
Rata-rata,0.856634,0.66744,4.874202,12.425688


Looking at the above table, RMSE for both the training and testing data has quite consistent value. I will continue with this setting to make a CF-based recommender system.

In [34]:
# Making function to get top-N recommendation for collaborative filtering (CF) algorithm
from collections import defaultdict

def top_n_recom_cf(predictions, n):
    # Mapping the predictions for the desired user.
    top_n = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        top_n[uid].append((iid, est))
    
    # Then sort the predictions for each user and retrieve the k highest ones.
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:n]

    return top_n

In [35]:
# Training KNNWithMeans algorithm on the goodreads' books dataset.
ratings = pd.read_csv('datasets/ratings_ml.csv')
df = Dataset.load_from_df(ratings, Reader())
df_train = df.build_full_trainset()
algo = KNNWithMeans(k = 25, min_k = 4, 
                    sim_options = {'name' : 'pearson', 'user_based' : False, 'verbose' : False})
algo.fit(df_train)

Computing the pearson similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNWithMeans at 0x273285a79b0>

In [36]:
# Predict ratings for all pairs (u, i) that are Nnot in the training set.
df_test = df_train.build_anti_testset()
predictions = algo.test(df_test)
recom = top_n_recom_cf(predictions, 10)

In [37]:
# Recommendation result based on KNNWithMeans
uid = 75
print(f'User: {uid}\n\nTop-10 Recommended Books:', '\n'.join([f'- {books[books["book_id"] == i[0]]["title"].values[0]}'for i in recom[uid]]), sep = '\n') 

User: 75

Top-10 Recommended Books:
- Harry Potter Boxset (Harry Potter, #1-7)
- The Complete Sherlock Holmes
- Nine Stories
- Where the Wild Things Are
- The Giving Tree
- A Light in the Attic
- How the Grinch Stole Christmas!
- The Two Towers (The Lord of the Rings, #2)
- The Velveteen Rabbit
- The Tale of Peter Rabbit


# 3. Exporting Data

In [38]:
# Export the predictions
import pickle
pickle.dump(sim_title_tfidf, open('predictions/predictions_title.sav', 'wb'))
pickle.dump(sim_author_tfidf, open('predictions/predictions_author.sav', 'wb'))
pickle.dump(sim_tag_tfidf, open('predictions/predictions_tag.sav', 'wb'))
pickle.dump(recom, open('predictions/predictions_cf.sav', 'wb'))