In [1]:
!pip install pandas-profiling==2.7.1

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
# import dependent libraries
!pip install LightFM
!pip install scikit_optimize
!pip install "scikit_optimize==0.7.4"
import pandas as pd
import os
from scipy.sparse import csr_matrix
import numpy as np
from IPython.display import display_html
import warnings
#import MaskedArray

import matplotlib.pyplot as plt
from matplotlib.gridspec import GridSpec
import seaborn as sns
%matplotlib inline

from lightfm.cross_validation import random_train_test_split
from lightfm.evaluation import auc_score, precision_at_k, recall_at_k
from lightfm import LightFM
#from skopt import forest_minimize

def display_side_by_side(*args):
    html_str = ''
    for df in args:
        html_str += df.to_html()
    display_html(html_str.replace(
        'table', 'table style="display:inline"'), raw=True)


# update the working directory to the root of the project
os.chdir('..')
warnings.filterwarnings("ignore")

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


Goodreads Data
The datasets were collected in late 2017 from goodreads.com, where we only scraped users' public shelves, i.e. everyone can see it on web without login. User IDs and review IDs are anonymized.

We collected these datasets for academic use only. Please do not redistribute them or use for commercial purposes.

There are three groups of datasets: (1) meta-data of the books, (2) user-book interactions (users' public shelves) and (3) users' detailed book reviews. These datasets can be merged together by matching book/user/review ids. For the purposes of this exam, we'll be using only the former two.

You can download the dataset using the links provided in the code

In [3]:
books_metadata = pd.read_json('/content/goodreads_books_poetry.json.gz', lines=True)
interactions = pd.read_json('/content/goodreads_interactions_poetry.json.gz', lines=True)

In [4]:
books_metadata.columns.values

array(['isbn', 'text_reviews_count', 'series', 'country_code',
       'language_code', 'popular_shelves', 'asin', 'is_ebook',
       'average_rating', 'kindle_asin', 'similar_books', 'description',
       'format', 'link', 'authors', 'publisher', 'num_pages',
       'publication_day', 'isbn13', 'publication_month',
       'edition_information', 'publication_year', 'url', 'image_url',
       'book_id', 'ratings_count', 'work_id', 'title',
       'title_without_series'], dtype=object)

In [5]:
books_metadata.sample(2)

Unnamed: 0,isbn,text_reviews_count,series,country_code,language_code,popular_shelves,asin,is_ebook,average_rating,kindle_asin,...,publication_month,edition_information,publication_year,url,image_url,book_id,ratings_count,work_id,title,title_without_series
32735,9799100143,18,[],US,ind,"[{'count': '156', 'name': 'to-read'}, {'count'...",,False,3.73,,...,,,2004,https://www.goodreads.com/book/show/1408082.Ke...,https://images.gr-assets.com/books/1392722972m...,1408082,142,1398356,Kekasihku,Kekasihku
36182,1484008065,1,[],US,,"[{'count': '1337', 'name': 'to-read'}, {'count...",,False,4.25,B00CF0OITC,...,4.0,,2013,https://www.goodreads.com/book/show/19553088-e...,https://images.gr-assets.com/books/1420792026m...,19553088,8,24944198,Empty Roads & Broken Bottles: In Search for th...,Empty Roads & Broken Bottles: In Search for th...


In [6]:
books_metadata.shape

(36514, 29)

While all the available information is vital to extract contextual information to be able to train a better recommendation system, for this example, we'll only focus on the selected fields that require minimal manipulation.

In [7]:
# Limit the books metadata to selected fields
books_metadata_selected = books_metadata[['book_id', 'average_rating', 'is_ebook', 'num_pages', 
                                          'publication_year', 'ratings_count', 'language_code']]
books_metadata_selected.sample(5)

Unnamed: 0,book_id,average_rating,is_ebook,num_pages,publication_year,ratings_count,language_code
1409,2410624,4.73,False,79.0,1996.0,2,
10153,599721,4.52,False,,,259,spa
26608,13542785,3.57,False,95.0,2012.0,23,
35055,23249297,4.18,False,126.0,1985.0,13,ara
35232,12363517,3.67,False,,,4,eng


Now that we have the data with selected fields, next, we'll run it through pandas profiler to perform preliminary exploratory data analysis to help us better understand the available data

In [8]:
import pandas_profiling
import numpy as np

# replace blank cells with NaN
books_metadata_selected.replace('', np.nan, inplace=True)


# not taking book_id into the profiler report
profile = pandas_profiling.ProfileReport(books_metadata_selected[['average_rating', 'is_ebook', 'num_pages', 
                                                                  'publication_year', 'ratings_count']])
# profile.to_file('/content/profiler_books_metadata_1.html')


Considering the results from the profiler, we'll perform following transformations to the dataset:

Replace the missing value of categorical values with another value to create a new category
Convert bin values for numeric variables into discrete intervals

In [9]:
# using pandas cut method to convert fields into discrete intervals
books_metadata_selected['num_pages'].replace(np.nan, -1, inplace=True)
books_metadata_selected['num_pages'] = pd.to_numeric(books_metadata_selected['num_pages'])
books_metadata_selected['num_pages'] = pd.cut(books_metadata_selected['num_pages'], bins=25)

# rounding ratings to neares .5 score
books_metadata_selected['average_rating'] = books_metadata_selected['average_rating'].apply(lambda x: round(x*2)/2)

# using pandas qcut method to convert fields into quantile-based discrete intervals
books_metadata_selected['ratings_count'] = pd.qcut(books_metadata_selected['ratings_count'], 25)

# replacing missing values to year 2100
books_metadata_selected['publication_year'].replace(np.nan, 2100, inplace=True)

# replacing missing values to 'unknown'
books_metadata_selected['language_code'].replace(np.nan, 'unknown', inplace=True)


# convert is_ebook column into 1/0 where true=1 and false=0
books_metadata_selected['is_ebook'] = books_metadata_selected.is_ebook.map(
    lambda x: 1.0*(x == 'true'))

In [10]:
profile = pandas_profiling.ProfileReport(books_metadata_selected[['average_rating', 'is_ebook', 'num_pages', 
                                                         'publication_year', 'ratings_count']])
# profile.to_file('./results/profiler_books_metadata_2.html')

In [11]:
books_metadata_selected.sample(5)

Unnamed: 0,book_id,average_rating,is_ebook,num_pages,publication_year,ratings_count,language_code
20959,23966631,3.5,0.0,"(-11.961, 437.44]",2014,"(-0.001, 2.0]",eng
4832,3325955,4.0,0.0,"(-11.961, 437.44]",2100,"(-0.001, 2.0]",unknown
34710,10307146,4.5,0.0,"(-11.961, 437.44]",2011,"(25.0, 29.0]",eng
17869,24575346,4.0,0.0,"(-11.961, 437.44]",2100,"(125.0, 179.0]",unknown
29907,1057737,3.5,0.0,"(-11.961, 437.44]",2003,"(73.0, 94.0]",unknown


Data Inspection & Preparation: Interactions data
As the first step, let's take a look at all the available fields, and sample data

In [12]:
interactions.columns.values

array(['user_id', 'book_id', 'review_id', 'is_read', 'rating',
       'review_text_incomplete', 'date_added', 'date_updated', 'read_at',
       'started_at'], dtype=object)

In [13]:
interactions.sample(5)

Unnamed: 0,user_id,book_id,review_id,is_read,rating,review_text_incomplete,date_added,date_updated,read_at,started_at
2391374,3f3ff453067066f54e9e7227049916a1,25502493,9c43dc44cc8ebe179213043626fec144,False,0,,Sat Oct 31 21:28:18 -0700 2015,Sat Oct 31 21:28:19 -0700 2015,,
2404172,7a605852c7bc9e1494b33a01a71cd692,14288864,ab6d2ec355889168df47d9584d8a95a4,True,4,,Mon May 27 19:42:57 -0700 2013,Mon May 27 19:43:09 -0700 2013,,
375016,590b63a6a132f3e850c02bd329ed2bc8,5924914,eaa592d23252c127b11403091ec8fb1a,True,5,Another brilliant and sexy edition of MiPO wit...,Wed Nov 26 10:51:36 -0800 2008,Wed Nov 26 10:52:48 -0800 2008,,
1320207,15541502d449e21365115002f6b53f2c,23282006,1af232171b392faea62eda26d4ed533a,False,0,,Sun Jan 01 04:18:37 -0800 2017,Sun Jan 01 04:18:37 -0800 2017,,
275516,f9947dc92c517bfd1d77c86eba862e0c,18661581,16132e9eff6a84eefaad4e3b27cf6041,True,4,,Wed Dec 17 10:46:26 -0800 2014,Wed Dec 17 10:46:35 -0800 2014,Wed Dec 17 10:46:34 -0800 2014,Wed Dec 17 10:46:26 -0800 2014


In [14]:
interactions.shape

(2734350, 10)

While all the available information is vital to extract contextual information to be able to train a better recommendation system, for this example, we'll only focus on the selected fields that require minimal manipulation.

In [15]:
# Limit the books metadata to selected fields
interactions_selected = interactions[['user_id', 'book_id', 'is_read', 'rating']]

# mapping boolean to string
booleanDictionary = {True: 'true', False: 'false'}
interactions_selected['is_read'] = interactions_selected['is_read'].replace(booleanDictionary)

interactions_selected.sample(5)

Unnamed: 0,user_id,book_id,is_read,rating
1487332,6c17bff82c59257a345398ab5106d4da,2696,False,0
1165583,8859d0e9c64b0e49faee20b8bd6241c4,3864107,False,0
1100099,60fc589692cf8666d5e5ee339242ebe8,2168850,True,3
266791,76ab69ce6088287ab97c0d6e81b4f7fe,206633,True,5
646239,008a4080503bd3888c849820eb40a360,47730,False,0


In [16]:
profile = pandas_profiling.ProfileReport(interactions_selected[['is_read', 'rating']])
# profile.to_file('./results/profiler_interactions.html')

Considering the results from the profiler, we'll perform following transformations to the dataset:

Convert is_read column to 1/0

In [17]:
# convert is_read column into 1/0 where true=1 and false=0
interactions_selected['is_read'] = interactions_selected.is_read.map(
    lambda x: 1.0*(x == 'true'))

In [18]:
interactions_selected.sample(10)

Unnamed: 0,user_id,book_id,is_read,rating
1808228,1265f9a1093dd1dcf03f87cd940850bd,6780686,0.0,0
236672,07e43114328799afba4111870ebc7547,112166,0.0,0
2022907,4dc586ff5f20e618bc840fd1434a3231,6656,0.0,0
1198909,3ae0030b8a5b7308fa147945f02abbb7,6604600,0.0,0
751826,39b79e93103f5262d7ae3a6ba4dd2251,30118,1.0,4
1446248,2d0a26ba0ff2b964a3f9cf9152d869eb,4607636,1.0,3
2277002,084fb44abaabe412992ef746073fa7fc,125209,1.0,4
2521367,86f71f12fd94f0002f24d4f571504360,23919,1.0,5
25858,9cce8b52d6071445732e7d415de97670,915458,0.0,0
2672064,c67829df1eb62344515a5908f2c98b05,9444,0.0,0


Since we have two fields denoting interaction between a user and a book, is_read and rating - let's see how many data points we have where the user hasn't read the book but have given the ratings.

In [19]:
interactions_selected.groupby(['rating', 'is_read']).size().reset_index().pivot(columns='rating', index='is_read', values=0)

rating,0,1,2,3,4,5
is_read,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0.0,1420740.0,,,,,
1.0,84551.0,20497.0,64084.0,237942.0,405565.0,500971.0


From the above results, we can conclusively infer that users with ratings >= 1 have all read the book. Therefore, we'll use the ratings as the final score, drop interactions where is_read is false, and limit interactions from random 500 users to limit the data size for further analysis

In [20]:
import random

interactions_selected = interactions_selected.loc[interactions_selected['is_read']==1, ['user_id', 'book_id', 'rating']]

interactions_selected = interactions_selected[interactions_selected['user_id'].isin(random.sample(list(interactions_selected['user_id'].unique()), 
                                                                                                  k=5000))]

interactions_selected.sample(10)

Unnamed: 0,user_id,book_id,rating
459254,7814eed7aa67785a9ff323a3a8709a48,5333076,5
590199,fa67dada969a27c8af1eeda80298ccd0,441444,5
2194780,eb009b4794d61d7dd4d159a93c148f5d,525138,5
1434023,f92981e29c415ac0ce8f326881179eec,19155,4
2731013,2c45d3f648e86bc9951294f76e57e0db,159087,5
2617442,ae52418b641271f8b7607d317a77adda,134025,4
1757023,dded4d179b87eba86f808f163aba6c4a,13235,4
719973,179cd42e7082c82a906a2801f810ee77,831071,0
954645,062b303bf95322b190ad579851518133,9682918,5
2131303,4b1b2577df7c6316dcb6a697b86541f1,1420,3


In [21]:
interactions_selected.shape

(22666, 3)

Data Preprocessing
Now, let's transform the available data into CSR sparse matrix that can be used for matrix operations. We will start by the process by creating books_metadata matrix which is np.float64 csr_matrix of shape ([n_books, n_books_features]) – Each row contains that book's weights over features. However, before we create a sparse matrix, we'll first create a item dictionar for future references

In [22]:
item_dict ={}
df = books_metadata[['book_id', 'title']].sort_values('book_id').reset_index()

for i in range(df.shape[0]):
    item_dict[(df.loc[i,'book_id'])] = df.loc[i,'title']

In [23]:
# dummify categorical features
books_metadata_selected_transformed = pd.get_dummies(books_metadata_selected, columns = ['average_rating', 'is_ebook', 'num_pages', 
                                                                                         'publication_year', 'ratings_count', 
                                                                                         'language_code'])

books_metadata_selected_transformed = books_metadata_selected_transformed.sort_values('book_id').reset_index().drop('index', axis=1)
books_metadata_selected_transformed.head(5)

Unnamed: 0,book_id,average_rating_0.0,average_rating_1.0,average_rating_1.5,average_rating_2.0,average_rating_2.5,average_rating_3.0,average_rating_3.5,average_rating_4.0,average_rating_4.5,...,language_code_tel,language_code_tgl,language_code_tha,language_code_tlh,language_code_tur,language_code_ukr,language_code_unknown,language_code_urd,language_code_vie,language_code_zho
0,234,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,1,0,0,0
1,236,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,1,0,0,0
2,241,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,1,0,0,0
3,244,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
4,254,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,1,0,0,0


In [24]:
# convert to csr matrix
books_metadata_csr = csr_matrix(books_metadata_selected_transformed.drop('book_id', axis=1).values)
books_metadata_csr

<36514x357 sparse matrix of type '<class 'numpy.uint8'>'
	with 219084 stored elements in Compressed Sparse Row format>

Next we'll create a iteractions matrix which is np.float64 csr_matrix of shape ([n_users, n_books]). We'll also create a user dictionary for future use cases

In [25]:
user_book_interaction = pd.pivot_table(interactions_selected, index='user_id', columns='book_id', values='rating')

# fill missing values with 0
user_book_interaction = user_book_interaction.fillna(0)

user_book_interaction.head(10)

book_id,234,241,254,286,289,290,291,292,448,484,...,35832423,35894556,35896040,35910593,35919520,36010801,36070215,36122873,36262212,36262245
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0022b4d5697b0e2a72b5eef1de80c3e9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0033f6da70a111e142abffc59e486e94,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
00355d3fe92a6c0db46e20f152195fa7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
004d435847c22f0325f5f4700b21b00d,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0070948b59cc193cd2288fc59318accb,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
007d2747d2477db5027c18244d4cc6e3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
007ef105cea5bf0c3edfe6c57859f74c,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0099c79ed03588585a04e1bc05a029d1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
00b46da4f0fb73bf00f09d8a655d209a,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
00b97603657f122bdb6c9caf90689cca,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [26]:
user_id = list(user_book_interaction.index)
user_dict = {}
counter = 0 
for i in user_id:
    user_dict[i] = counter
    counter += 1

In [27]:
# convert to csr matrix
user_book_interaction_csr = csr_matrix(user_book_interaction.values)
user_book_interaction_csr

<5000x6690 sparse matrix of type '<class 'numpy.float64'>'
	with 21120 stored elements in Compressed Sparse Row format>

Model Training

Ideally, we would build, train, and evaluate several models for our recommender system to determine which model holds the most promise for further optimization (hyper-parameter tuning).

However, for this exam, we'll train the base model, with randomly selected input parameters for demonstrations.

In [28]:
model = LightFM(loss='warp',
                random_state=2016,
                learning_rate=0.90,
                no_components=150,
                user_alpha=0.000005)

model = model.fit(user_book_interaction_csr,
                  epochs=100,
                  num_threads=16, verbose=False)

# **Top n Recommendations**

In [29]:
def sample_recommendation_user(model, interactions, user_id, user_dict, 
                               item_dict,threshold = 0,nrec_items = 10, show = True):
    
    n_users, n_items = interactions.shape
    user_x = user_dict[user_id]
    scores = pd.Series(model.predict(user_x,np.arange(n_items), item_features=books_metadata_csr))
    scores.index = interactions.columns
    scores = list(pd.Series(scores.sort_values(ascending=False).index))
    
    known_items = list(pd.Series(interactions.loc[user_id,:] \
                                 [interactions.loc[user_id,:] > threshold].index).sort_values(ascending=False))
    
    scores = [x for x in scores if x not in known_items]
    return_score_list = scores[0:nrec_items]
    known_items = list(pd.Series(known_items).apply(lambda x: item_dict[x]))
    scores = list(pd.Series(return_score_list).apply(lambda x: item_dict[x]))
    if show == True:
        print ("User: " + str(user_id))
        print("Known Likes:")
        counter = 1
        for i in known_items:
            print(str(counter) + '- ' + i)
            counter+=1

        print("\n Recommended Items:")
        counter = 1
        for i in scores:
            print(str(counter) + '- ' + i)
            counter+=1

In [38]:
sample_recommendation_user(model, user_book_interaction, '007ef105cea5bf0c3edfe6c57859f74c', user_dict, item_dict)

User: 007ef105cea5bf0c3edfe6c57859f74c
Known Likes:
1- The Odyssey
2- The Iliad

 Recommended Items:
1- Romantically Disturbed: Love Poems to Rip Your Heart Out
2- Lamia
3- Over the Anvil We Stretch
4- A Book of Luminous Things: An International Anthology of Poetry
5- Stanyan Street & Other Sorrows
6- Ascending Into Euphoria: A Collection of Poetry
7- A Season in Hell/The Drunken Boat
8- Rooms for Rent in the Outer Planets: Selected Poems 1962-1996
9- Folding Ruler Star
10- Before Morning
