# Book Recommendation System - Collaborative Filtering

In [1]:
import numpy as np
import pandas as pd

In [2]:
my_books = pd.read_csv("my_liked_books.csv", index_col=0)
my_books["book_id"] = my_books["book_id"].astype(str)

In [3]:
my_books.head()

Unnamed: 0,user_id,book_id,rating,title
0,-1,91201,5,Doctors
1,-1,18143977,5,All the Light We Cannot See
2,-1,467994,5,Sense and Sensibility
3,-1,8045416,5,To Kill a Mockingbird
4,-1,341288,5,Little Women


In [4]:
my_books.shape

(20, 4)

In [5]:
csv_mapping = {}

with open("book_id_map.csv", "r") as f:
    while True:
        line = f.readline()
        if not line:
            break
        csv_id, book_id = line.strip().split(",")
        csv_mapping[csv_id] = book_id

In [6]:
book_set = set(my_books["book_id"])

In [7]:
overlap_users = {}
                    
with open("goodreads_interactions.csv", "r") as f:
    while True:
        line = f.readline()
        if not line:
            break
        user_id, csv_id, _, rating, _ = line.split(",")
        
        book_id = csv_mapping.get(csv_id)
        
        if book_id in book_set:
            if user_id not in overlap_users:
                overlap_users[user_id] = 1
            else:
                overlap_users[user_id] += 1

In [8]:
len(overlap_users)

165329

In [9]:
filtered_overlap_users = set([k for k in overlap_users if overlap_users[k] > (my_books.shape[0]/5)])

In [10]:
len(filtered_overlap_users)

87

In [11]:
interactions_list = []

with open("goodreads_interactions.csv", 'r') as f:
    while True:
        line = f.readline()
        if not line:
            break
        user_id, csv_id, _, rating, _ = line.split(",")
        
        if user_id in filtered_overlap_users:
            book_id = csv_mapping[csv_id]
            interactions_list.append([user_id, book_id, rating])

In [12]:
len(interactions_list)

394091

In [13]:
interactions = pd.DataFrame(interactions_list, columns=["user_id", "book_id", "rating"])

In [14]:
interactions = pd.concat([my_books[["user_id", "book_id", "rating"]], interactions])

In [15]:
interactions["book_id"] = interactions["book_id"].astype(str)
interactions["user_id"] = interactions["user_id"].astype(str)
interactions["rating"] = pd.to_numeric(interactions["rating"])

In [16]:
interactions["user_index"] = interactions["user_id"].astype("category").cat.codes

In [17]:
interactions["book_index"] = interactions["book_id"].astype("category").cat.codes

In [18]:
interactions["user_index"].nunique()

88

In [19]:
interactions["book_index"].nunique()

209408

In [20]:
from scipy.sparse import coo_matrix

In [21]:
ratings_mat_coo = coo_matrix((interactions["rating"], (interactions["user_index"], interactions["book_index"])))

In [22]:
ratings_mat_coo

<88x209408 sparse matrix of type '<class 'numpy.int64'>'
	with 394111 stored elements in COOrdinate format>

In [23]:
ratings_mat = ratings_mat_coo.tocsr()

In [24]:
interactions[interactions["user_id"] == "-1"]

Unnamed: 0,user_id,book_id,rating,user_index,book_index
0,-1,91201,5,0,201490
1,-1,18143977,5,0,60539
2,-1,467994,5,0,150855
3,-1,8045416,5,0,190287
4,-1,341288,5,0,134028
5,-1,303481,5,0,120074
6,-1,348914,5,0,136950
7,-1,4397,5,0,148325
8,-1,140303,5,0,33890
9,-1,9723667,5,0,206991


In [25]:
my_index = 0

In [26]:
from sklearn.metrics.pairwise import cosine_similarity

In [27]:
similarity = cosine_similarity(ratings_mat[my_index,:], ratings_mat).flatten()

In [28]:
indices = np.argpartition(similarity, -15)[-15:]

In [29]:
similar_users = interactions[interactions["user_index"].isin(indices)].copy()

In [30]:
similar_users = similar_users[similar_users["user_id"]!="-1"]

In [31]:
similar_users.shape

(12634, 5)

In [32]:
book_recs = similar_users.groupby("book_id").rating.agg(['count', 'mean'])

In [33]:
book_recs

Unnamed: 0_level_0,count,mean
book_id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,7,3.714286
10,1,5.000000
10000191,2,2.000000
1000032,1,0.000000
10001383,1,0.000000
...,...,...
998,1,0.000000
998133,2,2.000000
9992246,1,0.000000
99944,1,0.000000


In [34]:
book_titles = pd.read_json("book_titles.json")
book_titles["book_id"] = book_titles["book_id"].astype(str)

In [35]:
book_recs = book_recs.merge(book_titles, how="inner", on="book_id")

In [36]:
book_recs.shape

(8742, 8)

In [37]:
book_recs["adjusted_count"] = book_recs["count"] * (book_recs["count"] / book_recs["ratings"])

In [38]:
book_recs["score"] = book_recs["mean"] * book_recs["adjusted_count"]

In [39]:
book_recs = book_recs[~book_recs["book_id"].isin(my_books["book_id"])]

In [40]:
my_books["mod_title"] = my_books["title"].str.replace("[^a-zA-Z0-9 ]", "", regex=True).str.lower()

In [41]:
my_books["mod_title"] = my_books["mod_title"].str.replace("\s+", " ", regex=True)

In [42]:
book_recs

Unnamed: 0,book_id,count,mean,title,ratings,url,cover_image,title_clean,adjusted_count,score
0,1,7,3.714286,Harry Potter and the Half-Blood Prince (Harry ...,1713866,https://www.goodreads.com/book/show/1.Harry_Po...,https://images.gr-assets.com/books/1361039191m...,harry potter and the halfblood prince harry po...,0.000029,0.000106
1,10,1,5.000000,"Harry Potter Collection (Harry Potter, #1-6)",25245,https://www.goodreads.com/book/show/10.Harry_P...,https://images.gr-assets.com/books/1328867351m...,harry potter collection harry potter 16,0.000040,0.000198
2,10000191,2,2.000000,Yellow Crocus,17787,https://www.goodreads.com/book/show/10000191-y...,https://s.gr-assets.com/assets/nophoto/book/11...,yellow crocus,0.000225,0.000450
3,1000032,1,0.000000,Getting Home Alive,47,https://www.goodreads.com/book/show/1000032.Ge...,https://s.gr-assets.com/assets/nophoto/book/11...,getting home alive,0.021277,0.000000
4,10001383,1,0.000000,33 Men,817,https://www.goodreads.com/book/show/10001383-3...,https://s.gr-assets.com/assets/nophoto/book/11...,33 men,0.001224,0.000000
...,...,...,...,...,...,...,...,...,...,...
8737,998,1,0.000000,The Millionaire Next Door: The Surprising Secr...,46032,https://www.goodreads.com/book/show/998.The_Mi...,https://s.gr-assets.com/assets/nophoto/book/11...,the millionaire next door the surprising secre...,0.000022,0.000000
8738,998133,2,2.000000,The Gathering,14548,https://www.goodreads.com/book/show/998133.The...,https://s.gr-assets.com/assets/nophoto/book/11...,the gathering,0.000275,0.000550
8739,9992246,1,0.000000,Eating Smoke: One Man's Descent Into Crystal M...,329,https://www.goodreads.com/book/show/9992246-ea...,https://images.gr-assets.com/books/1329579969m...,eating smoke one mans descent into crystal met...,0.003040,0.000000
8740,99944,1,0.000000,The Bhagavad Gita,33855,https://www.goodreads.com/book/show/99944.The_...,https://images.gr-assets.com/books/1383059639m...,the bhagavad gita,0.000030,0.000000


In [43]:
book_recs = book_recs[~book_recs["title_clean"].isin(my_books["mod_title"])]

In [44]:
book_recs = book_recs[book_recs["mean"] >=4]

In [45]:
book_recs = book_recs[book_recs["count"]>2]

In [46]:
top_recs = book_recs.sort_values("mean", ascending=False)

In [47]:
top_recs.shape

(38, 10)

In [48]:
def make_clickable(val):
    return '<a target="_blank" href="{}">Goodreads</a>'.format(val)

def show_image(val):
    return '<img src="{}" width=50></img>'.format(val)

In [49]:
top_recs.head(10).style.format({'url': make_clickable, 'cover_image': show_image})

Unnamed: 0,book_id,count,mean,title,ratings,url,cover_image,title_clean,adjusted_count,score
4015,25147754,3,5.0,Between the World and Me,18448,Goodreads,,between the world and me,0.000488,0.002439
4095,2547,4,4.75,The Prophet,145579,Goodreads,,the prophet,0.00011,0.000522
8222,8664353,7,4.714286,"Unbroken: A World War II Story of Survival, Resilience, and Redemption",504961,Goodreads,,unbroken a world war ii story of survival resilience and redemption,9.7e-05,0.000457
1410,144974,3,4.666667,The Velveteen Rabbit,175779,Goodreads,,the velveteen rabbit,5.1e-05,0.000239
8090,83674,3,4.666667,The Bridges of Madison County,57812,Goodreads,,the bridges of madison county,0.000156,0.000726
7422,703292,3,4.666667,The Witch of Blackbird Pond,107549,Goodreads,,the witch of blackbird pond,8.4e-05,0.000391
6354,5,8,4.625,"Harry Potter and the Prisoner of Azkaban (Harry Potter, #3)",1876252,Goodreads,,harry potter and the prisoner of azkaban harry potter 3,3.4e-05,0.000158
972,12996,5,4.6,Othello,242511,Goodreads,,othello,0.000103,0.000474
6807,6,7,4.571429,"Harry Potter and the Goblet of Fire (Harry Potter, #4)",1792561,Goodreads,,harry potter and the goblet of fire harry potter 4,2.7e-05,0.000125
1250,136251,7,4.571429,"Harry Potter and the Deathly Hallows (Harry Potter, #7)",1784684,Goodreads,,harry potter and the deathly hallows harry potter 7,2.7e-05,0.000126
