In [1]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction import DictVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
df = pd.read_csv('./goodreads_reviews_comics_graphic.csv')[['user_id', 'book_id', 'rating']]
map_title = pd.read_csv('./goodreads_books_comics_graphic.csv')[['book_id', 'title', 'url']]

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 542338 entries, 0 to 542337
Data columns (total 3 columns):
 #   Column   Non-Null Count   Dtype 
---  ------   --------------   ----- 
 0   user_id  542338 non-null  object
 1   book_id  542338 non-null  int64 
 2   rating   542338 non-null  int64 
dtypes: int64(2), object(1)
memory usage: 12.4+ MB


In [4]:
ratings = df[0: 200000]

In [5]:
ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200000 entries, 0 to 199999
Data columns (total 3 columns):
 #   Column   Non-Null Count   Dtype 
---  ------   --------------   ----- 
 0   user_id  200000 non-null  object
 1   book_id  200000 non-null  int64 
 2   rating   200000 non-null  int64 
dtypes: int64(2), object(1)
memory usage: 4.6+ MB


In [6]:
ratings_rmv_duplicates = ratings.drop_duplicates()

In [7]:
unwanted_users = ratings_rmv_duplicates.groupby('user_id')['user_id'].count()

In [8]:
unwanted_users.shape

(18963,)

In [9]:
unwanted_users = unwanted_users[unwanted_users < 3]
unwanted_users.shape

(10563,)

In [10]:
unwanted_ratings = ratings_rmv_duplicates[ratings_rmv_duplicates.user_id.isin(unwanted_users.index)]
unwanted_ratings.shape

(13416, 3)

In [11]:
new_ratings = ratings_rmv_duplicates.drop(unwanted_ratings.index)
new_ratings.shape

(186584, 3)

In [12]:
new_ratings['title'] = map_title.set_index('book_id').title.loc[new_ratings.book_id].values

In [13]:
new_ratings.head()

Unnamed: 0,user_id,book_id,rating,title
1,bafc2d50014200cda7cb2b6acd60cd73,6315584,4,Spider-Man: The Darkest Hours
2,bafc2d50014200cda7cb2b6acd60cd73,29847729,4,Jim Butcher's Dresden Files: Wild Card #1
3,bafc2d50014200cda7cb2b6acd60cd73,18454118,5,"Deadlock, Vol. 1"
4,bafc2d50014200cda7cb2b6acd60cd73,2239435,4,"All-Star Batman and Robin, the Boy Wonder, Vol. 1"
5,bafc2d50014200cda7cb2b6acd60cd73,13094398,3,"Black Butler, Vol. 12 (Black Butler, #12)"


In [14]:
rating_book_count = pd.DataFrame(new_ratings.groupby('title')['rating'].count())
rating_book_count.head()

Unnamed: 0_level_0,rating
title,Unnamed: 1_level_1
"""Any Grooming Hints for Your Fans, Rollie?""",1
"""Bob's"" Favorite Comics",1
"""Fringe""",1
"""How Come Boys Get to Keep Their Noses?"": Women and Jewish American Identity in Contemporary Graphic Memoirs",1
"""Suki"" to Ienai",1


In [15]:
bookmat = new_ratings.pivot_table(index='user_id', columns='title', values='rating')

In [16]:
bookmat.head()

title,"""Any Grooming Hints for Your Fans, Rollie?""","""Bob's"" Favorite Comics","""Fringe""","""How Come Boys Get to Keep Their Noses?"": Women and Jewish American Identity in Contemporary Graphic Memoirs","""Suki"" to Ienai",#01 Sherlock Holmes and a Scandal in Bohemia,#01 The Drained Brains Caper,#02 The Planet of the Firebird,#1 Believe Your Eyes,#1 Freedom!,...,"피노키오 1 (Pinocchio, #1)","하백의 신부 [Bride of the Water God], Volume 10","하백의 신부 [Bride of the Water God], Volume 11","하백의 신부 [Bride of the Water God], Volume 12","하백의 신부 [Bride of the Water God], Volume 14","하백의 신부 [Bride of the Water God], Volume 8","하백의 신부 [Bride of the Water God], Volume 9",Ｌ・ＤＫ（１０） (講談社コミックス別冊フレンド),Ｌ・ＤＫ（１２）,ＳＡＭＵＲＡＩ　ＤＥＥＰＥＲ　ＫＹＯ（１）
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0006260f85929db85eddee3a0bd0e504,,,,,,,,,,,...,,,,,,,,,,
0008931c0cde961e9c802c5a58196d23,,,,,,,,,,,...,,,,,,,,,,
000b9da55af4420915d408f205919d29,,,,,,,,,,,...,,,,,,,,,,
001010815d3b2692435dfc70285d06e4,,,,,,,,,,,...,,,,,,,,,,
002a023d3de233b4bd3ec4fc3e9c581a,,,,,,,,,,,...,,,,,,,,,,


In [19]:
def get_similar(title, mat):
    title_user_ratings = mat[title]
    similar_to_title = mat.corrwith(title_user_ratings, method='pearson')
    corr_title = pd.DataFrame(similar_to_title, columns=['correlation'])
    corr_title['rating_count'] = rating_book_count['rating']
    corr_title.dropna(inplace=True)
    corr_title[corr_title['rating_count']>50].sort_values('correlation', ascending=False, inplace=True)
    return corr_title

In [20]:
smlr = get_similar('12 Terrors of Christmas', bookmat)

  c = cov(x, y, rowvar)
  c *= np.true_divide(1, fact)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  corr_title[corr_title['rating_count']>50].sort_values('correlation', ascending=False, inplace=True)


In [123]:
smlr.head(5)

Unnamed: 0_level_0,correlation
title,Unnamed: 1_level_1
12 Terrors of Christmas,1.0
Blankets,1.0
"Buddha, Vol. 1: Kapilavastu (Buddha #1)",1.0
The Sculptor,1.0
"Blood Work (The Hollows Graphic Novel, #1)",1.0
