In [28]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import SVR
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error

%matplotlib inline

In [2]:
df = pd.read_csv('50_genre_wr_gt_4.csv')
df.head(3)

Unnamed: 0,author,bookformat,desc,genre,img,isbn,isbn13,link,pages,rating,reviews,title,totalratings,weighted_rating
0,J.K. Rowling,Hardcover,"Six years of magic, adventure, and mystery mak...","Adventure,Classics,Magic,Young Adult,Fantasy,F...",https://i.gr-assets.com/images/S/compressed.ph...,439827604.0,9780000000000.0,https://goodreads.com/book/show/10.Harry_Potte...,3342,4.73,923,Harry Potter Collection,31332,4.651838
1,Brandon Sanderson,Paperback,According to mythology mankind used to live in...,"War,Magic,Novels,Adult,Fantasy,Fiction,Science...",https://i.gr-assets.com/images/S/compressed.ph...,575102489.0,9780000000000.0,https://goodreads.com/book/show/11221285-the-w...,530,4.79,741,"The Way of Kings, Part 2",14458,4.625966
2,J.K. Rowling,Hardcover,Harry Potter is leaving Privet Drive for the l...,"Adventure,Classics,Magic,Middle Grade,Fantasy,...",https://i.gr-assets.com/images/S/compressed.ph...,,,https://goodreads.com/book/show/136251.Harry_P...,759,4.62,68974,Harry Potter and the Deathly Hallows,2984351,4.619212


In [3]:
df.shape

(5015, 14)

In [4]:
df['genre'].unique()

array(['Adventure,Classics,Magic,Young Adult,Fantasy,Fiction,Science Fiction Fantasy,Paranormal,Childrens',
       'War,Magic,Novels,Adult,Fantasy,Fiction,Science Fiction Fantasy,Audiobook',
       'Adventure,Classics,Magic,Middle Grade,Fantasy,Fiction,Audiobook,Young Adult,Science Fiction Fantasy,Childrens',
       ...,
       'Urban Fantasy,Magic,Mystery,Romance,Fantasy,Fiction,Paranormal',
       'Classics,European Literature,Nonfiction,History',
       'Adventure,Romance,Fantasy,Fiction,Middle Grade,Young Adult,Paranormal,Supernatural,Childrens'],
      dtype=object)

In [5]:
# Dùng genre là features vector, dùng weighted rating làm giá trị rating ban đầu của user, bất kể khi nào user vote một item nào thì train lại đối với user đấy

In [6]:
books = pd.DataFrame({
    'book_id': np.arange(len(df)) + 1,
    'title': df['title'].values,
    'genre': df['genre'].values,
    'base_rate': df['weighted_rating'].values
})
books.head()

Unnamed: 0,book_id,title,genre,base_rate
0,1,Harry Potter Collection,"Adventure,Classics,Magic,Young Adult,Fantasy,F...",4.651838
1,2,"The Way of Kings, Part 2","War,Magic,Novels,Adult,Fantasy,Fiction,Science...",4.625966
2,3,Harry Potter and the Deathly Hallows,"Adventure,Classics,Magic,Middle Grade,Fantasy,...",4.619212
3,4,Harry Potter and the Order of the Phoenix (Har...,"Adventure,Classics,Magic,Young Adult,Middle Gr...",4.60279
4,5,The Lord of the Rings: The Art of the Fellowsh...,"Classics,Adventure,Fantasy,Fiction,Science Fic...",4.583587


# Top 10 những quyển sách khi user chưa vote

In [7]:
books.sort_values(by='base_rate', ascending=False).head(10)

Unnamed: 0,book_id,title,genre,base_rate
0,1,Harry Potter Collection,"Adventure,Classics,Magic,Young Adult,Fantasy,F...",4.651838
1,2,"The Way of Kings, Part 2","War,Magic,Novels,Adult,Fantasy,Fiction,Science...",4.625966
2,3,Harry Potter and the Deathly Hallows,"Adventure,Classics,Magic,Middle Grade,Fantasy,...",4.619212
3,4,Harry Potter and the Order of the Phoenix (Har...,"Adventure,Classics,Magic,Young Adult,Middle Gr...",4.60279
4,5,The Lord of the Rings: The Art of the Fellowsh...,"Classics,Adventure,Fantasy,Fiction,Science Fic...",4.583587
5,6,The Jesus Storybook Bible: Every Story Whisper...,"Religion,Nonfiction,Childrens,Christian",4.542604
6,7,The Wise Man's Fear,"Adventure,Magic,Adult,Fantasy,Fiction,Science ...",4.535398
7,8,Clockwork Angel; Clockwork Prince; Clockwork P...,"Historical,Urban Fantasy,Romance,Science Ficti...",4.531528
8,9,The Revenge of the Baby-Sat,"Sequential Art,Comics,Graphic Novels,Fiction,H...",4.518708
9,10,The Complete Maus,"Autobiography,War,Sequential Art,Memoir,Histor...",4.516693


In [8]:
def get_book(book_id):
    return books.loc[np.array(book_id) - 1]

In [9]:
def features_vector(corpus):
    vectorizer = TfidfVectorizer(tokenizer=lambda x: x.split(','))
    return vectorizer.fit_transform(corpus).toarray()

In [10]:
def refit(X, y):
    model = SVR(kernel='rbf',degree=50, C=0.5, epsilon=0.01, tol=0.001)
    return model.fit(X, y)

In [11]:
user = books.copy()
user.rename(columns={'base_rate': 'rate'}, inplace=True)
user.head()

Unnamed: 0,book_id,title,genre,rate
0,1,Harry Potter Collection,"Adventure,Classics,Magic,Young Adult,Fantasy,F...",4.651838
1,2,"The Way of Kings, Part 2","War,Magic,Novels,Adult,Fantasy,Fiction,Science...",4.625966
2,3,Harry Potter and the Deathly Hallows,"Adventure,Classics,Magic,Middle Grade,Fantasy,...",4.619212
3,4,Harry Potter and the Order of the Phoenix (Har...,"Adventure,Classics,Magic,Young Adult,Middle Gr...",4.60279
4,5,The Lord of the Rings: The Art of the Fellowsh...,"Classics,Adventure,Fantasy,Fiction,Science Fic...",4.583587


In [12]:
# tạo mảng đánh dấu những quyển sách mà user 1 đã vote
fav_genre = 'Adventure'
mark = np.zeros(user.shape[0], dtype='bool')
genre = user['genre'].apply(lambda s: s.split(','))
isin = lambda x: fav_genre in x

In [13]:
mark[user[list(map(isin, genre))].index] = True
user.loc[mark, 'rate'] = 5

In [14]:
sum(mark)

949

In [15]:
X = features_vector(list(user['genre']))
y = user['rate'].values
X.shape, y.shape

((5015, 50), (5015,))

In [24]:
xtrain, xtest, ytrain, ytest = train_test_split(X, y, test_size=.2, random_state=42)

In [30]:
model = Ridge(alpha=5)
model.fit(xtrain, ytrain)

Ridge(alpha=5)

In [25]:
model = SVR(kernel='rbf',degree=50, C=0.5, epsilon=0.01, tol=0.001)
model.fit(xtrain, ytrain)

SVR(C=0.5, degree=50, epsilon=0.01)

In [31]:
model.score(xtrain, ytrain)

0.9228298102859124

In [32]:
model.score(xtest, ytest)

0.9071030157011197

In [18]:
data_new = user.copy()

In [19]:
data_new.loc[~mark, 'rate'] = model.predict(X[~mark])

In [20]:
data_new[~mark].sort_values(by='rate', ascending=False).head(10)

Unnamed: 0,book_id,title,genre,rate
45,46,Rock Chick Revenge,"Suspense,Romance,Erotica,Action,Adult Fiction,...",4.396919
88,89,Rock Chick Revolution,"Suspense,Romance,Erotica,Action,Adult Fiction,...",4.396919
23,24,Rock Chick Regret,"Suspense,Romance,Action,Adult,Chick Lit,Contem...",4.361766
77,78,Rock Chick Renegade,"Suspense,Romance,Action,Adult,Chick Lit,Fictio...",4.35517
120,121,Rock Chick Reckoning,"Suspense,Romance,Action,Adult,Chick Lit,Fictio...",4.35517
76,77,Kings Rising,"Historical,Romance,Adult,Fantasy,Fiction",4.355031
124,125,Captive Prince: Volume Two,"Historical,Romance,Adult,Fantasy,Fiction",4.355031
84,85,A Charlie Brown Christmas,"Classics,Sequential Art,Comics,Christian,Young...",4.335308
52,53,The Coldest Winter Ever,"Classics,Novels,Young Adult,Cultural,Adult Fic...",4.327491
12,13,Harry Potter Page to Screen: The Complete Film...,"Magic,Adult,Fantasy,Nonfiction,Paranormal,Chil...",4.312906


In [21]:
get_book(46)

book_id                                                     46
title                                       Rock Chick Revenge
genre        Suspense,Romance,Erotica,Action,Adult Fiction,...
base_rate                                             4.425164
Name: 45, dtype: object