In [1]:
import numpy as np
import pandas as pd

# Load the dataset
ratings_file = 'goodbooks-10k-data/ratings.csv'
ratings = pd.read_csv(ratings_file)

books_file = 'goodbooks-10k-data/book_data2.csv'
books = pd.read_csv(books_file)

display(ratings.head())
display(books.head())

Unnamed: 0,user_id,book_id,rating
0,1,258,5
1,2,4081,4
2,2,260,5
3,2,9296,5
4,2,2318,3


Unnamed: 0,book_id,book_authors,book_desc,book_edition,book_format,book_isbn,book_ pages,book_rating,book_rating_count,book_review_count,book_title,genres,image_url,book_ price
0,1,Suzanne Collins,Winning will make you famous. Losing means cer...,,Hardcover,9780000000000.0,374.0,4.33,5519135.0,160706.0,The Hunger Games,Young Adult|Fiction|Science Fiction|Dystopia|F...,https://images.gr-assets.com/books/1447303603l...,374.0
1,2,J.K. Rowling|Mary GrandPré,There is a door at the end of a silent corrido...,US Edition,Paperback,9780000000000.0,870.0,4.48,2041594.0,33264.0,Harry Potter and the Order of the Phoenix,Fantasy|Young Adult|Fiction,https://images.gr-assets.com/books/1255614970l...,870.0
2,3,Harper Lee,The unforgettable novel of a childhood in a sl...,50th Anniversary,Paperback,9780000000000.0,324.0,4.27,3745197.0,79450.0,To Kill a Mockingbird,Classics|Fiction|Historical|Historical Fiction...,https://images.gr-assets.com/books/1361975680l...,324.0
3,4,Stephenie Meyer,About three things I was absolutely positive.F...,,Paperback,9780000000000.0,498.0,3.58,4281268.0,97991.0,Twilight,Young Adult|Fantasy|Romance|Paranormal|Vampire...,https://images.gr-assets.com/books/1361039443l...,498.0
4,5,Markus Zusak,Trying to make sense of the horrors of World W...,First American Edition (US / CAN),Hardcover,9780000000000.0,552.0,4.36,1485632.0,100821.0,The Book Thief,Historical|Historical Fiction|Fiction|Young Adult,https://images.gr-assets.com/books/1522157426l...,552.0


In [4]:
# Merge the two tables then pivot so we have Users X Books dataframe. 
ratings_title = pd.merge(ratings, books[['book_id', 'book_title']], on='book_id' )
user_book_ratings = pd.pivot_table(ratings_title, index='user_id', columns= 'book_title', values='rating')

print('dataset dimensions: ', user_book_ratings.shape, '\n\nSubset example:')
user_book_ratings.iloc[:25, :10]

dataset dimensions:  (53424, 2945) 

Subset example:


book_title,'Salem's Lot,'Tis A Memoir,....Și la sfârșit a mai rămas coșmarul (...And Then The Nightmare Came At Last),09-Nov,11/22/1963,13 Little Blue Envelopes,1776,1984,1Q84,2001: A Space Odyssey
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,,,,,,,,3.0,,
2,,,,,,,,,4.0,
3,,,,,,,,,,
4,,,,,,,,,,
5,,,,,,,,,,
6,,,,,,,,,,
7,,,,,,,,,,
8,,,,,,,,,,
9,,,,,,,,3.0,4.0,
10,,,,,,,,3.0,,


In [5]:
# Drop users that have given fewer than 100 ratings of these most-rated books
user_book_ratings = user_book_ratings.dropna(thresh=100)

print('dataset dimensions: ', user_book_ratings.shape, '\n\nSubset example:')
user_book_ratings.iloc[:25, :10]

dataset dimensions:  (14421, 2945) 

Subset example:


book_title,'Salem's Lot,'Tis A Memoir,....Și la sfârșit a mai rămas coșmarul (...And Then The Nightmare Came At Last),09-Nov,11/22/1963,13 Little Blue Envelopes,1776,1984,1Q84,2001: A Space Odyssey
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,,,,,,,,3.0,,
4,,,,,,,,,,
7,,,,,,,,,,
9,,,,,,,,3.0,4.0,
10,,,,,,,,3.0,,
24,,,,,,,,,,
28,,,,,,,,,,
31,,,,,,,,,,
32,,,,,,,,,,
35,,,,,,,,,,3.0


In [6]:
from sklearn.decomposition import TruncatedSVD

# replace NaN's with zeroes for Truncated SVD
user_book_ratings_without_nan = user_book_ratings.fillna(0)

tsvd = TruncatedSVD(n_components=200, random_state=42)
user_book_ratings_tsvd = tsvd.fit(user_book_ratings_without_nan).transform(user_book_ratings_without_nan)

print('Original number of features:', user_book_ratings_without_nan.shape[1])
print('Reduced number of features:', user_book_ratings_tsvd.shape[1])
print('Explained variance ratio:', tsvd.explained_variance_ratio_[0:200].sum())

Original number of features: 2945
Reduced number of features: 200
Explained variance ratio: 0.5008407754455055


In [7]:
# view result in a Pandas dataframe, applying the original indices
indices = user_book_ratings.index

book_ratings_for_clustering = pd.DataFrame(data=user_book_ratings_tsvd).set_index(indices)
print('dataset dimensions: ', book_ratings_for_clustering.shape, '\n\nSubset example:')

book_ratings_for_clustering.iloc[:25, :10]

dataset dimensions:  (14421, 200) 

Subset example:


Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,16.220778,11.799651,3.655209,0.887518,-3.960578,-1.298969,1.46147,-1.333716,0.81826,-2.898317
4,23.82954,-4.433101,5.951869,-2.052134,-2.49098,-7.452374,-4.116685,-1.158452,-6.119198,2.850747
7,10.437163,-7.601986,5.937097,4.952678,-4.411809,2.314064,-1.11937,-4.075988,-3.723585,1.040645
9,17.894757,3.503459,-2.711384,1.022007,-6.492036,-7.177919,-2.468837,3.882659,-3.876755,-2.720349
10,15.729847,7.662916,2.122101,1.877979,-6.43286,-0.634575,0.633293,-0.888231,1.672112,-2.364288
24,16.208157,7.021813,0.777877,0.012294,-4.894536,-8.450227,0.40181,1.357305,-0.926761,-4.210418
28,16.091915,8.501099,1.392591,0.471256,-2.258694,1.002249,-5.696135,-0.431419,-6.791429,1.739545
31,11.582038,-1.956634,-1.90103,7.052071,-3.59764,-1.498217,-2.32652,-1.643519,-2.578855,-3.107729
32,18.88227,6.149676,3.276511,-4.874912,4.077347,-2.013265,-1.890023,-3.411589,-5.127413,-0.687245
35,13.556431,4.710415,6.17327,-4.082089,-0.40932,1.171046,0.1392,-1.643576,6.593858,-8.851333


In [58]:
from sklearn.model_selection import train_test_split
book_ratings_training, book_ratings_testing = train_test_split(book_ratings_for_clustering, test_size=0.20, random_state=42)

print('Training data shape: ', book_ratings_training.shape)
print('Testing data shape: ', book_ratings_testing.shape)
book_ratings_testing.head()

Training data shape:  (11536, 200)
Testing data shape:  (2885, 200)


Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,190,191,192,193,194,195,196,197,198,199
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
27229,19.547949,17.713445,6.787664,2.086802,1.823835,-2.212598,4.337423,-4.190022,-1.638654,-7.385726,...,1.511495,0.152081,2.101824,-1.636865,1.303433,0.505455,-1.815816,-2.472362,3.005418,-2.164375
8667,16.883877,-10.912198,9.340424,-0.597333,-5.001667,-1.777423,-2.356401,1.080788,1.395323,3.372327,...,0.345107,0.139171,1.174737,-1.852491,1.309904,-1.179369,0.87857,0.579809,0.578394,-0.67139
25466,12.866662,0.9581,3.471084,5.479239,-4.585485,1.942287,4.640046,-2.106725,-5.221919,-2.219497,...,0.533507,0.851783,-0.051794,1.630426,1.303638,-1.247364,0.101246,-1.022655,-1.752877,1.250331
18077,20.493929,-2.368544,-0.062209,9.490465,1.394649,-2.756778,-3.534943,3.209413,-6.207719,4.445236,...,3.532888,1.089592,0.366332,1.891617,-1.762708,1.668843,0.005135,1.20695,-3.031102,-0.287719
9285,13.937307,-11.412479,-2.153913,14.224522,8.761112,-3.748852,2.576691,1.122271,-0.351791,2.152324,...,-0.253796,0.744679,0.355653,0.608796,-0.090372,0.592161,-2.213528,0.022517,-1.593686,-0.502372


In [11]:
# find the per-book ratings of the test set
indices = book_ratings_testing.index
test_set_ratings = user_book_ratings.ix[indices]
test_set_ratings.head()

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  This is separate from the ipykernel package so we can avoid doing imports until


book_title,'Salem's Lot,'Tis A Memoir,....Și la sfârșit a mai rămas coșmarul (...And Then The Nightmare Came At Last),09-Nov,11/22/1963,13 Little Blue Envelopes,1776,1984,1Q84,2001: A Space Odyssey,...,زندگی مه آلود پریا,ساق البامبو,سیمای شکسته پدر سالار,سینوهه,شيكاجو,عزازيل,لا تحزن,يوتوبيا,گم شده ای در مه,பொன்னியின் செல்வன் [Ponniyin Selvan]
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
27229,,,,,,,,,,,...,,,,,,,,,,4.0
8667,,,,,,,,,,,...,,,,,,,,,,
25466,,,,,,,,,,,...,5.0,4.0,,,,,,,,
18077,,,,,,,,,,,...,4.0,,4.0,,,,,,,
9285,,,,,,,,,,,...,,,,,,,,,,


In [12]:
mean_ratings_for_random_10 = []

# for each user, pick 10 books at random that the reader has rated and get the reader's average score for those books
for index, row in test_set_ratings.iterrows():
    ratings_without_nas = row.dropna()
    random_10 = ratings_without_nas.sample(n=10)
    random_10_mean = random_10.mean()
    mean_ratings_for_random_10.append(random_10_mean)

# get the mean of the users' mean ratings for 10 random books each    
mean_benchmark_rating = sum(mean_ratings_for_random_10) / len(mean_ratings_for_random_10)

print('Mean rating for 10 random books per test user: ', mean_benchmark_rating)

Mean rating for 10 random books per test user:  3.8876949740034736


In [19]:
# trying with the training data after preprocessing 
from sklearn.cluster import KMeans

clusterer_KMeans = KMeans(n_clusters=7).fit(book_ratings_training)
preds_KMeans = clusterer_KMeans.predict(book_ratings_training)

from sklearn.metrics import silhouette_score
kmeans_score = silhouette_score(book_ratings_training, preds_KMeans)
print(kmeans_score)

0.0433968332584411


In [18]:
# trying with the training data after preprocessing 
from sklearn.mixture import GaussianMixture

clusterer_GMM = GaussianMixture(n_components=7).fit(book_ratings_training)
preds_GMM = clusterer_GMM.predict(book_ratings_training)

GMM_score = silhouette_score(book_ratings_training, preds_GMM)
print(GMM_score)

0.016778872313688933


In [20]:
indices = book_ratings_training.index
preds = pd.DataFrame(data=preds_KMeans, columns=['cluster']).set_index(indices)
preds.head()

Unnamed: 0_level_0,cluster
user_id,Unnamed: 1_level_1
375,0
45824,5
37649,4
27560,1
53327,5


In [21]:
# get a list of the highest-rated books for each cluster
def get_cluster_favorites(cluster_number):
    # create a list of cluster members
    cluster_membership = preds.index[preds['cluster'] == cluster_number].tolist()
    # build a dataframe of that cluster's book ratings
    cluster_ratings = user_book_ratings.ix[cluster_membership]
    # drop books that have fewer than 10 ratings by cluster members
    cluster_ratings = cluster_ratings.dropna(axis='columns', thresh=10)
    # find the cluster's mean rating overal and for each book
    means = cluster_ratings.mean(axis=0)
    # sort books by mean rating
    favorites = means.sort_values(ascending=False)
    return favorites

# for each cluster, determine the overall mean rating cluster members have given books
def get_cluster_mean(cluster_number):
    # create a list of cluster members
    cluster_membership = preds.index[preds['cluster'] == cluster_number].tolist()
    # create a version of the original ratings dataset that only includes cluster members
    cluster_ratings = ratings[ratings['user_id'].isin(cluster_membership)]
    # get the mean rating
    return cluster_ratings['rating'].mean()

In [25]:
cluster0_books_storted = get_cluster_favorites(0)
cluster0_mean = get_cluster_mean(0)

print('The cluster 0 mean is:', cluster0_mean)
cluster0_books_storted[0:10]

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  


The cluster 0 mean is: 3.8818989751461417


book_title
The Complete Poetry and Prose    4.888889
Collide                          4.857143
American Pastoral                4.800000
Hopeless                         4.790123
The Evolution of Mara Dyer       4.734694
Another Roadside Attraction      4.730769
Incarceron                       4.727273
Blood & Spirits                  4.727273
Vampire Kisses: The Beginning    4.714286
Olive Kitteridge                 4.714286
dtype: float64

In [23]:
cluster1_books_storted = get_cluster_favorites(1)
cluster1_mean = get_cluster_mean(1)

print('The cluster 1 mean is:', cluster1_mean)
cluster1_books_storted[0:10]

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  


The cluster 1 mean is: 3.862520635436118


book_title
The Battle for Skandia           4.838710
The Chronicle of Sapta Sindhu    4.791667
Opal                             4.770270
The Evolution of Mara Dyer       4.754386
The Outlaw Demon Wails           4.750000
Turtles All the Way Down         4.708333
Master of the Game               4.700000
A Touch of Dead                  4.692308
Touch the Dark                   4.681818
Zorba the Greek                  4.666667
dtype: float64

In [24]:
cluster2_books_storted = get_cluster_favorites(2)
cluster2_mean = get_cluster_mean(2)

print('The cluster 2 mean is:', cluster2_mean)
cluster2_books_storted[0:10]

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  


The cluster 2 mean is: 3.8900369328174462


book_title
Station Eleven                 4.800000
City of Lost Souls             4.793103
Mary Poppins                   4.750000
Tatiana and Alexander          4.727273
Another Roadside Attraction    4.705882
Last Chance to See             4.666667
Out                            4.666667
Hopeless                       4.655556
The Evolution of Mara Dyer     4.640351
The Hobbit: Graphic Novel      4.640000
dtype: float64

In [26]:
cluster3_books_storted = get_cluster_favorites(3)
cluster3_mean = get_cluster_mean(3)

print('The cluster 3 mean is:', cluster3_mean)
cluster3_books_storted[0:10]

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  


The cluster 3 mean is: 3.815326063809456


book_title
Ascend                           4.857143
Fire and Ice                     4.833333
Temple of the Winds              4.666667
A Touch of Dead                  4.666667
Infinity                         4.636364
The Cat in the Hat Comes Back    4.636364
The Andromeda Strain             4.625000
Hopeless                         4.619718
A Kiss of Shadows                4.619048
The Lords of Discipline          4.619048
dtype: float64

In [27]:
cluster4_books_storted = get_cluster_favorites(4)
cluster4_mean = get_cluster_mean(4)

print('The cluster 4 mean is:', cluster4_mean)
cluster4_books_storted[0:10]

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  


The cluster 4 mean is: 3.8679799820648557


book_title
House of Leaves          5.000000
All the Pretty Horses    5.000000
Hopeless                 4.852632
Temple of the Winds      4.772727
City of Bones            4.768944
LISEY'S STORY.           4.733333
The Aeneid               4.727273
Bag of Bones             4.700000
The Great Gatsby         4.695062
Airhead                  4.692308
dtype: float64

In [28]:
cluster5_books_storted = get_cluster_favorites(5)
cluster5_mean = get_cluster_mean(5)

print('The cluster 5 mean is:', cluster5_mean)
cluster5_books_storted[0:10]

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  


The cluster 5 mean is: 3.8507374449820935


book_title
The Battle for Skandia                     4.814815
Opal                                       4.782609
Fantastic Beasts and Where to Find Them    4.758621
All the Pretty Horses                      4.750000
Portnoy's Complaint                        4.733333
The Evolution of Mara Dyer                 4.714286
Night Embrace                              4.700000
Hearts in Atlantis                         4.692308
About a Boy                                4.666667
The Sight                                  4.636364
dtype: float64

In [29]:
cluster6_books_storted = get_cluster_favorites(6)
cluster6_mean = get_cluster_mean(6)

print('The cluster 6 mean is:', cluster6_mean)
cluster6_books_storted[0:10]

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  


The cluster 6 mean is: 3.9555913932860842


book_title
The Sword of Summer                                     4.900000
Infinity                                                4.833333
Waiting for Godot                                       4.833333
Magic Strikes                                           4.818182
The Battle for Skandia                                  4.764706
Exotic Neurotic                                         4.764706
The Canterbury Tales                                    4.759259
The Power of Now: A Guide to Spiritual Enlightenment    4.727273
Ocean Sea                                               4.714286
The Light Fantastic                                     4.714286
dtype: float64

In [74]:
# associate each test user with a cluster
test_set_preds = clusterer_KMeans.predict(book_ratings_testing)
test_set_indices = book_ratings_testing.index
test_set_clusters = pd.DataFrame(data=test_set_preds, columns=['cluster']).set_index(test_set_indices)

test_set_clusters.head()


Unnamed: 0_level_0,cluster
user_id,Unnamed: 1_level_1
27229,3
8667,5
25466,5
18077,6
9285,6


In [32]:
mean_ratings_for_cluster_favorites = []

# put each cluster's sorted book list in an array to reference
cluster_favorites = [cluster0_books_storted, cluster1_books_storted, cluster2_books_storted, cluster3_books_storted, cluster4_books_storted, cluster5_books_storted, cluster6_books_storted]

# for each user, find the 10 books the reader has rated that are the top-rated books of the cluster. 
# get the reader's average score for those books
for index, row in test_set_ratings.iterrows():
    user_cluster = test_set_clusters.loc[index, 'cluster']
    favorites = cluster_favorites[user_cluster].index
    user_ratings_of_favorites = []
    # proceed in order down the cluster's list of favorite books
    for book in favorites:
        # if the user has given the book a rating, save the rating to a list
        if np.isnan(row[book]) == False:
            user_ratings_of_favorites.append(row[book])
        # stop when there are 10 ratings for the user
        if len(user_ratings_of_favorites) >= 10:
            break
    # get the mean for the user's rating of the cluster's 10 favorite books
    mean_rating_for_favorites = sum(user_ratings_of_favorites) / len(user_ratings_of_favorites)
    mean_ratings_for_cluster_favorites.append(mean_rating_for_favorites)
    
mean_favorites_rating = sum(mean_ratings_for_cluster_favorites) / len(mean_ratings_for_cluster_favorites)

print('Mean rating for 10 random books per test user: ', mean_benchmark_rating)
print('Mean rarting for 10 books that are the cluster\'s favorites: ', mean_favorites_rating)
print('Difference between ratings: ', mean_favorites_rating-mean_benchmark_rating)

Mean rating for 10 random books per test user:  3.8876949740034736
Mean rarting for 10 books that are the cluster's favorites:  4.3735008665511135
Difference between ratings:  0.48580589254763984


In [79]:
import random
def recommend(cluster_assignments, user_id):
    user_cluster = cluster_assignments
    favorites = get_cluster_favorites(user_cluster).index
    favorites = random.choices(favorites, k=9)     
    return favorites
    

# recommendation27229 = recommend(test_set_clusters, user_book_ratings, 27229)
# recommendation31159 = recommend(test_set_clusters, user_book_ratings, 31159)
# recommendation10579 = recommend(test_set_clusters, user_book_ratings, 10579)
# recommendation8667 = recommend(test_set_clusters, user_book_ratings, 8667)

# print('Recommendation for user 27229: ', recommendation27229)
# print('Recommendation for user 31159: ', recommendation31159)
# print('Recommendation for user 10579: ', recommendation10579)
# print('Recommendation for user 8667: ', recommendation8667)

recommendation8667 = recommend(5, 8667)
print(recommendation8667)

['Perfume: The Story of a Murderer', 'Vain', 'Struck By Lightning: The Carson Phillips Journal', 'The Magicians', 'The Republic', 'The Kite Runner', 'We Were the Mulvaneys', 'Tuesdays with Morrie', 'Midnight in the Garden of Good and Evil']


.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  
