## Bookify: The Ultimate Book Recommendation Application With Data-Driven Intelligence

### Importing the libraries

In [60]:
import pandas as pd 
import numpy as np
import pickle

In [61]:
import warnings
warnings.filterwarnings('ignore')

In [62]:
books = pd.read_csv('Cleaned Data/cleaned_books.csv')
users = pd.read_csv('Cleaned Data/cleaned_users.csv')
ratings = pd.read_csv('Cleaned Data/cleaned_ratings.csv')

## Collabrative based filtering system 

### 1. cosine similarity based approach

In [63]:
ratings_with_name = ratings.merge(books,on='ISBN')

In [64]:
# filtering users

x = ratings_with_name.groupby('User-ID').count()['Book-Rating'] > 200
users_to_consider = x[x].index

In [65]:
filtered_rating = ratings_with_name[ratings_with_name['User-ID'].isin(users_to_consider)]

In [66]:
# filtering books

y = filtered_rating.groupby('Book-Title').count()['Book-Rating']>=50
famous_books = y[y].index

In [67]:
final_ratings = filtered_rating[filtered_rating['Book-Title'].isin(famous_books)]

In [68]:
final_ratings

Unnamed: 0,User-ID,ISBN,Book-Rating,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-S,Image-URL-M,Image-URL-L
63,278418,0446520802,0,The Notebook,Nicholas Sparks,1996,Warner Books,http://images.amazon.com/images/P/0446520802.0...,http://images.amazon.com/images/P/0446520802.0...,http://images.amazon.com/images/P/0446520802.0...
65,3363,0446520802,0,The Notebook,Nicholas Sparks,1996,Warner Books,http://images.amazon.com/images/P/0446520802.0...,http://images.amazon.com/images/P/0446520802.0...,http://images.amazon.com/images/P/0446520802.0...
66,7158,0446520802,10,The Notebook,Nicholas Sparks,1996,Warner Books,http://images.amazon.com/images/P/0446520802.0...,http://images.amazon.com/images/P/0446520802.0...,http://images.amazon.com/images/P/0446520802.0...
69,11676,0446520802,10,The Notebook,Nicholas Sparks,1996,Warner Books,http://images.amazon.com/images/P/0446520802.0...,http://images.amazon.com/images/P/0446520802.0...,http://images.amazon.com/images/P/0446520802.0...
74,23768,0446520802,6,The Notebook,Nicholas Sparks,1996,Warner Books,http://images.amazon.com/images/P/0446520802.0...,http://images.amazon.com/images/P/0446520802.0...,http://images.amazon.com/images/P/0446520802.0...
...,...,...,...,...,...,...,...,...,...,...
1026724,266865,0531001725,10,The Catcher in the Rye,Jerome David Salinger,1973,Scholastic Library Pub,http://images.amazon.com/images/P/0531001725.0...,http://images.amazon.com/images/P/0531001725.0...,http://images.amazon.com/images/P/0531001725.0...
1027923,269566,0670809381,0,Echoes,Maeve Binchy,1986,Penguin USA,http://images.amazon.com/images/P/0670809381.0...,http://images.amazon.com/images/P/0670809381.0...,http://images.amazon.com/images/P/0670809381.0...
1028777,271284,0440910927,0,The Rainmaker,John Grisham,1995,Island,http://images.amazon.com/images/P/0440910927.0...,http://images.amazon.com/images/P/0440910927.0...,http://images.amazon.com/images/P/0440910927.0...
1029070,271705,B0001PIOX4,0,Fahrenheit 451,Ray Bradbury,1993,Simon &amp; Schuster,http://images.amazon.com/images/P/B0001PIOX4.0...,http://images.amazon.com/images/P/B0001PIOX4.0...,http://images.amazon.com/images/P/B0001PIOX4.0...


In [69]:
pt = final_ratings.pivot_table(index='Book-Title',columns='User-ID',values='Book-Rating')

In [70]:
pt.fillna(0,inplace=True)

In [71]:
pt

User-ID,254,2276,2766,2977,3363,4017,4385,6251,6323,6543,...,271705,273979,274004,274061,274301,274308,275970,277427,277639,278418
Book-Title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1984,9.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1st to Die: A Novel,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2nd Chance,0.0,10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4 Blondes,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A Bend in the Road,0.0,0.0,7.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Year of Wonders,0.0,0.0,0.0,7.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,9.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
You Belong To Me,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Zen and the Art of Motorcycle Maintenance: An Inquiry into Values,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Zoya,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [72]:
from sklearn.metrics.pairwise import cosine_similarity

In [73]:
# Finding distance of each book with every other book

similarity_scores = cosine_similarity(pt)

In [74]:
similarity_scores.shape

(706, 706)

In [75]:
# Checking for the book "A Bend in the Road"

In [76]:
index = np.where(pt.index=="A Bend in the Road")[0][0]
index

4

In [77]:
similarity_scores[231]

array([0.07574414, 0.02712118, 0.06380993, 0.0523528 , 0.060984  ,
       0.09178516, 0.01830553, 0.03574664, 0.06670845, 0.04938934,
       0.09499683, 0.        , 0.06156482, 0.04424727, 0.10927927,
       0.17378867, 0.07029547, 0.03878884, 0.05149406, 0.09160219,
       0.        , 0.0331026 , 0.06669328, 0.03494163, 0.06657921,
       0.09148399, 0.1559416 , 0.09809101, 0.10905172, 0.06721713,
       0.07854089, 0.09708723, 0.        , 0.02885384, 0.08809253,
       0.06763676, 0.1320828 , 0.03728567, 0.05338959, 0.03140194,
       0.04190882, 0.0739393 , 0.06367465, 0.02475263, 0.0395123 ,
       0.07518124, 0.        , 0.10207517, 0.06245333, 0.11748705,
       0.0748842 , 0.0840645 , 0.06603795, 0.        , 0.12658799,
       0.08328517, 0.11262774, 0.04184236, 0.03682347, 0.08302738,
       0.03636795, 0.06665491, 0.02537406, 0.06104088, 0.01620832,
       0.04251403, 0.03986177, 0.0144452 , 0.02230074, 0.07641461,
       0.        , 0.02867282, 0.07179908, 0.04860278, 0.08965

In [78]:
list(enumerate(similarity_scores[231]))

[(0, 0.0757441443938586),
 (1, 0.027121182330812378),
 (2, 0.06380992673357845),
 (3, 0.05235279711652995),
 (4, 0.06098399656223388),
 (5, 0.09178515589337805),
 (6, 0.018305531448118514),
 (7, 0.03574664291571256),
 (8, 0.06670844674022552),
 (9, 0.04938933853022259),
 (10, 0.09499682631961476),
 (11, 0.0),
 (12, 0.06156482483855114),
 (13, 0.044247274030800865),
 (14, 0.10927927490861332),
 (15, 0.17378866641782403),
 (16, 0.07029547062308994),
 (17, 0.03878884245409791),
 (18, 0.0514940594254751),
 (19, 0.09160219022843828),
 (20, 0.0),
 (21, 0.03310260312720459),
 (22, 0.0666932785457436),
 (23, 0.03494162660409004),
 (24, 0.06657920855240111),
 (25, 0.09148399166565378),
 (26, 0.1559415990017713),
 (27, 0.09809100629826661),
 (28, 0.10905172494902898),
 (29, 0.0672171336931463),
 (30, 0.0785408899781545),
 (31, 0.09708722697898134),
 (32, 0.0),
 (33, 0.02885384176253614),
 (34, 0.08809253408014917),
 (35, 0.06763675565091232),
 (36, 0.13208279924559113),
 (37, 0.03728566565186062

In [79]:
# sorting by default happens on index so we need to use lambda function to sort on basis of distance

similar_items = sorted(list(enumerate(similarity_scores[4])),key = lambda x:x[1],reverse=True)[1:6]
similar_items

[(24, 0.29562611831779734),
 (564, 0.29142141649839076),
 (418, 0.26043350404519483),
 (43, 0.25099279007642467),
 (186, 0.24631687323985016)]

In [80]:
for i in similar_items:
    print(pt.index[i[0]])

A Walk to Remember
The Last Time They Met : A Novel
Sea Glass: A Novel
Angels
Family Album


In [81]:
# Function for recommending books

def recommend(book_name):
    
    # index fetch
    index = np.where(pt.index==book_name)[0][0]
    
    # enumerate displays index along with distances
    similar_items = sorted(list(enumerate(similarity_scores[index])),key = lambda x:x[1],reverse=True)[1:6]
    
    for i in similar_items:
        print(pt.index[i[0]])
#     return suggestions


In [82]:
 recommend("A Bend in the Road")

A Walk to Remember
The Last Time They Met : A Novel
Sea Glass: A Novel
Angels
Family Album


### 2. Matrix Factorization

<!--     To ensure statistical significance, users with less than 200 ratings, and books with less than 100 ratings are excluded. -->

In [83]:
# Considering only those users which have rated atleast 200 books

counts = ratings['User-ID'].value_counts()
ratings = ratings[ratings['User-ID'].isin(counts[counts >= 200].index)]

combine_book_rating = pd.merge(ratings, books, on='ISBN')
columns = ['Year-Of-Publication', 'Publisher', 'Image-URL-S','Image-URL-L']
combine_book_rating = combine_book_rating.drop(columns, axis=1)
combine_book_rating.head()

Unnamed: 0,User-ID,ISBN,Book-Rating,Book-Title,Book-Author,Image-URL-M
0,277427,002542730X,10,Politically Correct Bedtime Stories: Modern Ta...,James Finn Garner,http://images.amazon.com/images/P/002542730X.0...
1,3363,002542730X,0,Politically Correct Bedtime Stories: Modern Ta...,James Finn Garner,http://images.amazon.com/images/P/002542730X.0...
2,11676,002542730X,6,Politically Correct Bedtime Stories: Modern Ta...,James Finn Garner,http://images.amazon.com/images/P/002542730X.0...
3,12538,002542730X,10,Politically Correct Bedtime Stories: Modern Ta...,James Finn Garner,http://images.amazon.com/images/P/002542730X.0...
4,13552,002542730X,0,Politically Correct Bedtime Stories: Modern Ta...,James Finn Garner,http://images.amazon.com/images/P/002542730X.0...


In [84]:
combine_book_rating = combine_book_rating.dropna(axis = 0, subset = ['Book-Title'])

book_ratingCount = (combine_book_rating.
     groupby(by = ['Book-Title'])['Book-Rating'].
     count().
     reset_index().
     rename(columns = {'Book-Rating': 'totalRatingCount'})
     [['Book-Title', 'totalRatingCount']]
    )
book_ratingCount.head()

Unnamed: 0,Book-Title,totalRatingCount
0,A Light in the Storm: The Civil War Diary of ...,2
1,Always Have Popsicles,1
2,Apple Magic (The Collector's series),1
3,Beyond IBM: Leadership Marketing and Finance ...,1
4,Clifford Visita El Hospital (Clifford El Gran...,1


In [85]:
rating_with_totalRatingCount = combine_book_rating.merge(book_ratingCount, left_on = 'Book-Title', right_on = 'Book-Title', how = 'left')
rating_with_totalRatingCount.head()

Unnamed: 0,User-ID,ISBN,Book-Rating,Book-Title,Book-Author,Image-URL-M,totalRatingCount
0,277427,002542730X,10,Politically Correct Bedtime Stories: Modern Ta...,James Finn Garner,http://images.amazon.com/images/P/002542730X.0...,82
1,3363,002542730X,0,Politically Correct Bedtime Stories: Modern Ta...,James Finn Garner,http://images.amazon.com/images/P/002542730X.0...,82
2,11676,002542730X,6,Politically Correct Bedtime Stories: Modern Ta...,James Finn Garner,http://images.amazon.com/images/P/002542730X.0...,82
3,12538,002542730X,10,Politically Correct Bedtime Stories: Modern Ta...,James Finn Garner,http://images.amazon.com/images/P/002542730X.0...,82
4,13552,002542730X,0,Politically Correct Bedtime Stories: Modern Ta...,James Finn Garner,http://images.amazon.com/images/P/002542730X.0...,82


In [86]:
# Considering only those books which have atleast 50 ratings

popularity_threshold = 50
rating_popular_book = rating_with_totalRatingCount.query('totalRatingCount >= @popularity_threshold')
rating_popular_book.head()

Unnamed: 0,User-ID,ISBN,Book-Rating,Book-Title,Book-Author,Image-URL-M,totalRatingCount
0,277427,002542730X,10,Politically Correct Bedtime Stories: Modern Ta...,James Finn Garner,http://images.amazon.com/images/P/002542730X.0...,82
1,3363,002542730X,0,Politically Correct Bedtime Stories: Modern Ta...,James Finn Garner,http://images.amazon.com/images/P/002542730X.0...,82
2,11676,002542730X,6,Politically Correct Bedtime Stories: Modern Ta...,James Finn Garner,http://images.amazon.com/images/P/002542730X.0...,82
3,12538,002542730X,10,Politically Correct Bedtime Stories: Modern Ta...,James Finn Garner,http://images.amazon.com/images/P/002542730X.0...,82
4,13552,002542730X,0,Politically Correct Bedtime Stories: Modern Ta...,James Finn Garner,http://images.amazon.com/images/P/002542730X.0...,82


In [87]:
user_rating = rating_popular_book.merge(users, left_on = 'User-ID', right_on = 'User-ID', how = 'left')
user_rating=user_rating.drop('Age', axis=1)
user_rating.head()

Unnamed: 0,User-ID,ISBN,Book-Rating,Book-Title,Book-Author,Image-URL-M,totalRatingCount,Location
0,277427,002542730X,10,Politically Correct Bedtime Stories: Modern Ta...,James Finn Garner,http://images.amazon.com/images/P/002542730X.0...,82,"gilbert, arizona, usa"
1,3363,002542730X,0,Politically Correct Bedtime Stories: Modern Ta...,James Finn Garner,http://images.amazon.com/images/P/002542730X.0...,82,"knoxville, tennessee, usa"
2,11676,002542730X,6,Politically Correct Bedtime Stories: Modern Ta...,James Finn Garner,http://images.amazon.com/images/P/002542730X.0...,82,"n/a, n/a, n/a"
3,12538,002542730X,10,Politically Correct Bedtime Stories: Modern Ta...,James Finn Garner,http://images.amazon.com/images/P/002542730X.0...,82,"byron, minnesota, usa"
4,13552,002542730X,0,Politically Correct Bedtime Stories: Modern Ta...,James Finn Garner,http://images.amazon.com/images/P/002542730X.0...,82,"cordova, tennessee, usa"


In [88]:
user_rating = user_rating.drop_duplicates(['User-ID', 'Book-Title'])

In [89]:
# Creating pivot table

In [90]:
user_rating_pivot1 = user_rating.pivot(index = 'User-ID', columns = 'Book-Title', values = 'Book-Rating').fillna(0)
user_rating_pivot1.head()

Book-Title,1984,1st to Die: A Novel,2nd Chance,4 Blondes,84 Charing Cross Road,A Bend in the Road,A Case of Need,"A Child Called \It\"": One Child's Courage to Survive""",A Civil Action,A Cry In The Night,...,Winter Solstice,Wish You Well,Without Remorse,"Wizard and Glass (The Dark Tower, Book 4)",Wuthering Heights,Year of Wonders,You Belong To Me,Zen and the Art of Motorcycle Maintenance: An Inquiry into Values,Zoya,"\O\"" Is for Outlaw"""
User-ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
254,9.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2276,0.0,0.0,10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2766,0.0,0.0,0.0,0.0,0.0,7.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2977,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,7.0,0.0,0.0,0.0,0.0
3363,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [91]:
user_rating_pivot1.shape

(894, 746)

In [92]:
X = user_rating_pivot1.values.T
X.shape

(746, 894)

In [93]:
import sklearn
from sklearn.decomposition import TruncatedSVD

SVD = TruncatedSVD(n_components=12, random_state=17)
matrix = SVD.fit_transform(X)
matrix.shape

(746, 12)

In [94]:
import warnings
warnings.filterwarnings("ignore",category =RuntimeWarning)

# Forming a correlation matrix

corr = np.corrcoef(matrix)
corr.shape

(746, 746)

In [95]:
book_title = user_rating_pivot1.columns
book_list = list(book_title)
coffey_hands = book_list.index("A Bend in the Road")
print(coffey_hands)

5


In [96]:
corr_coffey_hands  = corr[coffey_hands]

In [97]:
book_title[corr_coffey_hands>0.90]

Index(['A Bend in the Road', 'A Walk to Remember', 'Sea Glass: A Novel',
       'The Five People You Meet in Heaven',
       'The Last Time They Met : A Novel'],
      dtype='object', name='Book-Title')

### 3. k-Nearest Neighbors (kNN)

In [98]:
# Implementing kNN

In [99]:
# Finding the Nearest Neighbors

In [100]:
from scipy.sparse import csr_matrix


# Converting the 'user_rating' to 2D matrix and fill the null values with 0 since we have to calculate the distance between the vectors

user_rating_pivot2 = user_rating.pivot(index = 'Book-Title', columns = 'User-ID', values = 'Book-Rating').fillna(0)



# Transforming the matrix to scipy sparse matrix for more efficient calculations

user_rating_matrix = csr_matrix(user_rating_pivot2.values)



# Modelling

from sklearn.neighbors import NearestNeighbors

model_knn = NearestNeighbors(metric = 'cosine', algorithm = 'brute')

model_knn.fit(user_rating_matrix)

In [101]:
# Test our model and make some recommendations:

In [102]:
user_rating_pivot2.shape

(746, 894)

In [103]:
np.where(user_rating_pivot2.index=="A Bend in the Road")[0][0]

5

In [104]:
# Function to recommend books

def book_recommend(bk_name):
    
    # index fetch
    
    book_id = np.where(user_rating_pivot2.index==bk_name)[0][0]
    
    
    distance,suggestion =  model_knn.kneighbors(user_rating_pivot2.iloc[book_id,:].values.reshape(1,-1),n_neighbors=6)
    
    for i in range(len(suggestion)):
        books =user_rating_pivot2.index[suggestion[i]]
        count=0
        for j in books:
            count+=1
            if count==1:
                continue
            else:
                print(j)

In [105]:
book_recommend("A Bend in the Road")

Nights in Rodanthe
A Walk to Remember
Sea Glass: A Novel
The Last Time They Met : A Novel
Family Album


In [106]:
books_name = user_rating_pivot2.index

In [107]:
books_name=books_name.insert(0," ")

In [108]:
pickle.dump(model_knn,open('model_knn.pkl','wb'))
pickle.dump(books_name,open('books_name.pkl','wb'))
pickle.dump(user_rating_pivot2,open('user_rating.pkl','wb'))
pickle.dump(user_rating_pivot2,open('user_rating_pivot2.pkl','wb'))