## Bookify: The Ultimate Book Recommendation Application With Data-Driven Intelligence

### Importing the libraries

In [9]:
import pandas as pd 
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import pickle

In [10]:
import warnings
warnings.filterwarnings('ignore')

In [11]:
books = pd.read_csv('Cleaned Data/cleaned_books.csv')
users = pd.read_csv('Cleaned Data/cleaned_users.csv')
ratings = pd.read_csv('Cleaned Data/cleaned_ratings.csv')

In [12]:
pd.set_option('display.max_colwidth', -1)

# Recommendation systems

## A] Popularity based filtering 

### 1. Using average rating - Top 50 books in whole collection

In [13]:
# merging rating and books

ratings_with_name = ratings.merge(books,on='ISBN')

In [14]:
num_rating_df = ratings_with_name.groupby('Book-Title').count()['Book-Rating'].reset_index()

In [15]:
num_rating_df.rename(columns={'Book-Rating':'num_ratings'},inplace=True)
num_rating_df

Unnamed: 0,Book-Title,num_ratings
0,"A Light in the Storm: The Civil War Diary of Amelia Martin, Fenwick Island, Delaware, 1861 (Dear America)",4
1,Always Have Popsicles,1
2,Apple Magic (The Collector's series),1
3,"Ask Lily (Young Women of Faith: Lily Series, Book 5)",1
4,Beyond IBM: Leadership Marketing and Finance for the 1990s,1
...,...,...
241066,Ã?Â?lpiraten.,2
241067,Ã?Â?rger mit Produkt X. Roman.,4
241068,Ã?Â?sterlich leben.,1
241069,Ã?Â?stlich der Berge.,3


In [16]:
avg_rating_df = ratings_with_name.groupby('Book-Title').mean()['Book-Rating'].reset_index()
avg_rating_df.rename(columns={'Book-Rating':'avg_ratings'},inplace=True)
avg_rating_df

Unnamed: 0,Book-Title,avg_ratings
0,"A Light in the Storm: The Civil War Diary of Amelia Martin, Fenwick Island, Delaware, 1861 (Dear America)",2.250000
1,Always Have Popsicles,0.000000
2,Apple Magic (The Collector's series),0.000000
3,"Ask Lily (Young Women of Faith: Lily Series, Book 5)",8.000000
4,Beyond IBM: Leadership Marketing and Finance for the 1990s,0.000000
...,...,...
241066,Ã?Â?lpiraten.,0.000000
241067,Ã?Â?rger mit Produkt X. Roman.,5.250000
241068,Ã?Â?sterlich leben.,7.000000
241069,Ã?Â?stlich der Berge.,2.666667


In [17]:
popular_df = num_rating_df.merge(avg_rating_df,on='Book-Title')

In [18]:
popular_df = popular_df[popular_df['num_ratings']>=250 ].sort_values('avg_ratings',ascending=False).head(50)

In [19]:
popular_df=popular_df.merge(books,on='Book-Title').drop_duplicates('Book-Title')[['Book-Title','Book-Author','Image-URL-M','num_ratings','avg_ratings']]
popular_df.reset_index(inplace=True)
popular_df

Unnamed: 0,index,Book-Title,Book-Author,Image-URL-M,num_ratings,avg_ratings
0,0,Harry Potter and the Prisoner of Azkaban (Book 3),J. K. Rowling,http://images.amazon.com/images/P/0439136350.01.MZZZZZZZ.jpg,428,5.852804
1,3,Harry Potter and the Goblet of Fire (Book 4),J. K. Rowling,http://images.amazon.com/images/P/0439139597.01.MZZZZZZZ.jpg,387,5.824289
2,5,Harry Potter and the Sorcerer's Stone (Book 1),J. K. Rowling,http://images.amazon.com/images/P/0590353403.01.MZZZZZZZ.jpg,278,5.73741
3,9,Harry Potter and the Order of the Phoenix (Book 5),J. K. Rowling,http://images.amazon.com/images/P/043935806X.01.MZZZZZZZ.jpg,347,5.501441
4,13,Harry Potter and the Chamber of Secrets (Book 2),J. K. Rowling,http://images.amazon.com/images/P/0439064872.01.MZZZZZZZ.jpg,556,5.183453
5,16,The Hobbit : The Enchanting Prelude to The Lord of the Rings,J.R.R. TOLKIEN,http://images.amazon.com/images/P/0345339681.01.MZZZZZZZ.jpg,281,5.007117
6,17,"The Fellowship of the Ring (The Lord of the Rings, Part 1)",J.R.R. TOLKIEN,http://images.amazon.com/images/P/0345339703.01.MZZZZZZZ.jpg,368,4.94837
7,26,Harry Potter and the Sorcerer's Stone (Harry Potter (Paperback)),J. K. Rowling,http://images.amazon.com/images/P/059035342X.01.MZZZZZZZ.jpg,575,4.895652
8,28,"The Two Towers (The Lord of the Rings, Part 2)",J.R.R. TOLKIEN,http://images.amazon.com/images/P/0345339711.01.MZZZZZZZ.jpg,260,4.880769
9,39,To Kill a Mockingbird,Harper Lee,http://images.amazon.com/images/P/0446310786.01.MZZZZZZZ.jpg,510,4.7


In [20]:
popular_df.shape

(50, 6)

### 2. Books popular yearly 

In [21]:
# merging rating and books

ratings_with_name = ratings.merge(books,on='ISBN')

In [22]:
num_rating_df = ratings_with_name.groupby('ISBN').count()['Book-Rating'].reset_index()

In [23]:
num_rating_df.rename(columns={'Book-Rating':'num_ratings'},inplace=True)

In [24]:
avg_rating_df = ratings_with_name.groupby('ISBN').mean()['Book-Rating'].reset_index()
avg_rating_df.rename(columns={'Book-Rating':'avg_ratings'},inplace=True)

In [25]:
popular_df_y = num_rating_df.merge(avg_rating_df,on='ISBN')
popular_df_y = pd.merge(popular_df_y, books, on='ISBN')

In [26]:
popular_df_y=popular_df_y.sort_values('avg_ratings', ascending=False)

In [27]:
years = set()
indices = []
for ind, row in popular_df_y.iterrows():
    if row['Year-Of-Publication'] in years:
        indices.append(ind)
    else:
        years.add(row['Year-Of-Publication'])

popular_df_y = popular_df_y.drop(indices)
popular_df_y = popular_df_y.drop(['num_ratings','avg_ratings'], axis = 1)
popular_df_y = popular_df_y.sort_values('Year-Of-Publication')

In [28]:
popular_df_y

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-S,Image-URL-M,Image-URL-L
166739,0781228956,"Complete Works 10 Volumes [2,6,7,8,9] (Notable American Authors)",Benjamin Franklin,1806,Reprint Services Corp,http://images.amazon.com/images/P/0781228956.01.THUMBZZZ.jpg,http://images.amazon.com/images/P/0781228956.01.MZZZZZZZ.jpg,http://images.amazon.com/images/P/0781228956.01.LZZZZZZZ.jpg
166743,0781268001,"Hugh Wynne, Free Quaker (2 Volumes (BCL1-PS American Literature)",Silas Weir Mitchell,1897,Reprint Services Corp,http://images.amazon.com/images/P/0781268001.01.THUMBZZZ.jpg,http://images.amazon.com/images/P/0781268001.01.MZZZZZZZ.jpg,http://images.amazon.com/images/P/0781268001.01.LZZZZZZZ.jpg
217769,1551103982,The Cycling Adventures of Coconut Head: A North American Odyssey,Ted Schredd,1900,Graphic Arts Center Pub Co,http://images.amazon.com/images/P/1551103982.01.THUMBZZZ.jpg,http://images.amazon.com/images/P/1551103982.01.MZZZZZZZ.jpg,http://images.amazon.com/images/P/1551103982.01.LZZZZZZZ.jpg
140778,0671825356,W D HSE PLANTS,Jd Hersey,1901,Simon &amp; Schuster,http://images.amazon.com/images/P/0671825356.01.THUMBZZZ.jpg,http://images.amazon.com/images/P/0671825356.01.MZZZZZZZ.jpg,http://images.amazon.com/images/P/0671825356.01.LZZZZZZZ.jpg
190035,0841499306,"Charlotte Bronte, George Eliot and Jane Austin: Studies in Their Works",Henry H. Bonnell,1902,Folcroft Library Editions,http://images.amazon.com/images/P/0841499306.01.THUMBZZZ.jpg,http://images.amazon.com/images/P/0841499306.01.MZZZZZZZ.jpg,http://images.amazon.com/images/P/0841499306.01.LZZZZZZZ.jpg
...,...,...,...,...,...,...,...,...
269406,9643112136,Dalan-i bihisht (Dastan-i Irani),Nazi Safavi,2010,Intisharat-i Quqnus,http://images.amazon.com/images/P/9643112136.01.THUMBZZZ.jpg,http://images.amazon.com/images/P/9643112136.01.MZZZZZZZ.jpg,http://images.amazon.com/images/P/9643112136.01.LZZZZZZZ.jpg
72582,0394172116,"Monkey (An Evergreen Book, E-112)",Cheng-En Wu,2011,Grove Press,http://images.amazon.com/images/P/0394172116.01.THUMBZZZ.jpg,http://images.amazon.com/images/P/0394172116.01.MZZZZZZZ.jpg,http://images.amazon.com/images/P/0394172116.01.LZZZZZZZ.jpg
29005,0307124533,Owl's Amazing but True No. 2,Owl Magazine,2012,Golden Books,http://images.amazon.com/images/P/0307124533.01.THUMBZZZ.jpg,http://images.amazon.com/images/P/0307124533.01.MZZZZZZZ.jpg,http://images.amazon.com/images/P/0307124533.01.LZZZZZZZ.jpg
146139,068107468X,Edgar Allen Poe Collected Poems,Edgar Allan Poe,2020,Bausch &amp; Lombard,http://images.amazon.com/images/P/068107468X.01.THUMBZZZ.jpg,http://images.amazon.com/images/P/068107468X.01.MZZZZZZZ.jpg,http://images.amazon.com/images/P/068107468X.01.LZZZZZZZ.jpg


## B] Recommendations based on correlations

We use Pearsons’R correlation coefficient to measure the linear correlation between two variables, in our case, the ratings for two books.

First, we need to find out the average rating, and the number of ratings each book received.

In [29]:
average_rating = pd.DataFrame(ratings.groupby('ISBN')['Book-Rating'].mean())
average_rating['ratingCount'] = pd.DataFrame(ratings.groupby('ISBN')['Book-Rating'].count())
average_rating.sort_values('ratingCount', ascending=False).head()

Unnamed: 0_level_0,Book-Rating,ratingCount
ISBN,Unnamed: 1_level_1,Unnamed: 2_level_1
971880107,1.019584,2502
316666343,4.468726,1295
385504209,4.652322,883
60928336,3.448087,732
312195516,4.334716,723


In this data set, the book that received the most rating counts was not highly rated at all. As a result, if we were to use recommendations based on rating counts, we would definitely make mistakes here. So, we need to have a better system.

To ensure statistical significance, users with less than 200 ratings, and books with less than 100 ratings are excluded.

In [30]:
# Considering only those users which have rated atleast 200 books

counts1 = ratings['User-ID'].value_counts()

ratings_cor = ratings[ratings['User-ID'].isin(counts1[counts1 >= 200].index)]



# Considering only those books which have atleast 100 ratings on them

counts = ratings['Book-Rating'].value_counts()

ratings_cor = ratings_cor[ratings_cor['Book-Rating'].isin(counts[counts >= 100].index)]

In [31]:
# We convert the ratings table to a 2D matrix. The matrix will be sparse because not every user rated every book.

ratings_pivot = ratings_cor.pivot(index='User-ID', columns='ISBN')['Book-Rating']
userID = ratings_pivot.index
ISBN = ratings_pivot.columns
print(ratings_pivot.shape)
ratings_pivot.head()

(905, 207699)


ISBN,0330299891,0375404120,0586045007,9022906116,9032803328,9044922564,9044922572,9044922718,9044923161,904492401X,...,UNGRANDHOMMED,X000000000,"YOUTELLEM,AND",ZR903CX0003,"\0432534220\""""","\2842053052\""""",b00005wz75,cn108465,cn113107,Ô½crosoft
User-ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
254,,,,,,,,,,,...,,,,,,,,,,
2276,,,,,,,,,,,...,,,,,,,,,,
2766,,,,,,,,,,,...,,,,,,,,,,
2977,,,,,,,,,,,...,,,,,,,,,,
3363,,,,,,,,,,,...,,,,,,,,,,


In [32]:
# Testing 1

In [33]:
# Let’s find out which books are correlated with the 2nd most rated book “The Lovely Bones: A Novel”, '0316666343'.

bones_ratings = ratings_pivot['0316666343']
similar_to_bones = ratings_pivot.corrwith(bones_ratings)
corr_bones = pd.DataFrame(similar_to_bones, columns=['pearsonR'])
corr_bones.dropna(inplace=True)
corr_summary = corr_bones.join(average_rating['ratingCount'])
l=corr_summary[corr_summary['ratingCount']>=300].sort_values('pearsonR', ascending=False).head(10)

In [34]:
books_corr_to_bones = pd.DataFrame(list(l.index), 
                                  index=np.arange(10), columns=['ISBN'])
corr_books = pd.merge(books_corr_to_bones, books, on='ISBN')
corr_books

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-S,Image-URL-M,Image-URL-L
0,316666343,The Lovely Bones: A Novel,Alice Sebold,2002,"Little, Brown",http://images.amazon.com/images/P/0316666343.01.THUMBZZZ.jpg,http://images.amazon.com/images/P/0316666343.01.MZZZZZZZ.jpg,http://images.amazon.com/images/P/0316666343.01.LZZZZZZZ.jpg
1,312291639,The Nanny Diaries: A Novel,Emma McLaughlin,2003,St. Martin's Griffin,http://images.amazon.com/images/P/0312291639.01.THUMBZZZ.jpg,http://images.amazon.com/images/P/0312291639.01.MZZZZZZZ.jpg,http://images.amazon.com/images/P/0312291639.01.LZZZZZZZ.jpg
2,316601950,The Pilot's Wife : A Novel,Anita Shreve,1999,Back Bay Books,http://images.amazon.com/images/P/0316601950.01.THUMBZZZ.jpg,http://images.amazon.com/images/P/0316601950.01.MZZZZZZZ.jpg,http://images.amazon.com/images/P/0316601950.01.LZZZZZZZ.jpg
3,446610038,1st to Die: A Novel,James Patterson,2002,Warner Vision,http://images.amazon.com/images/P/0446610038.01.THUMBZZZ.jpg,http://images.amazon.com/images/P/0446610038.01.MZZZZZZZ.jpg,http://images.amazon.com/images/P/0446610038.01.LZZZZZZZ.jpg
4,446672211,Where the Heart Is (Oprah's Book Club (Paperback)),Billie Letts,1998,Warner Books,http://images.amazon.com/images/P/0446672211.01.THUMBZZZ.jpg,http://images.amazon.com/images/P/0446672211.01.MZZZZZZZ.jpg,http://images.amazon.com/images/P/0446672211.01.LZZZZZZZ.jpg
5,385265700,The Book of Ruth (Oprah's Book Club (Paperback)),Jane Hamilton,1990,Anchor,http://images.amazon.com/images/P/0385265700.01.THUMBZZZ.jpg,http://images.amazon.com/images/P/0385265700.01.MZZZZZZZ.jpg,http://images.amazon.com/images/P/0385265700.01.LZZZZZZZ.jpg
6,345342968,Fahrenheit 451,RAY BRADBURY,1987,Del Rey,http://images.amazon.com/images/P/0345342968.01.THUMBZZZ.jpg,http://images.amazon.com/images/P/0345342968.01.MZZZZZZZ.jpg,http://images.amazon.com/images/P/0345342968.01.LZZZZZZZ.jpg
7,60930535,The Poisonwood Bible: A Novel,Barbara Kingsolver,1999,Perennial,http://images.amazon.com/images/P/0060930535.01.THUMBZZZ.jpg,http://images.amazon.com/images/P/0060930535.01.MZZZZZZZ.jpg,http://images.amazon.com/images/P/0060930535.01.LZZZZZZZ.jpg
8,375707972,The Reader,Bernhard Schlink,1999,Vintage Books USA,http://images.amazon.com/images/P/0375707972.01.THUMBZZZ.jpg,http://images.amazon.com/images/P/0375707972.01.MZZZZZZZ.jpg,http://images.amazon.com/images/P/0375707972.01.LZZZZZZZ.jpg
9,684872153,Angela's Ashes (MMP) : A Memoir,Frank McCourt,1999,Scribner,http://images.amazon.com/images/P/0684872153.01.THUMBZZZ.jpg,http://images.amazon.com/images/P/0684872153.01.MZZZZZZZ.jpg,http://images.amazon.com/images/P/0684872153.01.LZZZZZZZ.jpg


In [35]:
# Testing 2

In [36]:
# Let’s find out which books are correlated with “The Green Mile: Coffey's Hands (Green Mile Series)”, '0451190548'.

bones_ratings = ratings_pivot['0451190548']
similar_to_bones = ratings_pivot.corrwith(bones_ratings)
corr_bones = pd.DataFrame(similar_to_bones, columns=['pearsonR'])
corr_bones.dropna(inplace=True)
corr_summary = corr_bones.join(average_rating['ratingCount'])
l1=corr_summary[corr_summary['ratingCount']>=300].sort_values('pearsonR', ascending=False).head(10)

In [37]:
books_corr_to_bones = pd.DataFrame(list(l1.index), 
                                  index=np.arange(10), columns=['ISBN'])
corr_books = pd.merge(books_corr_to_bones, books, on='ISBN')
corr_books

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-S,Image-URL-M,Image-URL-L
0,345342968,Fahrenheit 451,RAY BRADBURY,1987,Del Rey,http://images.amazon.com/images/P/0345342968.01.THUMBZZZ.jpg,http://images.amazon.com/images/P/0345342968.01.MZZZZZZZ.jpg,http://images.amazon.com/images/P/0345342968.01.LZZZZZZZ.jpg
1,440224764,The Partner,John Grisham,1998,Dell Publishing Company,http://images.amazon.com/images/P/0440224764.01.THUMBZZZ.jpg,http://images.amazon.com/images/P/0440224764.01.MZZZZZZZ.jpg,http://images.amazon.com/images/P/0440224764.01.LZZZZZZZ.jpg
2,375725784,A Heartbreaking Work of Staggering Genius,Dave Eggers,2001,Vintage Books USA,http://images.amazon.com/images/P/0375725784.01.THUMBZZZ.jpg,http://images.amazon.com/images/P/0375725784.01.MZZZZZZZ.jpg,http://images.amazon.com/images/P/0375725784.01.LZZZZZZZ.jpg
3,671003755,She's Come Undone (Oprah's Book Club (Paperback)),Wally Lamb,1996,Washington Square Press,http://images.amazon.com/images/P/0671003755.01.THUMBZZZ.jpg,http://images.amazon.com/images/P/0671003755.01.MZZZZZZZ.jpg,http://images.amazon.com/images/P/0671003755.01.LZZZZZZZ.jpg
4,385720106,A Map of the World,Jane Hamilton,1999,Anchor Books/Doubleday,http://images.amazon.com/images/P/0385720106.01.THUMBZZZ.jpg,http://images.amazon.com/images/P/0385720106.01.MZZZZZZZ.jpg,http://images.amazon.com/images/P/0385720106.01.LZZZZZZZ.jpg
5,385484518,"Tuesdays with Morrie: An Old Man, a Young Man, and Life's Greatest Lesson",MITCH ALBOM,1997,Doubleday,http://images.amazon.com/images/P/0385484518.01.THUMBZZZ.jpg,http://images.amazon.com/images/P/0385484518.01.MZZZZZZZ.jpg,http://images.amazon.com/images/P/0385484518.01.LZZZZZZZ.jpg
6,316776963,Me Talk Pretty One Day,David Sedaris,2001,Back Bay Books,http://images.amazon.com/images/P/0316776963.01.THUMBZZZ.jpg,http://images.amazon.com/images/P/0316776963.01.MZZZZZZZ.jpg,http://images.amazon.com/images/P/0316776963.01.LZZZZZZZ.jpg
7,312195516,The Red Tent (Bestselling Backlist),Anita Diamant,1998,Picador USA,http://images.amazon.com/images/P/0312195516.01.THUMBZZZ.jpg,http://images.amazon.com/images/P/0312195516.01.MZZZZZZZ.jpg,http://images.amazon.com/images/P/0312195516.01.LZZZZZZZ.jpg
8,345353145,Sphere,MICHAEL CRICHTON,1988,Ballantine Books,http://images.amazon.com/images/P/0345353145.01.THUMBZZZ.jpg,http://images.amazon.com/images/P/0345353145.01.MZZZZZZZ.jpg,http://images.amazon.com/images/P/0345353145.01.LZZZZZZZ.jpg
9,440241073,The Summons,John Grisham,2002,Dell Publishing Company,http://images.amazon.com/images/P/0440241073.01.THUMBZZZ.jpg,http://images.amazon.com/images/P/0440241073.01.MZZZZZZZ.jpg,http://images.amazon.com/images/P/0440241073.01.LZZZZZZZ.jpg


From above we can see that the recommendation system is not working fine


In [38]:
pickle.dump(popular_df,open('popular_df.pkl','wb'))
pickle.dump(popular_df_y,open('popular_df_y.pkl','wb'))