# Book Recommender System

This notebook contains the code for EDA and initial modelling for book recommender system.

In [1]:
import pandas as pd
import numpy as np

In [2]:
books = pd.read_csv("data/Books.csv")
ratings = pd.read_csv("data/Ratings.csv")
users = pd.read_csv("data/Users.csv")

  books = pd.read_csv("data/Books.csv")


In [3]:
ratings.head()

Unnamed: 0,User-ID,ISBN,Book-Rating
0,276725,034545104X,0
1,276726,0155061224,5
2,276727,0446520802,0
3,276729,052165615X,3
4,276729,0521795028,6


In [4]:
print(books.shape)
print(ratings.shape)
print(users.shape)

(271360, 8)
(1149780, 3)
(278858, 3)


In [5]:
books.isnull().sum()

ISBN                   0
Book-Title             0
Book-Author            2
Year-Of-Publication    0
Publisher              2
Image-URL-S            0
Image-URL-M            0
Image-URL-L            3
dtype: int64

In [6]:
users.isnull().sum()

User-ID          0
Location         0
Age         110762
dtype: int64

In [7]:
ratings.isnull().sum()

User-ID        0
ISBN           0
Book-Rating    0
dtype: int64

In [8]:
books.duplicated().sum()

0

In [9]:
ratings.duplicated().sum()

0

In [10]:
users.duplicated().sum()

0

## Exploratory Data Analysis

In [11]:
# Books Dataframe
books.head()

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-S,Image-URL-M,Image-URL-L
0,195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...
1,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...
2,60973129,Decision in Normandy,Carlo D'Este,1991,HarperPerennial,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...
3,374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999,Farrar Straus Giroux,http://images.amazon.com/images/P/0374157065.0...,http://images.amazon.com/images/P/0374157065.0...,http://images.amazon.com/images/P/0374157065.0...
4,393045218,The Mummies of Urumchi,E. J. W. Barber,1999,W. W. Norton &amp; Company,http://images.amazon.com/images/P/0393045218.0...,http://images.amazon.com/images/P/0393045218.0...,http://images.amazon.com/images/P/0393045218.0...


In [12]:
books.dtypes

ISBN                   object
Book-Title             object
Book-Author            object
Year-Of-Publication    object
Publisher              object
Image-URL-S            object
Image-URL-M            object
Image-URL-L            object
dtype: object

In [13]:
# This cell deals with columns understanding.

# Let us start with column "Year-Of-Publication" to find if it does not have any value that does not make sense.

wrong_entries_to_year_of_publication = pd.to_numeric(
    books["Year-Of-Publication"], errors="coerce"
).isna()
print(books[wrong_entries_to_year_of_publication].index)
books.drop([209538, 220731, 221678], inplace=True)

Index([209538, 220731, 221678], dtype='int64')


#### From the above cell, it can be noted that, the Year-Of-Publication column should be an integer indicating the year in which the book is published. But, there are 3 rows where the value is incorrent. The last line in above cell removes those rows.

## Popularity Based Recommender System

In [14]:
# Merge books and ratings on ISBN column
ratings_with_name = ratings.merge(books, on="ISBN")

In [15]:
num_ratings_df = (
    ratings_with_name.groupby("Book-Title").count()["Book-Rating"].reset_index()
)
num_ratings_df.rename(columns={"Book-Rating": "num_ratings"}, inplace=True)
num_ratings_df

Unnamed: 0,Book-Title,num_ratings
0,A Light in the Storm: The Civil War Diary of ...,4
1,Always Have Popsicles,1
2,Apple Magic (The Collector's series),1
3,"Ask Lily (Young Women of Faith: Lily Series, ...",1
4,Beyond IBM: Leadership Marketing and Finance ...,1
...,...,...
241063,Ã?Â?lpiraten.,2
241064,Ã?Â?rger mit Produkt X. Roman.,4
241065,Ã?Â?sterlich leben.,1
241066,Ã?Â?stlich der Berge.,3


In [16]:
avg_rating_df = (
    ratings_with_name.groupby("Book-Title")["Book-Rating"].mean().reset_index()
)
avg_rating_df.rename(columns={"Book-Rating": "avg_ratings"}, inplace=True)
avg_rating_df

Unnamed: 0,Book-Title,avg_ratings
0,A Light in the Storm: The Civil War Diary of ...,2.250000
1,Always Have Popsicles,0.000000
2,Apple Magic (The Collector's series),0.000000
3,"Ask Lily (Young Women of Faith: Lily Series, ...",8.000000
4,Beyond IBM: Leadership Marketing and Finance ...,0.000000
...,...,...
241063,Ã?Â?lpiraten.,0.000000
241064,Ã?Â?rger mit Produkt X. Roman.,5.250000
241065,Ã?Â?sterlich leben.,7.000000
241066,Ã?Â?stlich der Berge.,2.666667


In [17]:
popular_df = num_ratings_df.merge(avg_rating_df, on="Book-Title")
popular_df = (
    popular_df[popular_df["num_ratings"] >= 250]
    .sort_values("avg_ratings", ascending=False)
    .head(50)
)

In [18]:
books.columns

Index(['ISBN', 'Book-Title', 'Book-Author', 'Year-Of-Publication', 'Publisher',
       'Image-URL-S', 'Image-URL-M', 'Image-URL-L'],
      dtype='object')

In [19]:
popular_df = popular_df.merge(books, on="Book-Title").drop_duplicates("Book-Title")[
    ["Book-Title", "Book-Author", "Image-URL-M", "num_ratings", "avg_ratings"]
]

In [20]:
popular_df['avg_ratings'] = popular_df['avg_ratings'].apply(lambda rating: round(rating,1))

In [21]:
popular_df['Image-URL-M']=popular_df['Image-URL-M'].apply(lambda x: x.replace("http","https"))
popular_df.head()

Unnamed: 0,Book-Title,Book-Author,Image-URL-M,num_ratings,avg_ratings
0,Harry Potter and the Prisoner of Azkaban (Book 3),J. K. Rowling,https://images.amazon.com/images/P/0439136350....,428,5.9
3,Harry Potter and the Goblet of Fire (Book 4),J. K. Rowling,https://images.amazon.com/images/P/0439139597....,387,5.8
5,Harry Potter and the Sorcerer's Stone (Book 1),J. K. Rowling,https://images.amazon.com/images/P/0590353403....,278,5.7
9,Harry Potter and the Order of the Phoenix (Boo...,J. K. Rowling,https://images.amazon.com/images/P/043935806X....,347,5.5
13,Harry Potter and the Chamber of Secrets (Book 2),J. K. Rowling,https://images.amazon.com/images/P/0439064872....,556,5.2


In [22]:
import pickle
pickle.dump(popular_df,open("popular.pkl", "wb"))

## Collaborative filtering based recommender system

In [23]:
x = ratings_with_name.groupby('User-ID').count()['Book-Rating'] > 200
users_with_high_ratings = x[x].index

In [24]:
filtered_rating = ratings_with_name[ratings_with_name['User-ID'].isin(users_with_high_ratings)]

In [25]:
y = filtered_rating.groupby('Book-Title').count()['Book-Rating'] >=50
famous_books = y[y].index
famous_books

Index(['1984', '1st to Die: A Novel', '2nd Chance', '4 Blondes',
       'A Bend in the Road', 'A Case of Need',
       'A Child Called \It\": One Child's Courage to Survive"',
       'A Civil Action', 'A Day Late and a Dollar Short', 'A Fine Balance',
       ...
       'Winter Solstice', 'Wish You Well', 'Without Remorse',
       'Wizard and Glass (The Dark Tower, Book 4)', 'Wuthering Heights',
       'Year of Wonders', 'You Belong To Me',
       'Zen and the Art of Motorcycle Maintenance: An Inquiry into Values',
       'Zoya', '\O\" Is for Outlaw"'],
      dtype='object', name='Book-Title', length=706)

In [26]:
final_ratings = filtered_rating[filtered_rating['Book-Title'].isin(famous_books)]

In [27]:
# TODO: Create pivot table
pt = final_ratings.pivot_table(index='Book-Title', columns="User-ID", values="Book-Rating")
pt.fillna(0, inplace=True)
pt

User-ID,254,2276,2766,2977,3363,4017,4385,6251,6323,6543,...,271705,273979,274004,274061,274301,274308,275970,277427,277639,278418
Book-Title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1984,9.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1st to Die: A Novel,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2nd Chance,0.0,10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4 Blondes,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A Bend in the Road,0.0,0.0,7.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Year of Wonders,0.0,0.0,0.0,7.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,9.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
You Belong To Me,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Zen and the Art of Motorcycle Maintenance: An Inquiry into Values,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Zoya,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [28]:
from sklearn.metrics.pairwise import cosine_similarity

similarity_scores = cosine_similarity(pt)

In [71]:
def recommend(book_name: str):
    index_of_book = np.where(pt.index==book_name)[0][0]
    suggestions = sorted(list(enumerate(similarity_scores[index_of_book])), key=lambda x: x[1], reverse=True)[1:6]
    
    data = []

    for suggestion in suggestions:
        item=[]
        temp_df = books[books['Book-Title'] == pt.index[suggestion[0]]]
        temp_df=temp_df.drop_duplicates('Book-Title')  
        item.extend(list(temp_df['Book-Title'].values))
        item.extend(list(temp_df['Book-Author'].values))
        item.extend(list(temp_df['Image-URL-M'].values))
        data.append(item)
    
    print(data)


In [72]:
recommend('1984')

[['Animal Farm', 'George Orwell', 'http://images.amazon.com/images/P/0451526341.01.MZZZZZZZ.jpg'], ["The Handmaid's Tale", 'Margaret Atwood', 'http://images.amazon.com/images/P/0449212602.01.MZZZZZZZ.jpg'], ['Brave New World', 'Aldous Huxley', 'http://images.amazon.com/images/P/0060809833.01.MZZZZZZZ.jpg'], ['The Vampire Lestat (Vampire Chronicles, Book II)', 'ANNE RICE', 'http://images.amazon.com/images/P/0345313860.01.MZZZZZZZ.jpg'], ['The Hours : A Novel', 'Michael Cunningham', 'http://images.amazon.com/images/P/0312243022.01.MZZZZZZZ.jpg']]


In [73]:
# Export the dependecies used by recommend function
pickle.dump(pt, open("pt.pkl","wb"))
pickle.dump(similarity_scores, open("similarity_scores.pkl","wb"))
pickle.dump(books, open("books.pkl","wb"))