# Preprocessing

In [2]:
import pandas as pd

In [3]:
#reading csv file into dataset
books = pd.read_csv("books.csv")

In [4]:
#printing first 5 rows
books.head()

Unnamed: 0.1,Unnamed: 0,bookID,title,authors,average_rating,isbn,isbn13,language_code,num_pages,ratings_count,text_reviews_count,publication_date,publisher,title_lower,categories,thumbnail,description,published_year,num_pages.1
0,0,21,A Short History of Nearly Everything,Bill Bryson,4.21,076790818X,9780767908184,eng,544,248558,9396,9/14/2004,Broadway Books,a short history of nearly everything,Science,http://books.google.com/books/content?id=YjAnf...,The author of A Walk in the Woods traces the B...,2004.0,544.0
1,1,21,A Short History of Nearly Everything,Bill Bryson,4.21,076790818X,9780767908184,eng,544,248558,9396,9/14/2004,Broadway Books,a short history of nearly everything,Science,,In this book Bill Bryson explores the most int...,2003.0,545.0
2,2,3870,A Short History of Nearly Everything,Bill Bryson,4.21,0767923227,9780767923224,eng,624,352,37,11/1/2005,Broadway Books,a short history of nearly everything,Science,http://books.google.com/books/content?id=YjAnf...,The author of A Walk in the Woods traces the B...,2004.0,544.0
3,3,3870,A Short History of Nearly Everything,Bill Bryson,4.21,0767923227,9780767923224,eng,624,352,37,11/1/2005,Broadway Books,a short history of nearly everything,Science,,In this book Bill Bryson explores the most int...,2003.0,545.0
4,4,22,Bill Bryson's African Diary,Bill Bryson,3.44,0767915062,9780767915069,eng,55,7270,499,12/3/2002,Broadway Books,bill bryson's african diary,Biography & Autobiography,http://books.google.com/books/content?id=3QiTm...,A travel writer with little background knowled...,2002.0,55.0


In [5]:
#number of rows and columns
books.shape

(3572, 19)

Books dataset has 3572 rows and 19 columns.

In [6]:
#printing the name of columns
books.columns

Index(['Unnamed: 0', 'bookID', 'title', 'authors', 'average_rating', 'isbn',
       'isbn13', 'language_code', '  num_pages', 'ratings_count',
       'text_reviews_count', 'publication_date', 'publisher', 'title_lower',
       'categories', 'thumbnail', 'description', 'published_year',
       'num_pages'],
      dtype='object')

In [7]:
#missing values
books.isna().sum()

Unnamed: 0              0
bookID                  0
title                   0
authors                 0
average_rating          0
isbn                    0
isbn13                  0
language_code           0
  num_pages             0
ratings_count           0
text_reviews_count      0
publication_date        0
publisher               0
title_lower             0
categories             33
thumbnail             159
description            81
published_year          0
num_pages              23
dtype: int64

In [9]:
#value counts of book title
books["title"].value_counts()

title
The Odyssey                      32
One Hundred Years of Solitude    30
A Midsummer Night's Dream        28
The Brothers Karamazov           27
Macbeth                          24
                                 ..
Geek Love                         1
Lord of Light                     1
Manna from Heaven                 1
The Interpreter                   1
You Bright and Risen Angels       1
Name: count, Length: 2287, dtype: int64

Many book titles are repeated therefore duplicates should be removed.

In [10]:
#dropping duplicates 
books = books.drop_duplicates(subset = "title_lower")

In [11]:
#checking if the drop worked
books["title"].value_counts()

title
A Short History of Nearly Everything                1
The Death of Virgil                                 1
Buddha                                              1
Hiroshima                                           1
The Wall                                            1
                                                   ..
Arthur Edward Waite's Quest of the Golden Stairs    1
Main Street                                         1
Babbitt                                             1
The Guns of August                                  1
You Bright and Risen Angels                         1
Name: count, Length: 2287, dtype: int64

In [12]:
#checking how the shape changed after dropping duplicates
books.shape

(2287, 19)

In [13]:
#indexing the dataset
books1 = books.reset_index(drop = True)

In [14]:
#checking if it worked
books1.head()

Unnamed: 0.1,Unnamed: 0,bookID,title,authors,average_rating,isbn,isbn13,language_code,num_pages,ratings_count,text_reviews_count,publication_date,publisher,title_lower,categories,thumbnail,description,published_year,num_pages.1
0,0,21,A Short History of Nearly Everything,Bill Bryson,4.21,076790818X,9780767908184,eng,544,248558,9396,9/14/2004,Broadway Books,a short history of nearly everything,Science,http://books.google.com/books/content?id=YjAnf...,The author of A Walk in the Woods traces the B...,2004.0,544.0
1,4,22,Bill Bryson's African Diary,Bill Bryson,3.44,0767915062,9780767915069,eng,55,7270,499,12/3/2002,Broadway Books,bill bryson's african diary,Biography & Autobiography,http://books.google.com/books/content?id=3QiTm...,A travel writer with little background knowled...,2002.0,55.0
2,6,24,In a Sunburned Country,Bill Bryson,4.07,0767903862,9780767903868,eng,335,72451,4245,5/15/2001,Broadway Books,in a sunburned country,True Crime,http://books.google.com/books/content?id=7ZELq...,The author takes readers on a tour of the land...,2001.0,335.0
3,7,28,Notes from a Small Island,Bill Bryson,3.91,0380727501,9780380727506,eng,324,80609,3301,5/28/1997,William Morrow Paperbacks,notes from a small island,Travel,http://books.google.com/books/content?id=rTXB6...,"""Suddenly, in the space of a moment, I realize...",1997.0,324.0
4,8,61,The Changeling,Zilpha Keatley Snyder,4.17,0595321801,9780595321803,eng,228,1176,96,6/8/2004,iUniverse,the changeling,Fiction,http://books.google.com/books/content?id=5jE31...,FICTION-ROMANCE/GOTHIC,1990.0,369.0


In [15]:
# selecting only the relevant columns
books1 = books1[["title", "authors", "average_rating", "title_lower", "categories", "description", "thumbnail", "published_year"]]

# Combined Features column

In [16]:
def combine_features(data): 
    data['combined_features'] = data[['title_lower', 'authors', 'categories', 'description']].apply(lambda x: ','.join(x.dropna().astype(str)),axis=1)
    return data

In [18]:
combine_features(books1)

Unnamed: 0,title,authors,average_rating,title_lower,categories,description,thumbnail,published_year,combined_features
0,A Short History of Nearly Everything,Bill Bryson,4.21,a short history of nearly everything,Science,The author of A Walk in the Woods traces the B...,http://books.google.com/books/content?id=YjAnf...,2004.0,"a short history of nearly everything,Bill Brys..."
1,Bill Bryson's African Diary,Bill Bryson,3.44,bill bryson's african diary,Biography & Autobiography,A travel writer with little background knowled...,http://books.google.com/books/content?id=3QiTm...,2002.0,"bill bryson's african diary,Bill Bryson,Biogra..."
2,In a Sunburned Country,Bill Bryson,4.07,in a sunburned country,True Crime,The author takes readers on a tour of the land...,http://books.google.com/books/content?id=7ZELq...,2001.0,"in a sunburned country,Bill Bryson,True Crime,..."
3,Notes from a Small Island,Bill Bryson,3.91,notes from a small island,Travel,"""Suddenly, in the space of a moment, I realize...",http://books.google.com/books/content?id=rTXB6...,1997.0,"notes from a small island,Bill Bryson,Travel,""..."
4,The Changeling,Zilpha Keatley Snyder,4.17,the changeling,Fiction,FICTION-ROMANCE/GOTHIC,http://books.google.com/books/content?id=5jE31...,1990.0,"the changeling,Zilpha Keatley Snyder,Fiction,F..."
...,...,...,...,...,...,...,...,...,...
2282,The Passion of Jesus Christ,John Piper,4.23,the passion of jesus christ,Religion,Fifty Reasons Why Jesus Came to Die The most i...,http://books.google.com/books/content?id=5eLRw...,2004.0,"the passion of jesus christ,John Piper,Religio..."
2283,Wartime Lies,Louis Begley,3.71,wartime lies,Fiction,"A haunting, unforgettable novel about an orpha...",http://books.google.com/books/content?id=V-aLD...,1997.0,"wartime lies,Louis Begley,Fiction,A haunting, ..."
2284,Undaunted Courage,Stephen E. Ambrose/Barrett Whitener,4.21,undaunted courage,Explorers,'This was much more than a bunch of guys out o...,http://books.google.com/books/content?id=1egIO...,2003.0,"undaunted courage,Stephen E. Ambrose/Barrett W..."
2285,Whores for Gloria,William T. Vollmann,3.69,whores for gloria,Fiction,From the acclaimed author of The Rainbow Stori...,http://books.google.com/books/content?id=mQA-P...,1994.0,"whores for gloria,William T. Vollmann,Fiction,..."


In [19]:
#checking if the combined_feature column was added to the dataset
books1.head()

Unnamed: 0,title,authors,average_rating,title_lower,categories,description,thumbnail,published_year,combined_features
0,A Short History of Nearly Everything,Bill Bryson,4.21,a short history of nearly everything,Science,The author of A Walk in the Woods traces the B...,http://books.google.com/books/content?id=YjAnf...,2004.0,"a short history of nearly everything,Bill Brys..."
1,Bill Bryson's African Diary,Bill Bryson,3.44,bill bryson's african diary,Biography & Autobiography,A travel writer with little background knowled...,http://books.google.com/books/content?id=3QiTm...,2002.0,"bill bryson's african diary,Bill Bryson,Biogra..."
2,In a Sunburned Country,Bill Bryson,4.07,in a sunburned country,True Crime,The author takes readers on a tour of the land...,http://books.google.com/books/content?id=7ZELq...,2001.0,"in a sunburned country,Bill Bryson,True Crime,..."
3,Notes from a Small Island,Bill Bryson,3.91,notes from a small island,Travel,"""Suddenly, in the space of a moment, I realize...",http://books.google.com/books/content?id=rTXB6...,1997.0,"notes from a small island,Bill Bryson,Travel,""..."
4,The Changeling,Zilpha Keatley Snyder,4.17,the changeling,Fiction,FICTION-ROMANCE/GOTHIC,http://books.google.com/books/content?id=5jE31...,1990.0,"the changeling,Zilpha Keatley Snyder,Fiction,F..."


In [20]:
#checking how the missing values changed
books1.isna().sum()

title                 0
authors               0
average_rating        0
title_lower           0
categories           21
description          47
thumbnail            77
published_year        0
combined_features     0
dtype: int64

In [22]:
#saving dataset as pickle file
# import pickle

# with open('combined_features_books_final.pkl', 'wb') as f:
#     pickle.dump(books1, f)

books1.to_csv("books_finaldata.csv")

#  Cosine Similarity 

In [23]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [25]:
#vectorizing and removing stopwords
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(books1['combined_features'])

In [26]:
cosine_sim = cosine_similarity(tfidf_matrix)

In [27]:
indices = pd.Series(books1.index, index = books1["title_lower"])
index = indices["the passion of jesus christ"]
print(index)

2282


In [28]:
#mapping the book index to the cosine similarity value
sim_scores = list(enumerate(cosine_sim[index])) 
sim_scores

[(0, 0.0),
 (1, 0.0),
 (2, 0.0),
 (3, 0.004083679458708559),
 (4, 0.0),
 (5, 0.0),
 (6, 0.010172357198051693),
 (7, 0.017559973367271765),
 (8, 0.01344636028094678),
 (9, 0.012930258841035033),
 (10, 0.012354811007525121),
 (11, 0.0),
 (12, 0.019623210551578535),
 (13, 0.0),
 (14, 0.0),
 (15, 0.0),
 (16, 0.005429641195306451),
 (17, 0.0038352140862484797),
 (18, 0.0),
 (19, 0.0),
 (20, 0.017064964833990746),
 (21, 0.03229207925687702),
 (22, 0.004043882852390592),
 (23, 0.0),
 (24, 0.0),
 (25, 0.001724119621121611),
 (26, 0.00257009844409099),
 (27, 0.0),
 (28, 0.0048918638530524),
 (29, 0.0),
 (30, 0.0),
 (31, 0.0),
 (32, 0.03334156686016867),
 (33, 0.0),
 (34, 0.0),
 (35, 0.0),
 (36, 0.0),
 (37, 0.0),
 (38, 0.0),
 (39, 0.0),
 (40, 0.0),
 (41, 0.0),
 (42, 0.0),
 (43, 0.014040598140430021),
 (44, 0.0),
 (45, 0.0),
 (46, 0.0),
 (47, 0.0),
 (48, 0.0),
 (49, 0.0),
 (50, 0.008158530918834294),
 (51, 0.0),
 (52, 0.006338989762296107),
 (53, 0.0),
 (54, 0.0),
 (55, 0.026135068489777574),
 (5

In [29]:
#putting the list in descending order
sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
sim_scores

[(2282, 0.9999999999999999),
 (614, 0.2024591195636038),
 (661, 0.1999369430758783),
 (2281, 0.1918383635562585),
 (1975, 0.1662082966334203),
 (596, 0.12466696313081481),
 (1083, 0.12357371892780582),
 (63, 0.12209572940097062),
 (1710, 0.11131627373020074),
 (1781, 0.10506746489176018),
 (924, 0.0982408991052817),
 (2170, 0.09704002802905111),
 (839, 0.08875474425581317),
 (1796, 0.0884223334378055),
 (1749, 0.08752170730138285),
 (469, 0.0804666862616215),
 (1459, 0.080248940810352),
 (1128, 0.07618743192634073),
 (256, 0.0711966205716764),
 (1840, 0.06905011058702201),
 (598, 0.06802416766966515),
 (1297, 0.06732857812427505),
 (104, 0.06448453476993043),
 (2228, 0.0632284896283246),
 (1418, 0.0630561736256421),
 (1748, 0.06296296965808619),
 (629, 0.06252887740576883),
 (1300, 0.06235636530847847),
 (972, 0.062019037136198864),
 (783, 0.061348049688016236),
 (2246, 0.060699224961933106),
 (1237, 0.0593019327434799),
 (994, 0.05776341211573716),
 (1348, 0.05717982350874313),
 (1525

In [30]:
sim_scores = sim_scores[1:11]

In [31]:
#mapping the index values back to the dataset to see the book title instead of the index
books_indices = [i[0] for i in sim_scores]
books_indices

[614, 661, 2281, 1975, 596, 1083, 63, 1710, 1781, 924]

In [32]:
#printing book titles based on book index
books.iloc[books_indices]['title']

1408                            Jesus Freaks
1465                              Jesus' Son
3566    Suffering and the Sovereignty of God
3228                  Discovering God's Will
1344           The Last Temptation of Christ
2101                            Kingdom Come
158         The Kingdom of God Is Within You
2936                 What Christians Believe
3013             The Gospel According to Job
1866            The Legend of the Poinsettia
Name: title, dtype: object

In [6]:
import pandas as pd

#function to print most similar books based on cosine similarity
def results(insert_book_name, num_recs):
    insert_book_name =insert_book_name.lower()

    # with open("combined_features_books_final.pkl", "rb") as f:
    #     books = pickle.load(f)
    # New way with using CSV
    books = pd.read_csv("books_finaldata.csv")
    
    from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
    from sklearn.metrics.pairwise import cosine_similarity
    
    tfidf = TfidfVectorizer(stop_words='english')
    tfidf_matrix = tfidf.fit_transform(books['combined_features'])
    cosine_sim = cosine_similarity(tfidf_matrix)

    if insert_book_name not in books["title_lower"].unique():
        return 'Book not in Database'

    else:
        indices = pd.Series(books.index, index = books["title_lower"])
        index = indices[insert_book_name]
        sim_scores = list(enumerate(cosine_sim[index])) 
        sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
        sim_scores = sim_scores[1:num_recs+1]
        books_indices = [i[0] for i in sim_scores]
        return books.iloc[books_indices]['title']


In [7]:
#9 most similar books to the "The Passion of Jesus Christ

results("The Passion of Jesus Christ", 9)

614                             Jesus Freaks
661                               Jesus' Son
2281    Suffering and the Sovereignty of God
1975                  Discovering God's Will
596            The Last Temptation of Christ
1083                            Kingdom Come
63          The Kingdom of God Is Within You
1710                 What Christians Believe
1781             The Gospel According to Job
Name: title, dtype: object

By using the results function users can insert the title of the book they like and how many recommendations they want. (In this format results(insert_book_name, num_recs). This function was tested with many other book titles and it was decided that the recommendations are relevant to the given book title.