In [1]:
# importing libraries
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import linear_kernel
from sklearn.feature_extraction.text import TfidfVectorizer

In [3]:
# reading file
#in the following line you have to write the file's directory on your computer
book_description = pd.read_csv(r"description.csv", encoding = 'latin-1')

In [4]:
# checking if we have the right data
book_description.head()

Unnamed: 0,book_id,Name,Description
0,4833.0,The Glass Castle,"A Tender, Moving Tale Of Unconditional Love In..."
1,590.0,"Night (The Night Trilogy, #1)","Born Into A Jewish Ghetto In Hungary, As A Chi..."
2,4264.0,"Angela'S Ashes (Frank Mccourt, #1)",Imbued On Every Page With Frank Mccourt'S Asto...
3,3361.0,"Eat, Pray, Love","A Celebrated Writer'S Irresistible, Candid, An..."
4,4535.0,Into Thin Air: A Personal Account Of The Mount...,A Bank Of Clouds Was Assembling On The Not-So-...


In [5]:
# removing the stop words
books_tfidf = TfidfVectorizer(stop_words='english')
# filling the missing values with empty string
book_description['Description'] = book_description['Description'].fillna('')
# computing TF-IDF matrix required for calculating cosine similarity
book_description_matrix = books_tfidf.fit_transform(book_description['Description'])

In [6]:
# Let's check the shape of computed matrix
book_description_matrix.shape

(143, 4186)

In [7]:
# Let's check the shape of computed matrix
book_description_matrix.shape

(143, 4186)

In [8]:
# computing cosine similarity matrix using linear_kernal of sklearn
cosine_similarity = linear_kernel(book_description_matrix, book_description_matrix)

In [9]:
print(cosine_similarity)

[[1.         0.03240804 0.021228   ... 0.005362   0.01071911 0.00532216]
 [0.03240804 1.         0.02661665 ... 0.         0.00510551 0.        ]
 [0.021228   0.02661665 1.         ... 0.01079403 0.         0.00563089]
 ...
 [0.005362   0.         0.01079403 ... 1.         0.         0.01875445]
 [0.01071911 0.00510551 0.         ... 0.         1.         0.05737646]
 [0.00532216 0.         0.00563089 ... 0.01875445 0.05737646 1.        ]]


In [10]:
#printing the tf-idf weights for each relevant word in the description of each book in the dataset
print(book_description_matrix) 

  (0, 3755)	0.13974290015697077
  (0, 2497)	0.16597222927200309
  (0, 3720)	0.1154766075072239
  (0, 3922)	0.1290048945978735
  (0, 2268)	0.08155340553903333
  (0, 1438)	0.19690781001870242
  (0, 1055)	0.12138615840709932
  (0, 2959)	0.13974290015697077
  (0, 1510)	0.1290048945978735
  (0, 1622)	0.1154766075072239
  (0, 315)	0.046145705549065295
  (0, 1476)	0.13974290015697077
  (0, 1063)	0.1290048945978735
  (0, 584)	0.13974290015697077
  (0, 3647)	0.11064815284800204
  (0, 2211)	0.12215375666198071
  (0, 3758)	0.1154766075072239
  (0, 2075)	0.27948580031394155
  (0, 4044)	0.38701468379362053
  (0, 1709)	0.06069307920354966
  (0, 2710)	0.15454412498584177
  (0, 1884)	0.06069307920354966
  (0, 3631)	0.06450244729893675
  (0, 2581)	0.06987145007848539
  (0, 952)	0.06987145007848539
  :	:
  (142, 2262)	0.1688824115538687
  (142, 1462)	0.1688824115538687
  (142, 3942)	0.12383683795518675
  (142, 759)	0.14485130153549752
  (142, 4071)	0.14485130153549752
  (142, 1093)	0.14485130153549752
 

In [11]:
#seperating the book names column from the dataframe
df=book_description
df1=df.loc[:,"Name":"Name"]

print(df1)

                                                  Name
0                                    The Glass Castle 
1                       Night (The Night Trilogy, #1) 
2                  Angela'S Ashes (Frank Mccourt, #1) 
3                                     Eat, Pray, Love 
4    Into Thin Air: A Personal Account Of The Mount...
5                                Tuesdays With Morrie 
6                               Running With Scissors 
7                                       Into The Wild 
8                      I Know Why The Caged Bird Sings
9               A Child Called "It" (Dave Pelzer, #1) 
10   Chickens, Mules And Two Old Fools: Tuck Into A...
11   Persepolis: The Story Of A Childhood (Persepol...
12                     The Autobiography Of Malcolm X 
13   The Hiding Place: The Triumphant True Story Of...
14                      All Creatures Great And Small 
15                                        Confessions 
16                             Me Talk Pretty One Day 
17   Marle

In [12]:
# Function to get the most similar books
def recommend(index, cosine_sim=cosine_similarity):
    # Get the pairwsie similarity scores of all books compared to that book, 
    # sorting them and getting top 5
    id=index
    similarity_scores = list(enumerate(cosine_sim[id]))
    similarity_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)
    similarity_scores = similarity_scores[1:6]

    # Get the books index
    books_index = [i[0] for i in similarity_scores]

    # Return the top 5 most similar books using integer-location based indexing (iloc)
    return book_description['Name'].iloc[books_index]

In [14]:
# getting the input of the book which i want to have similar recommendations to it and matching it to its index in the dataset
#(the input has to belong to the dataset)
df2=df1[df1['Name' ].str.match(input().title(), na=False)]
print(df2)
#getting the index of the book
df3=df2.index.astype(int)
x=0
for i in range(142):
    x=x+1
# getting recommendation for book 
#to get the output you have to rerun this line 
print('you may also like to read: ')
recommend(int(df3.values[0]))

the diary of a young girl
                          Name
29  The Diary Of A Young Girl 
you may also like to read: 


2                   Angela'S Ashes (Frank Mccourt, #1) 
13    The Hiding Place: The Triumphant True Story Of...
8                       I Know Why The Caged Bird Sings
21                Wild Swans: Three Daughters Of China 
0                                     The Glass Castle 
Name: Name, dtype: object