In [26]:
import pandas as pd
import difflib
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

In [87]:
df=pd.read_csv("books.csv")

In [88]:
df.head()

Unnamed: 0,Title,Author,Genre,Height,Publisher,Index
0,Fundamentals of Wavelets,"Goswami, Jaideva",signal_processing,228,Wiley,0
1,Data Smart,"Foreman, John",data_science,235,Wiley,1
2,God Created the Integers,"Hawking, Stephen",mathematics,197,Penguin,2
3,Superfreakonomics,"Dubner, Stephen",economics,179,HarperCollins,3
4,Orientalism,"Said, Edward",history,197,Penguin,4


In [89]:
df.shape

(211, 6)

In [90]:
selected_features = ['Title','Author','Genre','Publisher']
print(selected_features)

['Title', 'Author', 'Genre', 'Publisher']


In [91]:
df.fillna('unknown')

Unnamed: 0,Title,Author,Genre,Height,Publisher,Index
0,Fundamentals of Wavelets,"Goswami, Jaideva",signal_processing,228,Wiley,0
1,Data Smart,"Foreman, John",data_science,235,Wiley,1
2,God Created the Integers,"Hawking, Stephen",mathematics,197,Penguin,2
3,Superfreakonomics,"Dubner, Stephen",economics,179,HarperCollins,3
4,Orientalism,"Said, Edward",history,197,Penguin,4
...,...,...,...,...,...,...
206,Structure and Randomness,"Tao, Terence",mathematics,252,unknown,206
207,Image Processing with MATLAB,"Eddins, Steve",signal_processing,241,unknown,207
208,Animal Farm,"Orwell, George",fiction,180,unknown,208
209,"Idiot, The","Dostoevsky, Fyodor",fiction,197,unknown,209


In [92]:

for feature in selected_features:
    if feature in df.columns:
        df[feature] = df[feature].fillna('')

combined_features = df['Title']+' '+df['Author']+' '+df['Genre']+' '+df['Publisher']

In [93]:
print(combined_features)

0      Fundamentals of Wavelets Goswami, Jaideva sign...
1            Data Smart Foreman, John data_science Wiley
2      God Created the Integers Hawking, Stephen math...
3      Superfreakonomics Dubner, Stephen economics Ha...
4               Orientalism Said, Edward history Penguin
                             ...                        
206    Structure and Randomness Tao, Terence mathemat...
207    Image Processing with MATLAB Eddins, Steve sig...
208                  Animal Farm Orwell, George fiction 
209               Idiot, The Dostoevsky, Fyodor fiction 
210         Christmas Carol, A Dickens, Charles fiction 
Length: 211, dtype: object


In [94]:
combined_features = combined_features.fillna('')

In [95]:
vectorizer = TfidfVectorizer()

In [96]:
feature_vectors = vectorizer.fit_transform(combined_features)

In [97]:
print(feature_vectors)

  (0, 639)	0.38508442233932055
  (0, 530)	0.3640151121863299
  (0, 294)	0.4147799471460624
  (0, 233)	0.4147799471460624
  (0, 629)	0.4147799471460624
  (0, 417)	0.17616290061645812
  (0, 217)	0.4147799471460624
  (1, 130)	0.2947926202309474
  (1, 302)	0.32927638372940826
  (1, 204)	0.48166091336569306
  (1, 538)	0.48166091336569306
  (1, 129)	0.3751166483689138
  (1, 639)	0.4471771498672322
  (2, 443)	0.20954510755616926
  (2, 365)	0.341469397147513
  (2, 552)	0.3551078624691722
  (2, 252)	0.3933198966227683
  (2, 286)	0.4236504944594545
  (2, 582)	0.14229595664317748
  (2, 120)	0.4236504944594545
  (2, 231)	0.4236504944594545
  (3, 249)	0.40710388908710715
  (3, 174)	0.3653858176520641
  (3, 164)	0.48530919012371093
  (3, 564)	0.5227334801189049
  :	:
  (206, 574)	0.4086956854974872
  (206, 559)	0.4323511214056754
  (206, 365)	0.37535522115740577
  (207, 553)	0.4048681294814102
  (207, 175)	0.4048681294814102
  (207, 366)	0.4048681294814102
  (207, 644)	0.31531056713989813
  (207, 46

In [98]:
similarity = cosine_similarity(feature_vectors)

In [99]:
print(similarity)

[[1.         0.17220095 0.         ... 0.         0.         0.        ]
 [0.17220095 1.         0.         ... 0.         0.         0.        ]
 [0.         0.         1.         ... 0.         0.02766264 0.        ]
 ...
 [0.         0.         0.         ... 1.         0.04366742 0.03704226]
 [0.         0.         0.02766264 ... 0.04366742 1.         0.04218914]
 [0.         0.         0.         ... 0.03704226 0.04218914 1.        ]]


In [100]:
print(similarity.shape)

(211, 211)


In [102]:
book_name = input(' Enter your favourite book name : ')

 Enter your favourite book name : Birth of a Theorem


In [103]:
list_of_all_titles = df['Title'].tolist()

find_close_match = difflib.get_close_matches(book_name, list_of_all_titles)

close_match = find_close_match[0]

index_of_the_book = df[df['Title'] == close_match]['Index'].values[0]

similarity_score = list(enumerate(similarity[index_of_the_book]))

sorted_similar_books = sorted(similarity_score, key=lambda x: x[1], reverse=True)

print('books suggested for you:\n')

i = 1

for book in sorted_similar_books:
    Index = book[0]
    title_from_index = df[df['Index'] == Index]['Title'].values[0]
    if (i < 30):
        print(i, '.', title_from_index)
        i += 1


books suggested for you:

1 . Birth of a Theorem
2 . Men of Mathematics
3 . Prisoner of Birth, A
4 . Analysis, Vol I
5 . Structure and Randomness
6 . God Created the Integers
7 . Empire of the Mughal - Ruler of the World
8 . Clash of Civilizations and Remaking of the World Order
9 . Death of Superman, The
10 . History of the DC Universe
11 . Journal of a Novel
12 . Grapes of Wrath, The
13 . Short History of the World, A
14 . City of Djinns
15 . Trembling of a Leaf, The
16 . Idea of Justice, The
17 . Phantom of Manhattan, The
18 . Justice League: Throne of Atlantis
19 . Tao of Physics, The
20 . Age of the Warrior, The
21 . Tales of Beedle the Bard
22 . Return of the Primitive
23 . Discovery of India, The
24 . Sea of Poppies
25 . Age of Wrath, The
26 . City of Joy, The
27 . Theory of Everything, The
28 . Elements of Information Theory
29 . Winter of Our Discontent, The
