In [4]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
import numpy as np

In [2]:
df = pd.read_csv('scrapped_data_version_1_comp_business_mind.csv')

In [3]:
df.head()

Unnamed: 0,Book_title,book_id,book_price,author_name(s),publisher_name,publication_year,sub_title_book,book_edition,description,book_availablity,date_of_sale,year_month,author_count,common_width,common_heighy,category_verification
0,Software Engineering at Google,209970024,71.99,Titus Winters,O'Reilly Media,2020,Lessons Learned from Programming Over Time,,"Today, software engineers need to know not onl...",True,2020-02-28T00:00:00,Feb 2020,3,97,150,arch
1,The Software Architect Elevator,209997033,71.99,Gregor Hohpe,O'Reilly Media,2020,Redefining the Architect's Role in the Digital...,,As the digital economy changes the rules of th...,True,2020-04-08T00:00:00,Apr 2020,1,97,150,arch
2,Software Architecture Metrics,210567692,71.99,Christian Ciceri,O'Reilly Media,2022,,,Software architecture metrics are key to the m...,True,2022-05-18T00:00:00,May 2022,10,97,150,arch
3,Designing Hexagonal Architecture with Java,211106723,46.99,Davi Vieira,Packt Publishing,2023,Build maintainable and long-lasting applicatio...,,,True,2023-09-29T00:00:00,Sep 2023,1,97,150,arch
4,OpenStack for Architects,96237030,44.99,Michael Solberg,Packt Publishing,2018,Design production-ready private cloud infrastr...,,,True,2018-05-31T00:00:00,May 2018,2,97,150,arch


In [5]:
# combining the subtitle and description column.
df['combined_description'] = df['description'].fillna('') + ' '+df['sub_title_book'].fillna('')

In [6]:
df.head()

Unnamed: 0,Book_title,book_id,book_price,author_name(s),publisher_name,publication_year,sub_title_book,book_edition,description,book_availablity,date_of_sale,year_month,author_count,common_width,common_heighy,category_verification,combined_description
0,Software Engineering at Google,209970024,71.99,Titus Winters,O'Reilly Media,2020,Lessons Learned from Programming Over Time,,"Today, software engineers need to know not onl...",True,2020-02-28T00:00:00,Feb 2020,3,97,150,arch,"Today, software engineers need to know not onl..."
1,The Software Architect Elevator,209997033,71.99,Gregor Hohpe,O'Reilly Media,2020,Redefining the Architect's Role in the Digital...,,As the digital economy changes the rules of th...,True,2020-04-08T00:00:00,Apr 2020,1,97,150,arch,As the digital economy changes the rules of th...
2,Software Architecture Metrics,210567692,71.99,Christian Ciceri,O'Reilly Media,2022,,,Software architecture metrics are key to the m...,True,2022-05-18T00:00:00,May 2022,10,97,150,arch,Software architecture metrics are key to the m...
3,Designing Hexagonal Architecture with Java,211106723,46.99,Davi Vieira,Packt Publishing,2023,Build maintainable and long-lasting applicatio...,,,True,2023-09-29T00:00:00,Sep 2023,1,97,150,arch,Build maintainable and long-lasting applicati...
4,OpenStack for Architects,96237030,44.99,Michael Solberg,Packt Publishing,2018,Design production-ready private cloud infrastr...,,,True,2018-05-31T00:00:00,May 2018,2,97,150,arch,Design production-ready private cloud infrast...


In [7]:
# Filtering out rows with empty combined_description.
df_filtered = df[df['combined_description'].str.strip()!='']

In [13]:
df_filtered.head()

Unnamed: 0,Book_title,book_id,book_price,author_name(s),publisher_name,publication_year,sub_title_book,book_edition,description,book_availablity,date_of_sale,year_month,author_count,common_width,common_heighy,category_verification,combined_description
0,Software Engineering at Google,209970024,71.99,Titus Winters,O'Reilly Media,2020,Lessons Learned from Programming Over Time,,"Today, software engineers need to know not onl...",True,2020-02-28T00:00:00,Feb 2020,3,97,150,arch,"Today, software engineers need to know not onl..."
1,The Software Architect Elevator,209997033,71.99,Gregor Hohpe,O'Reilly Media,2020,Redefining the Architect's Role in the Digital...,,As the digital economy changes the rules of th...,True,2020-04-08T00:00:00,Apr 2020,1,97,150,arch,As the digital economy changes the rules of th...
2,Software Architecture Metrics,210567692,71.99,Christian Ciceri,O'Reilly Media,2022,,,Software architecture metrics are key to the m...,True,2022-05-18T00:00:00,May 2022,10,97,150,arch,Software architecture metrics are key to the m...
3,Designing Hexagonal Architecture with Java,211106723,46.99,Davi Vieira,Packt Publishing,2023,Build maintainable and long-lasting applicatio...,,,True,2023-09-29T00:00:00,Sep 2023,1,97,150,arch,Build maintainable and long-lasting applicati...
4,OpenStack for Architects,96237030,44.99,Michael Solberg,Packt Publishing,2018,Design production-ready private cloud infrastr...,,,True,2018-05-31T00:00:00,May 2018,2,97,150,arch,Design production-ready private cloud infrast...


In [8]:
print(df.shape)
print(df_filtered.shape)

(43055, 17)
(42927, 17)


In [9]:
# Initializing the vectorizer with english stop words.
tfidf_vec = TfidfVectorizer(stop_words='english')

In [10]:
# Fitting and transforming the combined description to create a TF-IDF matrix.
tfidf_matrix = tfidf_vec.fit_transform(df_filtered['combined_description'])

In [22]:
def search_book(query, n):
    query_tfidf = tfidf_vec.transform([query])
    cosine_similarities = linear_kernel(query_tfidf, tfidf_matrix).flatten()
    top_idx = np.argsort(-cosine_similarities)[:n]
    recommended_books = df_filtered.iloc[top_idx][['Book_title','author_name(s)','sub_title_book']]
    return recommended_books

In [23]:
# Example usage
search_query1='I want a book to learn python.'
search_query2='I want a book to learn advanced python.'
print(search_book(search_query1,10))
print(search_book(search_query2,10))

                             Book_title        author_name(s)  \
19155              Adventures in Python      Craig Richardson   
19479      Learning Professional Python  Usharani Bhimavarapu   
17099    Python 3 Using ChatGPT / GPT-4      Oswald Campesato   
21600    Python 3 Using ChatGPT / GPT-4      Oswald Campesato   
19482      Learning Professional Python  Usharani Bhimavarapu   
17355                   Python Cookbook         David Beazley   
42303  Rising from the Ashes of Divorce            Beth Tiger   
29489                     PYTHON CODING         Pierce Weaver   
21926  Python Programming for Beginners            Joe Benton   
18837               Python Without Fear        Brian Overland   

                                          sub_title_book  
19155                                                NaN  
19479                               Volume 1: The Basics  
17099                                                NaN  
21600                                           