In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

In [5]:
df = pd.read_csv('scrapped_data_version_1_comp_business_mind.csv')

In [6]:
df.head()

Unnamed: 0,Book_title,book_id,book_price,author_name(s),publisher_name,publication_year,sub_title_book,book_edition,description,book_availablity,date_of_sale,year_month,author_count,common_width,common_heighy,category_verification
0,Software Engineering at Google,209970024,71.99,Titus Winters,O'Reilly Media,2020,Lessons Learned from Programming Over Time,,"Today, software engineers need to know not onl...",True,2020-02-28T00:00:00,Feb 2020,3,97,150,arch
1,The Software Architect Elevator,209997033,71.99,Gregor Hohpe,O'Reilly Media,2020,Redefining the Architect's Role in the Digital...,,As the digital economy changes the rules of th...,True,2020-04-08T00:00:00,Apr 2020,1,97,150,arch
2,Software Architecture Metrics,210567692,71.99,Christian Ciceri,O'Reilly Media,2022,,,Software architecture metrics are key to the m...,True,2022-05-18T00:00:00,May 2022,10,97,150,arch
3,Designing Hexagonal Architecture with Java,211106723,46.99,Davi Vieira,Packt Publishing,2023,Build maintainable and long-lasting applicatio...,,,True,2023-09-29T00:00:00,Sep 2023,1,97,150,arch
4,OpenStack for Architects,96237030,44.99,Michael Solberg,Packt Publishing,2018,Design production-ready private cloud infrastr...,,,True,2018-05-31T00:00:00,May 2018,2,97,150,arch


In [7]:
df.columns

Index(['Book_title', 'book_id', 'book_price', 'author_name(s)',
       'publisher_name', 'publication_year', 'sub_title_book', 'book_edition',
       'description', 'book_availablity', 'date_of_sale', 'year_month',
       'author_count', 'common_width', 'common_heighy',
       'category_verification'],
      dtype='object')

Data Preprocessing

In [8]:
# combining all relevant features into a single column for each book.
df['combined_features'] = df['Book_title'].fillna('')+ ' ' + df['description'].fillna('') + ' ' + \
                          df['author_name(s)'].fillna('') + ' ' + \
                          df['publication_year'].apply(str) + ' ' + \
                          df['category_verification'].fillna('')

In [9]:
# Preprocessing the combined_features: removing NaN values and duplicates books
df.dropna(subset=['combined_features'], inplace=True)
df.drop_duplicates(subset=['book_id'], inplace= True)

Text vectorization

In [10]:
tfidf_vectorizer = TfidfVectorizer(stop_words='english')
# transforming the combined data
tfidf_matrix = tfidf_vectorizer.fit_transform(df['combined_features'])

Calculating cosine similarity

In [11]:
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

Recommendations

In [16]:
def get_recommendations(title, cosine_sim = cosine_sim):
    book_index = df.index[df['Book_title'] == title].tolist()[0] # getting the book title.
    similarity_score = list(enumerate(cosine_sim[book_index])) # getting pairwise similarity of all the matched books
    similarity_score = sorted(similarity_score, key = lambda x:x[1], reverse=True) # sorting the books
    similarity_score = similarity_score[1:11] # getting the score of the 10 most similar books.
    
    book_indices = [i[0] for i in similarity_score]
    return df['Book_title'].iloc[book_indices]     # Returning the top 10 most similar books

    

In [14]:
df.head()

Unnamed: 0,Book_title,book_id,book_price,author_name(s),publisher_name,publication_year,sub_title_book,book_edition,description,book_availablity,date_of_sale,year_month,author_count,common_width,common_heighy,category_verification,combined_features
0,Software Engineering at Google,209970024,71.99,Titus Winters,O'Reilly Media,2020,Lessons Learned from Programming Over Time,,"Today, software engineers need to know not onl...",True,2020-02-28T00:00:00,Feb 2020,3,97,150,arch,"Software Engineering at Google Today, software..."
1,The Software Architect Elevator,209997033,71.99,Gregor Hohpe,O'Reilly Media,2020,Redefining the Architect's Role in the Digital...,,As the digital economy changes the rules of th...,True,2020-04-08T00:00:00,Apr 2020,1,97,150,arch,The Software Architect Elevator As the digital...
2,Software Architecture Metrics,210567692,71.99,Christian Ciceri,O'Reilly Media,2022,,,Software architecture metrics are key to the m...,True,2022-05-18T00:00:00,May 2022,10,97,150,arch,Software Architecture Metrics Software archite...
3,Designing Hexagonal Architecture with Java,211106723,46.99,Davi Vieira,Packt Publishing,2023,Build maintainable and long-lasting applicatio...,,,True,2023-09-29T00:00:00,Sep 2023,1,97,150,arch,Designing Hexagonal Architecture with Java Da...
4,OpenStack for Architects,96237030,44.99,Michael Solberg,Packt Publishing,2018,Design production-ready private cloud infrastr...,,,True,2018-05-31T00:00:00,May 2018,2,97,150,arch,OpenStack for Architects Michael Solberg 2018...


In [17]:
print(get_recommendations('The Software Architect Elevator'))

423      97 Things Every Software Architect Should Know
2                         Software Architecture Metrics
244                       Hands-On Linux for Architects
15245                The Technical–Business Equilibrium
4                              OpenStack for Architects
241                                Azure for Architects
2030                         Architects of Intelligence
19437                              Software Engineering
111                         Hybrid Cloud for Architects
18703                              Azure for Architects
Name: Book_title, dtype: object
