In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
import plotly.express as px

In [2]:
# Load the data
data = pd.read_csv("books_data.csv")

In [3]:
# Check the initial structure
data.head()
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11127 entries, 0 to 11126
Data columns (total 4 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   bookID          11127 non-null  int64 
 1   title           11127 non-null  object
 2   authors         11127 non-null  object
 3   average_rating  11127 non-null  object
dtypes: int64(1), object(3)
memory usage: 347.8+ KB


In [4]:
# Convert 'average_rating' to numeric, handling missing values
data['average_rating'] = pd.to_numeric(data['average_rating'], errors='coerce')

In [5]:
# Drop rows with missing values in 'title' and 'authors'
data.dropna(subset=['title', 'authors'], inplace=True)

In [6]:
# Data Visualization: Distribution of Average Ratings
fig = px.histogram(data, x='average_rating', nbins=30,
                   title='Distribution of Average Ratings')
fig.update_xaxes(title_text='Average Rating')
fig.update_yaxes(title_text='Frequency')
fig.show()

In [7]:
# Data Visualization: Top Authors by Number of Books
top_authors = data['authors'].value_counts().head(10).reset_index()
top_authors.columns = ['Author', 'Number of Books']

fig = px.bar(top_authors, x='Number of Books', y='Author', orientation='h',
             labels={'Number of Books': 'Number of Books', 'Author': 'Author'},
             title='Top Authors by Number of Books')
fig.show()

In [8]:
# Create a new column 'book_content' by combining 'title' and 'authors'
data['book_content'] = data['title'] + ' ' + data['authors']

In [9]:
# Check the updated structure
data.head()

Unnamed: 0,bookID,title,authors,average_rating,book_content
0,1,Harry Potter and the Half-Blood Prince (Harry ...,J.K. Rowling/Mary GrandPré,4.57,Harry Potter and the Half-Blood Prince (Harry ...
1,2,Harry Potter and the Order of the Phoenix (Har...,J.K. Rowling/Mary GrandPré,4.49,Harry Potter and the Order of the Phoenix (Har...
2,4,Harry Potter and the Chamber of Secrets (Harry...,J.K. Rowling,4.42,Harry Potter and the Chamber of Secrets (Harry...
3,5,Harry Potter and the Prisoner of Azkaban (Harr...,J.K. Rowling/Mary GrandPré,4.56,Harry Potter and the Prisoner of Azkaban (Harr...
4,8,Harry Potter Boxed Set Books 1-5 (Harry Potte...,J.K. Rowling/Mary GrandPré,4.78,Harry Potter Boxed Set Books 1-5 (Harry Potte...


In [10]:
# Apply TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf_vectorizer.fit_transform(data['book_content'])

In [11]:
# Compute the cosine similarity between books
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

In [12]:
# Book Recommendation Function with Error Handling
def recommend_books(book_title, cosine_sim=cosine_sim):
    # Check if the book title exists
    if book_title not in data['title'].values:
        print(f"'{book_title}' not found in the dataset.")
        return []
    
    # Get the index of the book that matches the title
    idx = data[data['title'] == book_title].index[0]

    # Get the cosine similarity scores for all books with this book
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort the books based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the top 10 most similar books (excluding the input book)
    sim_scores = sim_scores[1:11]

    # Get the book indices
    book_indices = [i[0] for i in sim_scores]

    # Return the top 10 recommended books
    return data['title'].iloc[book_indices].tolist()

In [13]:
# Example: Get Recommendations
book_title = "Seven Plays"
recommended_books = recommend_books(book_title)

In [14]:
# Print Recommendations
if recommended_books:
    print("Recommended Books:")
    for book in recommended_books:
        print(book)

Recommended Books:
Buried Child
See You Around  Sam! (Sam Krupnik  #3)
Sam Walton: Made In America
The Secret Seven (The Secret Seven  #1)
Seven Novels
The Atlantis Dialogue
Early Candlelight
Medea and Other Plays
The Bacchae and Other Plays
Waterworks
