In [1]:
import pandas as pd
import numpy as np
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics.pairwise import cosine_similarity


In [2]:
# Load the dataset
books = pd.read_csv("Books.csv", encoding='latin-1', on_bad_lines='skip')

# Preview the dataset
books.head()


  books = pd.read_csv("Books.csv", encoding='latin-1', on_bad_lines='skip')


Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-S,Image-URL-M,Image-URL-L
0,195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...
1,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...
2,60973129,Decision in Normandy,Carlo D'Este,1991,HarperPerennial,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...
3,374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999,Farrar Straus Giroux,http://images.amazon.com/images/P/0374157065.0...,http://images.amazon.com/images/P/0374157065.0...,http://images.amazon.com/images/P/0374157065.0...
4,393045218,The Mummies of Urumchi,E. J. W. Barber,1999,W. W. Norton &amp; Company,http://images.amazon.com/images/P/0393045218.0...,http://images.amazon.com/images/P/0393045218.0...,http://images.amazon.com/images/P/0393045218.0...


In [3]:
def clean_text(text):
    text = re.sub(r'\s+', ' ', text)  # Replace multiple spaces with one
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)  # Remove non-alphanumeric characters
    text = text.lower()  # Convert to lowercase
    return text

# Clean the book titles in the dataset
books['cleaned_title'] = books['Book-Title'].apply(clean_text)

# Create a mapping from cleaned titles to original indices
indices = pd.Series(books.index, index=books['cleaned_title']).to_dict()

# Preview the cleaned titles
books[['Book-Title', 'cleaned_title']].head()


Unnamed: 0,Book-Title,cleaned_title
0,Classical Mythology,classical mythology
1,Clara Callan,clara callan
2,Decision in Normandy,decision in normandy
3,Flu: The Story of the Great Influenza Pandemic...,flu the story of the great influenza pandemic ...
4,The Mummies of Urumchi,the mummies of urumchi


In [4]:
# Use TF-IDF to transform the book descriptions into vectors
tfidf_vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf_vectorizer.fit_transform(books['Book-Title'])

# Preview the shape of the TF-IDF matrix
tfidf_matrix.shape


(271360, 78387)

In [5]:
# Train the nearest neighbors model
nn_model = NearestNeighbors(n_neighbors=30, metric='cosine', n_jobs=-1)
nn_model.fit(tfidf_matrix)

# Test the model
print("Model trained successfully!")


Model trained successfully!


In [6]:
import re

def normalize_title(title):
    """Lowercase and remove punctuation and series info from title."""
    title = re.sub(r'\(.*?\)', '', title)  # remove anything in parentheses
    title = re.sub(r'[^a-zA-Z0-9 ]', '', title.lower())  # remove punctuation
    return title.strip()

def recommend(book_title, num_recommendations=5):
    book_title_clean = clean_text(book_title)

    if book_title_clean not in indices:
        return f"Book '{book_title}' not found in the dataset."

    idx = indices[book_title_clean]

    distances, indices_nn = nn_model.kneighbors(tfidf_matrix[idx], n_neighbors=100)

    original_title = books.iloc[idx]['Book-Title']
    original_norm = normalize_title(original_title)

    recommended = []
    seen_norms = set()

    for i in indices_nn[0]:
        candidate_title = books.iloc[i]['Book-Title']
        norm_title = normalize_title(candidate_title)

        if norm_title != original_norm and norm_title not in seen_norms:
            recommended.append(candidate_title.title())
            seen_norms.add(norm_title)

        if len(recommended) == num_recommendations:
            break

    return recommended


In [7]:
# Manually set the input book title (change this to any title from your dataset)
query = "Harry Potter and the Sorcerer's Stone"
print(f"📚 Input Book: {query}")
print("🔎 Recommended Books:")

# Call recommend() and display the output
results = recommend(query)

if isinstance(results, list):
    for i, book in enumerate(results, 1):
        print(f"{i}. {book}")
else:
    print(results)


📚 Input Book: Harry Potter and the Sorcerer's Stone
🔎 Recommended Books:
1. Harry Potter And The  Sorcerer'S Stone
2. Harry Potter And The Sorcerer'S Stone: A Deluxe Pop-Up Book
3. Harry Potter And The Sorcerer'S Stone Movie Poster Book
4. The Sorcerer'S Companion: A Guide To The Magical World Of Harry Potter
5. Harry Potter And The Philosopher'S Stone
