In [1]:
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

print("="*70)
print("BOOK RECOMMENDER SYSTEM - COMPLETE WITH DATA CLEANING")
print("="*70)

BOOK RECOMMENDER SYSTEM - COMPLETE WITH DATA CLEANING


In [2]:
# ============================================================================
# STEP 1: LOAD DATA
# ============================================================================
print("\n[STEP 1] Loading datasets...")
books = pd.read_csv("Books.csv")
users = pd.read_csv("Users.csv")
rating = pd.read_csv("Ratings.csv")

print(f"Initial data loaded:")
print(f"  Books: {books.shape}")
print(f"  Users: {users.shape}")
print(f"  Ratings: {rating.shape}")


[STEP 1] Loading datasets...
Initial data loaded:
  Books: (271360, 8)
  Users: (278858, 3)
  Ratings: (1149780, 3)


In [3]:
# ============================================================================
# STEP 2: CLEAN & PREPARE BOOKS DATA
# ============================================================================
print("\n[STEP 2] Cleaning Books dataset...")


[STEP 2] Cleaning Books dataset...


In [4]:

 #Select relevant columns
books = books[['ISBN', 'Book-Title', 'Year-Of-Publication', 'Publisher', 'Book-Author']]


In [5]:
# Rename columns
books.rename(columns={
    'Book-Title': "title", 
    "Year-Of-Publication": "year", 
    'Book-Author': "Auther"
}, inplace=True)

In [6]:
# Rename columns in rating
rating.rename(columns={"User-ID": "user_id", "Book-Rating": "rating"}, inplace=True)


In [7]:

# Data cleaning for books
print(f"  Missing values before cleaning: {books.isnull().sum().sum()}")
books.dropna(subset=['title', 'Auther'], inplace=True)  # Remove books without title/author
books['Publisher'].fillna('Unknown', inplace=True)
books.drop_duplicates(subset=['ISBN'], inplace=True)  # Remove duplicate ISBNs


  Missing values before cleaning: 4


In [8]:
# Clean text columns
books['title'] = books['title'].str.strip()
books['Auther'] = books['Auther'].str.strip()
books['Publisher'] = books['Publisher'].str.strip()

print(f"  Books after cleaning: {books.shape}")


  Books after cleaning: (271358, 5)


In [9]:
# ============================================================================
# STEP 3: CLEAN & PREPARE USERS DATA
# ============================================================================
print("\n[STEP 3] Cleaning Users dataset...")



[STEP 3] Cleaning Users dataset...


In [10]:
# Rename columns
users.rename(columns={
    "User-ID": "user_id", 
    "Location": "location", 
    "Age": "age"
}, inplace=True)

In [11]:
# Clean users data
users['age'] = pd.to_numeric(users['age'], errors='coerce')
users = users[(users['age'].isna()) | ((users['age'] >= 5) & (users['age'] <= 100))]
users['location'].fillna('Unknown', inplace=True)
users.drop_duplicates(subset=['user_id'], inplace=True)

print(f"  Users after cleaning: {users.shape}")


  Users after cleaning: (277610, 3)


In [12]:
# ============================================================================
# STEP 4: CLEAN & PREPARE RATINGS DATA
# ============================================================================
print("\n[STEP 4] Cleaning Ratings dataset...")



[STEP 4] Cleaning Ratings dataset...


In [13]:
# Rename columns
rating.rename(columns={
    "User-ID": "user_id", 
    "Book-Rating": "rating"
}, inplace=True)


In [14]:
# Clean ratings
rating = rating[(rating['rating'] >= 0) & (rating['rating'] <= 10)]  # Valid ratings only
rating.drop_duplicates(subset=['user_id', 'ISBN'], inplace=True)  # Remove duplicate ratings
print(f"  Ratings after cleaning: {rating.shape}")

  Ratings after cleaning: (1149780, 3)


In [15]:
# ============================================================================
# STEP 5: FILTER ACTIVE USERS (>200 ratings)
# ============================================================================
print("\n[STEP 5] Filtering active users (>200 ratings)...")

x = rating["user_id"].value_counts() > 200
y = x[x].index
rating = rating[rating["user_id"].isin(y)]

print(f"  Active users: {len(y)}")
print(f"  Filtered ratings shape: {rating.shape}")


[STEP 5] Filtering active users (>200 ratings)...
  Active users: 899
  Filtered ratings shape: (526356, 3)


In [16]:
# ============================================================================
# STEP 6: MERGE RATINGS WITH BOOKS
# ============================================================================
print("\n[STEP 6] Merging ratings with books...")

rating_with_books = rating.merge(books, on="ISBN", how='inner')
print(f"  Rating with books shape: {rating_with_books.shape}")



[STEP 6] Merging ratings with books...
  Rating with books shape: (487670, 7)


In [17]:
# Remove any rows with missing titles after merge
rating_with_books = rating_with_books[rating_with_books['title'].notna()]


In [18]:
# ============================================================================
# STEP 7: COUNT RATINGS PER BOOK
# ============================================================================
print("\n[STEP 7] Counting ratings per book...")

number_rating = rating_with_books.groupby("title")["rating"].count().reset_index()
number_rating.rename(columns={"rating": "num_of_ratings"}, inplace=True)

print(f"  Unique books with ratings: {len(number_rating)}")



[STEP 7] Counting ratings per book...
  Unique books with ratings: 160264


In [19]:
# ============================================================================
# STEP 8: CREATE FINAL DATASET WITH RATING COUNTS
# ============================================================================
print("\n[STEP 8] Creating final dataset...")



[STEP 8] Creating final dataset...


In [20]:
# FIXED: Correct variable name - rating_with_books (not ratong_with_books)
final_rating = rating_with_books.merge(number_rating, on="title")
print(f"  Final rating shape: {final_rating.shape}")

  Final rating shape: (487670, 8)


In [21]:
# Filter popular books (at least 50 ratings)
final_rating = final_rating[final_rating["num_of_ratings"] >= 50]
print(f"  After filtering popular books (>=50 ratings): {final_rating.shape}")


  After filtering popular books (>=50 ratings): (61853, 8)


In [22]:
# Remove duplicates
final_rating.drop_duplicates(['user_id', 'title'], inplace=True)
print(f"  After removing duplicates: {final_rating.shape}")


  After removing duplicates: (59850, 8)


In [23]:
# ============================================================================
# STEP 9: CREATE PIVOT TABLE (User-Item Matrix)
# ============================================================================
print("\n[STEP 9] Creating user-item matrix...")

book_pivot = final_rating.pivot_table(
    columns='user_id', 
    index='title', 
    values='rating'
)
book_pivot.fillna(0, inplace=True)

sparsity = (book_pivot == 0).sum().sum() / (book_pivot.shape[0] * book_pivot.shape[1]) * 100

print(f"  Pivot table shape: {book_pivot.shape}")
print(f"    - Books: {book_pivot.shape[0]}")
print(f"    - Users: {book_pivot.shape[1]}")
print(f"    - Sparsity: {sparsity:.2f}%")


[STEP 9] Creating user-item matrix...
  Pivot table shape: (742, 888)
    - Books: 742
    - Users: 888
    - Sparsity: 97.73%


In [24]:
# ============================================================================
# STEP 10: BUILD RECOMMENDATION MODEL
# ============================================================================
print("\n[STEP 10] Building recommendation model...")

from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors

# Convert to sparse matrix
book_sparse = csr_matrix(book_pivot.values)

# Train KNN model
model = NearestNeighbors(algorithm='brute', metric='cosine', n_jobs=-1)
model.fit(book_sparse)

print("  ‚úì Model trained successfully!")


[STEP 10] Building recommendation model...
  ‚úì Model trained successfully!


In [25]:
# ============================================================================
# RECOMMENDATION FUNCTION
# ============================================================================
def recommend_book(book_name, n_recommendations=5):
    """
    Recommend similar books using collaborative filtering
    
    Parameters:
    -----------
    book_name : str
        Title of the book
    n_recommendations : int
        Number of recommendations (default: 5)
    """
    if book_name not in book_pivot.index:
        print(f"\n‚ùå '{book_name}' not found!")
        print("\nüí° Did you mean:")
        similar = [b for b in book_pivot.index if book_name.lower() in b.lower()][:10]
        if similar:
            for i, b in enumerate(similar, 1):
                print(f"  {i}. {b}")
        else:
            print("  No similar titles found.")
        return
    
    book_idx = book_pivot.index.get_loc(book_name)
    distances, indices = model.kneighbors(
        book_pivot.iloc[book_idx, :].values.reshape(1, -1),
        n_neighbors=n_recommendations + 1
    )
    
    print(f"\n{'='*70}")
    print(f"üìö TOP {n_recommendations} RECOMMENDATIONS FOR:")
    print(f"   '{book_name}'")
    print(f"{'='*70}\n")
    
    for i in range(1, len(indices[0])):
        rec_book = book_pivot.index[indices[0][i]]
        similarity = (1 - distances[0][i]) * 100
        
        # Get book details
        book_info = final_rating[final_rating['title'] == rec_book].iloc[0]
        
        print(f"{i}. {rec_book}")
        print(f"   Author: {book_info['Auther']}")
        print(f"   Year: {book_info['year']}")
        print(f"   Similarity: {similarity:.1f}%")
        print(f"   Total Ratings: {book_info['num_of_ratings']}\n")


In [26]:
# ============================================================================
# DATA QUALITY SUMMARY
# ============================================================================
print("\n" + "="*70)
print("üìä DATA QUALITY SUMMARY")
print("="*70)
print(f"Final Books in System: {book_pivot.shape[0]}")
print(f"Active Users: {book_pivot.shape[1]}")
print(f"Total Ratings: {final_rating.shape[0]}")
print(f"Average Ratings per Book: {final_rating.groupby('title').size().mean():.1f}")
print(f"Average Ratings per User: {final_rating.groupby('user_id').size().mean():.1f}")
print(f"Matrix Sparsity: {sparsity:.2f}%")


üìä DATA QUALITY SUMMARY
Final Books in System: 742
Active Users: 888
Total Ratings: 59850
Average Ratings per Book: 80.7
Average Ratings per User: 67.4
Matrix Sparsity: 97.73%


In [27]:
# ============================================================================
# TOP POPULAR BOOKS
# ============================================================================
print("\n" + "="*70)
print("üìñ TOP 15 MOST POPULAR BOOKS")
print("="*70)

popular = final_rating.groupby('title').agg({
    'num_of_ratings': 'first',
    'Auther': 'first',
    'year': 'first'
}).sort_values('num_of_ratings', ascending=False).head(15)

for idx, (book, row) in enumerate(popular.iterrows(), 1):
    print(f"\n{idx}. {book}")
    print(f"   Author: {row['Auther']}")
    print(f"   Year: {row['year']}")
    print(f"   Ratings: {row['num_of_ratings']}")


üìñ TOP 15 MOST POPULAR BOOKS

1. Wild Animus
   Author: Rich Shapero
   Year: 2004
   Ratings: 363

2. Bridget Jones's Diary
   Author: Helen Fielding
   Year: 1999
   Ratings: 277

3. The Lovely Bones: A Novel
   Author: Alice Sebold
   Year: 2002
   Ratings: 270

4. The Notebook
   Author: Nicholas Sparks
   Year: 1996
   Ratings: 241

5. The Pelican Brief
   Author: John Grisham
   Year: 1993
   Ratings: 236

6. The Nanny Diaries: A Novel
   Author: Emma McLaughlin
   Year: 2002
   Ratings: 230

7. A Painted House
   Author: JOHN GRISHAM
   Year: 2001
   Ratings: 228

8. Divine Secrets of the Ya-Ya Sisterhood: A Novel
   Author: Rebecca Wells
   Year: 1997
   Ratings: 228

9. The Firm
   Author: John Grisham
   Year: 1992
   Ratings: 227

10. The Da Vinci Code
   Author: Dan Brown
   Year: 2003
   Ratings: 224

11. The Horse Whisperer
   Author: Nicholas Evans
   Year: 1996
   Ratings: 213

12. Message in a Bottle
   Author: Nicholas Sparks
   Year: 1998
   Ratings: 212

13. A Ti

In [28]:
# ============================================================================
# EXAMPLE RECOMMENDATIONS
# ============================================================================
print("\n" + "="*70)
print("üéØ EXAMPLE RECOMMENDATION")
print("="*70)

if len(book_pivot.index) > 0:
    example_book = popular.index[0]
    recommend_book(example_book, n_recommendations=5)


üéØ EXAMPLE RECOMMENDATION

üìö TOP 5 RECOMMENDATIONS FOR:
   'Wild Animus'

1. Pay It Forward
   Author: Catherine Ryan Hyde
   Year: 2000
   Similarity: 22.5%
   Total Ratings: 70

2. At Home in Mitford (The Mitford Years)
   Author: Jan Karon
   Year: 1996
   Similarity: 22.3%
   Total Ratings: 86

3. The Andromeda Strain
   Author: MICHAEL CRICHTON
   Year: 1992
   Similarity: 20.6%
   Total Ratings: 59

4. The First Counsel
   Author: Brad Meltzer
   Year: 2001
   Similarity: 20.1%
   Total Ratings: 53

5. The Tao of Pooh
   Author: Benjamin Hoff
   Year: 1983
   Similarity: 18.4%
   Total Ratings: 77



In [29]:
# Create a new cell for each test
recommend_book('The Lovely Bones: A Novel')


üìö TOP 5 RECOMMENDATIONS FOR:
   'The Lovely Bones: A Novel'

1. Where the Heart Is (Oprah's Book Club (Paperback))
   Author: Billie Letts
   Year: 1998
   Similarity: 27.7%
   Total Ratings: 183

2. Life of Pi
   Author: Yann Martel
   Year: 2003
   Similarity: 24.4%
   Total Ratings: 185

3. Good in Bed
   Author: Jennifer Weiner
   Year: 2002
   Similarity: 24.3%
   Total Ratings: 174

4. The Book of Ruth (Oprah's Book Club (Paperback))
   Author: Jane Hamilton
   Year: 1990
   Similarity: 23.7%
   Total Ratings: 116

5. The Pilot's Wife : A Novel
   Author: Anita Shreve
   Year: 1999
   Similarity: 23.3%
   Total Ratings: 171



In [30]:
recommend_book('Wild Animus', n_recommendations=10)


üìö TOP 10 RECOMMENDATIONS FOR:
   'Wild Animus'

1. Pay It Forward
   Author: Catherine Ryan Hyde
   Year: 2000
   Similarity: 22.5%
   Total Ratings: 70

2. At Home in Mitford (The Mitford Years)
   Author: Jan Karon
   Year: 1996
   Similarity: 22.3%
   Total Ratings: 86

3. The Andromeda Strain
   Author: MICHAEL CRICHTON
   Year: 1992
   Similarity: 20.6%
   Total Ratings: 59

4. The First Counsel
   Author: Brad Meltzer
   Year: 2001
   Similarity: 20.1%
   Total Ratings: 53

5. The Tao of Pooh
   Author: Benjamin Hoff
   Year: 1983
   Similarity: 18.4%
   Total Ratings: 77

6. The Reader
   Author: Bernhard Schlink
   Year: 1999
   Similarity: 18.1%
   Total Ratings: 140

7. A Year in Provence
   Author: Peter Mayle
   Year: 1991
   Similarity: 17.0%
   Total Ratings: 51

8. A Lesson Before Dying (Vintage Contemporaries (Paperback))
   Author: Ernest J. Gaines
   Year: 1997
   Similarity: 16.1%
   Total Ratings: 61

9. Touching Evil
   Author: Kay Hooper
   Year: 2001
   Simil

In [31]:
recommend_book('Harry Potter')


‚ùå 'Harry Potter' not found!

üí° Did you mean:
  1. Harry Potter and the Chamber of Secrets (Book 2)
  2. Harry Potter and the Goblet of Fire (Book 4)
  3. Harry Potter and the Order of the Phoenix (Book 5)
  4. Harry Potter and the Prisoner of Azkaban (Book 3)
  5. Harry Potter and the Sorcerer's Stone (Book 1)
  6. Harry Potter and the Sorcerer's Stone (Harry Potter (Paperback))
