In [2]:
# import libraries (you may add additional imports but you may not have to)
import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors
import matplotlib.pyplot as plt

In [3]:
# get data files
!wget https://cdn.freecodecamp.org/project-data/books/book-crossings.zip

!unzip book-crossings.zip

books_filename = 'BX-Books.csv'
ratings_filename = 'BX-Book-Ratings.csv'

--2025-08-22 18:20:14--  https://cdn.freecodecamp.org/project-data/books/book-crossings.zip
Resolving cdn.freecodecamp.org (cdn.freecodecamp.org)... 104.26.2.33, 104.26.3.33, 172.67.70.149, ...
Connecting to cdn.freecodecamp.org (cdn.freecodecamp.org)|104.26.2.33|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 26085508 (25M) [application/zip]
Saving to: ‘book-crossings.zip’


2025-08-22 18:20:16 (19.7 MB/s) - ‘book-crossings.zip’ saved [26085508/26085508]

Archive:  book-crossings.zip
  inflating: BX-Book-Ratings.csv     
  inflating: BX-Books.csv            
  inflating: BX-Users.csv            


In [4]:
# import csv data into dataframes
df_books = pd.read_csv(
    books_filename,
    encoding = "ISO-8859-1",
    sep=";",
    header=0,
    names=['isbn', 'title', 'author'],
    usecols=['isbn', 'title', 'author'],
    dtype={'isbn': 'str', 'title': 'str', 'author': 'str'})

df_ratings = pd.read_csv(
    ratings_filename,
    encoding = "ISO-8859-1",
    sep=";",
    header=0,
    names=['user', 'isbn', 'rating'],
    usecols=['user', 'isbn', 'rating'],
    dtype={'user': 'int32', 'isbn': 'str', 'rating': 'float32'})

In [9]:
import pandas as pd
from sklearn.neighbors import NearestNeighbors

# --- Step 1: Filter dataset ---

# Keep users with >= 200 ratings
ratings_per_user = df_ratings['user'].value_counts()
df_ratings_filtered = df_ratings[df_ratings['user'].isin(ratings_per_user[ratings_per_user >= 200].index)]

# Keep books with >= 100 ratings
ratings_per_book = df_ratings_filtered['isbn'].value_counts()
df_ratings_filtered = df_ratings_filtered[df_ratings_filtered['isbn'].isin(ratings_per_book[ratings_per_book >= 100].index)]

# --- Step 2: Create book-user matrix ---
book_user_matrix = df_ratings_filtered.pivot_table(index='isbn', columns='user', values='rating').fillna(0)

# Mappings (isbn <-> title)
isbn_to_title = df_books.set_index('isbn')['title'].to_dict()
title_to_isbn = {title: isbn for isbn, title in isbn_to_title.items() if pd.notna(title)}

# --- Step 3: Fit NearestNeighbors ---
model = NearestNeighbors(metric='cosine', algorithm='brute')
model.fit(book_user_matrix.values)

# --- Step 4: Define recommender ---
def get_recommends(book_title):
    if book_title not in title_to_isbn:
        raise ValueError(f"Book '{book_title}' not found in dataset.")

    isbn = title_to_isbn[book_title]

    # Ensure ISBN exists in filtered matrix
    if isbn not in book_user_matrix.index:
        raise ValueError(f"Book '{book_title}' exists but was filtered out (too few ratings).")

    idx = list(book_user_matrix.index).index(isbn)

    distances, indices = model.kneighbors([book_user_matrix.iloc[idx].values], n_neighbors=6)

    recommendations = []
    for dist, neighbor_idx in zip(distances[0][1:], indices[0][1:]):  # skip the book itself
        neighbor_isbn = book_user_matrix.index[neighbor_idx]
        neighbor_title = isbn_to_title.get(neighbor_isbn, "Unknown Title")
        recommendations.append([neighbor_title, float(dist)])  # cast to float for clean output

    return [book_title, recommendations]


In [10]:
def get_recommends(book_title):
    if book_title not in title_to_isbn:
        raise ValueError(f"Book '{book_title}' not found in dataset.")

    isbn = title_to_isbn[book_title]
    if isbn not in book_user_matrix.index:
        raise ValueError(f"Book '{book_title}' exists but was filtered out.")

    idx = list(book_user_matrix.index).index(isbn)
    distances, indices = model.kneighbors([book_user_matrix.iloc[idx].values], n_neighbors=6)

    recommendations = []
    for dist, neighbor_idx in zip(distances[0][1:], indices[0][1:]):  # skip itself
        neighbor_isbn = book_user_matrix.index[neighbor_idx]
        neighbor_title = isbn_to_title.get(neighbor_isbn, "Unknown Title")
        recommendations.append([neighbor_title, round(float(dist), 2)])

    return [book_title, recommendations]


In [11]:
books = get_recommends("Where the Heart Is (Oprah's Book Club (Paperback))")
print(books)


["Where the Heart Is (Oprah's Book Club (Paperback))", [['The Lovely Bones: A Novel', 0.72], ["The Pilot's Wife : A Novel", 0.82], ['The Joy Luck Club', 0.82], ['The Notebook', 0.82], ['Bel Canto: A Novel', 0.82]]]
