In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score
import requests
import zipfile
import os

In [8]:
# Download and unzip the dataset
data_path = "./data/"

# Load the data with proper dtype handling
books = pd.read_csv(
    data_path + "BX-Books.csv",
    sep=",",
    header=0,
    index_col=0,
    on_bad_lines="skip",
    encoding="latin-1",
    low_memory=False,  # Option to avoid DtypeWarning
)

users = pd.read_csv(
    data_path + "BX-Users.csv",
    sep=",",
    header=0,
    index_col=0,
    on_bad_lines="skip",
    encoding="latin-1",
    low_memory=False,  # Option to avoid DtypeWarning
)

ratings = pd.read_csv(
    data_path + "BX-Book-Ratings.csv",
    sep=",",
    header=0,
    index_col=0,
    on_bad_lines="skip",
    encoding="latin-1",
    low_memory=False,  # Option to avoid DtypeWarning
)

# Print the column names to check for 'ISBN'
print("Books columns:", books.columns)
print("Users columns:", users.columns)
print("Ratings columns:", ratings.columns)

Books columns: Index(['ISBN', 'Book-Title', 'Book-Author', 'Year-Of-Publication', 'Publisher',
       'Image-URL-S', 'Image-URL-M', 'Image-URL-L'],
      dtype='object')
Users columns: Index(['User-ID', 'Location', 'Age'], dtype='object')
Ratings columns: Index(['User-ID', 'ISBN', 'Book-Rating'], dtype='object')


In [9]:
# Preprocess the data
ratings = ratings.merge(books[["ISBN", "Book-Title"]], on="ISBN")
ratings

Unnamed: 0,User-ID,ISBN,Book-Rating,Book-Title
0,276725,034545104X,0,Flesh Tones: A Novel
1,2313,034545104X,5,Flesh Tones: A Novel
2,6543,034545104X,0,Flesh Tones: A Novel
3,8680,034545104X,5,Flesh Tones: A Novel
4,10314,034545104X,9,Flesh Tones: A Novel
...,...,...,...,...
1031131,276688,0517145553,0,Mostly Harmless
1031132,276688,1575660792,7,Gray Matter
1031133,276690,0590907301,0,Triplet Trouble and the Class Trip (Triplet Tr...
1031134,276704,0679752714,0,A Desert of Pure Feeling (Vintage Contemporaries)


In [10]:
ratings = ratings.drop(["ISBN"], axis=1)
ratings

Unnamed: 0,User-ID,Book-Rating,Book-Title
0,276725,0,Flesh Tones: A Novel
1,2313,5,Flesh Tones: A Novel
2,6543,0,Flesh Tones: A Novel
3,8680,5,Flesh Tones: A Novel
4,10314,9,Flesh Tones: A Novel
...,...,...,...
1031131,276688,0,Mostly Harmless
1031132,276688,7,Gray Matter
1031133,276690,0,Triplet Trouble and the Class Trip (Triplet Tr...
1031134,276704,0,A Desert of Pure Feeling (Vintage Contemporaries)


In [11]:
# Filter books with at least min_book_ratings ratings
min_book_ratings = 10
book_rating_counts = ratings["Book-Title"].value_counts()
filtered_books = book_rating_counts[book_rating_counts >= min_book_ratings].index
ratings = ratings[ratings["Book-Title"].isin(filtered_books)]

In [12]:
# Filter users who rated at least min_user_ratings books
min_user_ratings = 5
user_rating_counts = ratings["User-ID"].value_counts()
filtered_users = user_rating_counts[user_rating_counts >= min_user_ratings].index
ratings = ratings[ratings["User-ID"].isin(filtered_users)]

In [13]:
# Create the user-item matrix
user_item_matrix = ratings.pivot_table(
    index="User-ID", columns="Book-Title", values="Book-Rating"
).fillna(0)

In [14]:
# Split user indices into train and test sets
user_indices = np.arange(user_item_matrix.shape[0])
train_user_indices, test_user_indices = train_test_split(
    user_indices, test_size=0.2, random_state=42
)

In [16]:
def random_recommendation(ratings, n=10):
    unique_books = ratings["Book-Title"].unique()
    random_books = np.random.choice(unique_books, size=n, replace=False)
    return random_books


random_books = random_recommendation(ratings, n=10)
print("Random Score-Based Recommendations:")
for i, book in enumerate(random_books, 1):
    print(f"{i}. {book}")

Random Score-Based Recommendations:
1. Almost Adam: A Novel
2. How the Irish Saved Civilization: The Untold Story of Ireland's Heroic Role from the Fall of Rome to the Rise of Medieval Europe (Hinges of History)
3. Deadly Exposure
4. Me: by Jimmy (Big Boy) Valente
5. Get to the Heart: My Story
6. When the Emperor Was Divine
7. The Noonday Demon: An Atlas of Depression
8. The Timothy Files
9. The Night Drifter : A Novel
10. The Seducer (Get Connected Romances)


In [17]:
def popularity_recommendation(ratings, n=10):
    popular_books = (
        ratings.groupby("Book-Title")["Book-Rating"]
        .count()
        .sort_values(ascending=False)
        .head(n)
        .index
    )
    return popular_books


popular_books = popularity_recommendation(ratings, n=10)
print("\nPopularity-Based Recommendations:")
for i, book in enumerate(popular_books, 1):
    print(f"{i}. {book}")


Popularity-Based Recommendations:
1. Wild Animus
2. The Lovely Bones: A Novel
3. The Da Vinci Code
4. The Nanny Diaries: A Novel
5. Bridget Jones's Diary
6. A Painted House
7. The Secret Life of Bees
8. Divine Secrets of the Ya-Ya Sisterhood: A Novel
9. Angels &amp; Demons
10. Life of Pi


In [23]:
def recommend_books(user_index, strategy, k=10):
    if strategy == "random":
        top_k_books = random_recommendation(ratings, n=k)
    elif strategy == "popularity":
        top_k_books = popularity_recommendation(ratings, n=k)
    else:
        raise ValueError("Invalid recommendation strategy")
    return top_k_books

In [24]:
def evaluate_model(strategy, k=10):
    true_positive = 0
    false_positive = 0
    false_negative = 0

    for user_index in test_user_indices:
        true_books = set(
            user_item_matrix.iloc[user_index][
                user_item_matrix.iloc[user_index] > 0
            ].index
        )
        recommended_books = set(recommend_books(user_index, strategy, k))

        tp = len(true_books.intersection(recommended_books))
        fp = len(recommended_books - true_books)
        fn = len(true_books - recommended_books)

        true_positive += tp
        false_positive += fp
        false_negative += fn

    precision = true_positive / (true_positive + false_positive)
    recall = true_positive / (true_positive + false_negative)

    return precision, recall

In [25]:
# Evaluate the random score-based recommendation model
random_precision, random_recall = evaluate_model(strategy="random")
print(
    f"Random Score-Based: Precision = {random_precision:.4f}, Recall = {random_recall:.4f}"
)

Random Score-Based: Precision = 0.0005, Recall = 0.0004


In [26]:
# Evaluate the popularity-based recommendation model
popularity_precision, popularity_recall = evaluate_model(strategy="popularity")
print(
    f"Popularity-Based: Precision = {popularity_precision:.4f}, Recall = {popularity_recall:.4f}"
)

Popularity-Based: Precision = 0.0200, Recall = 0.0167
