In [3]:
# Book Recommendation System: Unified Pipeline (Refactored)

import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from surprise import Dataset, Reader, SVD
from surprise.model_selection import cross_validate
import ast

# ------------------------------
# Step 1: Data Loading & Cleaning
# ------------------------------
print("[INFO] Loading data...")
df = pd.read_csv("/Users/whoseunassailable/Documents/coding_projects/college_projects/readiculous/data/processed/combined_books.csv")
print(f"Initial shape: {df.shape}")

# Drop rows with missing critical fields
df.dropna(subset=['title', 'author', 'desc', 'genres'], inplace=True)
df.drop_duplicates(subset=['title', 'author'], inplace=True)

# Filter garbage or uninformative entries
def is_valid_description(desc):
    invalids = ['.', 'No', 'no', 'PB', 'pB', 'P.B.']
    return not any(desc.strip() == val for val in invalids)

df = df[df['desc'].apply(lambda x: isinstance(x, str) and is_valid_description(x))]
df = df[df['title'].apply(lambda x: isinstance(x, str) and len(x) > 2 and not x.isdigit())]
df = df[df['author'].apply(lambda x: isinstance(x, str) and len(x.strip()) > 0)]

# Normalize text columns
df['author'] = df['author'].str.strip().str.lower()
df['desc'] = df['desc'].str.strip().str.lower()
df['genres'] = df['genres'].str.strip().str.lower()

# Parse genres to lists
df['genres_list'] = df['genres'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) and x.startswith('[') else [])

# ------------------------------
# Step 2: Feature Engineering (Combined Text)
# ------------------------------
def create_combined_text(row):
    return f"{row['title']} {row['author']} {row['desc']} {' '.join(row['genres_list'])}"

print("[INFO] Creating combined_text feature...")
df['combined_text'] = df.apply(create_combined_text, axis=1)
df.to_csv("cleaned_books.csv", index=False)

# ------------------------------
# Step 3: TF-IDF Vectorization
# ------------------------------
print("[INFO] Applying TF-IDF vectorization...")
tfidf = TfidfVectorizer(stop_words='english', max_features=5000)
tfidf_matrix = tfidf.fit_transform(df['combined_text'])
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

# ------------------------------
# Step 4: Genre-Based Filtering
# ------------------------------
def get_genre_recommendations(book_title, top_n=5):
    idx = df[df['title'].str.lower() == book_title.lower()].index
    if len(idx) == 0:
        return []
    idx = idx[0]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:top_n+1]
    return df.iloc[[i[0] for i in sim_scores]][['title', 'author', 'genres']]

# ------------------------------
# Step 5: Collaborative Filtering (SVD)
# ------------------------------
print("[INFO] Running Collaborative Filtering with Surprise SVD...")

ratings = pd.DataFrame({
    'user_id': np.random.randint(1, 100, 1000),
    'title': np.random.choice(df['title'], 1000),
    'rating': np.random.randint(1, 6, 1000)
})

ratings = ratings.merge(df[['title']], on='title')
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(ratings[['user_id', 'title', 'rating']], reader)

algo = SVD()
cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=3, verbose=True)

# ------------------------------
# Example Usage
# ------------------------------
print("\n[INFO] Sample Recommendations for 'Harry Potter'")
print(get_genre_recommendations("Harry Potter"))


[INFO] Loading data...
Initial shape: (151774, 11)


  df = pd.read_csv("/Users/whoseunassailable/Documents/coding_projects/college_projects/readiculous/data/processed/combined_books.csv")


KeyError: ['genres']