In [None]:
# loading -> preprocessing -> feature_engineering -> genre_based_filtering -> collaborative_filtering_2

In [None]:
import pandas as pd
import string
import re

# from helper_functions.google_books_api import fetch_genre_from_google_books

In [None]:
df_books = pd.read_csv("/Users/whoseunassailable/Documents/coding_projects/college_projects/readiculous/data/raw/Books.csv", sep=';', engine='python')
df_ratings = pd.read_csv("/Users/whoseunassailable/Documents/coding_projects/college_projects/readiculous/data/raw/Ratings.csv", sep=';', engine='python')
df_users = pd.read_csv("/Users/whoseunassailable/Documents/coding_projects/college_projects/readiculous/data/raw/Users.csv", sep=';', engine='python')


In [None]:
df_books.shape

In [None]:
df_ratings.shape

In [None]:
df_users.shape

In [None]:
# Rename books columns
df_books.columns = ['isbn', 'title', 'author', 'year', 'publisher']

# Rename ratings columns
df_ratings.columns = ['user_id', 'isbn', 'rating']

# Rename users columns
df_users.columns = ['user_id', 'age']


In [None]:
# Convert to string first, then clean
df_ratings = df_ratings[df_ratings['user_id'].astype(str).str.strip().str.isnumeric()]
df_users = df_users[df_users['user_id'].astype(str).str.strip().str.isnumeric()]

# Now convert to integer
df_ratings['user_id'] = df_ratings['user_id'].astype(int)
df_users['user_id'] = df_users['user_id'].astype(int)


### Matching Data Types

In [None]:
# Merge ratings with books on 'isbn'
df_merged = pd.merge(df_ratings, df_books, on='isbn', how='inner')

# Merge the result with users on 'user_id's
df_final = pd.merge(df_merged, df_users, on='user_id', how='inner')


In [None]:
df_final.to_csv("/Users/whoseunassailable/Documents/coding_projects/college_projects/readiculous/data/processed/books_dataset.csv")

In [None]:
df_final.describe(include='all')

In [None]:
df_final.columns

In [None]:
# Find duplicated ISBNs (same book listed multiple times)
duplicate_isbns = df_books[df_books.duplicated('isbn', keep=False)]

# Count frequency of each duplicate ISBN
duplicate_isbn_counts = duplicate_isbns.groupby('isbn').size().reset_index(name='count')
duplicate_isbn_counts = duplicate_isbn_counts.sort_values(by='count', ascending=False)

# Display top duplicates
print(duplicate_isbn_counts.head(10))


In [None]:

# Normalize titles: lowercase, strip, remove punctuation
df_final['normalized_title'] = df_final['title']\
    .str.lower()\
    .str.strip()\
    .apply(lambda x: x.translate(str.maketrans('', '', string.punctuation)))

# Count unique normalized titles
unique_title_count = df_final['normalized_title'].nunique()
print(f"Unique normalized titles: {unique_title_count}")


In [None]:
# Extract unique normalized titles
unique_titles = df_final['normalized_title'].dropna().unique()

# Save to CSV
import pandas as pd
pd.DataFrame(unique_titles, columns=['normalized_title']).to_csv('unique_titles.csv', index=False)


In [None]:
df_final.shape

In [None]:
best_books = pd.read_csv("/Users/whoseunassailable/Documents/coding_projects/college_projects/readiculous/data/raw/best_books.csv")

In [None]:
best_books.head()

In [None]:
# Extract title and genres
title_genres_df = best_books[['title', 'genres']]

# Save to CSV file
title_genres_df.to_csv('/Users/whoseunassailable/Documents/coding_projects/college_projects/readiculous/data/processed/title_genres.csv', index=False)


In [None]:
best_books = pd.read_csv("/Users/whoseunassailable/Documents/coding_projects/college_projects/readiculous/data/processed/unique_titles.csv")
unique_title_with_genres = pd.read_csv("/Users/whoseunassailable/Documents/coding_projects/college_projects/readiculous/data/processed/title_genres.csv")

In [None]:
best_books.columns

In [None]:
unique_title_with_genres.columns

In [None]:

# # Normalize function
# def normalize(text):
#     return re.sub(r'\W+', '', str(text).lower().strip())

# # Use 'title' instead of 'normalized_title'
# unique_title_with_genres['normalized_title_clean'] = unique_title_with_genres['title'].apply(normalize)
# best_books['normalized_title_clean'] = best_books['title'].apply(normalize)

# # Mapping
# genre_map = dict(zip(best_books['normalized_title_clean'], best_books['genres']))

# # Add matched genres
# unique_title_with_genres['matched_genres'] = None

# for i, row in unique_title_with_genres.iterrows():
#     current_title = row['normalized_title_clean']
#     for norm_title, genres in genre_map.items():
#         if norm_title in current_title or current_title in norm_title:
#             unique_title_with_genres.at[i, 'matched_genres'] = genres
#             break

# # Save
# unique_title_with_genres.to_csv('/Users/whoseunassailable/Documents/coding_projects/college_projects/readiculous/data/processed/enriched_titles_with_genres.csv', index=False)

# print("✅ Enriched file saved as 'enriched_titles_with_genres.csv'")


In [None]:
enriched_titles_with_genres = pd.read_csv("/Users/whoseunassailable/Documents/coding_projects/college_projects/readiculous/data/processed/enriched_titles_with_genres.csv")

In [None]:
enriched_titles_with_genres.head(5)

In [None]:
enriched_titles_with_genres.shape

In [None]:
enriched_titles_with_genres.isna().sum()

In [None]:
df_final.head(5)

In [None]:

# Merge on normalized_title
df_final = df_final.merge(
    enriched_titles_with_genres[['normalized_title_clean', 'matched_genres']],
    how='left',
    left_on='normalized_title',
    right_on='normalized_title_clean'
)

# Drop helper column
df_final.drop(columns='normalized_title_clean', inplace=True)

# Save result
df_final.to_csv('/Users/whoseunassailable/Documents/coding_projects/college_projects/readiculous/data/processed/df_final_with_genres.csv', index=False)

In [None]:
data_with_no_null_genre = df_final[~df_final['matched_genres'].isna()]

In [None]:
# Divide rating by 2
data_with_no_null_genre['rating'] = data_with_no_null_genre['rating'] / 2

# Remove rows where rating is NaN or 0
data_with_no_null_genre = data_with_no_null_genre[
    data_with_no_null_genre['rating'].notna() & (data_with_no_null_genre['rating'] != 0)
]

In [None]:
data_with_no_null_genre.to_csv("/Users/whoseunassailable/Documents/coding_projects/college_projects/readiculous/data/processed/data_with_no_null_genre.csv")

In [None]:
good_reads = pd.read_csv("/Users/whoseunassailable/Documents/coding_projects/college_projects/readiculous/data/raw/good_read_books_100k.csv")

In [None]:
good_reads.columns

In [None]:
good_reads.head()

In [None]:
data_with_no_null_genre.head()

In [None]:
data_with_no_null_genre.columns

In [None]:
good_reads.columns

In [None]:

# Convert genre from string to list if necessary
def parse_genre(val):
    if pd.isna(val):
        return []
    if isinstance(val, list):
        return val
    return [g.strip() for g in str(val).split(',') if g.strip()]

good_reads['matched_genres'] = good_reads['genre'].apply(parse_genre)

# Create a unified format for good_reads
good_reads_standardized = pd.DataFrame({
    'user_id': data_with_no_null_genre['user_id'],
    'isbn': good_reads['isbn'],
    'rating': good_reads['rating'],
    'title': good_reads['title'],
    'author': good_reads['author'],
    'desc':good_reads['desc'],
    'year': data_with_no_null_genre['year'],
    'publisher': data_with_no_null_genre['publisher'],
    'age': data_with_no_null_genre['age'],
    'normalized_title': good_reads['title'].str.lower().str.replace(r'[^a-z0-9]', '', regex=True),
    'matched_genres': good_reads['matched_genres']
})

# Select and align columns from data_with_no_null_genre
data_with_no_null_genre_aligned = data_with_no_null_genre[[
    'user_id', 'isbn', 'rating', 'title', 'author', 'year', 'publisher',
    'age', 'normalized_title', 'matched_genres'
]]

# Combine the two datasets
combined_df = pd.concat([data_with_no_null_genre_aligned, good_reads_standardized], ignore_index=True)

# Save the result
combined_df.to_csv("/Users/whoseunassailable/Documents/coding_projects/college_projects/readiculous/data/processed/combined_books.csv", index=False)

print("✅ Combined and standardized dataset saved as combined_books.csv")

In [None]:
combined_df = pd.read_csv("/Users/whoseunassailable/Documents/coding_projects/college_projects/readiculous/data/processed/combined_books.csv")


In [None]:
combined_df.shape

In [None]:
import pandas as pd
from ydata_profiling import ProfileReport
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px

In [None]:
df = pd.read_csv("/Users/whoseunassailable/Documents/coding_projects/college_projects/readiculous/data/processed/combined_books.csv")

## We will pre-process the data and see what are all the missing and duplicate values present in the dataset

In [None]:
df.isnull().sum()

### Columns that should NOT be missing (ideally):
rating - Drop rows where rating is missing

title -  Drop rows where title is missing

author - Drop rows where author is missing

matched_genres - Drop rows where matched_genres is missing


In [None]:
required_cols = ['rating', 'title', 'author', 'matched_genres']
df = df.dropna(subset=required_cols).copy()


In [None]:
df.shape

In [None]:
df['normalized_title'] = df['normalized_title'].fillna(df['title'].str.lower())


In [None]:
df.shape

In [None]:
df = df.drop(columns=['user_id', 'age', 'publisher', 'year'])


In [None]:
df.columns

In [None]:
def inspect_all_dirty_strings(df):
    cols_to_check = [
        'isbn', 'rating', 'title', 'author',
        'normalized_title', 'matched_genres', 'desc'
    ]
    
    for col in cols_to_check:
        print(f"\n--- Inspecting column: {col} ---")
        
        # Convert to string and drop missing
        df_col = df[col].dropna().astype(str)
        
        print("Non-null count:", df_col.shape[0])
        print("Unique values:", df_col.nunique())

        # Check for only-numeric strings or very short strings
        print("\nSuspicious values (pure numbers or < 3 chars):")
        suspicious = df_col[df_col.str.match(r'^\d+$') | df_col.str.len().lt(3)]
        print(suspicious.value_counts().head(10))

        # Check for null-like values stored as strings
        print("\nString values that look like nulls ('nan', 'none', etc.):")
        null_like = df_col[df_col.str.lower().isin(['nan', 'none', 'null', 'n/a', 'na'])].value_counts()
        print(null_like)

        # Top 5 most common values
        print("\nTop 5 most frequent entries:")
        print(df_col.value_counts().head(5))
        
        print("-" * 60)


In [None]:
inspect_all_dirty_strings(df)


### Cleaning Summary & Recommendations
isbn
Mostly clean. Looks like valid ISBNs (all numeric, fixed length).
Optional: Remove rows where isbn is missing if you plan to use it as a unique book key.

rating
We'll drop rows where rating is missing for collaborative filtering.

title / normalized_title
Some entries are very short (like "It", "V", "14"), which may be ambiguous or junk.
Keep short ones like “1984”, “It” (real books).
Remove/flag single-character or number-only titles like "1", "911", "Q".

author
A few numeric or junk entries ("19", "J.", "Ai") so replace them with empty author
Keep known short names if verified (Ai might be real).

matched_genres
Drop rows where matched_genres == '[]' (these can't be used in genre filtering).

for description
Remove junk entries like ".", ">", "No", "a", "PB" and make the value empty instead


In [None]:
# Drop rows missing required columns (except isbn)
df = df.dropna(subset=['rating', 'title', 'author', 'matched_genres']).copy()


In [None]:
# Convert columns to string for safe processing
df['title'] = df['title'].astype(str)
df['author'] = df['author'].astype(str)
df['matched_genres'] = df['matched_genres'].astype(str)
df['desc'] = df['desc'].astype(str)
df['normalized_title'] = df['normalized_title'].astype(str)


In [None]:
# Step 1: Remove titles that are purely numeric or single character (but keep legit short ones like "1984", "It")
legit_short_titles = {'it', 'we', 's.', 'v.', 'v', 'go'}
def is_bad_title(title):
    t = title.strip().lower()
    return ((len(t) <= 3 or t.isdigit()) and t not in legit_short_titles)


In [None]:
df = df[~df['title'].apply(is_bad_title)]


In [None]:
# Step 2: Replace junk author names (numbers or 1-2 character codes) with empty string
def clean_author(author):
    a = author.strip()
    if a.lower() in {'nan', 'none'} or a.isdigit() or len(a) <= 2 or a.lower() in {'j.', 'a.', 'b.'}:
        return ''
    return a
df['author'] = df['author'].apply(clean_author)


In [None]:
# Step 3: Drop rows where matched_genres is just an empty list
df = df[df['matched_genres'].str.strip() != '[]']


In [None]:
# Step 4: Replace junk descriptions with empty string
junk_descs = {'.', '>', 'no', 'b', 'a', 'pb', 'Â '}
df['desc'] = df['desc'].apply(lambda x: '' if x.strip().lower() in junk_descs or len(x.strip()) < 10 else x)


In [None]:
# Step 5: Normalize title if not available
df['normalized_title'] = df['normalized_title'].replace('nan', np.nan)
df['normalized_title'] = df['normalized_title'].fillna(df['title'].str.lower())


In [None]:
# Reset index for safety
df = df.reset_index(drop=True)


In [None]:
df.shape

In [None]:
df['author'] = df['author'].apply(lambda x: x.title())

In [None]:
df.to_csv("/Users/whoseunassailable/Documents/coding_projects/college_projects/readiculous/data/processed/ready_for_feature_engineering.csv")

In [None]:
import pandas as pd
import ast
from sklearn.preprocessing import MultiLabelBinarizer
import ast
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
df = pd.read_csv("/Users/whoseunassailable/Documents/coding_projects/college_projects/readiculous/data/processed/ready_for_feature_engineering.csv")

In [None]:
def create_combined_text(df):
    return (
        df['normalized_title'].fillna('') + ' ' +
        df['matched_genres'].fillna('') + ' ' +
        df['desc'].fillna('')
    )

df['combined_text'] = create_combined_text(df)


In [None]:

# Safely convert stringified lists to actual lists
df['genres_list'] = df['matched_genres'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else [])

# One-hot encode genres
mlb = MultiLabelBinarizer()
genre_features = mlb.fit_transform(df['genres_list'])

# Add back to DataFrame (optional)
genre_df = pd.DataFrame(genre_features, columns=mlb.classes_)
df_clean = pd.concat([df, genre_df], axis=1)


In [None]:
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_features=50000)
tfidf_matrix = tfidf_vectorizer.fit_transform(df_clean['combined_text'])

In [None]:
interaction_df = df_clean[['title', 'rating']].copy()

In [None]:
df.columns

In [None]:
df.to_csv("/Users/whoseunassailable/Documents/coding_projects/college_projects/readiculous/data/processed/feature_engineering_done.csv")

In [None]:
import pandas as pd

In [None]:
df = pd.read_csv("/Users/whoseunassailable/Documents/coding_projects/college_projects/readiculous/data/processed/feature_engineering_done.csv")

### Fix all-uppercase authors (e.g., “KAREN ROBARDS”)


In [None]:
df['author'] = df['author'].astype(str).apply(lambda x: x.title())

In [None]:
df['author'] = df['author'].apply(lambda x: x.title())

In [None]:
df.shape

In [None]:
def get_books_by_genres(df, selected_genres, top_n=10):
    # Filter books that contain at least one selected genre
    filtered_df = df[df['genres_list'].apply(lambda genres: any(genre in genres for genre in selected_genres))]

    # Optional: Sort by average rating or popularity if available
    if 'rating' in filtered_df.columns:
        top_books = filtered_df.sort_values(by='rating', ascending=False)
        top_books = top_books.drop_duplicates(subset=['title', 'author']).head(top_n)
    else:
        top_books = filtered_df.head(top_n)
    
    return top_books[['title', 'author', 'matched_genres', 'rating']].reset_index(drop=True)


In [None]:
# User-selected genres (simulate input)
user_likes = ['Fantasy', 'Thriller', 'Romance']

# Recommend books
recommended_books = get_books_by_genres(df, user_likes, top_n=10)

print(recommended_books)


In [None]:
df.to_csv("/Users/whoseunassailable/Documents/coding_projects/college_projects/readiculous/data/processed/genre_based_filtering.csv")

In [None]:
import pandas as pd
from surprise import Dataset, Reader, SVD
import ast
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.neighbors import NearestNeighbors



In [None]:

books_df = pd.read_csv('/Users/whoseunassailable/Documents/coding_projects/college_projects/readiculous/data/raw/Books.csv', sep=';')
ratings_df = pd.read_csv('/Users/whoseunassailable/Documents/coding_projects/college_projects/readiculous/data/raw/Ratings.csv', sep=';')

ratings_df['Rating'] = ratings_df['Rating'].apply(lambda x: round(x / 2, 1) if x > 0 else 0)
ratings_df = ratings_df[ratings_df['Rating'] > 0]

ratings_books = ratings_df.merge(books_df[['ISBN', 'Title']], on='ISBN', how='inner')

ratings_for_surprise = ratings_books.rename(columns={
    'User-ID': 'user_id', 'Title': 'title', 'Rating': 'rating'
})[['user_id', 'title', 'rating']]

ratings_for_surprise['title'] = ratings_for_surprise['title'].astype(str).str.strip().str.lower()


In [None]:
genre_based_df = pd.read_csv('/Users/whoseunassailable/Documents/coding_projects/college_projects/readiculous/data/processed/genre_based_filtering.csv')
genre_based_df['title'] = genre_based_df['title'].astype(str).str.strip().str.lower()
genre_based_df['genres_list'] = genre_based_df['genres_list'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else [])


In [None]:

reader = Reader(rating_scale=(0.5, 5.0))
data = Dataset.load_from_df(ratings_for_surprise[['user_id', 'title', 'rating']], reader)
trainset = data.build_full_trainset()

model = SVD()
model.fit(trainset)


In [None]:
def get_svd_predictions(model, ratings_df, user_id, top_n=50):
    rated_books = ratings_df[ratings_df['user_id'] == user_id]['title'].tolist()
    all_books = ratings_df['title'].unique()
    unseen_books = [book for book in all_books if book not in rated_books]
    predictions = [model.predict(user_id, book) for book in unseen_books]
    return sorted(predictions, key=lambda x: x.est, reverse=True)[:top_n]

def merge_with_genres(predictions, genre_df, selected_genres):
    selected_genres = set(selected_genres)
    rows = []
    for pred in predictions:
        title = pred.iid
        est = pred.est
        match = genre_df[genre_df['title'] == title]
        if not match.empty:
            genres = match.iloc[0]['genres_list']
            overlap = len(set(genres) & selected_genres)
            rows.append({
                'title': title,
                'estimated_rating': est,
                'genre_overlap': overlap,
                'author': match.iloc[0]['author'],
                'matched_genres': match.iloc[0]['matched_genres']
            })
    return pd.DataFrame(rows)


In [None]:
def hybrid_recommendation(model, ratings_df, genre_df, user_id, selected_genres, alpha=0.7, beta=0.3, top_n=10):
    preds = get_svd_predictions(model, ratings_df, user_id, top_n=100)
    enriched = merge_with_genres(preds, genre_df, selected_genres)
    
    if enriched.empty:
        return pd.DataFrame()

    enriched['norm_rating'] = (enriched['estimated_rating'] - enriched['estimated_rating'].min()) / (
        enriched['estimated_rating'].max() - enriched['estimated_rating'].min() + 1e-6
    )
    enriched['norm_genre'] = enriched['genre_overlap'] / (enriched['genre_overlap'].max() + 1e-6)
    
    enriched['hybrid_score'] = alpha * enriched['norm_rating'] + beta * enriched['norm_genre']
    
    return enriched.sort_values(by='hybrid_score', ascending=False).head(top_n)[
        ['title', 'author', 'matched_genres', 'estimated_rating', 'genre_overlap', 'hybrid_score']
    ]


In [None]:
user_id = 276726
selected_genres = ['Fantasy', 'Romance', 'Mystery']

hybrid_results = hybrid_recommendation(model, ratings_for_surprise, genre_based_df, user_id, selected_genres)
print(hybrid_results)


In [None]:
selected_genres = ['Fantasy', 'Romance', 'Mystery']
user_id = 276726  # Pick from your real users in ratings_for_surprise

recommendations = hybrid_recommendation(
    model, ratings_for_surprise, genre_based_df, user_id, selected_genres
)

print(recommendations)


In [None]:
genre_based_df.to_csv(
    "/Users/whoseunassailable/Documents/coding_projects/college_projects/readiculous/data/processed/genre_based_filtering_cleaned.csv",
    index=False
)


In [None]:
# Load the CSV efficiently
df = pd.read_csv("/Users/whoseunassailable/Documents/coding_projects/college_projects/readiculous/data/processed/genre_based_filtering_cleaned.csv")

In [None]:
# Combine text fields
df['combined_text'] = (
    df['normalized_title'].fillna('') + ' ' +
    df['matched_genres'].fillna('') + ' ' +
    df['desc'].fillna('')
)


In [None]:

# TF-IDF vectorization (sparse!)
vectorizer = TfidfVectorizer(stop_words='english', max_features=50000)
tfidf_matrix = vectorizer.fit_transform(df['combined_text'])


In [None]:
# Use Nearest Neighbors to get top 10 similar items
knn = NearestNeighbors(metric='cosine', algorithm='brute')
knn.fit(tfidf_matrix)


In [None]:
# Example: Get top 10 similar books for the first book
distances, indices = knn.kneighbors(tfidf_matrix[0], n_neighbors=10)


In [None]:
# Print similar book titles
similar_books = df.iloc[indices[0]]['normalized_title']
print(similar_books)

In [None]:
print(distances[0])


In [None]:
print(df.iloc[indices[0]]['combined_text'])