In [41]:
from surprise import Dataset, Reader, SVD
from surprise.model_selection import train_test_split
from surprise import accuracy
from surprise.model_selection import cross_validate
import pandas as pd

In [43]:
# Rename or extract just these columns
ratings_df = df[['title', 'rating']].copy()

# Simulate a single-session user (e.g., 'user_1') if no user_id is available
ratings_df['user_id'] = 'user_1'

# Reorder for Surprise: user, item, rating
ratings_df = ratings_df[['user_id', 'title', 'rating']]


In [44]:
reader = Reader(rating_scale=(0.5, 5.0))
data = Dataset.load_from_df(ratings_df, reader)
trainset = data.build_full_trainset()


In [45]:
model = SVD()
model.fit(trainset)


<surprise.prediction_algorithms.matrix_factorization.SVD at 0x3063969c0>

In [46]:
# List of all unique books
all_books = df['title'].unique()

# Books already rated
rated_books = ratings_df['title'].unique()

# Predict for books not rated yet
unrated_books = [book for book in all_books if book not in rated_books]

# Predict ratings for unrated books
predictions = [model.predict('user_1', book) for book in unrated_books]

# Sort predictions by estimated rating
predicted_ratings = sorted(predictions, key=lambda x: x.est, reverse=True)

# Top N recommendations
top_n = 10
top_books = [pred.iid for pred in predicted_ratings[:top_n]]


In [47]:
recommended_df = df[df['title'].isin(top_books)][['title', 'author', 'matched_genres']]
recommended_df = recommended_df.drop_duplicates(subset=['title', 'author']).head(top_n)
print(recommended_df)


Empty DataFrame
Columns: [title, author, matched_genres]
Index: []


In [48]:
df.columns

Index(['Unnamed: 0.2', 'Unnamed: 0.1', 'Unnamed: 0', 'isbn', 'rating', 'title',
       'author', 'normalized_title', 'matched_genres', 'desc', 'combined_text',
       'genres_list'],
      dtype='object')

In [49]:
# Load small samples
books_df = pd.read_csv('/Users/whoseunassailable/Documents/coding_projects/college_projects/readiculous/data/raw/Books.csv', sep=';')
ratings_df = pd.read_csv('/Users/whoseunassailable/Documents/coding_projects/college_projects/readiculous/data/raw/Ratings.csv', sep=';')
users_df = pd.read_csv('/Users/whoseunassailable/Documents/coding_projects/college_projects/readiculous/data/raw/Users.csv', sep=';')

# Normalize rating: convert 0–10 scale to 0.5–5.0 (0 stays 0)
ratings_df['Rating'] = ratings_df['Rating'].apply(lambda x: round(x / 2, 1) if x > 0 else 0)

# Optional: drop 0 ratings (implies no opinion)
ratings_df = ratings_df[ratings_df['Rating'] > 0]

# Join ratings with book titles via ISBN
ratings_books = ratings_df.merge(books_df[['ISBN', 'Title']], on='ISBN', how='inner')

# Rename for Surprise format
ratings_for_surprise = ratings_books.rename(columns={
    'User-ID': 'user_id',
    'Title': 'title',
    'Rating': 'rating'
})[['user_id', 'title', 'rating']]

# Preview
print(ratings_for_surprise.head())


  users_df = pd.read_csv('/Users/whoseunassailable/Documents/coding_projects/college_projects/readiculous/data/raw/Users.csv', sep=';')


   user_id                                              title  rating
0   276726                                   Rites of Passage     2.5
1   276729                                     Help!: Level 1     1.5
2   276729  The Amsterdam Connection : Level 4 (Cambridge ...     3.0
3   276744                                    A Painted House     3.5
4   276747                           Little Altars Everywhere     4.5


In [50]:
genre_based_df = pd.read_csv(
    "/Users/whoseunassailable/Documents/coding_projects/college_projects/readiculous/data/processed/genre_based_filtering.csv"
)

# Ensure title formatting matches collaborative filtering dataset
genre_based_df['title'] = genre_based_df['title'].astype(str).str.strip().str.lower()


In [51]:
ratings_for_surprise['title'] = ratings_for_surprise['title'].astype(str).str.strip().str.lower()

### 🔀 Hybrid =
α × SVD Estimated Rating

β × Genre Overlap Score



In [52]:
# Predict ratings for all books not yet rated by the user
def get_svd_predictions(model, ratings_df, user_id, top_n=50):
    rated_books = ratings_df[ratings_df['user_id'] == user_id]['title'].tolist()
    all_books = ratings_df['title'].unique()
    unseen_books = [book for book in all_books if book not in rated_books]

    predictions = [model.predict(user_id, book) for book in unseen_books]
    predictions_sorted = sorted(predictions, key=lambda x: x.est, reverse=True)
    
    return predictions_sorted[:top_n]


In [53]:
def merge_with_genres(predictions, genre_df, selected_genres):
    data = []
    selected_genres = set(selected_genres)
    
    for pred in predictions:
        title = pred.iid
        est_rating = pred.est

        match = genre_df[genre_df['title'].str.strip().str.lower() == title.strip().lower()]
        if not match.empty:
            genres = match.iloc[0]['genres_list'] if 'genres_list' in match.columns else []
            overlap = len(set(genres) & selected_genres)
            data.append({
                'title': title,
                'estimated_rating': est_rating,
                'genre_overlap': overlap,
                'author': match.iloc[0]['author'],
                'matched_genres': match.iloc[0]['matched_genres']
            })
    return pd.DataFrame(data)


In [54]:
def hybrid_recommendation(model, ratings_df, genre_df, user_id, selected_genres, alpha=0.7, beta=0.3, top_n=10):
    preds = get_svd_predictions(model, ratings_df, user_id, top_n=100)
    enriched = merge_with_genres(preds, genre_df, selected_genres)

    # Normalize scores (0 to 1)
    enriched['norm_rating'] = (enriched['estimated_rating'] - enriched['estimated_rating'].min()) / \
                              (enriched['estimated_rating'].max() - enriched['estimated_rating'].min())
    enriched['norm_genre'] = enriched['genre_overlap'] / enriched['genre_overlap'].max()

    # Blended score
    enriched['hybrid_score'] = alpha * enriched['norm_rating'] + beta * enriched['norm_genre']

    # Sort and return
    result = enriched.sort_values(by='hybrid_score', ascending=False).head(top_n)
    return result[['title', 'author', 'matched_genres', 'estimated_rating', 'genre_overlap', 'hybrid_score']]


In [55]:
selected_genres = ['Fantasy', 'Romance', 'Mystery']
user_id = 276726  # Pick from your real users in ratings_for_surprise

recommendations = hybrid_recommendation(
    model, ratings_for_surprise, genre_based_df, user_id, selected_genres
)

print(recommendations)


               title                                             author  \
0               9-11                                       Noam Chomsky   
1           stardust                                        Neil Gaiman   
2              1984.                                      George Orwell   
3                 it                                       Stephen King   
4            thirsty                                     M. T. Anderson   
5  waiting to exhale                                     Terry Mcmillan   
6      anna karenina  Leo Tolstoy,Constance Garnett,Aylmer Maude,Lou...   
7       false memory                                         Dan Krokos   
8  the boy next door                                    Annabelle Costa   
9        the contest                                    Nonny Hogrogian   

                                      matched_genres  estimated_rating  \
0  ['Politics', 'Nonfiction', 'History', 'Philoso...          4.153006   
1  ['Fantasy', 'Fiction', 