In [1]:
import pandas as pd
from surprise import Dataset, Reader, SVD
import ast
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.neighbors import NearestNeighbors



In [2]:

books_df = pd.read_csv('/Users/whoseunassailable/Documents/coding_projects/college_projects/readiculous/data/raw/Books.csv', sep=';')
ratings_df = pd.read_csv('/Users/whoseunassailable/Documents/coding_projects/college_projects/readiculous/data/raw/Ratings.csv', sep=';')

ratings_df['Rating'] = ratings_df['Rating'].apply(lambda x: round(x / 2, 1) if x > 0 else 0)
ratings_df = ratings_df[ratings_df['Rating'] > 0]

ratings_books = ratings_df.merge(books_df[['ISBN', 'Title']], on='ISBN', how='inner')

ratings_for_surprise = ratings_books.rename(columns={
    'User-ID': 'user_id', 'Title': 'title', 'Rating': 'rating'
})[['user_id', 'title', 'rating']]

ratings_for_surprise['title'] = ratings_for_surprise['title'].astype(str).str.strip().str.lower()


In [3]:
genre_based_df = pd.read_csv('/Users/whoseunassailable/Documents/coding_projects/college_projects/readiculous/data/processed/genre_based_filtering.csv')
genre_based_df['title'] = genre_based_df['title'].astype(str).str.strip().str.lower()
genre_based_df['genres_list'] = genre_based_df['genres_list'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else [])


In [4]:

reader = Reader(rating_scale=(0.5, 5.0))
data = Dataset.load_from_df(ratings_for_surprise[['user_id', 'title', 'rating']], reader)
trainset = data.build_full_trainset()

model = SVD()
model.fit(trainset)


<surprise.prediction_algorithms.matrix_factorization.SVD at 0x3138dd1f0>

In [5]:
def get_svd_predictions(model, ratings_df, user_id, top_n=50):
    rated_books = ratings_df[ratings_df['user_id'] == user_id]['title'].tolist()
    all_books = ratings_df['title'].unique()
    unseen_books = [book for book in all_books if book not in rated_books]
    predictions = [model.predict(user_id, book) for book in unseen_books]
    return sorted(predictions, key=lambda x: x.est, reverse=True)[:top_n]

def merge_with_genres(predictions, genre_df, selected_genres):
    selected_genres = set(selected_genres)
    rows = []
    for pred in predictions:
        title = pred.iid
        est = pred.est
        match = genre_df[genre_df['title'] == title]
        if not match.empty:
            genres = match.iloc[0]['genres_list']
            overlap = len(set(genres) & selected_genres)
            rows.append({
                'title': title,
                'estimated_rating': est,
                'genre_overlap': overlap,
                'author': match.iloc[0]['author'],
                'matched_genres': match.iloc[0]['matched_genres']
            })
    return pd.DataFrame(rows)


In [6]:
def hybrid_recommendation(model, ratings_df, genre_df, user_id, selected_genres, alpha=0.7, beta=0.3, top_n=10):
    preds = get_svd_predictions(model, ratings_df, user_id, top_n=100)
    enriched = merge_with_genres(preds, genre_df, selected_genres)
    
    if enriched.empty:
        return pd.DataFrame()

    enriched['norm_rating'] = (enriched['estimated_rating'] - enriched['estimated_rating'].min()) / (
        enriched['estimated_rating'].max() - enriched['estimated_rating'].min() + 1e-6
    )
    enriched['norm_genre'] = enriched['genre_overlap'] / (enriched['genre_overlap'].max() + 1e-6)
    
    enriched['hybrid_score'] = alpha * enriched['norm_rating'] + beta * enriched['norm_genre']
    
    return enriched.sort_values(by='hybrid_score', ascending=False).head(top_n)[
        ['title', 'author', 'matched_genres', 'estimated_rating', 'genre_overlap', 'hybrid_score']
    ]


In [7]:
user_id = 276726
selected_genres = ['Fantasy', 'Romance', 'Mystery']

hybrid_results = hybrid_recommendation(model, ratings_for_surprise, genre_based_df, user_id, selected_genres)
print(hybrid_results)


                       title                                       author  \
1             redeeming love                              Francine Rivers   
0   a tree grows in brooklyn                                 Betty  Smith   
2                  mrs. mike             Benedict Freedman,Nancy Freedman   
10            the green mile                                 Stephen King   
3                fingersmith                                 Sarah Waters   
6       the velveteen rabbit  Margery Williams Bianco,William   Nicholson   
15              dragonflight                               Anne Mccaffrey   
4                  hiroshima                                  John Hersey   
5        west with the night                                Beryl Markham   
11                   phantom                               Dean R. Koontz   

                                       matched_genres  estimated_rating  \
1   ['Christian Fiction', 'Christian', 'Fiction', ...          4.521801   
0 

In [8]:
selected_genres = ['Fantasy', 'Romance', 'Mystery']
user_id = 276726  # Pick from your real users in ratings_for_surprise

recommendations = hybrid_recommendation(
    model, ratings_for_surprise, genre_based_df, user_id, selected_genres
)

print(recommendations)


                       title                                       author  \
1             redeeming love                              Francine Rivers   
0   a tree grows in brooklyn                                 Betty  Smith   
2                  mrs. mike             Benedict Freedman,Nancy Freedman   
10            the green mile                                 Stephen King   
3                fingersmith                                 Sarah Waters   
6       the velveteen rabbit  Margery Williams Bianco,William   Nicholson   
15              dragonflight                               Anne Mccaffrey   
4                  hiroshima                                  John Hersey   
5        west with the night                                Beryl Markham   
11                   phantom                               Dean R. Koontz   

                                       matched_genres  estimated_rating  \
1   ['Christian Fiction', 'Christian', 'Fiction', ...          4.521801   
0 

In [9]:
genre_based_df.to_csv(
    "/Users/whoseunassailable/Documents/coding_projects/college_projects/readiculous/data/processed/genre_based_filtering_cleaned.csv",
    index=False
)


In [10]:
# Load the CSV efficiently
df = pd.read_csv("/Users/whoseunassailable/Documents/coding_projects/college_projects/readiculous/data/processed/genre_based_filtering_cleaned.csv")

In [11]:
# Combine text fields
df['combined_text'] = (
    df['normalized_title'].fillna('') + ' ' +
    df['matched_genres'].fillna('') + ' ' +
    df['desc'].fillna('')
)


In [12]:

# TF-IDF vectorization (sparse!)
vectorizer = TfidfVectorizer(stop_words='english', max_features=50000)
tfidf_matrix = vectorizer.fit_transform(df['combined_text'])


In [13]:
# Use Nearest Neighbors to get top 10 similar items
knn = NearestNeighbors(metric='cosine', algorithm='brute')
knn.fit(tfidf_matrix)


In [14]:
# Example: Get top 10 similar books for the first book
distances, indices = knn.kneighbors(tfidf_matrix[0], n_neighbors=10)


In [15]:
# Print similar book titles
similar_books = df.iloc[indices[0]]['normalized_title']
print(similar_books)

22837    alaska
25590    alaska
6709     alaska
14520    alaska
3810     alaska
12203    alaska
11046    alaska
4880     alaska
10161    alaska
16669    alaska
Name: normalized_title, dtype: object


In [16]:
print(distances[0])


[0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]


In [17]:
print(df.iloc[indices[0]]['combined_text'])


22837    alaska ['Young Adult', 'Fiction', 'Contemporar...
25590    alaska ['Young Adult', 'Fiction', 'Contemporar...
6709     alaska ['Young Adult', 'Fiction', 'Contemporar...
14520    alaska ['Young Adult', 'Fiction', 'Contemporar...
3810     alaska ['Young Adult', 'Fiction', 'Contemporar...
12203    alaska ['Young Adult', 'Fiction', 'Contemporar...
11046    alaska ['Young Adult', 'Fiction', 'Contemporar...
4880     alaska ['Young Adult', 'Fiction', 'Contemporar...
10161    alaska ['Young Adult', 'Fiction', 'Contemporar...
16669    alaska ['Young Adult', 'Fiction', 'Contemporar...
Name: combined_text, dtype: object
