In [None]:




import pandas as pd
from surprise import Dataset, Reader, SVD
from surprise.model_selection import train_test_split
from surprise import accuracy

# Load the datasets
ratings = pd.read_csv("/content/ratings.csv")            # book_id, user_id, rating
books = pd.read_csv("/content/books_cleaned.csv")        # book_id, title, etc.

# -----------------------------------------
# 🧹 2. Preprocessing and Filtering
# -----------------------------------------
# Ensure correct datatypes
ratings['book_id'] = ratings['book_id'].astype(int)
books['book_id'] = books['book_id'].astype(int)
books['title'] = books['title'].astype(str).fillna('').str.lower()

# Get book_ids that appear in books.csv
valid_book_ids = set(books['book_id'].unique())

# Filter ratings to only include those for books we have metadata for
filtered_ratings = ratings[ratings['book_id'].isin(valid_book_ids)]

# Now keep only books that actually have ratings
rated_books = filtered_ratings['book_id'].value_counts()
books_with_ratings = rated_books[rated_books > 0].index
filtered_books = books[books['book_id'].isin(books_with_ratings)]
filtered_ratings = filtered_ratings[filtered_ratings['book_id'].isin(books_with_ratings)]

In [None]:
filtered_ratings.shape

(79701, 3)

In [None]:
filtered_books.shape

(812, 6)

In [None]:
title_to_id = filtered_books.set_index('title')['book_id'].to_dict()
id_to_title = filtered_books.set_index('book_id')['title'].to_dict()

In [None]:
reader = Reader(rating_scale=(filtered_ratings.rating.min(), filtered_ratings.rating.max()))
data = Dataset.load_from_df(filtered_ratings[['user_id', 'book_id', 'rating']], reader)


In [None]:
trainset, testset = train_test_split(data, test_size=0.2, random_state=42)


In [None]:
filtered_ratings.shape

(79701, 3)

In [None]:
from surprise.model_selection import GridSearchCV

param_grid = {
    'n_factors': [50, 100],
    'n_epochs': [20, 30],
    'lr_all': [0.005, 0.01],
    'reg_all': [0.02, 0.1]
}

# GridSearchCV
gs = GridSearchCV(SVD, param_grid, measures=['rmse', 'mae'], cv=3, n_jobs=-1, joblib_verbose=1)
gs.fit(data)

# Output best score and parameters
print("✅ Best RMSE:", gs.best_score['rmse'])
print("✅ Best Parameters:", gs.best_params['rmse'])

# Train final model with best parameters
best_model = gs.best_estimator['rmse']

best_model.fit(trainset)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  48 out of  48 | elapsed:   40.1s finished


✅ Best RMSE: 0.901051039178887
✅ Best Parameters: {'n_factors': 50, 'n_epochs': 30, 'lr_all': 0.01, 'reg_all': 0.1}


<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7a5cbfe96cd0>

In [None]:
predictions = best_model.test(testset)
rmse = accuracy.rmse(predictions)
mae = accuracy.mae(predictions)

RMSE: 0.8917
MAE:  0.7063


In [None]:
def get_collaborative_recommendations_by_title(book_title, top_n=10):
    book_title = book_title.lower()

    if book_title not in title_to_id:
        matched_titles = [title for title in title_to_id if book_title in title]
        if not matched_titles:
            return f"❌ Book title '{book_title}' not found in dataset."
        book_title = matched_titles[0]

    book_id = title_to_id.get(book_title)
    if book_id is None:
        return f"❌ Book title '{book_title}' not found."

    users_who_rated = filtered_ratings[filtered_ratings['book_id'] == book_id]['user_id'].unique()
    if len(users_who_rated) == 0:
        return f"⚠️ No user ratings found for the book '{book_title.title()}'"

    example_user = users_who_rated[0]

    rated_books = filtered_ratings[filtered_ratings['user_id'] == example_user]['book_id'].tolist()
    all_books = filtered_ratings['book_id'].unique()
    unrated_books = [b for b in all_books if b not in rated_books]

    predictions = [(b_id, best_model.predict(example_user, b_id).est) for b_id in unrated_books]
    top_books = sorted(predictions, key=lambda x: x[1], reverse=True)[:top_n]
    recommendations = [(id_to_title[b_id].title(), round(score, 2)) for b_id, score in top_books if b_id in id_to_title]

    return recommendations

In [None]:
book_name_input = "memoirs of a geisha"
recs = get_collaborative_recommendations_by_title(book_name_input, top_n=5)

print(f"\n📘 Recommendations for '{book_name_input.title()}':")
if isinstance(recs, list):
    for i, (title, rating) in enumerate(recs, 1):
        print(f"{i}. {title} (Predicted Rating: {rating})")
else:
    print(recs)


📘 Recommendations for 'Memoirs Of A Geisha':
1. Still Life With Woodpecker (Predicted Rating: 4.76)
2. The Beautiful And Damned (Predicted Rating: 4.65)
3. Girl With A Pearl Earring (Predicted Rating: 4.64)
4. Peter And The Shadow Thieves (Predicted Rating: 4.62)
5. The Rules Of Attraction (Predicted Rating: 4.6)


In [None]:
filtered_books.head(30)

Unnamed: 0,book_id,title,authors,goodreads_book_id,tags,content
1,3,harry potter and the philosophers stone,jk rowling mary grandpre,3,children-s-books kindle owned-books favorite-s...,harry potter and the philosophers stone jk row...
3,2657,to kill a mockingbird,harper lee,2657,kindle owned-books harper-lee race general-fic...,to kill a mockingbird harper lee kindle owned-...
4,4671,the great gatsby,f scott fitzgerald,4671,owned-books america romance general-fiction no...,the great gatsby f scott fitzgerald owned-book...
6,5907,the hobbit or there and back again,jrr tolkien,5907,children-s-books on-hold kindle fiction-fantas...,the hobbit or there and back again jrr tolkien...
7,5107,the catcher in the rye,jd salinger,5107,owned-books america j-d-salinger general-ficti...,the catcher in the rye jd salinger owned-books...
8,960,angels demons,dan brown,960,kindle owned-books thrillers crime-thriller cr...,angels demons dan brown kindle owned-books thr...
9,1885,pride and prejudice,jane austen,1885,on-hold kindle owned-books chick-lit austen br...,pride and prejudice jane austen on-hold kindle...
12,5470,nineteen eightyfour,george orwell erich fromm celal uster,5470,satire on-hold kindle owned-books philosophy g...,nineteen eightyfour george orwell erich fromm ...
13,7613,animal farm a fairy story,george orwell,7613,satire kindle owned-books philosophy general-f...,animal farm a fairy story george orwell satire...
17,5,harry potter and the prisoner of azkaban,jk rowling mary grandpre rufus beck,5,children-s-books kindle owned-books favorite-s...,harry potter and the prisoner of azkaban jk ro...


In [None]:
popular = filtered_ratings[filtered_ratings['book_id'] == title_to_id['still life with woodpecker']]
print("Number of ratings:", popular.shape[0])
print("Average rating:", popular['rating'].mean())


Number of ratings: 99
Average rating: 4.777777777777778


In [None]:
import pickle

# Save the trained SVD model
with open('svd_model.pkl', 'wb') as f:
    pickle.dump(best_model, f)


In [None]:
filtered_books.to_csv("filtered_books.csv", index=False)


In [None]:
filtered_ratings.to_csv("filtered_ratings.csv", index=False)


In [None]:
from google.colab import files
files.download('svd_model.pkl')


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>