In [3]:
import pandas as pd
import numpy as np
from pathlib import Path
import matplotlib.pyplot as plt
import seaborn as sns

from surprise import SVD, Dataset, Reader, accuracy
from surprise.model_selection import train_test_split, GridSearchCV


# data loading

In [4]:
user_df = pd.read_csv(r"data_for_db\users.csv")
user_review_df = pd.read_csv(r"data_for_db\user_reviews.csv")
user_wishlist_df = pd.read_csv(r"data_for_db\user_wishlists.csv")
book_df = pd.read_csv(r"data_for_db\books.csv")
author_df = pd.read_csv(r"data_for_db\authors.csv")
publisher_df = pd.read_csv(r"data_for_db\publishers.csv")

  user_df = pd.read_csv(r"data_for_db\users.csv")


# data preparation

## add plus score if is_favourite is True

In [5]:
fav_plus = 3
def add_fav_score(row):
    if row["is_favourite"] == True:
        if pd.isna(row["book_rating"]):
            return 10
        else:
            return row["book_rating"] + fav_plus
    else:
        return row["book_rating"]
user_review_df["book_rating_plus"] = user_review_df.apply(add_fav_score, axis=1)

## drop NA

In [6]:
user_review_df.dropna(subset="book_rating_plus", inplace=True)

In [7]:
user_review_df.shape

(416207, 6)

## remove deactive users & book (For model)

In [8]:
threshold_user=3

user_count = user_review_df.value_counts("user_id")
active_user_mask = user_count[user_count > threshold_user].index

# filterd
user_review_df = user_review_df[user_review_df["user_id"].isin(active_user_mask)]

# Model

## normal

In [9]:
# 2. Load data into Surprise format
# The Reader needs to know your rating scale (e.g., 1 to 5)
df= user_review_df.copy()
df.dropna(subset="book_rating", inplace=True)
reader = Reader(rating_scale=(1, df["book_rating"].max()))
data = Dataset.load_from_df(df[['user_id', 'book_id', 'book_rating']], reader)

# 4. Initialize the SVD model
param_grid = {
    "n_factors": [50, 100, 150],
    "lr_all": [0.002, 0.005, 0.01],
    "reg_all": [0.02, 0.1, 0.4],
    "n_epochs": [20, 30]
}

# 2. Initialize GridSearchCV
# cv=3 means 3-fold cross-validation (it splits data 3 times to ensure stability)
gs = GridSearchCV(SVD, param_grid, measures=["rmse", "mae"], cv=3, n_jobs=-1)

# 5. Train the model
gs.fit(data)

# 4. Results
print(f"Best RMSE score: {gs.best_score['rmse']:.4f}")
print(f"Best Parameters: {gs.best_params['rmse']}")

best_params = gs.best_params['rmse']
final_model = SVD(
    n_factors=best_params['n_factors'],
    lr_all=best_params['lr_all'],
    reg_all=best_params['reg_all'],
    n_epochs=best_params['n_epochs']
)

trainset = data.build_full_trainset()
final_model.fit(trainset)

Best RMSE score: 1.5987
Best Parameters: {'n_factors': 50, 'lr_all': 0.005, 'reg_all': 0.1, 'n_epochs': 30}


<surprise.prediction_algorithms.matrix_factorization.SVD at 0x2426cc611d0>

In [10]:
df.sample(5)

Unnamed: 0,user_id,book_id,book_rating,comment,is_favourite,book_rating_plus
770163,210625,154474,8.0,This is a comment,False,8.0
203247,55187,99449,8.0,This is a comment,False,8.0
36806,11676,114398,6.0,This is a comment,False,6.0
291382,79441,37803,8.0,This is a comment,False,8.0
890446,240568,93455,6.0,This is a comment,False,6.0


In [11]:
test_user_id = 	31471

test_book_id = 190605
pred = final_model.predict(test_user_id, test_book_id)    

print(f"Predicted rating for User {test_user_id} on Item {test_book_id}: {pred.est:.2f}")

Predicted rating for User 31471 on Item 190605: 6.21


In [13]:
def get_top_n_recommendations(model, user_id, df, n=10):
    # 1. Get a list of all unique book IDs in the system
    all_books = df['book_id'].unique()
    
    # 2. Get the list of books this specific user has already rated
    rated_books = df[df['user_id'] == user_id]['book_id'].tolist()
    
    # 3. Identify books the user hasn't seen yet
    books_to_predict = [b for b in all_books if b not in rated_books]
    
    # 4. Predict ratings for all 'unseen' books
    predictions = [model.predict(user_id, book_id) for book_id in books_to_predict]
    
    # 5. Sort predictions by the estimated rating (est) in descending order
    predictions.sort(key=lambda x: x.est, reverse=True)
    
    # 6. Return the top n results
    return predictions[:n]

# Example usage:
top_10 = get_top_n_recommendations(final_model, user_id=31471, df=df, n=10)

for pred in top_10:
    print(f"Book ID: {pred.iid} | Predicted Rating: {pred.est:.2f}")

Book ID: 159168 | Predicted Rating: 9.77
Book ID: 189908 | Predicted Rating: 9.74
Book ID: 130260 | Predicted Rating: 9.67
Book ID: 84909 | Predicted Rating: 9.60
Book ID: 246315 | Predicted Rating: 9.57
Book ID: 74968 | Predicted Rating: 9.53
Book ID: 180024 | Predicted Rating: 9.51
Book ID: 84143 | Predicted Rating: 9.47
Book ID: 138571 | Predicted Rating: 9.46
Book ID: 16670 | Predicted Rating: 9.44


## plus

In [None]:
# 2. Load data into Surprise format
# The Reader needs to know your rating scale (e.g., 1 to 5)
df= user_review_df.copy()
reader = Reader(rating_scale=(1, df["book_rating_plus"].max()))
data = Dataset.load_from_df(df[['user_id', 'book_id', 'book_rating_plus']], reader)

# 4. Initialize the SVD model
param_grid = {
    "n_factors": [50, 100, 150],
    "lr_all": [0.002, 0.005, 0.01],
    "reg_all": [0.02, 0.1, 0.4],
    "n_epochs": [20, 30]
}

# 2. Initialize GridSearchCV
# cv=3 means 3-fold cross-validation (it splits data 3 times to ensure stability)
gs = GridSearchCV(SVD, param_grid, measures=["rmse", "mae"], cv=3, n_jobs=-1)

# 5. Train the model
gs.fit(data)

# 4. Results
print(f"Best RMSE score: {gs.best_score['rmse']:.4f}")
print(f"Best Parameters: {gs.best_params['rmse']}")

