# Recommender Systems
This project focuses on building a recommender system that predicts user ratings for books based on their past interactions. By understanding user preferences, the system aims to suggest books that align with individual tastes, enhancing the reading experience through personalized recommendations.

In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/dis-project-2-recommender-systems-f2024/sample_submission.csv
/kaggle/input/dis-project-2-recommender-systems-f2024/books.csv
/kaggle/input/dis-project-2-recommender-systems-f2024/train.csv
/kaggle/input/dis-project-2-recommender-systems-f2024/test.csv


Let's import the necessary libraries.

In [None]:
# Import necessary libraries

from sklearn.model_selection import train_test_split  # For splitting data into training and validation sets
from tqdm import tqdm  # For displaying progress bars during iterations

Before we start, we need to load the two files needed for the project:
- `train.csv`: contains the user ratings for the books.
- `test.csv`: contains the user-book pairs for which we need to predict a rating.

In [3]:
# Load the training, test, and books data
train_df = pd.read_csv('/kaggle/input/dis-project-2-recommender-systems-f2024/train.csv')
test_df = pd.read_csv('/kaggle/input/dis-project-2-recommender-systems-f2024/test.csv')

This notebook is devoted to the implementation of matrix factorization with bias (SVD-like approach) method to predict the rating a user might give to a book.

In the first step, we define a function to compute the Root Mean Squared Error (RMSE), a key metric for evaluating the accuracy of our model's predictions. The RMSE calculates the average squared difference between the predicted and actual values, followed by taking the square root. We will use this function to evaluate the model's performance during hyperparameter tuning and on the final predictions.

In [4]:
# Function for the comutation of RMSE 
def compute_rmse(actual, predicted):
    return np.sqrt(np.mean((actual - predicted) ** 2))

The following function trains the model using the given hyperparameters and predicts ratings for the test dataset. It iteratively updates the model parameters (latent factors and biases) and saves the predicted ratings to a CSV file.

In [None]:
# Function for trainning the model using the provided parameters and predict ratings on the test set
def train_and_predict(train_df, test_df, latent_dim, learning_rate, reg_param, n_epochs=20):

    print(f"Training model with params: Latent Dim: {latent_dim}, Learning Rate: {learning_rate}, Regularization: {reg_param}, Epochs: {n_epochs}")
    
    # Map user and book IDs to numerical indices
    user_to_index = {user_id: idx for idx, user_id in enumerate(train_df['user_id'].unique())}
    book_to_index = {book_id: idx for idx, book_id in enumerate(train_df['book_id'].unique())}

    # Number of unique users and books
    n_users = len(user_to_index)
    n_items = len(book_to_index)

    # Initialize latent factor matrices for users (P) and books (Q) and biases
    P = np.random.normal(scale=0.01, size=(n_users, latent_dim))  # User latent factors
    Q = np.random.normal(scale=0.01, size=(n_items, latent_dim))  # Book latent factors
    mu = train_df['rating'].mean()  # Global mean rating
    b_u = np.zeros(n_users)  # User biases
    b_i = np.zeros(n_items)  # Book biases

    # Training loop: iterate over epochs to optimize the model
    for epoch in range(n_epochs):
        total_loss = 0  # Track the cumulative loss for the epoch
        for _, row in tqdm(train_df.iterrows(), total=len(train_df), desc=f"Epoch {epoch+1}/{n_epochs}"):
            user_idx = user_to_index.get(row['user_id'])  # Map user ID to index
            book_idx = book_to_index.get(row['book_id'])  # Map book ID to index
            rating = row['rating']  # Actual rating

            # Skip if user or book is not in the training data
            if user_idx is None or book_idx is None:
                continue

            # Predict the rating
            pred_rating = mu + b_u[user_idx] + b_i[book_idx] + np.dot(P[user_idx], Q[book_idx])
            error = rating - pred_rating  # Compute the error

            # Update biases and latent factors using gradient descent
            b_u[user_idx] += learning_rate * (error - reg_param * b_u[user_idx])  # Update user bias
            b_i[book_idx] += learning_rate * (error - reg_param * b_i[book_idx])  # Update book bias
            P[user_idx] += learning_rate * (error * Q[book_idx] - reg_param * P[user_idx])  # Update user latent factors
            Q[book_idx] += learning_rate * (error * P[user_idx] - reg_param * Q[book_idx])  # Update book latent factors

            # Accumulate the squared error for loss tracking
            total_loss += error**2

        # Print the cumulative loss after each epoch
        print(f"Epoch {epoch+1}/{n_epochs}, Loss: {total_loss:.4f}")

    # Generate predictions for the test set
    predictions = []
    for _, row in tqdm(test_df.iterrows(), total=len(test_df), desc="Predicting"):
        user_idx = user_to_index.get(row['user_id'])  # Map user ID to index
        book_idx = book_to_index.get(row['book_id'])  # Map book ID to index

        # Predict the rating for user-book pair or fallback to the global mean
        if user_idx is not None and book_idx is not None:
            pred_rating = mu + b_u[user_idx] + b_i[book_idx] + np.dot(P[user_idx], Q[book_idx])
        else:
            pred_rating = mu  # Fallback to global mean rating if user or book is not in training data

        # Append the prediction to the results
        predictions.append({'id': row['id'], 'rating': pred_rating})

    # Save predictions to CSV
    predictions_df = pd.DataFrame(predictions)
    predictions_df.to_csv("submission.csv", index=False)
    print("Predictions saved to 'submission.csv'")

Here, we directly use the optimal hyperparameters, that we found in an intermediate computing notebook that calculated them by hyperparameterization tuning with grid search:
- Latent dimensions: 75
- Learning rate: 0.01
- Regularization: 0.1

We train for 20 epochs, providing enough iterations to ensure convergence while minimizing the risk of overfitting from excessive training.

The model is trained on the training dataset, and predictions for the test dataset are saved in `submission.csv`.

In [None]:
latent_dim = 75  # Optimal number of latent dimensions
learning_rate = 0.01  # Optimal learning rate
reg_param = 0.1  # Optimal regularization parameter
n_epochs = 20  # Number of epochs chosen

# Train the model and make predictions
train_and_predict(train_df, test_df, latent_dim, learning_rate, reg_param, n_epochs)