# Recommender Systems
This project focuses on building a recommender system that predicts user ratings for books based on their past interactions. By understanding user preferences, the system aims to suggest books that align with individual tastes, enhancing the reading experience through personalized recommendations.

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

Let's import the necessary libraries.

In [None]:
# Import necessary libraries

from sklearn.metrics.pairwise import pairwise_distances  # For calculating cosine similarity

import pandas as pd  # For data manipulation and CSV file handling
import numpy as np  # For numerical operations

Before we start, we need to load the two files needed for the project:
- `train.csv`: contains the user ratings for the books.
- `test.csv`: contains the user-book pairs for which we need to predict a rating.

In [None]:
# Load the training, test, and books data
train_df = pd.read_csv("Data/train.csv")
test_df = pd.read_csv("Data/test.csv")

## User-Based Collaborative Filtering Method
This section is devoted to the implementation of a user-based collaborative filtering method to predict the rating a user might give to a book.

First of all, we need to build a user-item matrix by first mapping each unique user_id and book_id to a unique index, allowing us to easily locate users and books within the matrix. By initializing the matrix with zeros and populating it with ratings from the training data, where each row represents a user and each column represents a book, we create a structured format suitable for collaborative filtering algorithms.

In [None]:
# Extract unique user and book IDs
users = train_df['user_id'].unique()
books = train_df['book_id'].unique()

# Create mappings for users and books to indices
user_to_index = {user: i for i, user in enumerate(users)}
book_to_index = {book: i for i, book in enumerate(books)}

# Initialize the user-item matrix with zeros
user_item_matrix = np.zeros((len(users), len(books)))

# Populate the matrix with ratings from the training data
for _, row in train_df.iterrows():
    user_idx = user_to_index[row['user_id']]
    book_idx = book_to_index[row['book_id']]
    user_item_matrix[user_idx, book_idx] = row['rating']

To prepare for user-based collaborative filtering predictions, we calculate cosine similarity matrix. We calculate the similarity between users based on their ratings of books, using the training data as is. This similarity matrix help identify the closest matches, which are then used to make predictions.

In [None]:
# Function to calculate cosine similarity matrix
def calculate_cosine_similarity(user_item_matrix):
    return 1 - pairwise_distances(user_item_matrix, metric='cosine')

# Calculate user similarity using cosine similarity
user_similarity = calculate_cosine_similarity(user_item_matrix)

The next step is to create a function to calculate the predicted ratings for a user-book pair. The function identifies “neighbors” and calculates the predicted rating as a weighted average of their contributions. This is done by multiplying the similarity score by the known rating and normalizing it using the sum of the absolute similarity scores.

In [None]:
# Function to predict a rating for a given user-book pair using collaborative filtering.
def calculate_prediction(user_idx, target_idx, similarity_matrix, data_matrix, is_user_based=True):
    # User-based filtering: Neighbors are other users who rated the same book
    rated_users = np.where(data_matrix[:, target_idx] > 0)[0]  # Users who rated the target book
    ratings = data_matrix[:, target_idx]  # Ratings for the target book
    
    numerator = 0  # Sum of weighted ratings
    denominator = 0  # Sum of absolute similarity scores

    # User-Based Filtering: Iterate through rated users
    for neighbor_idx in rated_users:
        similarity = similarity_matrix[user_idx, neighbor_idx]  # Similarity between users
        rating = data_matrix[neighbor_idx, target_idx]  # Rating given by the neighbor user for the book
        numerator += similarity * rating
        denominator += abs(similarity)

    # Compute the predicted rating
    if denominator > 0:
        return numerator / denominator
    elif np.any(ratings > 0):
        # Default to the average of known ratings
        return ratings[ratings > 0].mean()
    else:
        # If no neighbors exist, return a random rating between 0 and 5
        return np.random.uniform(0.0, 5.0)

Next, in order to predict the desired grades, we need to create a function that iterates over each user-book pair in the test dataset.

In [None]:
# Function to predict ratings for all user-book pairs in the test dataset.
def predict_ratings(test_df, similarity_matrix, data_matrix):
    # Initialize an empty dictionary to store predictions
    predictions = {} 

    # Iterate over each row in the test dataset
    for _, row in test_df.iterrows():
        query_id = row['id']  # Unique identifier for the test pair
        user_id = row['user_id']  # User ID for the current test pair
        book_id = row['book_id']  # Book ID for the current test pair

        # Skip the pair if the user or book is not in the training dataset
        if user_id not in user_to_index or book_id not in book_to_index:
            predictions[query_id] = None  # Assign a `None` value for out-of-scope pairs
            continue

        user_idx = user_to_index[user_id]  # Get the user index
        book_idx = book_to_index[book_id]  # Get the book index

        # Predict the rating using the appropriate filtering method
        predicted_rating = calculate_prediction(
            user_idx=user_idx,
            target_idx=book_idx,  
            similarity_matrix=similarity_matrix,
            data_matrix=data_matrix,
        )

        predictions[query_id] = predicted_rating  # Store the predicted rating

    return predictions

Finally, we save predicted ratings into a CSV file.

In [None]:
# Function to save predictions to a CSV file
def save_predictions_to_csv(predictions, output_file):
    # Convert the dictionary into a DataFrame and save it
    predictions_df = pd.DataFrame(list(predictions.items()), columns=['id', 'rating'])
    predictions_df.to_csv(output_file, index=False)

# Generate and save predictions for User-Based Collaborative Filtering
user_based_predictions = predict_ratings(test_df, user_similarity, user_item_matrix)
save_predictions_to_csv(user_based_predictions, "user_based_predictions.csv")