# Recommender Systems
This project focuses on building a recommender system that predicts user ratings for books based on their past interactions. By understanding user preferences, the system aims to suggest books that align with individual tastes, enhancing the reading experience through personalized recommendations.

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

Let's import the necessary libraries.

In [None]:
# Import necessary libraries

from sklearn.metrics.pairwise import cosine_similarity  # To compute cosine similarity between vectors
from sentence_transformers import SentenceTransformer  # To generate sentence embeddings for semantic similarity
from tqdm import tqdm  # For progress bars 

import pandas as pd  # For data manipulation and CSV file handling
import numpy as np  # For numerical operations 

import requests  # To make HTTP requests 
import time  # To introduce delays 
import glob  # To find and list files using patterns 
import csv  # For handling CSV file operations

Before we start, we need to load the three files needed for the project:
- `train.csv`: contains the user ratings for the books.
- `test.csv`: contains the user-book pairs for which we need to predict a rating.
- `books.csv`: contains information about the books.

In [None]:
# Load the training, test, and books data
train_df = pd.read_csv("Data/train.csv")
test_df = pd.read_csv("Data/test.csv")
books_df = pd.read_csv("Data/books.csv")

## Content-Based Recommendation Method
This section is devoted to the implementation of a content-based recommendation method to predict the rating a user might give to a book.

First of all, we need to collect metadata (title, authors, themes and description) about the books to establish similarities between them. To do this, we use two APIs: Open Library for themes, titles and authors, and Google Books for detailed descriptions, based on the unique ISBN of the books. This metadata is then concatenated into a content column of a new DataFrame, which will serve as a basis to allow us to compare books and build a content-based recommendation system.

In [None]:
# List of Personal Google API keys (use multiple keys to avoid request limits)
API_KEYS = [
    "***************************************",
    "***************************************",
    "***************************************"
]
api_index = 0  # Start with the first API key

# Function to fetch metadata via Open Library API
def fetch_metadata_openlibrary(isbn):
    url = f"https://openlibrary.org/api/books?bibkeys=ISBN:{isbn}&jscmd=data&format=json"
    try:
        response = requests.get(url)
        response.raise_for_status()
        data = response.json()
        key = f"ISBN:{isbn}"
        
        if key in data:
            book_data = data[key]
            title = book_data.get('title', '')
            authors = ', '.join(author['name'] for author in book_data.get('authors', []))
            subjects = book_data.get('subjects', [])
            themes = ', '.join(
                subject['name'] if isinstance(subject, dict) else subject
                for subject in subjects
            )
            return title, authors, themes
        return '', '', ''
    except Exception as e:
        print(f"Error fetching metadata for ISBN {isbn}: {e}")
        return '', '', ''

# Function to fetch description via Google Books API
def fetch_description_googlebooks(isbn, retries=3, delay=5):
    global api_index
    for attempt in range(retries):
        api_key = API_KEYS[api_index]
        url = f"https://www.googleapis.com/books/v1/volumes?q=isbn:{isbn}&key={api_key}"
        try:
            response = requests.get(url)
            if response.status_code == 429:
                print(f"Rate limit reached with key {api_key}. Switching API key...")
                api_index = (api_index + 1) % len(API_KEYS)
                time.sleep(delay)
                continue
            response.raise_for_status()
            data = response.json()
            if 'items' in data and len(data['items']) > 0:
                book_data = data['items'][0]['volumeInfo']
                description = book_data.get('description', '')
                return description
            return ''
        except Exception as e:
            print(f"Error fetching description for ISBN {isbn} (attempt {attempt + 1}): {e}")
            time.sleep(delay)
    return ''

# Define the output file base path
base_output_path = "books_content"

# Iterate over the rows in the DataFrame and create files every 500 books
batch_size = 500
metadata = []
batch_number = 1

for i, row in tqdm(enumerate(books_df.itertuples(), start=1), total=len(books_df), desc="Fetching metadata"):
    isbn = row.ISBN
    title, authors, themes = fetch_metadata_openlibrary(isbn)
    description = fetch_description_googlebooks(isbn)
    content = (
        (title or '') + ' ' +
        (authors or '') + ' ' +
        (themes or '') + ' ' +
        (description or '')
    ).strip()
    metadata.append({'book_id': row.book_id, 'content': content})
    
    # Save and reset metadata every 500 books
    if i % batch_size == 0 or i == len(books_df):
        output_path = f"{base_output_path}_{batch_number}.csv"
        with open(output_path, mode='w', encoding='utf-8', newline='') as csvfile:
            writer = csv.DictWriter(csvfile, fieldnames=['book_id', 'content'])
            writer.writeheader()
            writer.writerows(metadata)
        print(f"Saved {output_path}")
        metadata = []  # Reset metadata for the next batch
        batch_number += 1
        
    # Pause to avoid API limits
    time.sleep(2)

# Set path for intermediate files
file_pattern = "books_content_*.csv"

# Retrieve files in correct order based on number
file_list = sorted(glob.glob(file_pattern), key=lambda x: int(x.split('_')[-1].split('.')[0]))

# Concatenate files
all_books_content_df = pd.concat([pd.read_csv(file) for file in file_list], ignore_index=True)

# Save the final file
output_path = "all_books_content.csv"
all_books_content_df.to_csv(output_path, index=False)

The rest will be done using the dataframe created previously, so we need to import it. The file all_books_content.csv has two columns, book_id and content, corresponding to the metadata taken from the APIs.

In [None]:
# Charger le fichier all_books_content.csv
all_books_content_df = pd.read_csv("all_books_content.csv")

We construct a user-item matrix from the training data. Each row represents a user, and each column represents a book. The cell values correspond to user ratings for books. This matrix will be used to find books that a user has already rated.

In [None]:
# Map user_id and book_id to matrix indices
unique_users = sorted(train_df['user_id'].unique())
unique_books = sorted(train_df['book_id'].unique())

user_to_index = {user_id: idx for idx, user_id in enumerate(unique_users)}
book_to_index = {book_id: idx for idx, book_id in enumerate(unique_books)}

# Initialize the user-item matrix with zeros
n_users = len(unique_users)
n_books = len(unique_books)
data_train = np.zeros((n_users, n_books))  # Rows: users, Columns: books

# Populate the matrix with user ratings
for _, row in train_df.iterrows():
    user_idx = user_to_index[row['user_id']]
    book_idx = book_to_index[row['book_id']]
    data_train[user_idx, book_idx] = row['rating']

We can use the `SentenceTransformer` model to convert the textual content of books into dense numerical vectors (embeddings). These embeddings capture the semantic meaning of the book content, which will be used to calculate similarity between books.

In [None]:
# Fill missing content with empty strings
all_books_content_df['content'] = all_books_content_df['content'].fillna('')

# Initialize the SentenceTransformer model
model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

# Generate embeddings for the book content
embeddings = model.encode(all_books_content_df['content'].tolist(), show_progress_bar=True)

Using the embeddings, we calculate a cosine similarity matrix, where rows and columns represent books and the value at (i, j) represents the similarity between book i and book j. We also handle missing content (NaN) by setting their similarity values to zero.

In [None]:
# Compute cosine similarity matrix
similarity_matrix = cosine_similarity(embeddings)

# Handle missing content by setting rows and columns corresponding to NaN to 0
nan_indices = all_books_content_df['content'] == ''
similarity_matrix[nan_indices, :] = 0
similarity_matrix[:, nan_indices] = 0

Let's save the obtained similarity matrix.

In [None]:
# Convert the similarity matrix to a DataFrame
similarity_matrix_df = pd.DataFrame(similarity_matrix)

# Save the DataFrame to a CSV file
similarity_matrix_df.to_csv('similarity_matrix.csv', index=False)

The next step consists to predict ratings for the test data. For each user-book pair in the test dataset, we have to identify books rated by the user, calculate a weighted average of ratings for similar books based on the similarity scores, and if no similar books are found, fallback to the user's average rating or a random rating between zero and five. 

In [None]:
# Dictionary for storing predictions
predictions = {}

# Predict ratings for each test pair
for _, row in tqdm(test_df.iterrows(), total=len(test_df), desc="Predicting ratings"):
    query_id = row['id']
    user_id = row['user_id']
    book_id = row['book_id']
    
    # Skip if user or book is not in the matrix
    if user_id not in user_to_index or book_id not in book_to_index:
        predictions[query_id] = None
        continue
    
    user_idx = user_to_index[user_id]
    book_idx = book_to_index[book_id]
    
    # Find books rated by the user
    user_ratings = data_train[user_idx, :]
    rated_items = np.where(user_ratings > 0)[0]
    
    # Calculate weighted rating
    numerator, denominator = 0, 0
    for neighbor_idx in rated_items:
        similarity = similarity_matrix[book_idx, neighbor_idx]
        rating = data_train[user_idx, neighbor_idx]
        numerator += similarity * rating
        denominator += abs(similarity)
    
    # Handle cases with no neighbors
    if denominator > 0:
        predicted_rating = numerator / denominator
    else:
        if np.any(user_ratings > 0):
            predicted_rating = user_ratings[user_ratings > 0].mean()
        else:
            predicted_rating = np.random.uniform(0.0, 5.0)
    
    # Store the prediction
    predictions[query_id] = predicted_rating

Finally, the predicted ratings are saved in a CSV file.


In [None]:
# Convert predictions dictionary to DataFrame
predictions_df = pd.DataFrame(list(predictions.items()), columns=['id', 'rating'])

# Save predictions to a CSV file
predictions_df.to_csv("content_based_predictions.csv", index=False)