# Matrix factorization

#### Imports

In [1]:
import sys
sys.path.append("..")
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import surprise
from IPython.display import display
from surprise import Reader, Dataset, SVD, NMF
from surprise.model_selection import train_test_split, cross_validate

from data.data_helper_functions import *

In [2]:
%load_ext autoreload
%autoreload 2

#### Load data

In [61]:
data_path = '../data/'
books_df, users_df, ratings_df = load_data(data_path) 

##### Prepare the data for the surprise library

In [62]:
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(ratings_df[['user_id', 'book_id', 'rating']], reader)

##### Split the data into train and test sets

In [63]:
train, test = surprise.model_selection.train_test_split(data, test_size=0.2,random_state=0)

Get train test panda datagrames

In [64]:
test_df = pd.DataFrame(test, columns=["user_id", "book_id", "rating"])

trainset_to_tuples = [(train.to_raw_uid(uid), train.to_raw_iid(iid), rating) for (uid, iid, rating) in train.all_ratings()]
train_df = pd.DataFrame(trainset_to_tuples, columns=["user_id", "book_id", "rating"])

##### Train the matrix factorization model using the Singular Value Decomposition (SVD) algorithm

In [14]:
svd = surprise.SVD(n_factors=20 , n_epochs=40,  reg_all=0.1)
svd.fit(train)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x2bb109f6e90>

##### Evaluate the model performance

In [15]:
# 83, 65
predictions = svd.test(test)
rmse = surprise.accuracy.rmse(predictions, verbose=False)
mae = surprise.accuracy.mae(predictions, verbose=False)

print(f"RMSE: {rmse:.2f}")
print(f"MAE: {mae:.2f}")

RMSE: 0.83
MAE: 0.65


##### Make recommendations for a specific user:

In [8]:
def get_top_n_recommendations(user_id, model, n=10):
    all_book_ids = set(ratings_df['book_id'].unique())
    user_rated_books = set(ratings_df[ratings_df['user_id'] == user_id]['book_id'])

    books_not_yet_rated = list(all_book_ids - user_rated_books)
    predictions = [model.predict(user_id, book_id) for book_id in books_not_yet_rated]
    sorted_predictions = sorted(predictions, key=lambda x: x.est, reverse=True)
    
    return [(prediction.iid, prediction.est) for prediction in sorted_predictions[:n]]

In [9]:
query_user_id = 10
top_n_recommendations_with_ratings = get_top_n_recommendations(query_user_id, svd, n=20)
top_n_recommendations, predicted_ratings = zip(*top_n_recommendations_with_ratings)

In [None]:
# Display the top rated books by the user
user_book_ratings = ratings_df[ratings_df['user_id'] == query_user_id]
user_book_ratings = user_book_ratings.merge(books_df, left_on='book_id', right_index=True, how='inner')
user_book_ratings = user_book_ratings[['book_id', 'title', 'authors', 'average_rating', 'rating']].sort_values('rating', ascending=False)
print("Top rated books by user:")
display(user_book_ratings)

# Display the information of the recommended books
recommended_books = books_df.loc[list(top_n_recommendations)]
recommended_books['predicted_rating'] = predicted_ratings
print("\nRecommended books:")
display(recommended_books)

### Evaluate recommendations

Get top n recommendations for each user

In [65]:
from tqdm import tqdm

def get_top_recommendation_for_user(user_id, all_books, model, n=500):
    user_rated_books = set(train_df[train_df['user_id'] == user_id]['book_id'])
    books_not_yet_rated = list(all_books - user_rated_books) # Remove only from train!
    
    predictions = [model.predict(user_id, book_id) for book_id in books_not_yet_rated]
    sorted_predictions = sorted(predictions, key=lambda x: x.est, reverse=True)
    return [(prediction.iid, prediction.est) for prediction in sorted_predictions[:n]]

def get_recommendations_for_all_users(user_ids, model, n=500):
    all_book_ids = set(ratings_df['book_id'].unique())
    recommendations = {}
    for user_id in tqdm(user_ids, desc="Getting recommendations for users"):
        top_n_recommendations = get_top_recommendation_for_user(user_id, all_book_ids, model, n=n)
        recommendations[user_id] = [book_id for (book_id, _) in top_n_recommendations]
    return recommendations

user_ids = ratings_df['user_id'].unique()
recommendations = get_recommendations_for_all_users(user_ids, svd, n=500)

Getting recommendations for users: 100%|██████████| 39686/39686 [56:49<00:00, 11.64it/s]  


Calculate precision and recall at K

In [94]:
def precision_recall_at_k(user_id, top_n_recommendations, k= 100):
    relevant_items  = set(test_df[(test_df.user_id == user_id) & (test_df.rating>=3)]['book_id'])
    if len (relevant_items ) == 0: return -1, -1 # Can not evaluate this user if no relavant items in test set
    true_positives = len(relevant_items.intersection(set(top_n_recommendations[:k])))

    false_positives = len(top_n_recommendations) - true_positives
    false_negatives = len(relevant_items) - true_positives

    tp, fp, fn = true_positives, false_positives, false_negatives

    precision = tp / (tp + fp) if tp + fp > 0 else 0
    recall = tp / (tp + fn) if tp + fn > 0 else 0
    return precision, recall

In [None]:
k = 100

total_precision = 0
total_recall = 0
count = len(user_ids)

for user_id in tqdm(user_ids, desc="Evaluating recommendations"):
    top_n_recommendations = recommendations[user_id]
    precision, recall = precision_recall_at_k(user_id, top_n_recommendations, k)

    if precision > 0:
        total_precision += precision
        total_recall += recall
    elif precision < 0: # There were no relavant items
        count -=1

average_precision = total_precision / count
average_recall = total_recall / count

print(f"Precision@{k}: {average_precision:.5f}")
print(f"Recall@{k}: {average_recall:.5f}")