# COLLABORATIVE FILTERING

https://medium.com/@eli.hatcher/how-to-build-a-recommendation-system-e72fe9efb086

In [69]:
import sys
sys.path.append("..")
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import surprise
from IPython.display import display
from surprise import Reader, Dataset, KNNBasic, KNNWithZScore, KNNWithMeans, KNNBaseline, accuracy
from surprise.model_selection import train_test_split, cross_validate
import random

from data.data_helper_functions import *

In [2]:
%load_ext autoreload
%autoreload 2

#### Load data

In [3]:
data_path = '../data/'
books_df, users_df, ratings_df = load_data(data_path)

##### Prepare the data for the surprise library

In [4]:
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(ratings_df[['user_id', 'book_id', 'rating']], reader)

##### Split the data into train and test sets

In [88]:
train, test = surprise.model_selection.train_test_split(data, test_size=0.2,random_state=0) 

Get train test panda datagrames

In [62]:
test_df = pd.DataFrame(test, columns=["user_id", "book_id", "rating"])

trainset_to_tuples = [(train.to_raw_uid(uid), train.to_raw_iid(iid), rating) for (uid, iid, rating) in train.all_ratings()]
train_df = pd.DataFrame(trainset_to_tuples, columns=["user_id", "book_id", "rating"])

### USER-USER COLLABORATIVE FILTERING

KNN BASIC

In [9]:
knn_basic = KNNBasic(sim_options={'user_based': True})
knn_basic.fit(train)

cv_results = cross_validate(knn_basic, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

mean_rmse = cv_results['test_rmse'].mean()
mean_mae = cv_results['test_mae'].mean()
print("User-User Collaborative Filtering Average RMSE:", mean_rmse)
print("User-User Collaborative Filtering Average MAE:", mean_mae)

Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Evaluating RMSE, MAE of algorithm KNNBasic on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9064  0.9061  0.9078  0.9041  0.9071  0.9063  0.0012  
MAE (testset)     0.7054  0.7075  0.7064  0.7057  0.7077  0.7065  0.0009  
Fit time          79.19   73.12   120.22  120.73  102.59  99.17   19.99   
Test time         23.72   24.56   26.64   22.68   22.96   24.11   1.42    
User-User Collaborative Filtering Average RMSE: 0.9063168119559608
User-User Collaborative Filtering Average MAE: 0.706540987872

KNN Baseline

In [9]:
knn_baseline = KNNBaseline(sim_options={'user_based': True})
knn_baseline.fit(train)
predictions_baseline = knn_baseline.test(test)
rmse_baseline = accuracy.rmse(predictions_baseline, verbose=False)
mae_baseline = accuracy.mae(predictions_baseline, verbose=False)
print("RMSE:", rmse_baseline)
print("MAE:", mae_baseline)

Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 0.8437132171149854
MAE: 0.6490251094617892


KNN With Means

In [28]:
knn_means = KNNWithMeans(sim_options={'user_based': True})
knn_means.fit(train)
predictions_means = knn_means.test(test)
rmse_means = accuracy.rmse(predictions_means, verbose=False)
mae_means = accuracy.mae(predictions_means, verbose=False)
print("RMSE:", rmse_baseline)
print("MAE:", mae_baseline)

Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 0.8437132171149854
MAE: 0.6490251094617892


KNN With Z Score

In [None]:
knn_zscore = KNNWithZScore(sim_options={'user_based': True})
knn_zscore.fit(train)
predictions_zscore = knn_zscore.test(test)
rmse_zscore = accuracy.rmse(predictions_zscore, verbose=False)
mae_zscore = accuracy.mae(predictions_zscore, verbose=False)
print("RMSE:", rmse_baseline)
print("MAE:", mae_baseline)

Predictions

In [37]:
def get_top_n_recommendations(user_id, model, n=10):
    all_book_ids = set(ratings_df['book_id'].unique())
    user_rated_books = set(ratings_df[ratings_df['user_id'] == user_id]['book_id'])

    books_not_yet_rated = list(all_book_ids - user_rated_books)
    predictions = [model.predict(user_id, book_id) for book_id in books_not_yet_rated]
    sorted_predictions = sorted(predictions, key=lambda x: x.est, reverse=True)
    
    return [(prediction.iid, prediction.est) for prediction in sorted_predictions[:n]]

In [None]:
query_user_id = 7
top_n_recommendations_with_ratings = get_top_n_recommendations(query_user_id, knn_baseline, n=10)
top_n_recommendations, predicted_ratings = zip(*top_n_recommendations_with_ratings)

# Display the top rated books by the user
user_book_ratings = ratings_df[ratings_df['user_id'] == query_user_id]
user_book_ratings = user_book_ratings.merge(books_df, left_on='book_id', right_index=True, how='inner')
user_book_ratings = user_book_ratings[['book_id', 'title', 'authors', 'average_rating', 'rating', 'genres', 'year', 'language', 'pages']].sort_values('rating', ascending=False)
print("Top rated books by user:")
display(user_book_ratings)

# Display the information of the recommended books
recommended_books = books_df.loc[list(top_n_recommendations)]
recommended_books['predicted_rating'] = predicted_ratings
print("\nRecommended books:")
display(recommended_books.head(10))

### ITEM-ITEM COLLABORATIVE FILTERING

KNN BASELINE

In [25]:
knn_baseline_item = KNNBaseline(sim_options={'user_based': False})
knn_baseline_item.fit(train)
predictions_baseline_item = knn_baseline_item.test(test)
rmse_baseline_item = accuracy.rmse(predictions_baseline_item, verbose=False)
mae_baseline_item = accuracy.mae(predictions_baseline_item, verbose=False)

print("RMSE:", rmse_baseline_item)
print("MAE:", mae_baseline_item)

Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 0.8447441028828974
MAE: 0.6433720078942631


KNN With Means

In [24]:
knn_means_item = KNNWithMeans(sim_options={'user_based': False})
knn_means_item.fit(train)
predictions_means_item = knn_means_item.test(test)
rmse_means_item = accuracy.rmse(predictions_means_item, verbose=False)
mae_means_item = accuracy.mae(predictions_means_item, verbose=False)

print("RMSE:", rmse_means_item)
print("MAE:", mae_means_item)

Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 0.8497612719055213
MAE: 0.6479810569640901


In [53]:
query_user_id = 5
top_n_recommendations_with_ratings = get_top_n_recommendations(query_user_id, knn_baseline_item, n=10)
top_n_recommendations, predicted_ratings = zip(*top_n_recommendations_with_ratings)

# Display the top rated books by the user
user_book_ratings = ratings_df[ratings_df['user_id'] == query_user_id]
user_book_ratings = user_book_ratings.merge(books_df, left_on='book_id', right_index=True, how='inner')
user_book_ratings = user_book_ratings[['title', 'authors', 'average_rating', 'rating', 'genres', 'year', 'language', 'pages']].sort_values('rating', ascending=False)
print("Top rated books by user:")
display(user_book_ratings)

# Display the information of the recommended books
recommended_books = books_df.loc[list(top_n_recommendations)]
recommended_books['predicted_rating'] = predicted_ratings
print("\nRecommended books:")
display(recommended_books)

Top rated books by user:


Unnamed: 0,title,authors,average_rating,rating,genres,year,language,pages
732046,The Kept Woman,Karin Slaughter,4.1,5,"['comics', 'graphic-novels', 'fiction']",2016.0,eng,128.0
784735,Wait Till Helen Comes,Mary Downing Hahn,4.14,5,"['spirituality', 'self-help', 'nonfiction', 'b...",1986.0,eng,264.0
653626,Silent Scream,Angela Marsons,4.02,4,"['fantasy', 'young-adult', 'fiction', 'romance']",2015.0,eng,403.0
659112,A Hidden Fire,Elizabeth Hunter,4.05,4,"['fiction', 'science-fiction', 'contemporary']",2011.0,eng,307.0
479038,The One That Got Away,Simon Wood,3.77,3,"['nonfiction', 'classics', 'history', 'memoir']",2015.0,eng,106.0



Recommended books:


Unnamed: 0_level_0,authors,year,title,average_rating,num_ratings,num_1,num_2,num_3,num_4,num_5,genres,language,pages,predicted_rating
book_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
143,Anthony Doerr,2014.0,All the Light We Cannot See,4.31,470001,6209,14527,61020,185239,280832,"['fantasy', 'fiction', 'classics', 'young-adult']",eng,176.0,5
146,Donna Tartt,2013.0,The Goldfinch,3.87,396756,19164,35300,93459,153459,162465,"['classics', 'fiction', 'philosophy']",eng,123.0,5
165,George R.R. Martin,2005.0,A Feast for Crows,4.1,428186,3170,18574,94401,175973,189012,"['fantasy', 'young-adult', 'paranormal', 'roma...",eng,481.0,5
193,Malcolm Gladwell,2008.0,Outliers: The Story of Success,4.11,353011,4494,12545,62510,148978,140599,"['fantasy', 'young-adult', 'fiction']",eng,320.0,5
250,R.J. Palacio,2012.0,Wonder,4.43,228538,4504,7833,34567,105977,224910,"['poetry', 'classics', 'fiction', 'young-adult']",eng,176.0,5
307,Patrick Rothfuss,2011.0,The Wise Man's Fear,4.57,245686,1486,4018,17932,66415,192498,"['science-fiction', 'fiction', 'fantasy', 'cla...",eng,815.0,5
318,Christina Baker Kline,2013.0,Orphan Train,4.14,226091,1703,6769,42434,116003,97059,"['young-adult', 'contemporary', 'mystery', 'fi...",eng,242.0,5
437,Sue Monk Kidd,2014.0,The Invention of Wings,4.23,168006,1466,4021,25745,80524,82018,"['young-adult', 'romance', 'contemporary', 'fi...",eng,297.0,5
482,Kathleen Grissom,2010.0,The Kitchen House,4.18,168082,1830,5689,26149,71058,74551,"['art', 'nonfiction']",eng,240.0,5
504,Francine Rivers,1991.0,Redeeming Love,4.48,157506,2949,4277,13328,33009,110074,"['classics', 'science-fiction', 'fiction', 'fa...",eng,192.0,5


## Evaluating top k recommendations

In [7]:
knn_baseline = KNNBaseline(sim_options={'user_based': True})
knn_baseline.fit(train)

Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNBaseline at 0x1ac6bbefb80>

In [11]:
knn_means = KNNWithMeans(sim_options={'user_based': True})
knn_means.fit(train)

Computing the msd similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNWithMeans at 0x1ac6fd0a1a0>

Get top n recommendations for each user

In [72]:
from tqdm import tqdm

def get_top_recommendation_for_user(user_id, all_books, model, n=500):
    user_rated_books = set(train_df[train_df['user_id'] == user_id]['book_id'])
    books_not_yet_rated = list(all_books - user_rated_books) # Remove only from train!
    
    predictions = [model.predict(user_id, book_id) for book_id in books_not_yet_rated]
    sorted_predictions = sorted(predictions, key=lambda x: x.est, reverse=True)
    return [(prediction.iid, prediction.est) for prediction in sorted_predictions[:n]]

def get_recommendations_for_all_users(user_ids, model, n=500):
    all_book_ids = set(ratings_df['book_id'].unique())
    recommendations = {}
    for user_id in tqdm(user_ids, desc="Getting recommendations for users"):
        top_n_recommendations = get_top_recommendation_for_user(user_id, all_book_ids, model, n=n)
        recommendations[user_id] = [book_id for (book_id, _) in top_n_recommendations]
    return recommendations

user_ids = ratings_df['user_id'].unique()
sample_user_ids = random.sample(list(user_ids), 5000)
recommendations = get_recommendations_for_all_users(sample_user_ids, knn_baseline, n=500)

Getting recommendations for users: 100%|██████████| 5000/5000 [1:07:44<00:00,  1.23it/s]


In [31]:
with open('knn_means_recommendation.pkl', 'wb') as f:
    pickle.dump(recommendations, f)

Calculate precision and recall at K

In [86]:
def precision_recall_at_k(user_id, top_n_recommendations, k= 100):
    relevant_items  = set(test_df[(test_df.user_id == user_id) & (test_df.rating>=3)]['book_id'])
    if len (relevant_items ) == 0: 
        return -1, -1 # Can not evaluate this user if no relavant items in test set
    true_positives = len(relevant_items.intersection(set(top_n_recommendations[:k])))

    false_positives = len(top_n_recommendations) - true_positives
    false_negatives = len(relevant_items) - true_positives

    tp, fp, fn = true_positives, false_positives, false_negatives

    precision = tp / (tp + fp) if tp + fp > 0 else 0
    recall = tp / (tp + fn) if tp + fn > 0 else 0
    return precision, recall

In [90]:
k = 200

total_precision = 0
total_recall = 0
count = len(user_ids)

for user_id in tqdm(sample_user_ids, desc="Evaluating recommendations"):
    top_n_recommendations = recommendations[user_id]
    precision, recall = precision_recall_at_k(user_id, top_n_recommendations, k)

    if precision > 0:
        total_precision += precision
        total_recall += recall
    elif precision < 0: # There were no relavant items
        count -=1

average_precision = total_precision / count
average_recall = total_recall / count

print(f"Precision@{k}: {average_precision:.8f}")
print(f"Recall@{k}: {average_recall:.8f}")

Evaluating recommendations: 100%|██████████| 5000/5000 [00:06<00:00, 819.11it/s]

Precision@200: 0.00000923
Recall@200: 0.00125178



