In [1]:
import numpy as np
import pandas as pd
from surprise import SVD, KNNWithMeans, NMF
from surprise import accuracy
from util.helpers import load_train_test_surpriselib, load_dataset_explicit
from util.knn import get_knn
from neighborhood_eval.neighborhood_accuracy import critical_nbhds_accuracy

# path to the datasets folder
dataset_name = 'ml-latest-small'
dataset_path = '../data/' + dataset_name

# load the data
ratings = load_dataset_explicit(dataset_name, dataset_path, total_users=16000)
print('Dataset size:', len(ratings))
print('Total no of Users:', len(set(ratings.user_id.to_list())))

Dataset size: 100836
Total no of Users: 610


In [2]:
# split data into train/test
dataset = ratings.copy()
# test set portion (15%)
msk = np.random.rand(len(dataset)) < 0.85
x = dataset[msk]
y = dataset[~msk]

print("trainset size:", len(x))
print("testset size:", len(y))
print("full dataset size", len(dataset))

# load train, test compatible with surpriselib
trainset, testset = load_train_test_surpriselib(x, y)

trainset size: 85722
testset size: 15114
full dataset size 100836
trainset and testset successfully created.


In [3]:
# main train algo + algo for nbhd clustering
algo = SVD().fit(trainset)
clustering_algo = KNNWithMeans(sim_options = {'name': 'pearson', 'user_based': True}).fit(trainset)

predictions = algo.test(testset)
predictions_df = pd.DataFrame(predictions)

# compute systen metrics
accuracy.mae(predictions)
accuracy.rmse(predictions)
accuracy.mse(predictions)

Computing the pearson similarity matrix...
Done computing similarity matrix.
MAE:  0.6747
RMSE: 0.8792
MSE: 0.7729


0.7729459891822047

In [4]:
nbhd_size = 10 
p_thresh = 0.5

# MSE loss equation (base of test-1)
predictions_df['prediction_loss'] = predictions_df.apply(lambda x: (x.r_ui - x.est)**2 , axis=1)

# Get the neighborhoods (knn) for every unique user in the dataset
nbhd_clusters = get_knn(x, clustering_algo, nbhd_size)
critical_nbhds_df = critical_nbhds_accuracy(nbhd_clusters, predictions_df, p_thresh)
critical_nbhds_df.to_csv('output/clustering_exp/' + dataset_name + '.csv', index=False)

Clustering method used: PCC
total nbhds - test1: 387
total nbhds - test2: 87
total nbhds - test3 - MSE: 66
total nbhds - test3 - MAE: 66
total nbhds - test3 - RMSE: 66
Critical nbhd % 14.26




In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

fig, ax =plt.subplots(1,3, figsize=(18, 4))
box_plot_mse = critical_nbhds_df[['mse_nbhd','mse_equiv']].rename(columns={'mse_nbhd':'N', 'mse_equiv':'D\''}).reset_index().melt(id_vars='index').rename(columns=str.title)
sns.boxplot(x="Value", y="Variable", data=box_plot_mse, whis=np.inf, ax=ax[0])
sns.stripplot(x="Value", y="Variable", data=box_plot_mse, color=".3", ax=ax[0])

box_plot_mae = critical_nbhds_df[['mae_nbhd','mae_equiv']].rename(columns={'mae_nbhd':'N', 'mae_equiv':'D\''}).reset_index().melt(id_vars='index').rename(columns=str.title)
sns.boxplot(x="Value", y="Variable", data=box_plot_mae, whis=np.inf, ax=ax[1])
sns.stripplot(x="Value", y="Variable", data=box_plot_mae, color=".3", ax=ax[1])

box_plot_rmse = critical_nbhds_df[['rmse_nbhd','rmse_equiv']].rename(columns={'rmse_nbhd':'N', 'rmse_equiv':'D\''}).reset_index().melt(id_vars='index').rename(columns=str.title)
sns.boxplot(x="Value", y="Variable", data=box_plot_rmse, whis=np.inf, ax=ax[2])
sns.stripplot(x="Value", y="Variable", data=box_plot_rmse, color=".3", ax=ax[2])


ax[0].set_xlabel("MSE")
ax[0].set(ylabel=None)
ax[1].set_xlabel("MAE")
ax[1].set(ylabel=None)
ax[2].set_xlabel("RMSE")
ax[2].set(ylabel=None)