In [None]:
import numpy as np
import pandas as pd
from surprise import SVD, KNNWithMeans, NMF, SlopeOne, NMF
from surprise import accuracy
from util.helpers import load_train_test_surpriselib, load_dataset_explicit
from util.knn import get_knn
from neighborhood_eval.neighborhood_accuracy import critical_nbhds_accuracy

# path to the datasets folder
dataset_name = 'ml-latest'
dataset_path = '../data/' + dataset_name

# load the data
ratings = load_dataset_explicit(dataset_name, dataset_path, total_users=16000)
print('Dataset size:', len(ratings))
print('Total no of Users:', len(set(ratings.user_id.to_list())))

In [None]:
# split data into train/test
dataset = ratings.copy()
# test set portion (15%)
msk = np.random.rand(len(dataset)) < 0.85
x = dataset[msk]
y = dataset[~msk]

print("trainset size:", len(x))
print("testset size:", len(y))
print("full dataset size", len(dataset))

# load train, test compatible with surpriselib
trainset, testset = load_train_test_surpriselib(x, y)

In [None]:
# main train algo + algo for nbhd clustering
algo = SVD().fit(trainset)
clustering_algo = KNNWithMeans(sim_options = {'name': 'pearson_baseline', 'user_based': True}).fit(trainset)

predictions = algo.test(testset)
predictions_df = pd.DataFrame(predictions)

# compute systen metrics
accuracy.mae(predictions)
accuracy.rmse(predictions)
accuracy.mse(predictions)

In [None]:
nbhd_size = 10 
p_thresh = 0.5

# MSE loss equation (base error for test-1)
predictions_df['prediction_loss'] = predictions_df.apply(lambda x: (x.r_ui - x.est)**2 , axis=1)

# Get the neighborhoods (knn) for every unique user in the dataset
neighborhoods = get_knn(x, clustering_algo, nbhd_size)
critical_nbhds_df = critical_nbhds_accuracy(neighborhoods, predictions_df, p_thresh)
critical_nbhds_df.to_csv('output/knn_methods_exp/' + dataset_name + '.csv', index=False)