In [None]:
import numpy as np
import pandas as pd
from surprise import SVDpp, KNNWithMeans
from surprise import Dataset, Reader, accuracy
from util.helpers import load_train_test_surpriselib, load_dataset_explicit
from util.knn import get_knn
from neighborhood_eval.neighborhood_rankings import precision_recall_at_k, get_critical_nbhds

# path to the datasets folder
dataset_name = 'ml-latest'
dataset_path = '../data/' + dataset_name

# load the data
# old value was 16000
ratings = load_dataset_explicit(dataset_name, dataset_path, total_users=16000)
print('Dataset size:', len(ratings))
print('Total no of Users:', len(set(ratings.user_id.to_list())))

In [4]:
# split data into train/test
dataset = ratings.copy()
# test set portion (15%)
msk = np.random.rand(len(dataset)) < 0.85
x = dataset[msk]
y = dataset[~msk]

print("trainset size:", len(x))
print("testset size:", len(y))
print("full dataset size", len(dataset))

# load train, test compatible with surpriselib
trainset, testset = load_train_test_surpriselib(x, y)

# nbhds on full dataset
full_data = Dataset.load_from_df(dataset[['user_id', 'item_id', 'rating']], Reader(rating_scale=(dataset.rating.min(), dataset.rating.max())))
trainset_nbhds = full_data.build_full_trainset()

trainset and testset successfully created.


In [None]:
# main train algo + algo for nbhd clustering
algo = SVDpp().fit(trainset)
clustering_algo = KNNWithMeans(sim_options = {'name': 'pearson', 'user_based': True}).fit(trainset_nbhds)

predictions = algo.test(testset)
predictions_df = pd.DataFrame(predictions)

# compute systen metrics
accuracy.mae(predictions)
accuracy.rmse(predictions)
accuracy.mse(predictions)

In [None]:
nbhd_size = 10
p_thresh = 0.5
neighborhoods = get_knn(x, clustering_algo, nbhd_size)

precisions, recalls = precision_recall_at_k(predictions, k=5, threshold=4)
precisions_df = pd.DataFrame(precisions, index=[0]).T.reset_index().rename({'index':'user_id', 0:'precision'}, axis=1)
recalls_df = pd.DataFrame(recalls, index=[0]).T.reset_index().rename({'index':'user_id', 0:'recall'}, axis=1)

critical_nbhds = get_critical_nbhds(neighborhoods, predictions_df, precisions_df, recalls_df)
critical_nbhds.to_csv('output/ranking_algo_exp/' + dataset_name + '.csv', index=False)