In [None]:
import sys
import cornac
import numpy as np
import pandas as pd
from surprise import SVD, KNNWithMeans
from surprise import Dataset, Reader, accuracy
from util.helpers import load_train_test_surpriselib, load_dataset_explicit
from util.knn import get_knn
from neighborhood_eval.neighborhood_rankings import precision_recall_at_k, get_critical_nbhds, precision_recall_at_k_dfs
from recommenders.datasets.python_splitters import python_random_split
from recommenders.evaluation.python_evaluation import map_at_k, ndcg_at_k, precision_at_k, recall_at_k
from recommenders.models.cornac.cornac_utils import predict_ranking
from recommenders.utils.timer import Timer
from recommenders.utils.constants import SEED

print("System version: {}".format(sys.version))
print("Cornac version: {}".format(cornac.__version__))

In [None]:
# path to the datasets folder
dataset_name = 'ml-latest-small'
dataset_path = '../data/' + dataset_name

# load the data
ratings = load_dataset_explicit(dataset_name, dataset_path, total_users=16000)
print('Dataset size:', len(ratings))
print('Total no of Users:', len(set(ratings.user_id.to_list())))

In [None]:
data = ratings.rename({'user_id':'userID', 'item_id':'itemID'}, axis=1)
data = data[['userID', 'itemID', 'rating']]
train, test = python_random_split(data, 0.75)

In [None]:
# top k items to recommend
TOP_K = 10

# model parameters
NUM_FACTORS = 200
NUM_EPOCHS = 100

train_set = cornac.data.Dataset.from_uir(train.itertuples(index=False), seed=SEED)

print('Number of users: {}'.format(train_set.num_users))
print('Number of items: {}'.format(train_set.num_items))

In [None]:
bpr = cornac.models.BPR(
    k=NUM_FACTORS,
    max_iter=NUM_EPOCHS,
    learning_rate=0.01,
    lambda_reg=0.001,
    verbose=True,
    seed=SEED
)

In [None]:
with Timer() as t:
    bpr.fit(train_set)
print("Took {} seconds for training.".format(t))

In [None]:
with Timer() as t:
    all_predictions = predict_ranking(bpr, train, usercol='userID', itemcol='itemID', remove_seen=True)
print("Took {} seconds for prediction.".format(t))

In [None]:
k = 10
eval_map = map_at_k(test, all_predictions, col_prediction='prediction', k=k)
eval_ndcg = ndcg_at_k(test, all_predictions, col_prediction='prediction', k=k)
eval_precision = precision_at_k(test, all_predictions, col_prediction='prediction', k=k)
eval_recall = recall_at_k(test, all_predictions, col_prediction='prediction', k=k)

print("MAP:\t%f" % eval_map,
      "NDCG:\t%f" % eval_ndcg,
      "Precision@K:\t%f" % eval_precision,
      "Recall@K:\t%f" % eval_recall, sep='\n')

Neighborhood-based evaluation

In [None]:
train_surp_compat = train.rename({'userID':'user_id', 'itemID':'item_id'}, axis=1)
test_surp_compat = test.rename({'userID':'user_id', 'itemID':'item_id'}, axis=1)
# dataset = pd.concat([train_surp_compat, test_surp_compat])
dataset = ratings.copy()

# load train, test compatible with surpriselib
trainset, testset = load_train_test_surpriselib(train_surp_compat, test_surp_compat)

# nbhds on full dataset
full_data = Dataset.load_from_df(dataset[['user_id', 'item_id', 'rating']], Reader(rating_scale=(dataset.rating.min(), dataset.rating.max())))
trainset_nbhds = full_data.build_full_trainset()

clustering_algo = KNNWithMeans(sim_options = {'name': 'pearson', 'user_based': True}).fit(trainset_nbhds)

In [None]:
nbhd_size = 10
p_thresh = 0.5

# merge all predictions with the test
predictions = all_predictions.merge(test, on=['userID', 'itemID'])
predictions_df = predictions.rename({'userID':'uid', 'itemID':'iid', 'prediction':'est'}, axis=1)

# get neighborhoods and ranking metrics
neighborhoods = get_knn(train_surp_compat, clustering_algo, nbhd_size)
precisions, recalls = precision_recall_at_k_dfs(predictions, k=5, threshold=4)

precisions_df = pd.DataFrame(precisions, index=[0]).T.reset_index().rename({'index':'user_id', 0:'precision'}, axis=1)
recalls_df = pd.DataFrame(recalls, index=[0]).T.reset_index().rename({'index':'user_id', 0:'recall'}, axis=1)
critical_nbhds = get_critical_nbhds(neighborhoods, predictions_df, precisions_df, recalls_df)
critical_nbhds.to_csv('output/ranking_algo_exp/' + dataset_name + '.csv', index=False)