In [1]:
import os

# change dir for custom imports
os.chdir('../')

In [2]:
import pandas as pd

dataset_name = 'ml-1m'

testset = pd.read_csv('output/' + dataset_name + '/test.csv')
all_predictions = pd.read_csv('output/' + dataset_name + '/all_predictions.csv')

In [3]:
from recommenders.evaluation.python_evaluation import map_at_k, ndcg_at_k, precision_at_k, recall_at_k

k = 10
eval_map = map_at_k(testset, all_predictions, col_user='userId', col_item='movieId', col_prediction='prediction', k=k)
eval_ndcg = ndcg_at_k(testset, all_predictions, col_user='userId', col_item='movieId', col_prediction='prediction', k=k)
eval_precision = precision_at_k(testset, all_predictions, col_user='userId', col_item='movieId', col_prediction='prediction', k=k)
eval_recall = recall_at_k(testset, all_predictions, col_user='userId', col_item='movieId', col_prediction='prediction', k=k)

# print(
#     "MAP:\t%f" % eval_map,
#     "NDCG:\t%f" % eval_ndcg,
#     "Precision@K:\t%f" % eval_precision,
#     "Recall@K:\t%f" % eval_recall, sep='\n'
# )

In [4]:
all_predictions

Unnamed: 0,userId,movieId,prediction
0,1605,3549,4.083208
1,1605,2370,3.528950
2,1605,1959,2.961816
3,1605,3594,3.412933
4,1605,2091,3.090755
...,...,...,...
16218226,4355,3772,3.556495
16218227,4355,3828,3.517083
16218228,4355,3904,3.752912
16218229,4355,3722,3.530454


# Metrics

In [5]:
from recommenders.evaluation.python_evaluation import merge_ranking_true_pred

k = 10
col_user = "userId"
col_item = "movieId"
col_rating = "rating"
col_prediction = "prediction"

df_hit, df_hit_count, n_users = merge_ranking_true_pred(
    rating_true=testset,
    rating_pred=all_predictions,
    col_user=col_user,
    col_item=col_item,
    col_rating=col_rating,
    col_prediction=col_prediction,
    relevancy_method="top_k",
    k=k
)

In [6]:
import numpy as np

# calculate discounted gain for hit items
df_dcg = df_hit.copy()
# relevance in this case is always 1
df_dcg["dcg"] = 1 / np.log1p(df_dcg["rank"])
# sum up discount gained to get discount cumulative gain
df_dcg = df_dcg.groupby(col_user, as_index=False, sort=False).agg({"dcg": "sum"})
# calculate ideal discounted cumulative gain
df_ndcg = pd.merge(df_dcg, df_hit_count, on=[col_user])
df_ndcg["idcg"] = df_ndcg["actual"].apply(
    lambda x: sum(1 / np.log1p(range(1, min(x, k) + 1)))
)

In [7]:
# nDCG and precision equations
ndcg = (df_ndcg["dcg"] / df_ndcg["idcg"]).sum() / n_users
precision = (df_hit_count["hit"] / k).sum() / n_users
recall = (df_hit_count["hit"] / df_hit_count["actual"]).sum() / n_users

# Group Metric - nDCG

In [8]:
clustered_df = pd.read_csv('output/' + dataset_name + '/clusters.csv')

# group clusters into another dataframe with different representation
grouped_clusters = clustered_df.groupby('cluster')['userId'].apply(list).reset_index(name='users_list')
grouped_clusters['users_per_cluster'] = grouped_clusters.apply(lambda x: list(set(x.users_list)), axis=1)
grouped_clusters = grouped_clusters[['cluster', 'users_per_cluster']]

In [9]:
group_metric = {}
all_clusters_list = grouped_clusters.users_per_cluster.to_list()
all_users = len(set(all_predictions.userId.to_list()))

for cluster_id, cluster in enumerate(all_clusters_list):
    # users in the cluster vs. users in the equiv group
    n_cluster_users = len(cluster)
    n_cluster_users_equiv = all_users - n_cluster_users

    df_ndcg_cluster = df_ndcg.loc[df_ndcg["userId"].isin(cluster)]
    df_ndcg_cluster_equiv = df_ndcg.loc[~df_ndcg['userId'].isin(cluster)]

    # group metrics
    cluster_ndcg = (df_ndcg_cluster["dcg"] / df_ndcg_cluster["idcg"]).sum() / n_cluster_users
    cluster_precision = (df_ndcg_cluster["hit"] / k).sum() / n_cluster_users
    cluster_recall = (df_ndcg_cluster["hit"] / df_ndcg_cluster["actual"]).sum() / n_cluster_users

    # group equiv. metrics
    cluster_ndcg_equiv = (df_ndcg_cluster_equiv["dcg"] / df_ndcg_cluster_equiv["idcg"]).sum() / n_cluster_users_equiv
    cluster_precision_equiv = (df_ndcg_cluster_equiv["hit"] / k).sum() / n_cluster_users_equiv
    cluster_recall_equiv = (df_ndcg_cluster_equiv["hit"] / df_ndcg_cluster_equiv["actual"]).sum() / n_cluster_users_equiv

    group_metric[cluster_id] = [
        cluster_ndcg,
        cluster_ndcg_equiv,
        cluster_precision,
        cluster_precision_equiv,
        cluster_recall,
        cluster_recall_equiv
    ]

In [10]:
group_metric_df = pd.DataFrame.from_dict(group_metric, orient='index')\
    .reset_index()\
    .rename({
        'index': 'cluster',
        0: 'cluster-nDCG',
        1: 'cluster-nDCG-eq',
        2: 'cluster-precision',
        3: 'cluster-precision-eq',
        4: 'cluster-recall',
        5: 'cluster-recall-eq'
        }, axis=1)
group_metric_df['ndcg'] = ndcg
group_metric_df['precision'] = precision
group_metric_df['recall'] = recall

# save results in csv
group_metric_df.to_csv('validation.csv', index=False)