## RQ1

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import sys

import matplotlib.pyplot as plt
import numpy as np

from sklearn.metrics import ndcg_score

from dotenv import load_dotenv

load_dotenv()
project_root = os.environ["PROJECT_ROOT"]
sys.path.append(project_root)

In [3]:
import src.modules.result_analysis.loading as result_loading
import src.modules.result_analysis.model_standardization as ms

In [4]:
figures_root = os.path.join(project_root, "latex", "figures")
os.makedirs(figures_root, exist_ok=True)

In [5]:
plt.rc('font', size=20)
plt.rc('text', usetex=True)
plt.rc('text.latex', preamble=r'\usepackage{amsmath,amssymb,bm,bbm,lmodern}')

In [6]:
def loglik(df):
    return np.log(np.take_along_axis((df[bins_mass_cols].values+1e-6)/(1.+1e-5), (df["rating"]*2-1).astype(int).values[:,None], axis=1)).sum()

In [7]:
bins_mass_cols = [f"bins_mass_{x}" for x in range(10)]

In [8]:
NUM_FOLDS = 10

data_path_templates = {
    "LBDS_512_sum_no_bias": os.path.join(project_root, "logs", "LBD_results", "LBDS_512_sum_no_bias", "LBDS_512_sum_no_bias-{}-0", "export"),
    "LBDS_512_sum_mn": os.path.join(project_root, "logs", "LBD_results", "LBDS_512_sum_mn", "LBDS_512_sum_mn-{}-0", "export"),
    "LBDS_512_sum_ab": os.path.join(project_root, "logs", "LBD_results", "LBDS_512_sum_ab", "LBDS_512_sum_ab-{}-0", "export"),
    "LBDS_512_norm_ab": os.path.join(project_root, "logs", "LBD_results", "LBDS_512_norm_ab", "LBDS_512_norm_ab-{}-0", "export"),
    "LBDS_512_dot_ab": os.path.join(project_root, "logs", "LBD_results", "LBDS_512_dot_ab", "LBDS_512_dot_ab-{}-0", "export"),
    "LBDS_256_256_ab": os.path.join(project_root, "logs", "LBD_results", "LBDS_256_256_ab", "LBDS_256_256_ab-{}-0", "export"),
    "LBDA_512_sum_ab": os.path.join(project_root, "logs", "LBD_results", "LBDA_512_sum_ab", "LBDA_512_sum_ab-{}-0", "export")
}
print("Loading data")
data = {k: [result_loading.path_to_df(v.format(i)) for i in range(NUM_FOLDS)] for k, v in data_path_templates.items()}
print("Standardising")
confidence_models = {k: [ms.standardise_model(k, df) for df in dfs] for k, dfs in data.items()}

Loading data
Standardising


### Table 1

In [9]:
# RMSE
metric = {k: [np.sqrt((df["err_mean"]**2).mean()) for df in dfs] for k, dfs in confidence_models.items()}
print("RMSE")
print({k: np.mean(v) for k, v in metric.items()})

RMSE
{'LBDS_512_sum_no_bias': 0.79313594, 'LBDS_512_sum_mn': 0.7863374, 'LBDS_512_sum_ab': 0.78310776, 'LBDS_512_norm_ab': 0.8252937, 'LBDS_512_dot_ab': 0.86486995, 'LBDS_256_256_ab': 0.8018311, 'LBDA_512_sum_ab': 0.784263}


In [10]:
# MAE
metric = {k: [np.mean(np.abs(df["err_mean"])) for df in dfs] for k, dfs in confidence_models.items()}
print("MAE")
print({k: np.mean(v) for k, v in metric.items()})

MAE
{'LBDS_512_sum_no_bias': 0.61304545, 'LBDS_512_sum_mn': 0.6057754, 'LBDS_512_sum_ab': 0.5958842, 'LBDS_512_norm_ab': 0.6257069, 'LBDS_512_dot_ab': 0.66480464, 'LBDS_256_256_ab': 0.61231214, 'LBDA_512_sum_ab': 0.59612036}


In [11]:
# Accuracy
metric = {k: [np.mean(df["highest_correct"]) for df in dfs] for k, dfs in confidence_models.items()}
print("Accuracy")
print({k: np.mean(v) for k, v in metric.items()})

Accuracy
{'LBDS_512_sum_no_bias': 0.3016418917893139, 'LBDS_512_sum_mn': 0.3091412490075098, 'LBDS_512_sum_ab': 0.3089746485284124, 'LBDS_512_norm_ab': 0.29452723778295564, 'LBDS_512_dot_ab': 0.27667260271719435, 'LBDS_256_256_ab': 0.2996180770083929, 'LBDA_512_sum_ab': 0.42542173373523023}


In [12]:
# Loglik
metric = {k: [loglik(df) for df in dfs] for k, dfs in confidence_models.items()}
print("Loglik")
print({k: np.mean(v) for k, v in metric.items()})

Loglik
{'LBDS_512_sum_no_bias': -1769400.6, 'LBDS_512_sum_mn': -1759908.2, 'LBDS_512_sum_ab': -1757444.8, 'LBDS_512_norm_ab': -1887146.0, 'LBDS_512_dot_ab': -1856558.4, 'LBDS_256_256_ab': -1791266.8, 'LBDA_512_sum_ab': -1450483.8}


In [13]:
# NDCG@3
ndcg_fn = lambda x: ndcg_score(x["rating"].values[None,:], x["mean"].values[None,:], k=3) if len(x) > 1 else 1.
metric = {k: [np.mean(df.groupby("uid")[["rating", "mean"]].apply(ndcg_fn)) for df in dfs] for k, dfs in confidence_models.items()}
print("NDCG@3")
print({k: np.mean(v) for k, v in metric.items()})

NDCG@3
{'LBDS_512_sum_no_bias': 0.9330784663474194, 'LBDS_512_sum_mn': 0.9330805048016233, 'LBDS_512_sum_ab': 0.933683484803771, 'LBDS_512_norm_ab': 0.9248010918893647, 'LBDS_512_dot_ab': 0.9068579586061526, 'LBDS_256_256_ab': 0.9284288528290142, 'LBDA_512_sum_ab': 0.9329311919496576}


In [14]:
# NDCG@10
ndcg_fn = lambda x: ndcg_score(x["rating"].values[None,:], x["mean"].values[None,:], k=10) if len(x) > 1 else 1.
metric = {k: [np.mean(df.groupby("uid")[["rating", "mean"]].apply(ndcg_fn)) for df in dfs] for k, dfs in confidence_models.items()}
print("NDCG@10")
print({k: np.mean(v) for k, v in metric.items()})

NDCG@10
{'LBDS_512_sum_no_bias': 0.9568087133552832, 'LBDS_512_sum_mn': 0.9567515198335643, 'LBDS_512_sum_ab': 0.9570736381335326, 'LBDS_512_norm_ab': 0.9515151694240258, 'LBDS_512_dot_ab': 0.9406176803042717, 'LBDS_256_256_ab': 0.9537905880832037, 'LBDA_512_sum_ab': 0.9565138316171383}
