## RQ2

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import sys

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import scipy.stats as ss

from sklearn.metrics import ndcg_score

from dotenv import load_dotenv

load_dotenv()
project_root = os.environ["PROJECT_ROOT"]
sys.path.append(project_root)

In [3]:
import src.modules.result_analysis.loading as result_loading
import src.modules.result_analysis.model_standardization as ms

In [4]:
figures_root = os.path.join(project_root, "latex", "figures")
os.makedirs(figures_root, exist_ok=True)

In [5]:
plt.rc('font', size=20)
plt.rc('text', usetex=True)
plt.rc('text.latex', preamble=r'\usepackage{amsmath,amssymb,bm,bbm,lmodern}')

In [6]:
def loglik(df):
    return np.log(np.take_along_axis((df[bins_mass_cols].values+1e-6)/(1.+1e-5), (df["rating"]*2-1).astype(int).values[:,None], axis=1)).sum()

In [7]:
bins_mass_cols = [f"bins_mass_{x}" for x in range(10)]

In [8]:
NUM_FOLDS = 10

data_path_templates = {
    "MF_128": os.path.join(project_root, "logs", "LBD_results", "MF_128", "MF_128-{}-0", "export"),    
    "MF_512": os.path.join(project_root, "logs", "LBD_results", "MF_512", "MF_512-{}-0", "export"), #TH
    "CMF_128": os.path.join(project_root, "logs", "LBD_results", "CMF_128", "CMF_128-{}-0", "export"),
    "CMF_512": os.path.join(project_root, "logs", "LBD_results", "CMF_512", "CMF_512-{}-0", "export"), #TH

    "OrdRec-U_512": os.path.join(project_root, "logs", "LBD_results", "OrdRec-U_512", "OrdRec-U_512-{}-0", "export"), #TH

    "OrdRec-UI_512": os.path.join(project_root, "logs", "LBD_results", "OrdRec-UI_512", "OrdRec-UI_512-{}-0", "export"),
    "LBDS_512_sum_ab": os.path.join(project_root, "logs", "LBD_results", "LBDS_512_sum_ab", "LBDS_512_sum_ab-{}-0", "export"),
    "LBDA_512_sum_ab": os.path.join(project_root, "logs", "LBD_results", "LBDA_512_sum_ab", "LBDA_512_sum_ab-{}-0", "export")
}

print("Loading data")
data = {k: [result_loading.path_to_df(v.format(i)) for i in range(NUM_FOLDS)] for k, v in data_path_templates.items()}
print("Standardising")
confidence_models = {k: [ms.standardise_model(k, df) for df in dfs] for k, dfs in data.items()}

Loading data
Standardising


### Table 2

In [9]:
# RMSE
alternative = "less"
metric = {k: [np.sqrt((df["err_mean"]**2).mean()) for df in dfs] for k, dfs in confidence_models.items()}
keys = list(metric.keys())
stat_sign = np.zeros((len(metric), len(metric)))
for i, k in enumerate(keys):
    for j, k2 in enumerate(keys):
        if i == j:
            continue
        stat_sign[i,j] = ss.wilcoxon(metric[k], metric[k2], alternative=alternative).pvalue

print("RMSE") #TH
for m, v in metric.items(): #TH
    print(f"{m}: {np.mean(v)} ({np.std(v)})") #TH

print(f"RMSE: (i,j) is p-value for alternative hypothesis that i is {alternative} than j.")
print(pd.DataFrame(stat_sign, index=keys, columns=keys))

RMSE
MF_128: 0.8082108497619629 (0.0006533297710120678)
MF_512: 0.7882583737373352 (0.0007137320353649557)
CMF_128: 0.7914099262585901 (0.0006288243754967642)
CMF_512: 0.779554545879364 (0.0007619544048793614)
OrdRec-U_512: 0.7820902466773987 (0.0006716593052260578)
OrdRec-UI_512: 0.7764747142791748 (0.0006628120318055153)
LBDS_512_sum_ab: 0.7831077575683594 (0.0006951719406060874)
LBDA_512_sum_ab: 0.784263014793396 (0.00043219627696089447)
RMSE: (i,j) is p-value for alternative hypothesis that i is less than j.
                   MF_128    MF_512   CMF_128   CMF_512  OrdRec-U_512  \
MF_128           0.000000  1.000000  1.000000  1.000000      1.000000   
MF_512           0.000977  0.000000  0.000977  1.000000      1.000000   
CMF_128          0.000977  1.000000  0.000000  1.000000      1.000000   
CMF_512          0.000977  0.000977  0.000977  0.000000      0.000977   
OrdRec-U_512     0.000977  0.000977  0.000977  1.000000      0.000000   
OrdRec-UI_512    0.000977  0.000977  0.00097

In [10]:
# MAE
metric = {k: [np.mean(np.abs(df["err_mean"])) for df in dfs] for k, dfs in confidence_models.items()}
alternative = "less"
keys = list(metric.keys())
stat_sign = np.zeros((len(metric), len(metric)))
for i, k in enumerate(keys):
    for j, k2 in enumerate(keys):
        if i == j:
            continue
        stat_sign[i,j] = ss.wilcoxon(metric[k], metric[k2], alternative=alternative).pvalue
print("MAE")
for m, v in metric.items():
    print(f"{m}: {np.mean(v)} ({np.std(v)})")
print(f"\n(i,j) is p-value for alternative hypothesis that i is {alternative} than j.")
print(pd.DataFrame(stat_sign, index=keys, columns=keys))

MAE
MF_128: 0.6196640729904175 (0.0005219845334067941)
MF_512: 0.6018463373184204 (0.0005039231618866324)
CMF_128: 0.6069215342591894 (0.00047553317762655126)
CMF_512: 0.5965022444725037 (0.0005485996953211725)
OrdRec-U_512: 0.6042482256889343 (0.00040677032666280866)
OrdRec-UI_512: 0.5896550416946411 (0.0006094011478126049)
LBDS_512_sum_ab: 0.5958842039108276 (0.0005075912340544164)
LBDA_512_sum_ab: 0.5961203575134277 (0.0006764851277694106)

(i,j) is p-value for alternative hypothesis that i is less than j.
                   MF_128    MF_512   CMF_128   CMF_512  OrdRec-U_512  \
MF_128           0.000000  1.000000  1.000000  1.000000      1.000000   
MF_512           0.000977  0.000000  0.000977  1.000000      0.000977   
CMF_128          0.000977  1.000000  0.000000  1.000000      1.000000   
CMF_512          0.000977  0.000977  0.000977  0.000000      0.000977   
OrdRec-U_512     0.000977  1.000000  0.000977  1.000000      0.000000   
OrdRec-UI_512    0.000977  0.000977  0.000977  

In [11]:
# met

In [12]:
# Accuracy
metric = {k: [np.mean(df["highest_correct"]) for df in dfs] for k, dfs in confidence_models.items()}
alternative = "greater"
keys = list(metric.keys())
stat_sign = np.zeros((len(metric), len(metric)))
for i, k in enumerate(keys):
    for j, k2 in enumerate(keys):
        if i == j:
            continue
        stat_sign[i,j] = ss.wilcoxon(metric[k], metric[k2], alternative=alternative).pvalue
print("Accuracy")
for m, v in metric.items():
    print(f"{m}: {np.mean(v)} ({np.std(v)})")
print(f"\n(i,j) is p-value for alternative hypothesis that i is {alternative} than j.")
print(pd.DataFrame(stat_sign, index=keys, columns=keys))

Accuracy
MF_128: 0.07386756164163595 (0.0003484194214023142)
MF_512: 0.07582877602765585 (0.0002603589335166856)
CMF_128: 0.31004825596117036 (0.0003256249495231721)
CMF_512: 0.29065730627636693 (0.00223521099639586)
OrdRec-U_512: 0.23236346597439642 (0.0002788825642972985)
OrdRec-UI_512: 0.4187010821614837 (0.0005703560242397978)
LBDS_512_sum_ab: 0.3089746485284124 (0.00036166354474327016)
LBDA_512_sum_ab: 0.42542173373523023 (0.0005158957410708795)

(i,j) is p-value for alternative hypothesis that i is greater than j.
                   MF_128    MF_512   CMF_128   CMF_512  OrdRec-U_512  \
MF_128           0.000000  1.000000  1.000000  1.000000      1.000000   
MF_512           0.000977  0.000000  1.000000  1.000000      1.000000   
CMF_128          0.000977  0.000977  0.000000  0.000977      0.000977   
CMF_512          0.000977  0.000977  1.000000  0.000000      0.000977   
OrdRec-U_512     0.000977  0.000977  1.000000  1.000000      0.000000   
OrdRec-UI_512    0.000977  0.000977 

In [13]:
# Loglik
metric = {k: [loglik(df) for df in dfs] for k, dfs in confidence_models.items()}
alternative = "greater"
keys = list(metric.keys())
stat_sign = np.zeros((len(metric), len(metric)))
for i, k in enumerate(keys):
    for j, k2 in enumerate(keys):
        if i == j:
            continue
        stat_sign[i,j] = ss.wilcoxon(metric[k], metric[k2], alternative=alternative).pvalue
print("Loglik")
for m, v in metric.items():
    print(f"{m}: {np.mean(v)} ({np.std(v)})")
print(f"\n(i,j) is p-value for alternative hypothesis that i is {alternative} than j.")
print(pd.DataFrame(stat_sign, index=keys, columns=keys))

Loglik
MF_128: -5716996.386884504 (12578.77540970421)
MF_512: -5742655.395883513 (12503.224457324974)
CMF_128: -1803117.0853134033 (871.6316401816556)
CMF_512: -1804091.6142892234 (2422.228607769613)
OrdRec-U_512: -1880640.75 (478.5470275878906)
OrdRec-UI_512: -1569041.75 (1402.498046875)
LBDS_512_sum_ab: -1757444.75 (1018.4052734375)
LBDA_512_sum_ab: -1450483.75 (1057.6474609375)

(i,j) is p-value for alternative hypothesis that i is greater than j.
                   MF_128    MF_512   CMF_128   CMF_512  OrdRec-U_512  \
MF_128           0.000000  0.004883  1.000000  1.000000      1.000000   
MF_512           0.997070  0.000000  1.000000  1.000000      1.000000   
CMF_128          0.000977  0.000977  0.000000  0.065430      0.000977   
CMF_512          0.000977  0.000977  0.947266  0.000000      0.000977   
OrdRec-U_512     0.000977  0.000977  1.000000  1.000000      0.000000   
OrdRec-UI_512    0.000977  0.000977  0.000977  0.000977      0.000977   
LBDS_512_sum_ab  0.000977  0.00097

In [14]:
# NDCG@3
ndcg_fn = lambda x: ndcg_score(x["rating"].values[None,:], x["mean"].values[None,:], k=3) if len(x) > 1 else 1.
metric = {k: [np.mean(df.groupby("uid")[["rating", "mean"]].apply(ndcg_fn)) for df in dfs] for k, dfs in confidence_models.items()}
alternative = "greater"
keys = list(metric.keys())
stat_sign = np.zeros((len(metric), len(metric)))
for i, k in enumerate(keys):
    for j, k2 in enumerate(keys):
        if i == j:
            continue
        stat_sign[i,j] = ss.wilcoxon(metric[k], metric[k2], alternative=alternative).pvalue
print("NDCG@3")
for m, v in metric.items():
    print(f"{m}: {np.mean(v)} ({np.std(v)})")
print(f"\n(i,j) is p-value for alternative hypothesis that i is {alternative} than j.")
print(pd.DataFrame(stat_sign, index=keys, columns=keys))

NDCG@3
MF_128: 0.9273200054380386 (0.0002976981535892033)
MF_512: 0.9320953750747633 (0.0002663760513933267)
CMF_128: 0.9293129705932444 (0.00032171611500622573)
CMF_512: 0.9338366047196851 (0.00020902085624341282)
OrdRec-U_512: 0.9342867915678141 (0.00028085871784768444)
OrdRec-UI_512: 0.9349102495114007 (0.0002468387112372472)
LBDS_512_sum_ab: 0.933683484803771 (0.00032464374433531355)
LBDA_512_sum_ab: 0.9329311919496576 (0.00021384616027958543)

(i,j) is p-value for alternative hypothesis that i is greater than j.
                   MF_128    MF_512   CMF_128   CMF_512  OrdRec-U_512  \
MF_128           0.000000  1.000000  1.000000  1.000000      1.000000   
MF_512           0.000977  0.000000  0.000977  1.000000      1.000000   
CMF_128          0.000977  1.000000  0.000000  1.000000      1.000000   
CMF_512          0.000977  0.000977  0.000977  0.000000      1.000000   
OrdRec-U_512     0.000977  0.000977  0.000977  0.000977      0.000000   
OrdRec-UI_512    0.000977  0.000977  0.

In [15]:
# NDCG@10
ndcg_fn = lambda x: ndcg_score(x["rating"].values[None,:], x["mean"].values[None,:], k=10) if len(x) > 1 else 1.
metric = {k: [np.mean(df.groupby("uid")[["rating", "mean"]].apply(ndcg_fn)) for df in dfs] for k, dfs in confidence_models.items()}
alternative = "greater"
keys = list(metric.keys())
stat_sign = np.zeros((len(metric), len(metric)))
for i, k in enumerate(keys):
    for j, k2 in enumerate(keys):
        if i == j:
            continue
        stat_sign[i,j] = ss.wilcoxon(metric[k], metric[k2], alternative=alternative).pvalue
print("NDCG@10")
for m, v in metric.items():
    print(f"{m}: {np.mean(v)} ({np.std(v)})")
print(f"\n(i,j) is p-value for alternative hypothesis that i is {alternative} than j.")
print(pd.DataFrame(stat_sign, index=keys, columns=keys))

NDCG@10
MF_128: 0.9533271343478582 (0.00015554102568800455)
MF_512: 0.9560467228802819 (0.0001089099366321796)
CMF_128: 0.9546371424249005 (0.00014954737951962927)
CMF_512: 0.9572011700734204 (9.114925540612637e-05)
OrdRec-U_512: 0.9574918214693682 (0.00013271729456202388)
OrdRec-UI_512: 0.9577981375520881 (0.0001304416035401275)
LBDS_512_sum_ab: 0.9570736381335326 (0.00015807128040599966)
LBDA_512_sum_ab: 0.9565138316171383 (0.00011342171667260674)

(i,j) is p-value for alternative hypothesis that i is greater than j.
                   MF_128    MF_512   CMF_128   CMF_512  OrdRec-U_512  \
MF_128           0.000000  1.000000  1.000000  1.000000      1.000000   
MF_512           0.000977  0.000000  0.000977  1.000000      1.000000   
CMF_128          0.000977  1.000000  0.000000  1.000000      1.000000   
CMF_512          0.000977  0.000977  0.000977  0.000000      1.000000   
OrdRec-U_512     0.000977  0.000977  0.000977  0.000977      0.000000   
OrdRec-UI_512    0.000977  0.000977  