# Extract eval results

Analyze evaluation results for BMNIST
- Clean data

In [None]:
import sys
sys.path.append("./../")

In [None]:
import os
import json
import glob
import pickle

import numpy as np
import pandas as pd

In [None]:
def extract_results(model_dir):
    """
        Get OOD metrics from model dir
    """

    # Get config
    config_json = os.path.join(model_dir, 'config.json')
    config = json.load(open(config_json, 'r'))
    
    # Extract config values
    method = config['method']
    alpha = config['method_params'].get('alpha', 1.0)
    ds_size = config['ds_params'].get('size', 'Full')
    
    
    results = None
    
    # Get OOD result files
    ood_result_files = glob.glob(model_dir + "/ece_results_*.pkl")
    
    # Get results
    for rfile in ood_result_files:
        filename = os.path.basename(rfile)
        # Get corruption name from file name
        corr_name = ' '.join(filename.split('_')[2:])[:-4]
        with open(rfile, 'rb') as f:
            logs = pickle.load(f)[0]
            r = {
                'method': method,
                'alpha': alpha,
                'ds_size': ds_size,
                'corruption': corr_name,
                'ece': logs['ece_uncal'],
                'acc': logs['acc'],
                'nll': logs['nll_uncal_test'],
                'auroc': logs['auroc']
            }
            
            if results is not None:
                results.append(r)
            else:
                results = [r]
    
    return results

## Model dirs

In [None]:
# # LeNet + 1000
# models_root = "./../zoo/sl/half-prior-alphavar/BinaryMNISTC-1000-53-identity/LeNet"
# models_root = "./../zoo/sl/uniform-prior-alphavar/BinaryMNISTC-1000-53-identity/LeNet"
# models_root = "./../zoo/sl/auto-prior-alphavar/BinaryMNISTC-1000-53-identity/LeNet"
# elbo_models_root = "./../zoo/mfvi/BinaryMNISTC-1000-53-identity/LeNet"
# ls_models_root = "./../zoo/ls/BinaryMNISTC-1000-53-identity/LeNet"
# edl_models_root = "./../zoo/edl/BinaryMNISTC-1000-53-identity/LeNetEDL"

# # LeNet + 10000
# models_root = "./../zoo/sl/half-prior-alphavar/BinaryMNISTC-8000-53-identity/LeNet"
# models_root = "./../zoo/sl/uniform-prior-alphavar/BinaryMNISTC-8000-53-identity/LeNet"
# models_root = "./../zoo/sl/auto-prior-alphavar/BinaryMNISTC-8000-53-identity/LeNet"
# elbo_models_root = "./../zoo/mfvi/BinaryMNISTC-8000-53-identity/LeNet"
# ls_models_root = "./../zoo/ls/BinaryMNISTC-8000-53-identity/LeNet"
# edl_models_root = "./../zoo/edl/BinaryMNISTC-8000-53-identity/LeNetEDL"


# ConvNet + 1000
# models_root = "./../zoo/sl/half-prior-alphavar/BinaryMNISTC-1000-53-identity/ConvNet"
# models_root = "./../zoo/sl/auto-prior-alphavar/BinaryMNISTC-1000-53-identity/ConvNet"
# models_root = "./../zoo/sl/uniform-prior-alphavar/BinaryMNISTC-1000-53-identity/ConvNet"
# elbo_models_root = "./../zoo/mfvi/BinaryMNISTC-1000-53-identity/ConvNet"
# ls_models_root = "./../zoo/ls/BinaryMNISTC-1000-53-identity/ConvNet"
# edl_models_root = "./../zoo/edl/BinaryMNISTC-1000-53-identity/ConvNetEDL"

# ConvNet + 10000
# models_root = "./../zoo/sl/half-prior-alphavar/BinaryMNISTC-8000-53-identity/ConvNet"
# models_root = "./../zoo/sl/auto-prior-alphavar/BinaryMNISTC-8000-53-identity/ConvNet"
models_root = "./../zoo/sl/uniform-prior-alphavar/BinaryMNISTC-8000-53-identity/ConvNet"
elbo_models_root = "./../zoo/mfvi/BinaryMNISTC-8000-53-identity/ConvNet"
ls_models_root = "./../zoo/ls/BinaryMNISTC-8000-53-identity/ConvNet"
edl_models_root = "./../zoo/edl/BinaryMNISTC-8000-53-identity/ConvNetEDL"

## S-ELBO results

In [None]:
model_dirs = list(map(lambda d: os.path.join(models_root, d), os.listdir(models_root)))

In [None]:
results = []
for _m in model_dirs:
    results.extend(extract_results(_m))

## EBLO results

In [None]:
model_dirs = list(map(lambda d: os.path.join(elbo_models_root, d), os.listdir(elbo_models_root)))
for _m in model_dirs:
    results.extend(extract_results(_m))
df_results = pd.DataFrame(results)

## LS results

In [None]:
model_dirs = list(map(lambda d: os.path.join(ls_models_root, d), os.listdir(ls_models_root)))
for _m in model_dirs:
    results.extend(extract_results(_m))
df_results = pd.DataFrame(results)

## EDL Results

In [None]:
model_dirs = list(map(lambda d: os.path.join(edl_models_root, d), os.listdir(edl_models_root)))
for _m in model_dirs:
    results.extend(extract_results(_m))
df_results = pd.DataFrame(results)

In [None]:
# patch up for additional methods
df_results.alpha[df_results.method=='mfvi'] = -5.0 # For MFVI
df_results.alpha[df_results.method=='ls'] = -1.0 # For label smoothing
df_results.alpha[df_results.method=='edl'] = 0.0 # For EDL

In [None]:
df_results

In [None]:
metrics_summ = df_results.groupby('alpha').agg(
    n = pd.NamedAgg(column='acc', aggfunc='count'),
    acc_mean = pd.NamedAgg(column='acc', aggfunc='mean'),
    acc_err = pd.NamedAgg(column='acc', aggfunc=lambda x: np.std(x) / np.sqrt(x.shape[0])),
    ece_mean = pd.NamedAgg(column='ece', aggfunc='mean'),
    ece_err = pd.NamedAgg(column='ece', aggfunc=lambda x: np.std(x) / np.sqrt(x.shape[0])),
    nll_mean = pd.NamedAgg(column='nll', aggfunc='mean'),
    nll_err = pd.NamedAgg(column='nll', aggfunc=lambda x: np.std(x) / np.sqrt(x.shape[0])),
    auroc_mean = pd.NamedAgg(column='auroc', aggfunc='mean'),
    auroc_err = pd.NamedAgg(column='auroc', aggfunc=lambda x: np.std(x) / np.sqrt(x.shape[0])),
)

In [None]:
gdf_corr = df_results.groupby('corruption')

In [None]:
rdfs = []
for k, _df in gdf_corr:
#     r1 = _df.groupby('lam_sl').agg({'corruption': 'first','ece': 'mean'}).sort_values(by='ece').reset_index()
    r1 = _df.groupby('alpha').agg({
                    'corruption': 'first', 
                    'acc': 'mean', 
                    'ece': 'mean', 
                    'nll': 'mean',
                    'auroc': 'mean'}).reset_index()
    r1['ece_rank'] = r1.ece.rank(ascending=True)
    r1['acc_rank'] = r1.acc.rank(ascending=False)
    r1['nll_rank'] = r1.nll.rank(ascending=True)
    r1['auroc_rank'] = r1.auroc.rank(ascending=False)
    rdfs.append(r1)

df_ranked = pd.concat(rdfs)
# df_ranked.reset_index(inplace=True)


In [None]:
metrics_summ

In [None]:
df_rank_results = df_ranked.groupby('alpha').agg(
    ece_rank_mean = pd.NamedAgg(column='ece_rank', aggfunc='mean'),
    ece_rank_err = pd.NamedAgg(column='ece_rank', aggfunc=lambda x: np.std(x) / np.sqrt(x.shape[0])),
    acc_rank_mean = pd.NamedAgg(column='acc_rank', aggfunc='mean'),
    acc_rank_err = pd.NamedAgg(column='acc_rank', aggfunc=lambda x: np.std(x) / np.sqrt(x.shape[0])),
    nll_rank_mean = pd.NamedAgg(column='nll_rank', aggfunc='mean'),
    nll_rank_err = pd.NamedAgg(column='nll_rank', aggfunc=lambda x: np.std(x) / np.sqrt(x.shape[0])),
    auroc_rank_mean = pd.NamedAgg(column='auroc_rank', aggfunc='mean'),
    auroc_rank_err = pd.NamedAgg(column='auroc_rank', aggfunc=lambda x: np.std(x) / np.sqrt(x.shape[0]))
)

In [None]:
df_final = metrics_summ.merge(df_rank_results, on='alpha')

## Printout final results

In [None]:
df_final

Print out latex table

In [None]:
for row in df_final.itertuples():
    print(
#         "${:.0e}$".format(row.Index),
        "& ${:.3f} \pm {:.3f}$".format(row.acc_mean, row.acc_err),
        # "& ${:.2f} \pm {:.2f}$".format(row.acc_rank_mean, row.acc_rank_err),
        "& ${:.2f}$".format(row.acc_rank_mean),
        "& ${:.3f} \pm {:.3f}$".format(row.ece_mean, row.ece_err),
#         "& ${:.2f} \pm {:.2f}$".format(row.ece_rank_mean, row.ece_rank_err)
        "& ${:.2f}$".format(row.ece_rank_mean)
    )

Create latex table for aggregate OOD performance over all corruptions

In [None]:
#  Table 13 - 16
for row in df_final.itertuples():
    print(
        "& ${:.3f} \pm {:.3f} ({:.2f})$".format(row.nll_mean, row.nll_err, row.nll_rank_mean),
        "& ${:.3f} \pm {:.3f} ({:.2f})$".format(row.acc_mean, row.acc_err, row.acc_rank_mean),
        "& ${:.3f} \pm {:.3f} ({:.2f})$".format(row.auroc_mean, row.auroc_err, row.auroc_rank_mean),
        "& ${:.3f} \pm {:.3f} ({:.2f})$".format(row.ece_mean, row.ece_err, row.ece_rank_mean)
    )

## For only identity

In [None]:
df_iden = df_results[df_results.corruption == 'identity'].drop(['corruption'], axis=1).reset_index()

In [None]:
df_iden

In [None]:
metrics_summ = df_iden.groupby('alpha').agg(
    n = pd.NamedAgg(column='acc', aggfunc='count'),
    acc_mean = pd.NamedAgg(column='acc', aggfunc='mean'),
    acc_err = pd.NamedAgg(column='acc', aggfunc=lambda x: np.std(x) / np.sqrt(x.shape[0])),
    ece_mean = pd.NamedAgg(column='ece', aggfunc='mean'),
    ece_err = pd.NamedAgg(column='ece', aggfunc=lambda x: np.std(x) / np.sqrt(x.shape[0])),
    nll_mean = pd.NamedAgg(column='nll', aggfunc='mean'),
    nll_err = pd.NamedAgg(column='nll', aggfunc=lambda x: np.std(x) / np.sqrt(x.shape[0])),
    auroc_mean = pd.NamedAgg(column='auroc', aggfunc='mean'),
    auroc_err = pd.NamedAgg(column='auroc', aggfunc=lambda x: np.std(x) / np.sqrt(x.shape[0])),
)

In [None]:
metrics_summ

In [None]:
# For Table 1
for row in metrics_summ.itertuples():
    print(
        "& ${:.3f} \pm {:.3f}$".format(row.nll_mean, row.nll_err),
        "& ${:.3f} \pm {:.3f}$".format(row.acc_mean, row.acc_err),
        "& ${:.3f} \pm {:.3f}$".format(row.auroc_mean, row.auroc_err),
        "& ${:.3f} \pm {:.3f}$".format(row.ece_mean, row.ece_err)
    )

In [None]:
# For Table 7 - 
for row in metrics_summ.itertuples():
    print(
        "& ${:.3f} \pm {:.3f}$".format(row.nll_mean, row.nll_err),
        "& ${:.3f} \pm {:.3f}$".format(row.acc_mean, row.acc_err),
        "& ${:.3f} \pm {:.3f}$".format(row.auroc_mean, row.auroc_err),
        "& ${:.3f} \pm {:.3f}$".format(row.ece_mean, row.ece_err)
    )