# Extract eval results - CIFAR

In [None]:
import sys
sys.path.append("./../")

In [None]:
import os
import json
import glob
import pickle

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

sns.set_style('darkgrid')

#### Utility functions

In [None]:
# Accuracy threshold for models to be included in analysis
ACC_THRESHOLD = 0.75 # For CIFAR10

In [None]:
def extract_results(model_dir):
    """
        Get metrics from model dir
    """

    # Get config
    config_json = os.path.join(model_dir, 'config.json')
    config = json.load(open(config_json, 'r'))
    
    # Extract config values
    method = config['method']
    
    # Create a table entry for parameters  as string
    param_str = method # start with method label
    for _p, _p_value in config['method_params'].items():
        _p_value_str = None
        if isinstance(_p_value, int):
            _p_value_str = '{:04d}'.format(_p_value)
        elif isinstance(_p_value, float):
            _p_value_str = '{:08.3f}'.format(_p_value)
        else:
            _p_value_str = '{}'.format(_p_value)
        param_str += '-{}={}'.format(_p, _p_value_str)
    param = param_str
    
    results = None
    
    # Get result files
    ood_result_files = glob.glob(model_dir + "/ece_results_*.pkl")
    
    # Get results
    for rfile in ood_result_files:
        filename = os.path.basename(rfile)
        # Get corruption name from file name
        corr_name = ' '.join(filename.split('_')[2:])[:-4]
        # Split corruption name and identify severity
        severity = 3 # default
        _s = corr_name.split('-')
        if len(_s) > 1:
            corr_name = _s[0]
            severity = _s[1]
        with open(rfile, 'rb') as f:
            logs = pickle.load(f)[0]
            r = {
                'method': method,
                'params': param,
                'corruption': corr_name,
                'severity': severity,
                'ece': logs['ece_uncal_val'],
                'acc': logs['acc_val'],
                'nll': logs['nll_uncal_val'],
                'auroc': logs['auroc_val'],
                'ece_test': logs['ece_uncal_test'],
                'acc_test': logs['acc_test'],
                'nll_test': logs['nll_uncal_test'],
                'auroc_test': logs['auroc_test']
            }
            
            if results is not None:
                results.append(r)
            else:
                results = [r]
    
    return results

In [None]:
def extract_ood_results(model_dir, dataset_str='FMNIST'):
    """
        Get OOD metrics from model dir
    """

    # Get config
    config_json = os.path.join(model_dir, 'config.json')
    config = json.load(open(config_json, 'r'))
    
    # Extract config values
    method = config['method']
    
    # Create a table entry for parameters  as string
    param_str = method # start with method label
    for _p, _p_value in config['method_params'].items():
        _p_value_str = None
        if isinstance(_p_value, int):
            _p_value_str = '{:04d}'.format(_p_value)
        elif isinstance(_p_value, float):
            _p_value_str = '{:08.3f}'.format(_p_value)
        else:
            _p_value_str = '{}'.format(_p_value)
        param_str += '-{}={}'.format(_p, _p_value_str)
    param = param_str
    
    results = None
    
    # Get OOD result files
    ood_result_files = glob.glob(model_dir + "/ood_results_{}.pkl".format(dataset_str))
    
    assert len(ood_result_files) <= 1, "More than one OOD results exists"

    # Get accuracy on clean dataset also for quality checks
    acc_results_file = os.path.join(model_dir, "ece_results_identity-1.pkl")
    
    # Get results
    for rfile in ood_result_files:
        with open(acc_results_file, 'rb') as f:
            clean_results = pickle.load(f)[0]
        with open(rfile, 'rb') as f:
            logs = pickle.load(f)[0]
            r = {
                'method': method,
                'params': param,
                'ent_ood': logs['ent_ood'],
                'ent_test': logs['ent_test'],
                'ent_delta': logs['ent_delta'],
                'acc': clean_results['acc_val']
            }
            
            if results is not None:
                results.append(r)
            else:
                results = [r]
    
    return results

#### Specify experiments

In [None]:
# CIFAR10 + VGG11
result_dirs = [
    "./../zoo/multiclass-v2/sl/CIFAR10/VGG11",
    "./../zoo/multiclass-v2/mfvi/CIFAR10/VGG11",
    # "./../zoo/multiclass/ls/CIFAR10/VGG11",
    # "./../zoo/multiclass/edl/computed-prior/CIFAR10/VGG11EDL",
    # "./../zoo/multiclass/edl/skewed-prior/CIFAR10/VGG11EDL",
    "./../zoo/multiclass-v2/edl/CIFAR10/VGG11EDL"
]

#### Load results

In [None]:
# Enumerate model directories and load evaluation results
results = []
for models_root in result_dirs:
    model_dirs = list(map(lambda d: os.path.join(models_root, d), os.listdir(models_root)))
    for _m in model_dirs:
        results.extend(extract_results(_m))
df_results = pd.DataFrame(results)

In [None]:
# Do basic QA, ignore all models which fail to train satisfactorily
df_results = df_results[df_results.acc > ACC_THRESHOLD]

In [None]:
df_results

## Get results for Clean dataset

In [None]:
df_clean = df_results[df_results.corruption == 'identity'].drop(['corruption'], axis=1).reset_index()
# df_clean = df_results.drop(['corruption'], axis=1).reset_index()

In [None]:
df_clean

In [None]:
metrics_summ = df_clean.groupby('params').agg(
    n = pd.NamedAgg(column='acc', aggfunc='count'),
    acc_mean = pd.NamedAgg(column='acc', aggfunc='mean'),
    acc_err = pd.NamedAgg(column='acc', aggfunc=lambda x: np.std(x) / np.sqrt(x.shape[0])),
    ece_mean = pd.NamedAgg(column='ece', aggfunc='mean'),
    ece_err = pd.NamedAgg(column='ece', aggfunc=lambda x: np.std(x) / np.sqrt(x.shape[0])),
    nll_mean = pd.NamedAgg(column='nll', aggfunc='mean'),
    nll_err = pd.NamedAgg(column='nll', aggfunc=lambda x: np.std(x) / np.sqrt(x.shape[0])),
    auroc_mean = pd.NamedAgg(column='auroc', aggfunc='mean'),
    auroc_err = pd.NamedAgg(column='auroc', aggfunc=lambda x: np.std(x) / np.sqrt(x.shape[0])),
)

In [None]:
metrics_summ

In [None]:
# For Table 1

nll_min = metrics_summ.nll_mean.min()
acc_max = metrics_summ.acc_mean.max()
auroc_max = metrics_summ.auroc_mean.max()
ece_min = metrics_summ.ece_mean.min()

for row in metrics_summ.itertuples():
    buffer = "{:50s}".format(row.Index)

    if row.nll_mean == nll_min:
        buffer += "& $\mathbf{{{:.3f} \pm {:.3f}}}$".format(row.nll_mean, row.nll_err)
    else:
        buffer += "&          ${:.3f} \pm {:.3f}$".format(row.nll_mean, row.nll_err)

    if row.acc_mean == acc_max:
        buffer += "& $\mathbf{{{:.3f} \pm {:.3f}}}$".format(row.acc_mean, row.acc_err)
    else:
        buffer += "&          ${:.3f} \pm {:.3f}$".format(row.acc_mean, row.acc_err)

    if row.auroc_mean == auroc_max:
        buffer += "& $\mathbf{{{:.3f} \pm {:.3f}}}$".format(row.auroc_mean, row.auroc_err)
    else:
        buffer += "&          ${:.3f} \pm {:.3f}$".format(row.auroc_mean, row.auroc_err)

    if row.ece_mean == ece_min:
        buffer += "& $\mathbf{{{:.3f} \pm {:.3f}}}$".format(row.ece_mean, row.ece_err)
    else:
        buffer += "&          ${:.3f} \pm {:.3f}$".format(row.ece_mean, row.ece_err)

    print(buffer)

#### Get results for test dataset

In [None]:
metrics_summ_test = df_clean.groupby('params').agg(
    n = pd.NamedAgg(column='acc_test', aggfunc='count'),
    acc_mean = pd.NamedAgg(column='acc_test', aggfunc='mean'),
    acc_err = pd.NamedAgg(column='acc_test', aggfunc=lambda x: np.std(x) / np.sqrt(x.shape[0])),
    ece_mean = pd.NamedAgg(column='ece_test', aggfunc='mean'),
    ece_err = pd.NamedAgg(column='ece_test', aggfunc=lambda x: np.std(x) / np.sqrt(x.shape[0])),
    nll_mean = pd.NamedAgg(column='nll_test', aggfunc='mean'),
    nll_err = pd.NamedAgg(column='nll_test', aggfunc=lambda x: np.std(x) / np.sqrt(x.shape[0])),
    auroc_mean = pd.NamedAgg(column='auroc_test', aggfunc='mean'),
    auroc_err = pd.NamedAgg(column='auroc_test', aggfunc=lambda x: np.std(x) / np.sqrt(x.shape[0])),
)

In [None]:
metrics_summ_test

In [None]:
def get_prefix(x):
    parts = x.split('-')
    if len(parts) < 2:
        return '-'.join(parts)
    else:
        return '-'.join(parts[:-1])

# Find groups of experiments without looking at last param
unique_prefixes = list(sorted(set(
        list(map(get_prefix, metrics_summ.index))
    )))

# For each of the unique prefixes, find the best in validation group according to
# NLL
r = []
for pfx in unique_prefixes:
    _df_val = metrics_summ[metrics_summ.index.str.startswith(pfx)]
    idx = _df_val.nll_mean.idxmin()

    # Now get the corresponding results from test set
    r.append(metrics_summ_test.loc[idx])

df_nll_best = pd.DataFrame(r)

In [None]:
unique_prefixes

In [None]:
nll_min = df_nll_best.nll_mean.min()
acc_max = df_nll_best.acc_mean.max()
auroc_max = df_nll_best.auroc_mean.max()
ece_min = df_nll_best.ece_mean.min()

for row in df_nll_best.itertuples():
    buffer = "{:50s}".format(row.Index)

    if row.nll_mean == nll_min:
        buffer += " & $\mathbf{{{:.3f} \pm {:.3f}}}$".format(row.nll_mean, row.nll_err)
    else:
        buffer += " &          ${:.3f} \pm {:.3f}$".format(row.nll_mean, row.nll_err)

    if row.acc_mean == acc_max:
        buffer += " & $\mathbf{{{:.3f} \pm {:.3f}}}$".format(row.acc_mean, row.acc_err)
    else:
        buffer += " &          ${:.3f} \pm {:.3f}$".format(row.acc_mean, row.acc_err)

    if row.auroc_mean == auroc_max:
        buffer += " & $\mathbf{{{:.3f} \pm {:.3f}}}$".format(row.auroc_mean, row.auroc_err)
    else:
        buffer += " &          ${:.3f} \pm {:.3f}$".format(row.auroc_mean, row.auroc_err)

    if row.ece_mean == ece_min:
        buffer += " & $\mathbf{{{:.3f} \pm {:.3f}}}$".format(row.ece_mean, row.ece_err)
    else:
        buffer += " &          ${:.3f} \pm {:.3f}$".format(row.ece_mean, row.ece_err)

    print(buffer)

## OOD Results

In [None]:
# Enumerate model directories and load evaluation results
ood_results = []
for models_root in result_dirs:
    model_dirs = list(map(lambda d: os.path.join(models_root, d), os.listdir(models_root)))
    for _m in model_dirs:
        ood_results.extend(extract_ood_results(_m, dataset_str='SVHN'))
df_ood_results = pd.DataFrame(ood_results)

In [None]:
df_ood_results

In [None]:
df_ood_results = df_ood_results[df_ood_results.acc > ACC_THRESHOLD]

In [None]:
ood_metrics_summ = df_ood_results.groupby('params').agg(
    n = pd.NamedAgg(column='ent_ood', aggfunc='count'),
    ent_ood_mean = pd.NamedAgg(column='ent_ood', aggfunc='mean'),
    ent_ood_err = pd.NamedAgg(column='ent_ood', aggfunc=lambda x: np.std(x) / np.sqrt(x.shape[0])),
    ent_test_mean = pd.NamedAgg(column='ent_test', aggfunc='mean'),
    ent_test_err = pd.NamedAgg(column='ent_test', aggfunc=lambda x: np.std(x) / np.sqrt(x.shape[0])),
    ent_delta_mean = pd.NamedAgg(column='ent_delta', aggfunc='mean'),
    ent_delta_err = pd.NamedAgg(column='ent_delta', aggfunc=lambda x: np.std(x) / np.sqrt(x.shape[0])),
    ent_acc_mean = pd.NamedAgg(column='acc', aggfunc='mean'),
    ent_acc_err = pd.NamedAgg(column='acc', aggfunc=lambda x: np.std(x) / np.sqrt(x.shape[0])),
)

In [None]:
ood_metrics_summ

In [None]:
def get_prefix(x):
    parts = x.split('-')
    if len(parts) < 2:
        return '-'.join(parts)
    else:
        return '-'.join(parts[:-1])

# Find groups of experiments without looking at last param
unique_prefixes = list(sorted(set(
        list(map(get_prefix, ood_metrics_summ.index))
    )))

# For each of the unique prefixes, find the best in validation group according to
# NLL and printout the entropy in OOD
r = []
for pfx in unique_prefixes:
    _df_val = metrics_summ[metrics_summ.index.str.startswith(pfx)]
    idx = _df_val.nll_mean.idxmin()

    # Now get the corresponding results from OOD set
    r.append(ood_metrics_summ.loc[idx])

df_ood_best = pd.DataFrame(r)

In [None]:
df_ood_best

In [None]:
# Table - Results OOD
ent_delta_max = df_ood_best.ent_delta_mean.max()

for row in df_ood_best.itertuples():
    buffer = "{:50s}".format(row.Index)

    # In-domain entropy
    buffer += "& ${:.3f} \pm {:.3f}$".format(row.ent_test_mean, row.ent_test_err)

    # OOD entropy
    buffer += "& ${:.3f} \pm {:.3f}$".format(row.ent_ood_mean, row.ent_ood_err)

    if row.ent_delta_mean == ent_delta_max:
        buffer += "& $\mathbf{{{:.3f} \pm {:.3f}}}$".format(row.ent_delta_mean, row.ent_delta_err)
    else:
        buffer += "&          ${:.3f} \pm {:.3f}$".format(row.ent_delta_mean, row.ent_delta_err)

    print(buffer)

## Get results for Corrupted dataset

In [None]:
df_corrupted = df_results[df_results.corruption != 'identity'].reset_index().drop(['index'], axis=1)

In [None]:
# Get the best method with NLL with each algorithm
def get_prefix(x):
    parts = x.split('-')
    if len(parts) < 2:
        return '-'.join(parts)
    else:
        return '-'.join(parts[:1])
    # return parts[0]

# Find groups of experiments without looking at last param
unique_prefixes = list(sorted(set(
        list(map(get_prefix, metrics_summ.index))
    )))

# For each of the unique prefixes, find the best in validation group according to
# NLL
r = []
for pfx in unique_prefixes:
    _df_val = metrics_summ[metrics_summ.index.str.startswith(pfx)]
    idx = _df_val.nll_mean.idxmin()

    # Now get the corresponding results from test set
    r.append(metrics_summ_test.loc[idx])

df_nll_best_config = pd.DataFrame(r)

In [None]:
df_nll_best_config

In [None]:
# Get the result for best configuration in family
df_corrupted_test = df_corrupted[df_corrupted.params.isin(df_nll_best_config.index.tolist())].reset_index(drop=True)

In [None]:
gdf_corrupted_ece_mean = df_corrupted_test[
    ['corruption', 'method', 'ece_test', 'nll_test', 'acc_test']
].groupby(['corruption', 'method']).mean()

df = gdf_corrupted_ece_mean.reset_index()
df.method = df.method.str.upper()
df = df.rename(columns={'ece_test': 'ECE', 'corruption': 'Corruption', 'method': 'Method'})

df['Method'] = df['Method'].replace(['SL'], 'Proposed')
df['Method'] = df['Method'].replace(['LS'], 'Label Smoothing')
df['Method'] = df['Method'].replace(['MFVI'], 'ELBO')

### Overall corrupted

##### ECE

In [None]:
fig = plt.figure(figsize=(12, 3))
g = sns.barplot(x='Corruption', y='ECE', hue='Method', 
            data=df)
_ = g.set_xticklabels(g.get_xticklabels(), rotation=45)
_ = g.set_xlabel("")
_ = g.set_ylabel("ECE")

##### NLL

In [None]:
fig = plt.figure(figsize=(12, 3))
g = sns.barplot(x='Corruption', y='nll_test', hue='Method', 
            data=df)
_ = g.set_xticklabels(g.get_xticklabels(), rotation=45)
_ = g.set_xlabel("")
_ = g.set_ylabel("NLL")

##### Accuracy

In [None]:
# Prepare data frame
fig = plt.figure(figsize=(12, 3))
g = sns.barplot(x='Corruption', y='acc_test', hue='Method', 
            data=df)
_ = g.set_xticklabels(g.get_xticklabels(), rotation=45)
_ = g.set_xlabel("")
# _ = g.set_ylim(0.5, 1.0)
_ = g.set_ylabel("Accuracy")
_ = g.legend(loc='lower right')

### Level-wise corrupted

In [None]:
corruptions = sorted(df_corrupted_test.corruption.unique())
n = len(corruptions)

df = df_corrupted_test[df_corrupted_test.corruption.isin(corruptions)]
df['method'] = df['method'].replace(['sl'], 'Proposed')
df['method'] = df['method'].replace(['ls'], 'Label Smoothing')
df['method'] = df['method'].replace(['mfvi'], 'ELBO')
df['method'] = df['method'].replace(['edl'], 'EDL')

fig, ax = plt.subplots(n, 3, figsize=(3*4, n*2.5))

gdf_corruption = df.groupby(by='corruption')

for i, (_corr, _df) in enumerate(gdf_corruption):
    gdf_method = _df.groupby(by='method')
    for _method, _df_method in gdf_method:
        _r = _df_method.groupby(by='severity').mean()

        # Plot NLL
        ax[i, 0].plot(_r.index, _r.nll_test, label=_method)

        # Plot ECE
        ax[i, 1].plot(_r.index, _r.ece_test, label=_method)

        # Plot Acc
        ax[i, 2].plot(_r.index, _r.acc_test, label=_method)

    # Fix labels
    if i == 0:
        ax[i, 0].set_title("NLL")
        ax[i, 1].set_title("ECE")
        ax[i, 2].set_title("Acc")
    ax[i, 0].legend()
    ax[i, 0].set_ylabel(_corr)
