# Extract eval results - MNIST - Clean test set

Analyze evaluation results for BMNIST
- Clean data

In [None]:
import sys
sys.path.append("./../")

In [None]:
import os
import json
import glob
import pickle

import numpy as np
import pandas as pd

In [None]:
# Accuracy threshold for models to be included in analysis
ACC_THRESHOLD = 0.80

In [None]:
def extract_results(model_dir):
    """
        Get OOD metrics from model dir
    """

    # Get config
    config_json = os.path.join(model_dir, 'config.json')
    config = json.load(open(config_json, 'r'))
    
    # Extract config values
    method = config['method']
    
    # Create a table entry for parameters  as string
    param_str = method # start with method label
    for _p, _p_value in config['method_params'].items():
        _p_value_str = None
        if isinstance(_p_value, int):
            _p_value_str = '{:04d}'.format(_p_value)
        elif isinstance(_p_value, float):
            _p_value_str = '{:08.3f}'.format(_p_value)
        else:
            _p_value_str = '{}'.format(_p_value)
        param_str += '-{}={}'.format(_p, _p_value_str)
    param = param_str
    
    results = None
    
    # Get OOD result files
    ood_result_files = glob.glob(model_dir + "/ece_results_*.pkl")
    
    # Get results
    for rfile in ood_result_files:
        filename = os.path.basename(rfile)
        # Get corruption name from file name
        corr_name = ' '.join(filename.split('_')[2:])[:-4]
        with open(rfile, 'rb') as f:
            logs = pickle.load(f)[0]
            r = {
                'method': method,
                'params': param,
                'corruption': corr_name,
                'ece': logs['ece_uncal'],
                'acc': logs['acc'],
                'nll': logs['nll_uncal_test'],
                'auroc': logs['auroc']
            }
            
            if results is not None:
                results.append(r)
            else:
                results = [r]
    
    return results

#### Load results

In [None]:
# # LeNet + 1000
# models_root = "./../zoo/sl/half-prior-alphavar/BinaryMNISTC-1000-53-identity/LeNet"
# # models_root = "./../zoo/sl/uniform-prior-alphavar/BinaryMNISTC-1000-53-identity/LeNet"
# # models_root = "./../zoo/sl/auto-prior-alphavar/BinaryMNISTC-1000-53-identity/LeNet"
# elbo_models_root = "./../zoo/mfvi/BinaryMNISTC-1000-53-identity/LeNet"
# ls_models_root = "./../zoo/ls/BinaryMNISTC-1000-53-identity/LeNet"
# edl_models_root = "./../zoo/edl/BinaryMNISTC-1000-53-identity/LeNetEDL"

# LeNet + D1000
# result_dirs = [
#     "./../zoo/sl/half-prior-alphavar/BinaryMNISTC-1000-53-identity/LeNet", # SL
#     # "./../zoo/sl/uniform-prior-alphavar/BinaryMNISTC-1000-53-identity/LeNet", #SL
#     # "./../zoo/sl/auto-prior-alphavar/BinaryMNISTC-1000-53-identity/LeNet", #SL
#     "./../zoo/mfvi/BinaryMNISTC-1000-53-identity/LeNet", #MFVI
#     "./../zoo/ls/BinaryMNISTC-1000-53-identity/LeNet", #LS
#     # "./../zoo/edl/computed-prior/BinaryMNISTC-1000-53-identity/LeNetEDL", #EDL
#     # "./../zoo/edl/skewed-prior/BinaryMNISTC-1000-53-identity/LeNetEDL", #EDL
#     # "./../zoo/edl/uniform-prior/BinaryMNISTC-1000-53-identity/LeNetEDL", #EDL
# ]

# LeNet + D8000
result_dirs = [
    "./../zoo/sl/half-prior-alphavar/BinaryMNISTC-8000-53-identity/LeNet", # SL
    "./../zoo/sl/uniform-prior-alphavar/BinaryMNISTC-8000-53-identity/LeNet", #SL
    "./../zoo/sl/auto-prior-alphavar/BinaryMNISTC-8000-53-identity/LeNet", #SL
    "./../zoo/mfvi/BinaryMNISTC-8000-53-identity/LeNet", #MFVI
    "./../zoo/ls/BinaryMNISTC-8000-53-identity/LeNet", #LS
    "./../zoo/edl/computed-prior/BinaryMNISTC-8000-53-identity/LeNetEDL", #EDL
    "./../zoo/edl/skewed-prior/BinaryMNISTC-8000-53-identity/LeNetEDL", #EDL
    "./../zoo/edl/uniform-prior/BinaryMNISTC-8000-53-identity/LeNetEDL", #EDL
]


# ConvNet + 1000
# models_root = "./../zoo/sl/half-prior-alphavar/BinaryMNISTC-1000-53-identity/ConvNet"
# models_root = "./../zoo/sl/auto-prior-alphavar/BinaryMNISTC-1000-53-identity/ConvNet"
# models_root = "./../zoo/sl/uniform-prior-alphavar/BinaryMNISTC-1000-53-identity/ConvNet"
# elbo_models_root = "./../zoo/mfvi/BinaryMNISTC-1000-53-identity/ConvNet"
# ls_models_root = "./../zoo/ls/BinaryMNISTC-1000-53-identity/ConvNet"
# edl_models_root = "./../zoo/edl/BinaryMNISTC-1000-53-identity/ConvNetEDL"

# ConvNet + 10000
# models_root = "./../zoo/sl/half-prior-alphavar/BinaryMNISTC-8000-53-identity/ConvNet"
# models_root = "./../zoo/sl/auto-prior-alphavar/BinaryMNISTC-8000-53-identity/ConvNet"
# models_root = "./../zoo/sl/uniform-prior-alphavar/BinaryMNISTC-8000-53-identity/ConvNet"
# elbo_models_root = "./../zoo/mfvi/BinaryMNISTC-8000-53-identity/ConvNet"
# ls_models_root = "./../zoo/ls/BinaryMNISTC-8000-53-identity/ConvNet"
# edl_models_root = "./../zoo/edl/BinaryMNISTC-8000-53-identity/ConvNetEDL"

In [None]:
# Enumerate model directories and load evaluation results
results = []
for models_root in result_dirs:
    model_dirs = list(map(lambda d: os.path.join(models_root, d), os.listdir(models_root)))
    for _m in model_dirs:
        results.extend(extract_results(_m))
df_results = pd.DataFrame(results)

In [None]:
# Do basic QA, ignore all models which fail to train satisfactorily
df_results = df_results[df_results.acc > ACC_THRESHOLD]

In [None]:
df_results

#### Get results for clean dataset

In [None]:
df_clean = df_results[df_results.corruption == 'identity'].drop(['corruption'], axis=1).reset_index()

In [None]:
metrics_summ = df_clean.groupby('params').agg(
    n = pd.NamedAgg(column='acc', aggfunc='count'),
    acc_mean = pd.NamedAgg(column='acc', aggfunc='mean'),
    acc_err = pd.NamedAgg(column='acc', aggfunc=lambda x: np.std(x) / np.sqrt(x.shape[0])),
    ece_mean = pd.NamedAgg(column='ece', aggfunc='mean'),
    ece_err = pd.NamedAgg(column='ece', aggfunc=lambda x: np.std(x) / np.sqrt(x.shape[0])),
    nll_mean = pd.NamedAgg(column='nll', aggfunc='mean'),
    nll_err = pd.NamedAgg(column='nll', aggfunc=lambda x: np.std(x) / np.sqrt(x.shape[0])),
    auroc_mean = pd.NamedAgg(column='auroc', aggfunc='mean'),
    auroc_err = pd.NamedAgg(column='auroc', aggfunc=lambda x: np.std(x) / np.sqrt(x.shape[0])),
)

In [None]:
metrics_summ

In [None]:
# For Table 1
for row in metrics_summ.itertuples():
    print(
        "{:50s}".format(row.Index),
        "& ${:.3f} \pm {:.3f}$".format(row.nll_mean, row.nll_err),
        "& ${:.3f} \pm {:.3f}$".format(row.acc_mean, row.acc_err),
        "& ${:.3f} \pm {:.3f}$".format(row.auroc_mean, row.auroc_err),
        "& ${:.3f} \pm {:.3f}$".format(row.ece_mean, row.ece_err)
    )