# Extract eval results

Analyze evaluation results for BMNIST with modified $s_0$
- Clean data

In [4]:
import sys
sys.path.append("./../")

In [5]:
import os
import json
import glob
import pickle

import numpy as np
import pandas as pd

In [6]:
def extract_results(model_dir):
    
    # Get config
    config_json = os.path.join(model_dir, 'config.json')
    config = json.load(open(config_json, 'r'))
    
    # Extract config values
    method = config['method']
    lam_sl = config['method_params'].get('lam_sl', 0.0)
    ds_size = config['ds_params'].get('size', 'Full')
    
    
    results = None
    
    # Get OOD result files
    ood_result_files = glob.glob(model_dir + "/ece_results.pkl")
    
    # Get results
    for rfile in ood_result_files:
        filename = os.path.basename(rfile)
        # Get corruption name from file name
        with open(rfile, 'rb') as f:
            logs = pickle.load(f)[0]
            r = {
                'method': method,
                'lam_sl': lam_sl,
                'ds_size': ds_size,
                'ece': logs['ece_uncal'],
                'acc': logs['acc'],
                'nll': logs['nll_uncal_test'],
                'auroc': logs['auroc']
            }
            
            if results is not None:
                results.append(r)
            else:
                results = [r]
    
    return results

## Model dirs

In [10]:
# models_root = "./../zoo/abl-alpha100-unibin/CIFAR10/VGG11"
models_root = "./../zoo/abl-alpha100-slim-CIFAR10/CIFAR10/VGG11"
elbo_models_root = "./../zoo/abl-alpha100-unibin-mfvi-cifar10/CIFAR10/VGG11"

## S-ELBO results

In [11]:
model_dirs = list(map(lambda d: os.path.join(models_root, d), os.listdir(models_root)))

In [12]:
results = []
for _m in model_dirs:
    print(_m)
    results.extend(extract_results(_m))

./../zoo/abl-alpha100-slim-CIFAR10/CIFAR10/VGG11/slim-lam1e-02-2-20220728134850
./../zoo/abl-alpha100-slim-CIFAR10/CIFAR10/VGG11/slim-lam1e-02-1-20220728134748
./../zoo/abl-alpha100-slim-CIFAR10/CIFAR10/VGG11/slim-lam1e-04-2-20220728125431
./../zoo/abl-alpha100-slim-CIFAR10/CIFAR10/VGG11/slim-lam1e-05-1-20220728122554
./../zoo/abl-alpha100-slim-CIFAR10/CIFAR10/VGG11/slim-lam1e-04-1-20220728125300
./../zoo/abl-alpha100-slim-CIFAR10/CIFAR10/VGG11/slim-lam1e-03-2-20220728132148
./../zoo/abl-alpha100-slim-CIFAR10/CIFAR10/VGG11/slim-lam1e-04-5-20220728134033
./../zoo/abl-alpha100-slim-CIFAR10/CIFAR10/VGG11/slim-lam1e-04-4-20220728125405
./../zoo/abl-alpha100-slim-CIFAR10/CIFAR10/VGG11/slim-lam1e-01-1-20220728141516
./../zoo/abl-alpha100-slim-CIFAR10/CIFAR10/VGG11/slim-lam1e-06-5-20220728112300
./../zoo/abl-alpha100-slim-CIFAR10/CIFAR10/VGG11/slim-lam1e-01-4-20220728141417
./../zoo/abl-alpha100-slim-CIFAR10/CIFAR10/VGG11/slim-lam1e-03-3-20220728131945
./../zoo/abl-alpha100-slim-CIFAR10/CIFAR

## EBLO results

In [13]:
model_dirs = list(map(lambda d: os.path.join(elbo_models_root, d), os.listdir(elbo_models_root)))

In [14]:
for _m in model_dirs:
    results.extend(extract_results(_m))
df_results = pd.DataFrame(results)

In [15]:
df_results = pd.DataFrame(results)

In [16]:
df_results

Unnamed: 0,method,lam_sl,ds_size,ece,acc,nll,auroc
0,slim,0.01,Full,0.069834,0.805733,0.673393,0.974516
1,slim,0.01,Full,0.034837,0.797333,0.63091,0.976188
2,slim,0.0001,Full,0.10795,0.814933,0.800451,0.975679
3,slim,1e-05,Full,0.025263,0.8064,0.609941,0.978025
4,slim,0.0001,Full,0.072083,0.809733,0.657443,0.976529
5,slim,0.001,Full,0.104092,0.821333,0.733193,0.978337
6,slim,0.0001,Full,0.009465,0.808267,0.592921,0.977695
7,slim,0.0001,Full,0.035677,0.800933,0.631788,0.976978
8,slim,0.1,Full,0.033291,0.790933,0.647654,0.973956
9,slim,1e-06,Full,0.075332,0.806,0.667382,0.976297


In [17]:
metrics_summ = df_results.groupby('lam_sl').agg(
    n = pd.NamedAgg(column='acc', aggfunc='count'),
    acc_mean = pd.NamedAgg(column='acc', aggfunc='mean'),
    acc_err = pd.NamedAgg(column='acc', aggfunc=lambda x: np.std(x) / np.sqrt(x.shape[0])),
    ece_mean = pd.NamedAgg(column='ece', aggfunc='mean'),
    ece_err = pd.NamedAgg(column='ece', aggfunc=lambda x: np.std(x) / np.sqrt(x.shape[0])),
    nll_mean = pd.NamedAgg(column='nll', aggfunc='mean'),
    nll_err = pd.NamedAgg(column='nll', aggfunc=lambda x: np.std(x) / np.sqrt(x.shape[0])),
    auroc_mean = pd.NamedAgg(column='auroc', aggfunc='mean'),
    auroc_err = pd.NamedAgg(column='auroc', aggfunc=lambda x: np.std(x) / np.sqrt(x.shape[0])),
)

In [18]:
metrics_summ

Unnamed: 0_level_0,n,acc_mean,acc_err,ece_mean,ece_err,nll_mean,nll_err,auroc_mean,auroc_err
lam_sl,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0.0,5,0.809147,0.002108,0.082463,0.00681,0.702708,0.023684,0.976076,0.000456
1e-06,5,0.80928,0.001577,0.075725,0.0072,0.675843,0.022738,0.976856,0.0003
1e-05,5,0.809387,0.001064,0.070603,0.013664,0.681951,0.028311,0.976576,0.000409
0.0001,5,0.80928,0.00213,0.063573,0.016242,0.676575,0.031797,0.976914,0.00034
0.001,5,0.809227,0.003367,0.07528,0.010444,0.683155,0.015706,0.976551,0.000509
0.01,5,0.793253,0.009226,0.053784,0.00724,0.657615,0.022892,0.97443,0.001723
0.1,4,0.783133,0.00349,0.047076,0.012329,0.686653,0.022948,0.972836,0.000872
1.0,2,0.158867,0.018903,0.029095,0.007775,2.205565,0.054852,0.626081,0.043268


In [19]:
metrics_summ = df_results.groupby('lam_sl').agg(
    n = pd.NamedAgg(column='acc', aggfunc='count'),
    acc_mean = pd.NamedAgg(column='acc', aggfunc='mean'),
    acc_err = pd.NamedAgg(column='acc', aggfunc=lambda x: np.std(x) / np.sqrt(x.shape[0])),
    ece_mean = pd.NamedAgg(column='ece', aggfunc='mean'),
    ece_err = pd.NamedAgg(column='ece', aggfunc=lambda x: np.std(x) / np.sqrt(x.shape[0])),
    nll_mean = pd.NamedAgg(column='nll', aggfunc='mean'),
    nll_err = pd.NamedAgg(column='nll', aggfunc=lambda x: np.std(x) / np.sqrt(x.shape[0])),
    auroc_mean = pd.NamedAgg(column='auroc', aggfunc='mean'),
    auroc_err = pd.NamedAgg(column='auroc', aggfunc=lambda x: np.std(x) / np.sqrt(x.shape[0])),
)

In [20]:
metrics_summ

Unnamed: 0_level_0,n,acc_mean,acc_err,ece_mean,ece_err,nll_mean,nll_err,auroc_mean,auroc_err
lam_sl,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0.0,5,0.809147,0.002108,0.082463,0.00681,0.702708,0.023684,0.976076,0.000456
1e-06,5,0.80928,0.001577,0.075725,0.0072,0.675843,0.022738,0.976856,0.0003
1e-05,5,0.809387,0.001064,0.070603,0.013664,0.681951,0.028311,0.976576,0.000409
0.0001,5,0.80928,0.00213,0.063573,0.016242,0.676575,0.031797,0.976914,0.00034
0.001,5,0.809227,0.003367,0.07528,0.010444,0.683155,0.015706,0.976551,0.000509
0.01,5,0.793253,0.009226,0.053784,0.00724,0.657615,0.022892,0.97443,0.001723
0.1,4,0.783133,0.00349,0.047076,0.012329,0.686653,0.022948,0.972836,0.000872
1.0,2,0.158867,0.018903,0.029095,0.007775,2.205565,0.054852,0.626081,0.043268


In [21]:
for row in metrics_summ.itertuples():
    print(
        "& ${:.3f} \pm {:.3f}$".format(row.nll_mean, row.nll_err),
        "& ${:.3f} \pm {:.3f}$".format(row.acc_mean, row.acc_err),
#         "& ${:.3f} \pm {:.3f}$".format(row.auroc_mean, row.auroc_err),
        "& ${:.3f} \pm {:.3f}$".format(row.ece_mean, row.ece_err)
    )

& $0.703 \pm 0.024$ & $0.809 \pm 0.002$ & $0.082 \pm 0.007$
& $0.676 \pm 0.023$ & $0.809 \pm 0.002$ & $0.076 \pm 0.007$
& $0.682 \pm 0.028$ & $0.809 \pm 0.001$ & $0.071 \pm 0.014$
& $0.677 \pm 0.032$ & $0.809 \pm 0.002$ & $0.064 \pm 0.016$
& $0.683 \pm 0.016$ & $0.809 \pm 0.003$ & $0.075 \pm 0.010$
& $0.658 \pm 0.023$ & $0.793 \pm 0.009$ & $0.054 \pm 0.007$
& $0.687 \pm 0.023$ & $0.783 \pm 0.003$ & $0.047 \pm 0.012$
& $2.206 \pm 0.055$ & $0.159 \pm 0.019$ & $0.029 \pm 0.008$
