In [1]:
STRATEGIES_ORDER = {
    'Baseline': 0,
    'Salt&Pepper': 1,
    'Gaussian': 2,
    'DefaultAug': 3,
    'DefaultAug+Gaussian': 4,
    'DefaultAug+S&P': 5
}

In [2]:
import pandas as pd
import seaborn as sns
from scipy import stats
import matplotlib.pyplot as plt
import numpy as np

marker_styles = ['o', 's', '^', 'v', '<', '>', 'p']
sns.set_style("whitegrid")
sns.set(font='serif')
sns.set_style("white", {
    "font.family": "serif",
    "font.serif": ["Times", "Palatino", "serif"],
})

In [3]:
results = pd.read_csv(f'../results/full_10folds_results_all_severities.csv')

results['evaluation_set'] = results['evaluation_set'].str.replace(' 1', '')
results['evaluation_set'] = results['evaluation_set'].str.replace(' 2', '')
results['evaluation_set'] = results['evaluation_set'].str.replace(' 3', '')
results['evaluation_set'] = results['evaluation_set'].str.replace(' 4', '')
results['evaluation_set'] = results['evaluation_set'].str.replace(' 5', '')


results_ood = results[results['evaluation_set'] != 'In-Distribution']
results_ood.loc[:, 'evaluation_set'] = 'Out-of-Distribution'

results_in = results[results['evaluation_set'] == 'In-Distribution']
results_in.loc[:, 'Severity'] = 'In-Distribution'


# Miscoverage
### Generating Samples leaving 3 out

In [4]:
replications = []
for index in range(10):
  if index < 9:
    a = np.arange(1, 11)
    a = np.delete(a, [index, index+1])
    replications.append(a)
      
replications

[array([ 3,  4,  5,  6,  7,  8,  9, 10]),
 array([ 1,  4,  5,  6,  7,  8,  9, 10]),
 array([ 1,  2,  5,  6,  7,  8,  9, 10]),
 array([ 1,  2,  3,  6,  7,  8,  9, 10]),
 array([ 1,  2,  3,  4,  7,  8,  9, 10]),
 array([ 1,  2,  3,  4,  5,  8,  9, 10]),
 array([ 1,  2,  3,  4,  5,  6,  9, 10]),
 array([ 1,  2,  3,  4,  5,  6,  7, 10]),
 array([1, 2, 3, 4, 5, 6, 7, 8])]

In [5]:
def generate_miscoverage(df):
  global_10fold_df = df.copy()
  each_replication_array = []
  for i, selected_folds in enumerate(replications):
      splitted_dataframe = global_10fold_df[global_10fold_df['fold'].isin(selected_folds)].copy()
      splitted_dataframe['replication'] = i + 1
      each_replication_array.append(splitted_dataframe)

  each_replication_df = pd.concat(each_replication_array, ignore_index=True)

  each_leave1out_df = []
  
  for i, selected_folds in enumerate(replications):
      splitted_dataframe = each_replication_df[each_replication_df['fold'].isin(selected_folds)].copy()
      mean_fscore_overall = each_replication_df.groupby(['strategy', 'model', 'replication', 'Severity'])['f1-score(weighted avg)'].mean()

      def normalize_fscore(row):
          fscore = row['f1-score(weighted avg)']
          mean = mean_fscore_overall[row['strategy'], row['model'], row['replication'], row['Severity']]
          return fscore - mean

      dataframe = pd.DataFrame({
          'model': splitted_dataframe['model'],
          'replication': splitted_dataframe['replication'],
          'fold': splitted_dataframe['fold'],
          'severity': splitted_dataframe['Severity'],
          'evaluation_set': splitted_dataframe['evaluation_set'],
          'strategy': splitted_dataframe['strategy'] + '_' + str(i),
          'f1-score(weighted avg)': splitted_dataframe.apply(normalize_fscore, axis=1),
      })
      each_leave1out_df.append(dataframe)

  result_dataframe = pd.concat(each_leave1out_df, ignore_index=True)
  result_dataframe['sort_key'] = (result_dataframe['strategy']
                                  .apply(lambda strategy: STRATEGIES_ORDER[strategy.split('_')[0]]))
  result_dataframe_final = result_dataframe.sort_values(by=['sort_key', 'strategy']).drop(columns=['sort_key'])

  return result_dataframe_final, each_leave1out_df

In [6]:
result_dataframe_in, each_dataframe_fscore_in = generate_miscoverage(results_in)
result_dataframe_in['severity'] = 'In-Distribution'
result_dataframe_ood, each_dataframe_fscore_ood = generate_miscoverage(results_ood)


In [None]:
def plot_results_all(df_in, df_out, x_label='Mean Centered F-Score', figsize=(50, 25)):
    unique_values1 = df_in['model'].unique()
    unique_values2 = df_out['model'].unique()
    
    resnet_results_in = df_in[df_in['model'] == unique_values1[0]]
    xception_results_in = df_in[df_in['model'] == unique_values1[1]]
    resnet_results_out = df_out[df_out['model'] == unique_values2[0]]
    xception_results_out = df_out[df_out['model'] == unique_values2[1]]
    
    resnet_results_lowest_dist = resnet_results_out[resnet_results_out['severity'] == 'Lowest']
    resnet_results_midrange_dist = resnet_results_out[resnet_results_out['severity'] == 'Mid-Range']
    resnet_results_highest_dist = resnet_results_out[resnet_results_out['severity'] == 'Highest']
    
    xception_results_lowest_dist = xception_results_out[xception_results_out['severity'] == 'Lowest']
    xception_results_midrange_dist = xception_results_out[xception_results_out['severity'] == 'Mid-Range']
    xception_results_highest_dist = xception_results_out[xception_results_out['severity'] == 'Highest']
    
    fig, axes = plt.subplots(2, 4, figsize=figsize)  # Create a figure with four subplots

    x_min = -.03
    x_max = .03

    x_ticks = [x_min, 0, x_max]

    handles = []
    labels = []

    unique_approaches = df_in['strategy'].unique()
    palette_dict = {}

    for strategy in unique_approaches:
        if strategy not in palette_dict:
            if strategy.split('_')[0] == 'Baseline':
                palette_dict['Baseline'] = '#5471ab'
            elif strategy.split('_')[0] == 'Gaussian':
                palette_dict['Gaussian'] = '#6aa66e'
            elif strategy.split('_')[0] == 'Salt&Pepper':
                palette_dict['Salt&Pepper'] = '#d1885c'
            elif strategy.split('_')[0] == 'DefaultAug+S&P':
                palette_dict['DefaultAug+S&P'] = '#7f73af'
            elif strategy.split('_')[0] == 'DefaultAug+Gaussian':
                palette_dict['DefaultAug+Gaussian'] = '#8f7963'
            elif strategy.split('_')[0] == 'DefaultAug':
                palette_dict['DefaultAug'] = '#b65655'

    strategy = df_in.copy()
    palette = [palette_dict[strategy.split('_')[0]] for strategy in strategy['strategy'].unique()]

    for i, plot in enumerate([
        {"df": resnet_results_in, "model": "ResNet50", "ylabel": "In-Distribution"},
        {"df": resnet_results_lowest_dist, "model": "ResNet50", "ylabel": "Lowest"},
        {"df": resnet_results_midrange_dist, "model": "ResNet50", "ylabel": "Mid-Range"},
        {"df": resnet_results_highest_dist, "model": "ResNet50", "ylabel": "Highest"},
        {"df": xception_results_in, "model": "Xception", "ylabel": "In-Distribution"},
        {"df": xception_results_lowest_dist, "model": "Xception", "ylabel": "Lowest"},
        {"df": xception_results_midrange_dist, "model": "Xception", "ylabel": "Mid-Range"},
        {"df": xception_results_highest_dist, "model": "Xception", "ylabel": "Highest"},
    ]):
        ax = sns.pointplot(
            data=plot['df'],
            x='f1-score(weighted avg)',
            y='severity',
            hue='strategy',
            linestyles='-',  # Set line style for error bars
            markers=['o', 'o', 'o', 'o', 'o', 'o', 'o', 'o', 'o',
                     's', 's', 's', 's', 's', 's', 's', 's', 's',
                     '^', '^', '^', '^', '^', '^', '^', '^', '^',
                     'v', 'v', 'v', 'v', 'v', 'v', 'v', 'v', 'v',
                     '<', '<', '<', '<', '<', '<', '<', '<', '<',
                     '>', '>', '>', '>', '>', '>', '>', '>', '>',
                     'p', 'p', 'p', 'p', 'p', 'p', 'p', 'p', 'p'],
            errorbar=("ci", 95),
            dodge=.9,
            palette=palette, 
            err_kws={'linewidth': 3},
            ax=axes[i // 4, i % 4]
        )
                
        ax.set_xlabel(x_label, fontsize=32)

        ax.set_xlim(x_min, x_max)
        ax.set_xticks(x_ticks)
        ax.set_yticks([])

        ax.axvline(x=0, color='k', linestyle='--')
        
        if i == 0 or i == 1 or i == 2 or i == 3:
            ax.set_xlabel("")
            ax.set_title(plot['ylabel'], fontsize=42)
        else:
            ax.set_xlabel("Mean Centered F1-Score", fontsize=42)
            
            
        if i == 0:
            ax.set_ylabel("ResNet50", fontsize=42)
        elif i == 4:
            ax.set_ylabel("Xception", fontsize=42)
        else:
            ax.set_ylabel('')


        ax.tick_params(labelsize=38)

        ax.legend_.remove()

        if i == 0:
            handles, labels = ax.get_legend_handles_labels()


    for i in range(len(labels)):
        labels[i] = labels[i].split('_')[0]

    unique_labels = []
    unique_handles = []
    for i, label in enumerate(labels):
        if label not in unique_labels:     
            unique_labels.append(label)
            unique_handles.append(handles[i]) 
                
    for i, handle in enumerate(unique_handles):
        handle.set_marker(marker_styles[i % len(marker_styles)])
        handle.set_markersize(30)

    fig.legend(unique_handles, unique_labels, title="Strategy", loc='lower center', bbox_to_anchor=(0.5, -0.135), fontsize=42, title_fontsize=42, ncol=3)
    fig.suptitle('Distributions Domain Range', fontsize=50, y=1)
    plt.tight_layout()
    plt.savefig('../output/miscoverage_plot.pdf', bbox_inches='tight')
    plt.show()

plot_results_all(result_dataframe_in, result_dataframe_ood)

# Miscoverage STD DEV

#### In-Distribution

In [150]:
def bootstrap_confidence_interval(data, num_samples=1000, ci=0.95):
    data = np.array(data)

    res = stats.bootstrap((data,), np.std, confidence_level=ci, n_resamples=num_samples, method='basic')
    return res.confidence_interval.low, res.confidence_interval.high

In [151]:
df = pd.concat(each_dataframe_fscore_in)
for model in ['ResNet50', 'Xception']:
  print('\n')
  for strategy in ['Baseline', 'Gaussian', 'Salt&Pepper', 'DefaultAug_', 'DefaultAug+Gaussian_', 'DefaultAug+S&P_',]:
      curr_df = df[df['model'] == model]
      curr_df = curr_df[curr_df['strategy'].str.startswith(strategy)]
      lower, upper = bootstrap_confidence_interval(curr_df['f1-score(weighted avg)'])
      print(f"STD {model} - {strategy}: {curr_df['f1-score(weighted avg)'].std()} ({lower}, {upper})")



STD ResNet50 - Baseline: 0.06353211548672956 (0.06106268065713018, 0.06614960458990626)
STD ResNet50 - Gaussian: 0.03164450354719362 (0.030283328025111134, 0.03325610237411661)
STD ResNet50 - Salt&Pepper: 0.02937196654963576 (0.028102874446607137, 0.030512578581924375)
STD ResNet50 - DefaultAug_: 0.01478300194118503 (0.013961867188080044, 0.015605737516210775)
STD ResNet50 - DefaultAug+Gaussian_: 0.015266434115414172 (0.01444592607265728, 0.01610775671889196)
STD ResNet50 - DefaultAug+S&P_: 0.014651179919667967 (0.013872420331671107, 0.01551129206304325)


STD Xception - Baseline: 0.048032519001490026 (0.04581286804226009, 0.0505023107582514)
STD Xception - Gaussian: 0.032909360337623796 (0.03143248878059454, 0.034648484687443665)
STD Xception - Salt&Pepper: 0.04709308426529416 (0.04544183009525114, 0.048702082615707944)
STD Xception - DefaultAug_: 0.006588904192786502 (0.006210673889275062, 0.006976752804731249)
STD Xception - DefaultAug+Gaussian_: 0.013328707915748922 (0.0128497504

#### Out-of-Distribution

In [38]:
df = pd.concat(each_dataframe_fscore_ood)
for model in ['ResNet50', 'Xception']:
  print('\n')
  for strategy in ['Baseline', 'Gaussian', 'Salt&Pepper', 'DefaultAug_', 'DefaultAug+Gaussian_', 'DefaultAug+S&P_',]:
      curr_df = df[df['model'] == model]
      curr_df = curr_df[curr_df['strategy'].str.startswith(strategy)]
      lower, upper = bootstrap_confidence_interval(curr_df['f1-score(weighted avg)'])
      print(f"STD {model} - {strategy}: {curr_df['f1-score(weighted avg)'].std()} ({lower}, {upper})")



STD ResNet50 - Baseline: 0.08836883904380416 (0.08661553632395996, 0.09010103579488148)
STD ResNet50 - Gaussian: 0.10086383248365637 (0.09855908344785269, 0.10340442558739327)
STD ResNet50 - Salt&Pepper: 0.07783415296743314 (0.07573891441610651, 0.07985055332314206)
STD ResNet50 - DefaultAug_: 0.0791524405537469 (0.07792940298956437, 0.0804926112706884)
STD ResNet50 - DefaultAug+Gaussian_: 0.09299640679977839 (0.09062863591245778, 0.09559063499956144)
STD ResNet50 - DefaultAug+S&P_: 0.07479540932984444 (0.07249541050248078, 0.07694306077107199)


STD Xception - Baseline: 0.08456549411501284 (0.08336529330504239, 0.08570393206660082)
STD Xception - Gaussian: 0.0799014962999847 (0.07820283209482294, 0.08159153322774475)
STD Xception - Salt&Pepper: 0.06442951254670468 (0.06311279822113955, 0.06574203121748683)
STD Xception - DefaultAug_: 0.08351874879023403 (0.0824999507472845, 0.0846136712079276)
STD Xception - DefaultAug+Gaussian_: 0.0712837991858424 (0.06941056495728587, 0.0733000036