In [1]:
import pandas as pd

In [2]:
uncertainty_type_considered="softmax_response" 
# softmax_response or entropy

In [3]:
imgnet = "imagenet"
imgnetv2 = "imagenet_ood"
caltech256 = "caltech256"
iwc_ind = "iwildcam"
iwc_ood = "iwildcam_ood"
datasets = [imgnet,imgnetv2,iwc_ind,iwc_ood,caltech256]
prefix_single_model = "eval_res/evaluation_single_model"
save_prefix_single_model = "cleaned/evaluation_single_model"

backbone_df = pd.read_csv("backbones.csv")
backbone_df = backbone_df.rename(columns={"model_name": "model"})

In [4]:
architecture_family = {
    "vit": "ViT",
    "convnext": "ConvNeXt",
    "efficient": "EfficientNet",
    "mnas": "MNASNet",
    "regnet": "RegNet",
    "resnet": "ResNet",
    "regnext": "RegNeXt",
    "resnext": "ResNeXt",
    "shuffle": "ShuffleNet",
    "swin": "Swin"
}
def get_architecture(model_name):
    for key, value in architecture_family.items():
        if key in model_name.lower():
            return value
    return "Other"

## Cleaning Single Model dfs
1. Map Architecture Names
2. Filter out pre-temp-scaled results
3. Make Mode column and fill with "Base Model"

In [5]:
single_model_dfs = [pd.read_csv(f"{prefix_single_model}_{dataset}.csv") for dataset in datasets]
for i,df in enumerate(single_model_dfs):
    df["architecture"] = df["model"].apply(get_architecture)
    df=df[df["uncertainty_type"]==uncertainty_type_considered]
    df = df[df["wrapper"]=="TempScaleWrapper"]
    df["mode"]="Base Model"
    df_merged = df.merge(backbone_df, on="model", how="left")
    df_merged.to_csv(f"{save_prefix_single_model}_{datasets[i]}.csv",index=False)
    display(df_merged.head()) if i==0 else None

Unnamed: 0,Acc,F1,Brier,NLL,ECE,CP_AUROC,AURC,E-AURC,SAC@90,SAC@91,...,architecture_family,source,tv_weights,Acc@1,Acc@5,Params,GFLOPS,Architecture,Year,Version
0,0.8408,0.838734,0.00024,0.660703,0.038995,0.853991,0.046748,0.033342,0.874484,0.848463,...,ConvNeXt,torchvision,IMAGENET1K_V1,84.062,96.87,88.6,15.36,ConvNeXt_Base,2022.0,V1
1,0.843979,0.842046,0.000236,0.65291,0.038762,0.850796,0.047078,0.034218,0.884316,0.857895,...,ConvNeXt,torchvision,IMAGENET1K_V1,84.414,96.976,197.8,34.36,ConvNeXt_Large,2022.0,V1
2,0.836421,0.834847,0.000243,0.672291,0.027154,0.853674,0.048728,0.034552,0.866126,0.840105,...,ConvNeXt,torchvision,IMAGENET1K_V1,83.616,96.65,50.2,8.68,ConvNeXt_Small,2022.0,V1
3,0.825221,0.823448,0.000256,0.700695,0.031806,0.856791,0.050817,0.034565,0.846147,0.817558,...,ConvNeXt,torchvision,IMAGENET1K_V1,82.52,96.146,28.6,4.46,ConvNeXt_Tiny,2022.0,V1
4,0.777095,0.774309,0.000314,0.901992,0.018729,0.864319,0.065434,0.038505,0.743074,0.718421,...,EfficientNet,torchvision,IMAGENET1K_V1,77.692,93.532,5.3,0.39,EfficientNet_B0,2019.0,V1


## Cleaning Duo DFs
1. filter out non-sr unc
2. rename mode

In [6]:
prefix_duo="eval_res/evaluation_duo"
save_prefix_duo="cleaned/evaluation_duo"

In [7]:
mode_map = {
    "logit_average": "Duo: Unweighted",
    "dictatorial_weighteduncertainty": "Duo: UQ Only",
    "temperature_weighted": "Duo: Ours",
    "confident": "Duo: Confidence"
}

SAC_metrics = [f"SAC@{i}" for i in range(90, 100)]
metrics = ['Acc', 'F1', 'Brier', 'NLL', 'CP_AUROC', 'AURC','ECE',"SAC@98"] + SAC_metrics

In [8]:
duo_dfs = [pd.read_csv(f"{prefix_duo}_{dataset}.csv") for dataset in datasets]
for i,df in enumerate(duo_dfs):
    df = df[df["uncertainty_type"]==uncertainty_type_considered]
    df["gflops"]=df["gflops_small"]+df["gflops_large"]
    df["mode"].replace(mode_map,inplace=True)
    df = df[df["mode"].isin(mode_map.values())]
    duo_dfs[i]=df
    
for i, (duo_df, single_df, dataset_name) in enumerate(zip(duo_dfs, single_model_dfs, datasets)):
    if dataset_name in [imgnet,imgnetv2]:
        single_df = single_df[single_df["wrapper"]=="TempScaleWrapper"]
    else:
        single_df = single_df[single_df["wrapper"]=="TempScaleWrapper"]
    single_df = single_df.add_suffix("_baseline")
    merged = pd.merge(
        duo_df,
        single_df,
        left_on=["model_large", "uncertainty_type"],
        right_on=["model_baseline", "uncertainty_type_baseline"],
        how="left"
    )
    # Step 3: Compute delta and percentage improvement
    for metric in metrics:
        baseline_col = f"{metric}_baseline"
        delta_col = f"delta_{metric}"
        improvement_col = f"improvement_{metric}"
        print(merged.columns)
        merged[delta_col] = merged[metric] - merged[baseline_col]

        if metric in ["Acc", "F1", "CP_AUROC"]+SAC_metrics: # best is 1
            merged[improvement_col] = merged[delta_col] / (1-merged[baseline_col]) * 100
        elif metric in ["Brier", "NLL", "AURC", "ECE"]: # best is 0
            merged[improvement_col] = -merged[delta_col] / merged[baseline_col] * 100
        else:
            raise(f"{metric=} not assigned PTP perfection rule yet")

    # Add dataset column
    merged["dataset"] = dataset_name
    # display(merged.head()) if dataset_name==caltech256 else None
    merged.to_csv(f"{save_prefix_duo}_{datasets[i]}.csv",index=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["gflops"]=df["gflops_small"]+df["gflops_large"]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["mode"].replace(mode_map,inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["gflops"]=df["gflops_small"]+df["gflops_large"]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://panda

Index(['Acc', 'F1', 'Brier', 'NLL', 'ECE', 'CP_AUROC', 'AURC', 'E-AURC',
       'SAC@90', 'SAC@91', 'SAC@92', 'SAC@93', 'SAC@94', 'SAC@95', 'SAC@96',
       'SAC@97', 'SAC@98', 'SAC@99', 'model_large', 'model_small', 'mode',
       'wrapper', 'uncertainty_type', 'gflops_large', 'gflops_small',
       'gflops_balance', 'split', 'gflops', 'Acc_baseline', 'F1_baseline',
       'Brier_baseline', 'NLL_baseline', 'ECE_baseline', 'CP_AUROC_baseline',
       'AURC_baseline', 'E-AURC_baseline', 'SAC@90_baseline',
       'SAC@91_baseline', 'SAC@92_baseline', 'SAC@93_baseline',
       'SAC@94_baseline', 'SAC@95_baseline', 'SAC@96_baseline',
       'SAC@97_baseline', 'SAC@98_baseline', 'SAC@99_baseline',
       'model_baseline', 'wrapper_baseline', 'uncertainty_type_baseline',
       'gflops_baseline', 'params_baseline', 'split_baseline',
       'architecture_baseline'],
      dtype='object')
Index(['Acc', 'F1', 'Brier', 'NLL', 'ECE', 'CP_AUROC', 'AURC', 'E-AURC',
       'SAC@90', 'SAC@91', 'SAC@9