In [1]:
import os
import git
from pathlib import Path
from typing import List
import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.lines import Line2D
import plotly.express as px
import ipywidgets as widgets
from IPython.display import display
import numpy as np
import plotly.graph_objects as go
from IPython.display import clear_output
import scipy
import pylustrator
import math

ROOT_DIR =  Path(git.Repo('.', search_parent_directories=True).working_tree_dir)
SAVE_FIGS = False

os.chdir(os.path.join(ROOT_DIR, "utilities"))
from testing import * # If MATLAB is not installed, open utilities and set to False
from plotting import *
os.chdir(os.path.join(ROOT_DIR, "results", "combined_results"))
np.random.seed(0)
plots_path = os.path.join(ROOT_DIR, "publication", "paper", "draft_plots")

def variance_prior(r, eta, scale=1):
    beta = (eta+1.5)/r
    var_prior = scale * scipy.special.gamma(beta + 1/r)/scipy.special.gamma(beta)
    return var_prior

def kurtosis_prior(r, eta, fisher=True):
    beta = (eta+1.5)/r
    kurtosis = 3*scipy.special.gamma(beta + 2/r)*scipy.special.gamma(beta)/scipy.special.gamma(beta+1/r)**2 
    if fisher:
        return kurtosis - 3
    else:
        return kurtosis 

fixed_palette = {
    'gray': 'xkcd:gray',
    'green': 'xkcd:shamrock green',
    'red': 'xkcd:light red',
    'blue': 'xkcd:blue',
}

def find_master_dfs(root_dir: str) -> List[str]:
    root_path = Path(root_dir)
    if not root_path.exists():
        raise FileNotFoundError(f"Directory not found: {root_dir}")

    master_df_paths = []
    for current_dir, _, files in os.walk(root_path):
        if 'master_df.csv' in files:
            master_df_path = Path(os.path.join(current_dir, 'master_df.csv'))
            master_df_paths.append(str(master_df_path.absolute()))
    return master_df_paths

def add_hull(master_df, rEtaKsstats_dict, GROUP='group', debug=False):

    master_df_copy = master_df.copy()
    master_df_copy = master_df.set_index(GROUP)
    groups = master_df_copy.index
    master_df_copy["hull"] = ""

    for group in groups:
        if master_df_copy.loc[group, "total_samples"] < 10:
            master_df_copy.loc[group, "hull"] = np.nan
           
        else:
            drop_keys =list(rEtaKsstats_dict[group].keys())[-3:]
            if debug:
                print(drop_keys)
            pre_optimization = pd.DataFrame(rEtaKsstats_dict[group]).drop(drop_keys, axis = 1 )
            optimization = pd.DataFrame(rEtaKsstats_dict[group])[drop_keys]
            optimization = optimization.rename(columns = {"r_optimize": "r", "eta_optimize": "eta", drop_keys[-1]: "ksstat"})
            optimization = optimization.dropna()
            full_df = pre_optimization.merge(optimization, on=["r", "eta"], how="outer")
            full_df = full_df.set_index(["r", "eta"])
            full_df["ksstat"] = full_df.min(axis=1)
            full_df = full_df.reset_index()
            full_df = full_df[["r", "eta", "ksstat"]]
            full_df["1/beta"] = full_df["r"]/(full_df["eta"] + 1.5)
            MULT = 1.2
            cutoff = max(min(full_df["ksstat"]) * MULT, master_df_copy.loc[group, "kstest_stat_cutoff_0.05"], 0.01)
            filtered_df = full_df[full_df["ksstat"] < cutoff]
            points = np.column_stack((filtered_df["r"], filtered_df["1/beta"])) + stats.norm.rvs(size=(len(filtered_df), 2)) * 0.001  # Adding small noise for convex hull computation
            hull = ConvexHull(points)
            master_df_copy.loc[group, "hull"] = hull

    return master_df_copy.reset_index()

relevant_cols = ['group', 'obs_var', 'var_lower', 'var_upper', 'obs_kurt', 'kurt_lower',
       'kurt_upper', 'total_samples', 'initial_r', 'initial_eta', # 'initial_scale',
       'kstest_stat_initial', 'kstest_stat_cutoff_0.05',
       'best_r', 'best_eta', # 'best_scale', 
       'kstest_stat_best', # 'n_pval_0.05',
        'param_gaussian',
       'kstest_stat_gaussian', 'kstest_pval_gaussian', 'param_laplace',
       'kstest_stat_laplace', 'kstest_pval_laplace', 'param_t',
       'kstest_stat_t', 'kstest_pval_t', 'kstest_pval_gengamma', 
       'dataset', 'subset', 'transform', 'orientation', 'channel', 'github_plot', 'dataset_type', 'hull']

all_paths = find_master_dfs(os.path.join(ROOT_DIR, "results", "case-studies"))
all_master_dfs = []
github_plots_path = "https://github.com/yashdave003/hierarchical-bayesian-model-validation/blob/main/results/case-studies/"

for path in all_paths:
    if 'scaleTesting' in path:
        continue
    if 'standardTesting' in path:
        continue
    master_df = pd.read_csv(path)
    master_df = master_df.rename(columns={master_df.columns[0]: 'group'})
    parts = Path(path).parts[-7:]
    if parts[0] == 'case-studies':
        parts = parts[1:]
    if "MRI" in path:
        dataset, slice, transform, orientation, _, _ = parts
        master_df['dataset'] = dataset
        master_df['transform'] = transform
        master_df['subset'] = slice
        master_df['channel'] = np.nan
        master_df['orientation'] = orientation
        master_df['github_plot'] = [github_plots_path+'/'.join([dataset, slice, transform, orientation, 'plots', f'compare_cdf_pdf_layer_{group}.jpg']) for group in master_df['group']]
    
    elif len(parts) > 6:
        dataset, subset, transform, orientation, channel, _, _ = parts
        master_df['dataset'] = dataset
        master_df['transform'] = transform
        master_df['subset'] = subset
        master_df['channel'] = channel
        master_df['orientation'] = orientation
        master_df['github_plot'] = [github_plots_path+'/'.join([dataset, subset, transform, orientation, channel, 'plots', f'compare_cdf_pdf_layer_{group}.jpg']) for group in master_df['group']]
    else:
        dataset, size, transform, channel, _, _ = parts
        master_df['dataset'] = dataset
        master_df['transform'] = transform
        master_df['subset'] = size
        master_df['channel'] = channel
        master_df['orientation'] = np.nan
        master_df['github_plot'] = [github_plots_path+'/'.join([dataset, size, transform, channel, 'plots', f'compare_cdf_pdf_layer_{group}.jpg']) for group in master_df['group']]
    
    if dataset in ['pastis', 'agriVision', 'spaceNet']:
        master_df['dataset_type'] = 'remote sensing'
    elif dataset in ['syntheticMRI2D', 'syntheticMRI3D']:
        master_df['dataset_type'] = 'medical'
    elif dataset in ['coco', 'segmentAnything']:
        master_df['dataset_type'] = 'natural'

    GROUP = 'layer' if transform.split("-")[0] == 'wavelet' else ('band' if transform.split("-")[0] == 'fourier' else 'error')
    rEtaKsstatsDict = pd.read_pickle(path[:-18] + "cache/rEtaKsstats_dict.pickle")
    master_df = add_hull(master_df, rEtaKsstatsDict)


    all_master_dfs.append(master_df[relevant_cols])
    
main_df = pd.concat(all_master_dfs)
# main_df['prior_var'] = main_df.apply(lambda row : variance_prior(row.loc['best_r'], row.loc['best_eta']), axis = 1)
# main_df['prior_kurt'] = main_df.apply(lambda row : kurtosis_prior(row.loc['best_r'], row.loc['best_eta']), axis = 1)
# main_df['gaussian_kurt'] = np.zeros(main_df.shape[0])
# main_df['laplace_kurt'] = 3*np.ones(main_df.shape[0])
main_df['best_beta'] = (main_df['best_eta'] + 1.5)/main_df['best_r'] 
main_df['best_1/beta'] = 1/main_df['best_beta']
main_df['total_samples'] = main_df['total_samples']//10 # TODO: look into why total_samples*10
main_df['kstest_stat_cutoff_0.10'] = stats.kstwo(n=main_df['total_samples']).isf(0.1)
kstest_stat_cutoff_2 = 0.2
kstest_stat_cutoff_2_name = 'kstest_stat_cutoff_0.20'
main_df[kstest_stat_cutoff_2_name] = stats.kstwo(n=main_df['total_samples']).isf(kstest_stat_cutoff_2)
main_df['beat_all_priors'] = (main_df['kstest_stat_best'] < np.minimum.reduce([main_df['kstest_stat_gaussian'], main_df['kstest_stat_laplace'], main_df['kstest_stat_t']])).astype(int)
main_df["best_prior"] = np.array(["GenGamma", "Gaussian", "Laplace", "Student-T", np.nan])[
                                        np.nanargmin(np.array([main_df['kstest_stat_best'], main_df['kstest_stat_gaussian'], 
                                                 main_df['kstest_stat_laplace'], main_df['kstest_stat_t'], 0.99*np.ones_like(main_df['kstest_stat_t'])]).T, axis=1)]


frequency_map = pd.read_csv(os.path.join(ROOT_DIR, "transformed-data", "master-frequency-map.csv")).set_index(['dataset', 'transform', 'group'])
# main_df = main_df.set_index(['dataset', 'subset', 'transform', 'group']).merge(frequency_map, left_index = True, right_index=True).reset_index()
main_df = main_df.set_index(['dataset', 'subset', 'transform', 'group']).reset_index() #[(main_df['dataset'] == 'pastis') | (main_df['dataset'] == 'agriVision') | (main_df['dataset'] == 'spaceNet')]
print(main_df.shape)
main_df = main_df.merge(pd.read_csv('result_categorization - combined_categories.csv')[['github_plot', 'failure_category', 'failure_type', 'which_ones']], on='github_plot', how='left')
main_df['dataset'].value_counts()
main_df.head()

KeyError: "['kstest_pval_gengamma', 'dataset_type'] not in index"

In [None]:
main_df2 = main_df.fillna("None", inplace=False)

In [None]:
failCatDF = main_df.copy()
failCatDF = failCatDF[failCatDF["failure_type"]!= "low samples"]
failCatDF["pass"] = failCatDF["failure_category"].apply(lambda x: x in ["practically_pass", "actually_pass"])
failCatDF = failCatDF[["dataset_type","dataset", "subset", "transform", "orientation", "failure_category", "pass", "beat_all_priors", "best_prior"]]
failCatDF.fillna("None", inplace=True)
failCatDF["number"] = (
    failCatDF.groupby(["dataset_type", "dataset", "subset", "transform", "orientation", "failure_category"])["pass"]
    .transform("count")
)

failCatDF["pass_percentage"] = np.round(
    failCatDF.groupby(["dataset_type", "dataset", "subset", "transform", "orientation"])["pass"]
    .transform("mean") * 100
)

failCatDF["beat_all_priors_percentage"] = np.round(
    failCatDF.groupby(["dataset_type", "dataset", "subset", "transform", "orientation"])["beat_all_priors"]
    .transform("mean") * 100
)

failCatDF = failCatDF.groupby(["dataset_type", "dataset", "subset", "transform", "orientation", "failure_category"]).first().reset_index().sort_values(by=["dataset_type", "dataset", "subset", "transform", "orientation", "failure_category"])
failCatDF


Unnamed: 0,dataset_type,dataset,subset,transform,orientation,failure_category,pass,beat_all_priors,best_prior,number,pass_percentage,beat_all_priors_percentage
0,medical,syntheticMRI2D,axial,wavelet,diagonal,practically_pass,True,1,GenGamma,6,86.0,71.0
1,medical,syntheticMRI2D,axial,wavelet,diagonal,trivial_failure,False,0,Gaussian,1,86.0,71.0
2,medical,syntheticMRI2D,axial,wavelet,horizontal,practically_pass,True,1,GenGamma,5,71.0,86.0
3,medical,syntheticMRI2D,axial,wavelet,horizontal,trivial_failure,False,0,Gaussian,2,71.0,86.0
4,medical,syntheticMRI2D,axial,wavelet,vertical,practically_pass,True,1,GenGamma,5,71.0,71.0
...,...,...,...,...,...,...,...,...,...,...,...,...
73,remote sensing,spaceNet,full,wavelet,diagonal,actually_pass,True,1,GenGamma,3,62.0,91.0
74,remote sensing,spaceNet,full,wavelet,diagonal,practically_pass,True,1,GenGamma,17,62.0,91.0
75,remote sensing,spaceNet,full,wavelet,diagonal,trivial_failure,False,0,Gaussian,12,62.0,91.0
76,remote sensing,spaceNet,full,wavelet,horizVert,practically_pass,True,1,GenGamma,20,62.0,100.0


In [None]:
ordered_failcat_cols = ["actually_pass", "practically_pass", "TO DISCUSS", "interesting_failure", "trivial_failure"]
ordered_prior_cols = ["GenGamma", "Gaussian", "Laplace", "Student-T"]

# Remote Sensing

In [None]:
DATASET_TYPE = "remote sensing" 

In [None]:
medical = failCatDF.copy()
medical = medical[medical["dataset_type"] == DATASET_TYPE]
medical = medical.groupby(["dataset_type", "dataset", "subset", "transform", "orientation"]).first()[["pass_percentage", "beat_all_priors_percentage"]]
medical

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,pass_percentage,beat_all_priors_percentage
dataset_type,dataset,subset,transform,orientation,Unnamed: 5_level_1,Unnamed: 6_level_1
remote sensing,agriVision,full,fourier,,95.0,98.0
remote sensing,agriVision,full,wavelet,diagonal,100.0,100.0
remote sensing,agriVision,full,wavelet,horizVert,100.0,100.0
remote sensing,pastis,full,fourier,,100.0,100.0
remote sensing,pastis,full,wavelet,diagonal,100.0,100.0
remote sensing,pastis,full,wavelet,horizVert,100.0,96.0
remote sensing,spaceNet,full,fourier,,100.0,100.0
remote sensing,spaceNet,full,wavelet,diagonal,62.0,91.0
remote sensing,spaceNet,full,wavelet,horizVert,62.0,100.0


In [None]:

medical = main_df2.copy()
medical = medical[medical["dataset_type"] == DATASET_TYPE]
medical["number"] = 1
medical_pivot = medical.pivot_table(
    index=["dataset_type", "dataset", "subset", "transform", "orientation"],
    columns="failure_category",
    values="number",
    aggfunc="sum",
    fill_value=0,
)

# Calculate row sums for percentage calculation
row_sums = medical_pivot.sum(axis=1)
medical_pivot_percent = np.round(medical_pivot.div(row_sums, axis=0) * 100)

medical_pivot_percent
# Reorder columns if present in medical_pivot_percent
ordered_cols_present = [col for col in ordered_failcat_cols if col in medical_pivot_percent.columns]
medical_pivot_percent = medical_pivot_percent[ordered_cols_present]
medical_pivot_percent


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,failure_category,actually_pass,practically_pass,interesting_failure,trivial_failure
dataset_type,dataset,subset,transform,orientation,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
remote sensing,agriVision,full,fourier,,2.0,93.0,5.0,0.0
remote sensing,agriVision,full,wavelet,diagonal,17.0,83.0,0.0,0.0
remote sensing,agriVision,full,wavelet,horizVert,11.0,89.0,0.0,0.0
remote sensing,pastis,full,fourier,,91.0,9.0,0.0,0.0
remote sensing,pastis,full,wavelet,diagonal,57.0,43.0,0.0,0.0
remote sensing,pastis,full,wavelet,horizVert,57.0,43.0,0.0,0.0
remote sensing,spaceNet,full,fourier,,0.0,100.0,0.0,0.0
remote sensing,spaceNet,full,wavelet,diagonal,9.0,53.0,0.0,38.0
remote sensing,spaceNet,full,wavelet,horizVert,0.0,62.0,0.0,38.0


In [None]:

medical = main_df2.copy()
medical = medical[medical["dataset_type"] == DATASET_TYPE]
medical["number"] = 1
medical_pivot = medical.pivot_table(
    index=["dataset_type", "dataset", "subset", "transform", "orientation"],
    columns="best_prior",
    values="number",
    aggfunc="count",
    fill_value=0,
)

# Calculate row sums for percentage calculation
row_sums = medical_pivot.sum(axis=1)
medical_pivot_percent = np.round(medical_pivot.div(row_sums, axis=0) * 100)

medical_pivot_percent
ordered_cols_present = [col for col in ordered_prior_cols if col in medical_pivot_percent.columns]
medical_pivot_percent = medical_pivot_percent[ordered_cols_present]
medical_pivot_percent

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,best_prior,GenGamma,Gaussian,Student-T
dataset_type,dataset,subset,transform,orientation,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
remote sensing,agriVision,full,fourier,,98.0,0.0,2.0
remote sensing,agriVision,full,wavelet,diagonal,100.0,0.0,0.0
remote sensing,agriVision,full,wavelet,horizVert,100.0,0.0,0.0
remote sensing,pastis,full,fourier,,100.0,0.0,0.0
remote sensing,pastis,full,wavelet,diagonal,100.0,0.0,0.0
remote sensing,pastis,full,wavelet,horizVert,96.0,0.0,4.0
remote sensing,spaceNet,full,fourier,,100.0,0.0,0.0
remote sensing,spaceNet,full,wavelet,diagonal,91.0,9.0,0.0
remote sensing,spaceNet,full,wavelet,horizVert,100.0,0.0,0.0


In [None]:

medical = main_df2.copy()
medical = medical[medical["dataset_type"] == DATASET_TYPE]
medical["number"] = 1
medical_pivot = medical.pivot_table(
    index=["dataset_type", "dataset", "subset", "transform", "failure_category"],
    columns="best_prior",
    values="number",
    aggfunc="count",
    fill_value=0,
)

# Calculate row sums for percentage calculation
row_sums = medical_pivot.sum(axis=1)
medical_pivot_percent = np.round(medical_pivot.div(row_sums, axis=0) * 100)

medical_pivot_percent
ordered_cols_present = [col for col in ordered_prior_cols if col in medical_pivot_percent.columns]
medical_pivot_percent = medical_pivot_percent[ordered_cols_present]
medical_pivot_percent

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,best_prior,GenGamma,Gaussian,Student-T
dataset_type,dataset,subset,failure_category,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
remote sensing,agriVision,full,actually_pass,100.0,0.0,0.0
remote sensing,agriVision,full,interesting_failure,100.0,0.0,0.0
remote sensing,agriVision,full,practically_pass,99.0,0.0,1.0
remote sensing,pastis,full,actually_pass,99.0,0.0,1.0
remote sensing,pastis,full,practically_pass,100.0,0.0,0.0
remote sensing,spaceNet,full,actually_pass,100.0,0.0,0.0
remote sensing,spaceNet,full,practically_pass,100.0,0.0,0.0
remote sensing,spaceNet,full,trivial_failure,88.0,12.0,0.0


# Natural Images

In [None]:
DATASET_TYPE = "natural" 

In [None]:
medical = failCatDF.copy()
medical = medical[medical["dataset_type"] == DATASET_TYPE]
medical = medical.groupby(["dataset_type", "dataset", "subset", "transform", "orientation"]).first()[["pass_percentage", "beat_all_priors_percentage"]]
medical

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,pass_percentage,beat_all_priors_percentage
dataset_type,dataset,subset,transform,orientation,Unnamed: 5_level_1,Unnamed: 6_level_1
natural,coco,indoor,wavelet,diagonal,62.0,100.0
natural,coco,indoor,wavelet,horizontal,25.0,81.0
natural,coco,indoor,wavelet,vertical,84.0,94.0
natural,coco,outdoor,wavelet,diagonal,62.0,100.0
natural,coco,outdoor,wavelet,horizontal,25.0,84.0
natural,coco,outdoor,wavelet,vertical,88.0,100.0
natural,segmentAnything,full,wavelet,diagonal,100.0,100.0
natural,segmentAnything,full,wavelet,horizontal,56.0,86.0
natural,segmentAnything,full,wavelet,vertical,100.0,100.0


In [None]:

medical = failCatDF.copy()
medical = medical[medical["dataset_type"] == DATASET_TYPE]
medical_pivot = medical.pivot_table(
    index=["dataset_type", "dataset", "subset", "transform", "orientation"],
    columns="failure_category",
    values="number",
    aggfunc="sum",
    fill_value=0,
)

# Calculate row sums for percentage calculation
row_sums = medical_pivot.sum(axis=1)
medical_pivot_percent = np.round(medical_pivot.div(row_sums, axis=0) * 100)

medical_pivot_percent
# Reorder columns if present in medical_pivot_percent
ordered_cols_present = [col for col in ordered_failcat_cols if col in medical_pivot_percent.columns]
medical_pivot_percent = medical_pivot_percent[ordered_cols_present]
medical_pivot_percent


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,failure_category,actually_pass,practically_pass,interesting_failure,trivial_failure
dataset_type,dataset,subset,transform,orientation,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
natural,coco,indoor,wavelet,diagonal,38.0,25.0,38.0,0.0
natural,coco,indoor,wavelet,horizontal,0.0,25.0,12.0,62.0
natural,coco,indoor,wavelet,vertical,38.0,47.0,16.0,0.0
natural,coco,outdoor,wavelet,diagonal,38.0,25.0,38.0,0.0
natural,coco,outdoor,wavelet,horizontal,0.0,25.0,16.0,59.0
natural,coco,outdoor,wavelet,vertical,38.0,50.0,12.0,0.0
natural,segmentAnything,full,wavelet,diagonal,42.0,58.0,0.0,0.0
natural,segmentAnything,full,wavelet,horizontal,0.0,56.0,0.0,44.0
natural,segmentAnything,full,wavelet,vertical,33.0,67.0,0.0,0.0


In [None]:

medical = main_df2.copy()
medical = medical[medical["dataset_type"] == DATASET_TYPE]
medical["number"] = 1
medical_pivot = medical.pivot_table(
    index=["dataset_type", "dataset", "subset", "transform", "orientation"],
    columns="best_prior",
    values="number",
    aggfunc="count",
    fill_value=0,
)

# Calculate row sums for percentage calculation
row_sums = medical_pivot.sum(axis=1)
medical_pivot_percent = np.round(medical_pivot.div(row_sums, axis=0) * 100)

medical_pivot_percent
ordered_cols_present = [col for col in ordered_prior_cols if col in medical_pivot_percent.columns]
medical_pivot_percent = medical_pivot_percent[ordered_cols_present]
medical_pivot_percent

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,best_prior,GenGamma,Gaussian,Laplace,Student-T
dataset_type,dataset,subset,transform,orientation,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
natural,coco,indoor,wavelet,diagonal,100.0,0.0,0.0,0.0
natural,coco,indoor,wavelet,horizontal,81.0,12.0,6.0,0.0
natural,coco,indoor,wavelet,vertical,94.0,6.0,0.0,0.0
natural,coco,outdoor,wavelet,diagonal,100.0,0.0,0.0,0.0
natural,coco,outdoor,wavelet,horizontal,84.0,6.0,6.0,3.0
natural,coco,outdoor,wavelet,vertical,100.0,0.0,0.0,0.0
natural,segmentAnything,full,wavelet,diagonal,100.0,0.0,0.0,0.0
natural,segmentAnything,full,wavelet,horizontal,86.0,3.0,8.0,3.0
natural,segmentAnything,full,wavelet,vertical,100.0,0.0,0.0,0.0


In [None]:

medical = main_df2.copy()
medical = medical[medical["dataset_type"] == DATASET_TYPE]
medical["number"] = 1
medical_pivot = medical.pivot_table(
    index=["dataset_type", "dataset", "subset", "transform", "failure_category"],
    columns="best_prior",
    values="number",
    aggfunc="count",
    fill_value=0,
)

# Calculate row sums for percentage calculation
row_sums = medical_pivot.sum(axis=1)
medical_pivot_percent = np.round(medical_pivot.div(row_sums, axis=0) * 100)

medical_pivot_percent
ordered_cols_present = [col for col in ordered_prior_cols if col in medical_pivot_percent.columns]
medical_pivot_percent = medical_pivot_percent[ordered_cols_present]
medical_pivot_percent

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,best_prior,GenGamma,Gaussian,Laplace,Student-T
dataset_type,dataset,subset,failure_category,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
natural,coco,indoor,actually_pass,92.0,8.0,0.0,0.0
natural,coco,indoor,interesting_failure,100.0,0.0,0.0,0.0
natural,coco,indoor,practically_pass,100.0,0.0,0.0,0.0
natural,coco,indoor,trivial_failure,70.0,20.0,10.0,0.0
natural,coco,outdoor,actually_pass,100.0,0.0,0.0,0.0
natural,coco,outdoor,interesting_failure,100.0,0.0,0.0,0.0
natural,coco,outdoor,practically_pass,100.0,0.0,0.0,0.0
natural,coco,outdoor,trivial_failure,74.0,11.0,11.0,5.0
natural,segmentAnything,full,actually_pass,100.0,0.0,0.0,0.0
natural,segmentAnything,full,practically_pass,100.0,0.0,0.0,0.0


# Medical

In [None]:
DATASET_TYPE = "medical"

In [None]:
medical = failCatDF.copy()
medical = medical[medical["dataset_type"] == DATASET_TYPE]
medical = medical.groupby(["dataset_type", "dataset", "subset", "transform", "orientation"]).first()[["pass_percentage", "beat_all_priors_percentage"]]
medical

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,pass_percentage,beat_all_priors_percentage
dataset_type,dataset,subset,transform,orientation,Unnamed: 5_level_1,Unnamed: 6_level_1
medical,syntheticMRI2D,axial,wavelet,diagonal,86.0,71.0
medical,syntheticMRI2D,axial,wavelet,horizontal,71.0,86.0
medical,syntheticMRI2D,axial,wavelet,vertical,71.0,71.0
medical,syntheticMRI2D,coronal,wavelet,diagonal,100.0,86.0
medical,syntheticMRI2D,coronal,wavelet,horizontal,71.0,71.0
medical,syntheticMRI2D,coronal,wavelet,vertical,71.0,71.0
medical,syntheticMRI2D,sagittal,wavelet,diagonal,71.0,71.0
medical,syntheticMRI2D,sagittal,wavelet,horizontal,71.0,86.0
medical,syntheticMRI2D,sagittal,wavelet,vertical,100.0,86.0
medical,syntheticMRI3D,full,wavelet,aad,67.0,67.0


In [None]:

medical = failCatDF.copy()
medical = medical[medical["dataset_type"] == DATASET_TYPE]
medical_pivot = medical.pivot_table(
    index=["dataset_type", "dataset", "subset", "transform", "orientation"],
    columns="failure_category",
    values="number",
    aggfunc="sum",
    fill_value=0,
)

# Calculate row sums for percentage calculation
row_sums = medical_pivot.sum(axis=1)
medical_pivot_percent = np.round(medical_pivot.div(row_sums, axis=0) * 100)

medical_pivot_percent
# Reorder columns if present in medical_pivot_percent
ordered_cols_present = [col for col in ordered_failcat_cols if col in medical_pivot_percent.columns]
medical_pivot_percent = medical_pivot_percent[ordered_cols_present]
medical_pivot_percent


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,failure_category,actually_pass,practically_pass,interesting_failure,trivial_failure
dataset_type,dataset,subset,transform,orientation,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
medical,syntheticMRI2D,axial,wavelet,diagonal,0.0,86.0,0.0,14.0
medical,syntheticMRI2D,axial,wavelet,horizontal,0.0,71.0,0.0,29.0
medical,syntheticMRI2D,axial,wavelet,vertical,0.0,71.0,0.0,29.0
medical,syntheticMRI2D,coronal,wavelet,diagonal,29.0,71.0,0.0,0.0
medical,syntheticMRI2D,coronal,wavelet,horizontal,0.0,71.0,14.0,14.0
medical,syntheticMRI2D,coronal,wavelet,vertical,0.0,71.0,0.0,29.0
medical,syntheticMRI2D,sagittal,wavelet,diagonal,0.0,71.0,0.0,29.0
medical,syntheticMRI2D,sagittal,wavelet,horizontal,0.0,71.0,14.0,14.0
medical,syntheticMRI2D,sagittal,wavelet,vertical,0.0,100.0,0.0,0.0
medical,syntheticMRI3D,full,wavelet,aad,0.0,67.0,17.0,17.0


In [None]:

medical = main_df2.copy()
medical = medical[medical["dataset_type"] == DATASET_TYPE]
medical["number"] = 1
medical_pivot = medical.pivot_table(
    index=["dataset_type", "dataset", "subset", "transform", "orientation"],
    columns="best_prior",
    values="number",
    aggfunc="count",
    fill_value=0,
)

# Calculate row sums for percentage calculation
row_sums = medical_pivot.sum(axis=1)
medical_pivot_percent = np.round(medical_pivot.div(row_sums, axis=0) * 100)

medical_pivot_percent
ordered_cols_present = [col for col in ordered_prior_cols if col in medical_pivot_percent.columns]
medical_pivot_percent = medical_pivot_percent[ordered_cols_present]
medical_pivot_percent

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,best_prior,GenGamma,Gaussian,Laplace,Student-T
dataset_type,dataset,subset,transform,orientation,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
medical,syntheticMRI2D,axial,wavelet,diagonal,62.0,12.0,12.0,12.0
medical,syntheticMRI2D,axial,wavelet,horizontal,75.0,25.0,0.0,0.0
medical,syntheticMRI2D,axial,wavelet,vertical,62.0,38.0,0.0,0.0
medical,syntheticMRI2D,coronal,wavelet,diagonal,75.0,0.0,0.0,12.0
medical,syntheticMRI2D,coronal,wavelet,horizontal,62.0,25.0,0.0,0.0
medical,syntheticMRI2D,coronal,wavelet,vertical,62.0,25.0,0.0,0.0
medical,syntheticMRI2D,sagittal,wavelet,diagonal,62.0,12.0,0.0,12.0
medical,syntheticMRI2D,sagittal,wavelet,horizontal,75.0,12.0,0.0,0.0
medical,syntheticMRI2D,sagittal,wavelet,vertical,75.0,12.0,0.0,0.0
medical,syntheticMRI3D,full,wavelet,aad,57.0,29.0,0.0,0.0


In [None]:

medical = main_df2.copy()
medical = medical[medical["dataset_type"] == DATASET_TYPE]
medical["number"] = 1
medical_pivot = medical.pivot_table(
    index=["dataset_type", "dataset", "subset", "transform", "failure_category"],
    columns="best_prior",
    values="number",
    aggfunc="count",
    fill_value=0,
)

# Calculate row sums for percentage calculation
row_sums = medical_pivot.sum(axis=1)
medical_pivot_percent = np.round(medical_pivot.div(row_sums, axis=0) * 100)

medical_pivot_percent
ordered_cols_present = [col for col in ordered_prior_cols if col in medical_pivot_percent.columns]
medical_pivot_percent = medical_pivot_percent[ordered_cols_present]
medical_pivot_percent

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,best_prior,GenGamma,Gaussian,Laplace,Student-T
dataset_type,dataset,subset,failure_category,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
medical,syntheticMRI2D,axial,practically_pass,94.0,0.0,0.0,6.0
medical,syntheticMRI2D,axial,trivial_failure,12.0,75.0,12.0,0.0
medical,syntheticMRI2D,coronal,actually_pass,100.0,0.0,0.0,0.0
medical,syntheticMRI2D,coronal,interesting_failure,0.0,100.0,0.0,0.0
medical,syntheticMRI2D,coronal,practically_pass,93.0,0.0,0.0,7.0
medical,syntheticMRI2D,coronal,trivial_failure,0.0,50.0,0.0,0.0
medical,syntheticMRI2D,sagittal,interesting_failure,100.0,0.0,0.0,0.0
medical,syntheticMRI2D,sagittal,practically_pass,88.0,6.0,0.0,6.0
medical,syntheticMRI2D,sagittal,trivial_failure,17.0,33.0,0.0,0.0
medical,syntheticMRI3D,full,actually_pass,100.0,0.0,0.0,0.0
