In [2]:
import numpy as np
import pandas as pd

#### Combine results of CBMI, IClf and RF in one csv file

In [None]:
def combine_df(data_name, test_missing, dir):
    df1 = pd.read_csv(f'{dir}/{data_name}_{test_missing}_results.csv')
    df2 = pd.read_csv(f'{dir}/{data_name}_{test_missing}_RF.csv')
    df_combined = pd.merge(df1, df2, on=['MissingRate', 'run'], suffixes=('_base', '_rf')).sort_values(['MissingRate', 'run'])

    df_combined.to_csv(f'{dir}/{data_name}_{test_missing}_combined.csv', index=False)

In [None]:
dir = '/Volumes/Macintosh HD/Projects/PAPER/10.IUL-CBMI/results/csv'
test_missings = [True, False]

data_names = ['iris', 'liver', 'soybean', 'parkinson', 'heart', 'glass', 'car']
for data_name in data_names:
    for test_missing in test_missings:
        combine_df(data_name, test_missing, dir)


### Statistical Test

In [4]:
from scipy.stats import ttest_rel, wilcoxon, permutation_test

def pairwise_tests_save(df, metric, model_labels, alpha=0.05):
    """
    Performs pairwise t-test, Wilcoxon, and permutation test across all model pairs
    and returns results as a DataFrame.

    Parameters:
        df: pd.DataFrame
            DataFrame containing metric columns like CBMI_acc, IClf_acc, etc.
        metric: str
            One of: "acc", "f1", "recall"
        model_labels: list of str
            List of model prefixes, e.g. ["CBMI", "IClf", "RF"]
        alpha: float
            Significance level for highlighting

    Returns:
        pd.DataFrame with test results
    """
    results = []

    for i in range(len(model_labels)):
        for j in range(i+1, len(model_labels)):
            label1 = model_labels[i]
            label2 = model_labels[j]

            col1 = f"{label1}_{metric}"
            col2 = f"{label2}_{metric}"

            metric1 = df[col1].values
            metric2 = df[col2].values

            # Paired t-test
            t_stat, t_pval = ttest_rel(metric1, metric2)

            # Wilcoxon test
            try:
                w_stat, w_pval = wilcoxon(metric1, metric2)
            except ValueError:
                w_pval = np.nan

            # Permutation test
            perm_pval = permutation_test(
                (metric1, metric2),
                statistic=lambda x, y: np.mean(x - y),
                permutation_type='samples',
                alternative='two-sided',
                n_resamples=10000
            ).pvalue

            # Format p-values with bold if significant
            def format_p(p):
                return f"\\textbf{{{p:.4f}}}" if p < alpha else f"{p:.4f}"

            results.append({
                "Metric": metric.upper(),
                "Comparison": f"{label1} vs {label2}",
                "t-test p": format_p(t_pval),
                "Wilcoxon p": format_p(w_pval),
                "Permutation p": format_p(perm_pval)
            })

    return pd.DataFrame(results)


In [None]:
model_labels = ["CBMI", "IClf", "RF"]
metrics = ["acc", "f1", "recall"]

### IRIS DATASET

In [None]:
data_name = 'iris'
df_1 = pd.read_csv(f'{dir}/{data_name}_True_combined.csv')
df_2 = pd.read_csv(f'{dir}/{data_name}_False_combined.csv')
# Collect all results
df1_results = pd.concat(
    [pairwise_tests_save(df_1, m, model_labels) for m in metrics],
    ignore_index=True
)

df2_results = pd.concat(
    [pairwise_tests_save(df_2, m, model_labels) for m in metrics],
    ignore_index=True
)

df1_results

Unnamed: 0,Metric,Comparison,t-test p,Wilcoxon p,Permutation p
0,ACC,CBMI vs IClf,\textbf{0.0165},0.2189,\textbf{0.0156}
1,ACC,CBMI vs RF,0.1220,0.2484,0.1350
2,ACC,IClf vs RF,0.5355,0.9088,0.5397
3,F1,CBMI vs IClf,0.0824,0.7021,0.0848
4,F1,CBMI vs RF,0.1626,0.4496,0.1668
5,F1,IClf vs RF,0.8561,0.7628,0.8709
6,RECALL,CBMI vs IClf,\textbf{0.0165},0.2189,\textbf{0.0152}
7,RECALL,CBMI vs RF,0.1220,0.2484,0.1204
8,RECALL,IClf vs RF,0.5355,0.9088,0.5541


In [7]:
df2_results

Unnamed: 0,Metric,Comparison,t-test p,Wilcoxon p,Permutation p
0,ACC,CBMI vs IClf,\textbf{0.0000},\textbf{0.0000},\textbf{0.0002}
1,ACC,CBMI vs RF,0.2039,0.4622,0.2202
2,ACC,IClf vs RF,\textbf{0.0000},\textbf{0.0001},\textbf{0.0002}
3,F1,CBMI vs IClf,\textbf{0.0000},\textbf{0.0000},\textbf{0.0002}
4,F1,CBMI vs RF,0.2711,0.6341,0.2660
5,F1,IClf vs RF,\textbf{0.0000},\textbf{0.0000},\textbf{0.0002}
6,RECALL,CBMI vs IClf,\textbf{0.0000},\textbf{0.0000},\textbf{0.0002}
7,RECALL,CBMI vs RF,0.2039,0.4622,0.2280
8,RECALL,IClf vs RF,\textbf{0.0000},\textbf{0.0001},\textbf{0.0002}


### Liver DATASET

In [None]:
data_name = 'liver'
df_1 = pd.read_csv(f'{dir}/{data_name}_True_combined.csv')
df_2 = pd.read_csv(f'{dir}/{data_name}_False_combined.csv')
# Collect all results
df1_results = pd.concat(
    [pairwise_tests_save(df_1, m, model_labels) for m in metrics],
    ignore_index=True
)

df2_results = pd.concat(
    [pairwise_tests_save(df_2, m, model_labels) for m in metrics],
    ignore_index=True
)

df1_results

Unnamed: 0,Metric,Comparison,t-test p,Wilcoxon p,Permutation p
0,ACC,CBMI vs IClf,\textbf{0.0142},\textbf{0.0086},\textbf{0.0176}
1,ACC,CBMI vs RF,0.5511,0.3317,0.5581
2,ACC,IClf vs RF,\textbf{0.0047},\textbf{0.0052},\textbf{0.0040}
3,F1,CBMI vs IClf,0.2113,0.0747,0.2106
4,F1,CBMI vs RF,0.2659,0.3832,0.2540
5,F1,IClf vs RF,\textbf{0.0345},\textbf{0.0288},\textbf{0.0330}
6,RECALL,CBMI vs IClf,\textbf{0.0142},\textbf{0.0086},\textbf{0.0180}
7,RECALL,CBMI vs RF,\textbf{0.0047},\textbf{0.0051},\textbf{0.0040}
8,RECALL,IClf vs RF,0.5511,0.3303,0.5779


In [9]:
df2_results

Unnamed: 0,Metric,Comparison,t-test p,Wilcoxon p,Permutation p
0,ACC,CBMI vs IClf,\textbf{0.0222},\textbf{0.0030},\textbf{0.0218}
1,ACC,CBMI vs RF,0.7810,0.9874,0.7925
2,ACC,IClf vs RF,\textbf{0.0255},\textbf{0.0223},\textbf{0.0248}
3,F1,CBMI vs IClf,0.3421,0.1469,0.3506
4,F1,CBMI vs RF,\textbf{0.0338},\textbf{0.0432},\textbf{0.0370}
5,F1,IClf vs RF,\textbf{0.0054},\textbf{0.0050},\textbf{0.0048}
6,RECALL,CBMI vs IClf,\textbf{0.0222},\textbf{0.0030},\textbf{0.0232}
7,RECALL,CBMI vs RF,\textbf{0.0255},\textbf{0.0222},\textbf{0.0210}
8,RECALL,IClf vs RF,0.7810,0.9851,0.7969


### HEART DATASET

In [None]:
data_name = 'heart'
df_1 = pd.read_csv(f'{dir}/{data_name}_True_combined.csv')
df_2 = pd.read_csv(f'{dir}/{data_name}_False_combined.csv')
# Collect all results
df1_results = pd.concat(
    [pairwise_tests_save(df_1, m, model_labels) for m in metrics],
    ignore_index=True
)

df2_results = pd.concat(
    [pairwise_tests_save(df_2, m, model_labels) for m in metrics],
    ignore_index=True
)

df1_results

Unnamed: 0,Metric,Comparison,t-test p,Wilcoxon p,Permutation p
0,ACC,CBMI vs IClf,0.8572,0.6982,0.8879
1,ACC,CBMI vs RF,0.7612,0.6803,0.7781
2,ACC,IClf vs RF,0.8340,0.6993,0.8585
3,F1,CBMI vs IClf,\textbf{0.0058},\textbf{0.0242},\textbf{0.0064}
4,F1,CBMI vs RF,\textbf{0.0000},\textbf{0.0000},\textbf{0.0002}
5,F1,IClf vs RF,\textbf{0.0000},\textbf{0.0000},\textbf{0.0002}
6,RECALL,CBMI vs IClf,0.8572,0.6982,0.8723
7,RECALL,CBMI vs RF,0.8340,0.6993,0.8431
8,RECALL,IClf vs RF,0.7612,0.6803,0.7737


In [11]:
df2_results

Unnamed: 0,Metric,Comparison,t-test p,Wilcoxon p,Permutation p
0,ACC,CBMI vs IClf,0.3125,0.2459,0.3224
1,ACC,CBMI vs RF,\textbf{0.0000},\textbf{0.0000},\textbf{0.0002}
2,ACC,IClf vs RF,\textbf{0.0000},\textbf{0.0000},\textbf{0.0002}
3,F1,CBMI vs IClf,\textbf{0.0394},\textbf{0.0058},\textbf{0.0400}
4,F1,CBMI vs RF,\textbf{0.0000},\textbf{0.0000},\textbf{0.0002}
5,F1,IClf vs RF,\textbf{0.0000},\textbf{0.0000},\textbf{0.0002}
6,RECALL,CBMI vs IClf,0.3125,0.2459,0.3356
7,RECALL,CBMI vs RF,\textbf{0.0000},\textbf{0.0000},\textbf{0.0002}
8,RECALL,IClf vs RF,\textbf{0.0000},\textbf{0.0000},\textbf{0.0002}


### SOYBEAN DATASET

In [None]:
data_name = 'soybean'
df_1 = pd.read_csv(f'{dir}/{data_name}_True_combined.csv')
df_2 = pd.read_csv(f'{dir}/{data_name}_False_combined.csv')
# Collect all results
df1_results = pd.concat(
    [pairwise_tests_save(df_1, m, model_labels) for m in metrics],
    ignore_index=True
)

df2_results = pd.concat(
    [pairwise_tests_save(df_2, m, model_labels) for m in metrics],
    ignore_index=True
)

df1_results

Unnamed: 0,Metric,Comparison,t-test p,Wilcoxon p,Permutation p
0,ACC,CBMI vs IClf,\textbf{0.0443},\textbf{0.0492},0.0534
1,ACC,CBMI vs RF,\textbf{0.0000},\textbf{0.0000},\textbf{0.0002}
2,ACC,IClf vs RF,\textbf{0.0000},\textbf{0.0000},\textbf{0.0002}
3,F1,CBMI vs IClf,0.8149,0.5866,0.8177
4,F1,CBMI vs RF,\textbf{0.0000},\textbf{0.0000},\textbf{0.0002}
5,F1,IClf vs RF,\textbf{0.0000},\textbf{0.0000},\textbf{0.0002}
6,RECALL,CBMI vs IClf,\textbf{0.0443},\textbf{0.0492},0.0532
7,RECALL,CBMI vs RF,\textbf{0.0000},\textbf{0.0000},\textbf{0.0002}
8,RECALL,IClf vs RF,\textbf{0.0000},\textbf{0.0000},\textbf{0.0002}


In [14]:
df2_results

Unnamed: 0,Metric,Comparison,t-test p,Wilcoxon p,Permutation p
0,ACC,CBMI vs IClf,\textbf{0.0000},\textbf{0.0000},\textbf{0.0002}
1,ACC,CBMI vs RF,\textbf{0.0000},\textbf{0.0000},\textbf{0.0002}
2,ACC,IClf vs RF,\textbf{0.0000},\textbf{0.0000},\textbf{0.0002}
3,F1,CBMI vs IClf,\textbf{0.0000},\textbf{0.0000},\textbf{0.0002}
4,F1,CBMI vs RF,\textbf{0.0000},\textbf{0.0000},\textbf{0.0002}
5,F1,IClf vs RF,\textbf{0.0000},\textbf{0.0001},\textbf{0.0002}
6,RECALL,CBMI vs IClf,\textbf{0.0000},\textbf{0.0000},\textbf{0.0002}
7,RECALL,CBMI vs RF,\textbf{0.0000},\textbf{0.0000},\textbf{0.0002}
8,RECALL,IClf vs RF,\textbf{0.0000},\textbf{0.0000},\textbf{0.0002}


### PAKINSONS DATASET

In [15]:
data_name = 'pakinsons'
df_1 = pd.read_csv(f'{dir}/{data_name}_True_combined.csv')
df_2 = pd.read_csv(f'{dir}/{data_name}_False_combined.csv')
# Collect all results
df1_results = pd.concat(
    [pairwise_tests_save(df_1, m, model_labels) for m in metrics],
    ignore_index=True
)

df2_results = pd.concat(
    [pairwise_tests_save(df_2, m, model_labels) for m in metrics],
    ignore_index=True
)

df1_results

Unnamed: 0,Metric,Comparison,t-test p,Wilcoxon p,Permutation p
0,ACC,CBMI vs IClf,0.7800,0.7495,0.8077
1,ACC,CBMI vs RF,\textbf{0.0002},\textbf{0.0001},\textbf{0.0004}
2,ACC,IClf vs RF,\textbf{0.0003},\textbf{0.0002},\textbf{0.0010}
3,F1,CBMI vs IClf,0.7660,0.8302,0.7701
4,F1,CBMI vs RF,\textbf{0.0098},\textbf{0.0100},\textbf{0.0118}
5,F1,IClf vs RF,\textbf{0.0072},\textbf{0.0026},\textbf{0.0068}
6,RECALL,CBMI vs IClf,0.7800,0.7467,0.8073
7,RECALL,CBMI vs RF,\textbf{0.0003},\textbf{0.0002},\textbf{0.0006}
8,RECALL,IClf vs RF,\textbf{0.0002},\textbf{0.0001},\textbf{0.0002}


In [16]:
df2_results

Unnamed: 0,Metric,Comparison,t-test p,Wilcoxon p,Permutation p
0,ACC,CBMI vs IClf,0.8385,0.9987,0.8545
1,ACC,CBMI vs RF,\textbf{0.0000},\textbf{0.0000},\textbf{0.0002}
2,ACC,IClf vs RF,\textbf{0.0000},\textbf{0.0000},\textbf{0.0002}
3,F1,CBMI vs IClf,0.6230,0.3300,0.6213
4,F1,CBMI vs RF,\textbf{0.0000},\textbf{0.0000},\textbf{0.0002}
5,F1,IClf vs RF,\textbf{0.0000},\textbf{0.0000},\textbf{0.0002}
6,RECALL,CBMI vs IClf,0.8385,0.9987,0.8689
7,RECALL,CBMI vs RF,\textbf{0.0000},\textbf{0.0000},\textbf{0.0002}
8,RECALL,IClf vs RF,\textbf{0.0000},\textbf{0.0000},\textbf{0.0002}


### glass dataset

In [9]:
data_name = 'glass'
df_1 = pd.read_csv(f'{dir}/{data_name}_True_combined.csv')
df_2 = pd.read_csv(f'{dir}/{data_name}_False_combined.csv')
# Collect all results
df1_results = pd.concat(
    [pairwise_tests_save(df_1, m, model_labels) for m in metrics],
    ignore_index=True
)

df2_results = pd.concat(
    [pairwise_tests_save(df_2, m, model_labels) for m in metrics],
    ignore_index=True
)

df1_results

Unnamed: 0,Metric,Comparison,t-test p,Wilcoxon p,Permutation p
0,ACC,CBMI vs IClf,0.3554,0.6285,0.3678
1,ACC,CBMI vs RF,\textbf{0.0002},\textbf{0.0001},\textbf{0.0010}
2,ACC,IClf vs RF,\textbf{0.0004},\textbf{0.0005},\textbf{0.0014}
3,F1,CBMI vs IClf,0.0803,0.0798,0.0822
4,F1,CBMI vs RF,0.0780,0.0825,0.0790
5,F1,IClf vs RF,\textbf{0.0003},\textbf{0.0003},\textbf{0.0008}
6,RECALL,CBMI vs IClf,0.3554,0.6325,0.3736
7,RECALL,CBMI vs RF,\textbf{0.0004},\textbf{0.0005},\textbf{0.0014}
8,RECALL,IClf vs RF,\textbf{0.0002},\textbf{0.0001},\textbf{0.0004}


In [10]:
df2_results

Unnamed: 0,Metric,Comparison,t-test p,Wilcoxon p,Permutation p
0,ACC,CBMI vs IClf,\textbf{0.0000},\textbf{0.0000},\textbf{0.0002}
1,ACC,CBMI vs RF,\textbf{0.0000},\textbf{0.0000},\textbf{0.0002}
2,ACC,IClf vs RF,0.9535,0.9643,0.9635
3,F1,CBMI vs IClf,\textbf{0.0006},\textbf{0.0014},\textbf{0.0008}
4,F1,CBMI vs RF,\textbf{0.0215},\textbf{0.0101},\textbf{0.0182}
5,F1,IClf vs RF,0.5649,0.6752,0.5543
6,RECALL,CBMI vs IClf,\textbf{0.0000},\textbf{0.0000},\textbf{0.0002}
7,RECALL,CBMI vs RF,0.9535,0.9643,0.9627
8,RECALL,IClf vs RF,\textbf{0.0000},\textbf{0.0000},\textbf{0.0002}


### DIABETES DATASET

In [17]:
metrics = ["mae", "mse", "rmse"]

In [18]:
data_name = 'diabetes'
df_1 = pd.read_csv(f'{dir}/{data_name}_True_combined.csv')
df_2 = pd.read_csv(f'{dir}/{data_name}_False_combined.csv')
# Collect all results
df1_results = pd.concat(
    [pairwise_tests_save(df_1, m, model_labels) for m in metrics],
    ignore_index=True
)

df2_results = pd.concat(
    [pairwise_tests_save(df_2, m, model_labels) for m in metrics],
    ignore_index=True
)

df1_results

Unnamed: 0,Metric,Comparison,t-test p,Wilcoxon p,Permutation p
0,MAE,CBMI vs IClf,0.3391,0.3860,0.3488
1,MAE,CBMI vs RF,0.1529,0.1949,0.1392
2,MAE,IClf vs RF,\textbf{0.0304},\textbf{0.0278},\textbf{0.0340}
3,MSE,CBMI vs IClf,\textbf{0.0000},\textbf{0.0000},\textbf{0.0002}
4,MSE,CBMI vs RF,\textbf{0.0000},\textbf{0.0000},\textbf{0.0002}
5,MSE,IClf vs RF,0.1362,0.1774,0.1368
6,RMSE,CBMI vs IClf,\textbf{0.0000},\textbf{0.0000},\textbf{0.0002}
7,RMSE,CBMI vs RF,\textbf{0.0000},\textbf{0.0000},\textbf{0.0002}
8,RMSE,IClf vs RF,0.1586,0.1985,0.1532


In [19]:
df2_results

Unnamed: 0,Metric,Comparison,t-test p,Wilcoxon p,Permutation p
0,MAE,CBMI vs IClf,\textbf{0.0000},\textbf{0.0000},\textbf{0.0002}
1,MAE,CBMI vs RF,\textbf{0.0000},\textbf{0.0000},\textbf{0.0002}
2,MAE,IClf vs RF,0.9941,0.9041,0.9985
3,MSE,CBMI vs IClf,0.4802,0.8504,0.4630
4,MSE,CBMI vs RF,0.8428,0.9103,0.8349
5,MSE,IClf vs RF,0.7885,0.9520,0.8003
6,RMSE,CBMI vs IClf,0.5449,0.8917,0.5365
7,RMSE,CBMI vs RF,0.8773,0.9166,0.8793
8,RMSE,IClf vs RF,0.8097,0.9749,0.8231
