In [5]:
!python -m pip install scipy pandas numpy




[notice] A new release of pip is available: 25.3 -> 26.0
[notice] To update, run: python.exe -m pip install --upgrade pip


In [6]:
import pandas as pd
import numpy as np
from scipy.stats import wilcoxon, rankdata

# ==== INPUT FILE PATHS ====
baseline_file = "./../../dataset/cross_baseline_5fold_values.csv"
proposed_file = "./../../dataset/cross_proposed_5fold_values.csv"

# ==== READ DATA ====
baseline = pd.read_csv(baseline_file).assign(
    Train_norm=lambda df: df["Train"].astype(str).str.replace(".arff", "", regex=False),
    Test_norm=lambda df: df["Test"].astype(str).str.replace(".arff", "", regex=False),
)
proposed = pd.read_csv(proposed_file).assign(
    Train_norm=lambda df: df["Train"].astype(str).str.replace(".arff", "", regex=False),
    Test_norm=lambda df: df["Test"].astype(str).str.replace(".arff", "", regex=False),
)

metric_map = {
    "AUC_mean": "AUC",
    "F1_mean": "F1",
    "Precision_mean": "Precision",
    "Recall_mean": "Recall",
    "Generalizability_mean": "GLR",
    "Stability": "ECE",
    "ReliabilityIndex": "ReliabilityScore",
}

# +1 means higher is better, -1 means lower is better (e.g., ECE)
metric_improvement_sign = {
    "AUC_mean": 1,
    "F1_mean": 1,
    "Precision_mean": 1,
    "Recall_mean": 1,
    "Generalizability_mean": 1,
    "Stability": -1,
    "ReliabilityIndex": 1,
}

models = sorted(set(baseline["Model"]) & set(proposed["Model"]))
results = []

for model in models:
    base_model = baseline[baseline["Model"] == model]
    prop_model = proposed[proposed["Model"] == model]

    for base_metric, prop_metric in metric_map.items():
        base_rows = base_model[base_model["Metric"] == base_metric]
        prop_rows = prop_model[prop_model["Metric"] == prop_metric]

        if base_rows.empty or prop_rows.empty:
            continue

        merged = base_rows.merge(
            prop_rows,
            on=["Train_norm", "Test_norm", "Fold"],
            suffixes=("_baseline", "_proposed"),
        )

        if merged.empty:
            continue

        for (train, test), subset in merged.groupby(["Train_norm", "Test_norm"]):
            b_vals = subset["Value_baseline"].to_numpy()
            p_vals = subset["Value_proposed"].to_numpy()

            if b_vals.size == 0 or p_vals.size == 0:
                continue
            if b_vals.shape != p_vals.shape:
                continue

            improve_sign = metric_improvement_sign.get(base_metric, 1)
            diffs_raw = p_vals - b_vals
            diffs = diffs_raw * improve_sign
            n_pairs = len(diffs)
            baseline_mean = float(b_vals.mean())
            proposed_mean = float(p_vals.mean())
            difference = float(proposed_mean - baseline_mean)
            improvement = float(difference * improve_sign)

            if np.allclose(diffs, 0, equal_nan=True):
                W_min_scipy = np.nan
                W_plus = 0.0
                W_minus = 0.0
                W_signed = 0.0
                pval_one_sided = np.nan
                pval_two_sided = np.nan
                effect_r = 0.0
            else:
                nz_mask = diffs != 0
                d = diffs[nz_mask]

                if d.size == 0:
                    W_min_scipy = np.nan
                    W_plus = 0.0
                    W_minus = 0.0
                    W_signed = 0.0
                    pval_one_sided = np.nan
                    pval_two_sided = np.nan
                    effect_r = 0.0
                else:
                    ranks = rankdata(np.abs(d))
                    W_plus = ranks[d > 0].sum()
                    W_minus = ranks[d < 0].sum()
                    W_signed = W_plus - W_minus

                    try:
                        w_res = wilcoxon(
                            diffs,
                            alternative="greater",
                            zero_method="wilcox",
                            mode="auto",
                        )
                        W_min_scipy = float(w_res.statistic)
                        pval_one_sided = float(w_res.pvalue)

                        w_res_two = wilcoxon(
                            diffs,
                            alternative="two-sided",
                            zero_method="wilcox",
                            mode="auto",
                        )
                        pval_two_sided = float(w_res_two.pvalue)
                    except ValueError:
                        W_min_scipy = np.nan
                        pval_one_sided = np.nan
                        pval_two_sided = np.nan

                    denom = W_plus + W_minus
                    effect_r = (W_signed / denom) if denom != 0 else np.nan

            results.append(
                {
                    "Model": model,
                    "Train_dataset": train,
                    "Test_dataset": test,
                    "BaseMetric": base_metric,
                    "ProposedMetric": prop_metric,
                    "Baseline_mean": baseline_mean,
                    "Proposed_mean": proposed_mean,
                    "Difference": difference,
                    "Improvement": improvement,
                    "N_pairs": n_pairs,
                    "W_min_scipy": W_min_scipy,
                    "W_plus": W_plus,
                    "W_minus": W_minus,
                    "W_signed": W_signed,
                    "p_value": pval_one_sided,
                    "p_value_two_sided": pval_two_sided,
                    "significant_0.05": bool(
                        pval_one_sided is not None and not np.isnan(pval_one_sided) and pval_one_sided <= 0.05
                    ),
                    "significant_0.01": bool(
                        pval_one_sided is not None and not np.isnan(pval_one_sided) and pval_one_sided <= 0.01
                    ),
                    "effect_size_r": effect_r,
                }
            )

wsr_df = pd.DataFrame(results)

output_file = "./../../dataset/wsr_results_cross.csv"
wsr_df.to_csv(output_file, index=False)

print(f"WSR cross results saved to: {output_file}")
print(f"Total rows: {len(wsr_df)}")
print(wsr_df.head())


WSR cross results saved to: ./../../dataset/wsr_results_cross.csv
Total rows: 728
      Model Train_dataset Test_dataset BaseMetric ProposedMetric  \
0  adaboost          MEAN         MEAN   AUC_mean            AUC   
1  adaboost           pc1          pc2   AUC_mean            AUC   
2  adaboost           pc1          pc3   AUC_mean            AUC   
3  adaboost           pc1          pc4   AUC_mean            AUC   
4  adaboost           pc2          pc1   AUC_mean            AUC   

   Baseline_mean  Proposed_mean  Difference  Improvement  N_pairs  \
0       0.766667       0.754418   -0.012249    -0.012249        5   
1       0.794000       0.826308    0.032308     0.032308        5   
2       0.806000       0.802661   -0.003339    -0.003339        5   
3       0.800000       0.771981   -0.028019    -0.028019        5   
4       0.678000       0.680820    0.002820     0.002820        5   

   W_min_scipy  W_plus  W_minus  W_signed  p_value  p_value_two_sided  \
0          0.0     0.