In [2]:
import pandas as pd
import numpy as np
from scipy.stats import wilcoxon, rankdata

# ==== INPUT FILE PATHS ====
baseline_file = "./../../dataset/baseline_5fold_values.csv"
proposed_file = "./../../dataset/proposed_5fold_values.csv"

# ==== READ DATA ====
baseline = pd.read_csv(baseline_file)
proposed = pd.read_csv(proposed_file)

# ==== METRIC PAIRS (compatible metrics only) ====
metric_map = {
    "AUC mean": "AUC mean",
    "F1 mean": "F1 mean",
    "Precision mean": "Precision mean",
    "Recall mean": "Recall mean",
    "Generalizability mean": "GLR mean",
    "Concordance mean": "Hit@10",
    "Stability": "ECE",
    "Reliability Index": "Reliability Score",
}
# "Train AUC mean" (baseline) and "Brier mean" (proposed) are intentionally excluded

# ==== DATASETS & MODELS ====
datasets = [d for d in baseline["Dataset"].unique() if d != "Mean"]
models = baseline["Model"].unique()

results = []

for model in models:
    proposed_model = f"{model} (Proposed)"

    for dataset in datasets:
        for base_metric, prop_metric in metric_map.items():
            # Baseline group
            b_mask = (
                (baseline["Model"] == model)
                & (baseline["Dataset"] == dataset)
                & (baseline["Metric"] == base_metric)
            )
            b_group = baseline[b_mask]

            # Proposed group
            p_mask = (
                (proposed["Model"] == proposed_model)
                & (proposed["Dataset"] == dataset)
                & (proposed["Metric"] == prop_metric)
            )
            p_group = proposed[p_mask]

            # If either side missing, skip
            if b_group.empty or p_group.empty:
                continue

            # Sort by Fold to align values
            b_vals = b_group.sort_values("Fold")["Value"].to_numpy()
            p_vals = p_group.sort_values("Fold")["Value"].to_numpy()

            if b_vals.shape != p_vals.shape:
                print(
                    "Shape mismatch for",
                    model, dataset, base_metric, "vs", prop_metric,
                    b_vals.shape, p_vals.shape
                )
                continue

            diffs = p_vals - b_vals

            # Handle all-zero differences
            if np.allclose(diffs, 0):
                W_min_scipy = np.nan
                W_plus = 0.0
                W_minus = 0.0
                W_signed = 0.0
                pval = np.nan
                effect_r = 0.0
            else:
                # Remove zero differences for rank calculations (Wilcoxon logic)
                nz_mask = diffs != 0
                d = diffs[nz_mask]

                # If after removing zeros nothing is left, skip
                if d.size == 0:
                    W_min_scipy = np.nan
                    W_plus = 0.0
                    W_minus = 0.0
                    W_signed = 0.0
                    pval = np.nan
                    effect_r = 0.0
                else:
                    # Ranks of absolute differences
                    ranks = rankdata(np.abs(d))
                    W_plus = ranks[d > 0].sum()
                    W_minus = ranks[d < 0].sum()
                    W_signed = W_plus - W_minus

                    # SciPy statistic = min(W_plus, W_minus)
                    try:
                        w_res = wilcoxon(
                            p_vals, b_vals,
                            alternative="two-sided",
                            zero_method="wilcox",
                            mode="auto",
                        )
                        W_min_scipy = float(w_res.statistic)
                        pval = float(w_res.pvalue)
                    except ValueError as e:
                        print("Wilcoxon error for", model, dataset, base_metric, ":", e)
                        W_min_scipy = np.nan
                        pval = np.nan

                    # Effect size r (matched rank biserial)
                    denom = W_plus + W_minus
                    if denom == 0:
                        effect_r = np.nan
                    else:
                        effect_r = (W_plus - W_minus) / denom

            results.append(
                {
                    "Model": model,
                    "Dataset": dataset,
                    "BaselineMetric": base_metric,
                    "ProposedMetric": prop_metric,
                    # Different W variants:
                    "W_min_scipy": W_min_scipy,     # what SciPy returns (often 0)
                    "W_plus": W_plus,               # sum of positive ranks
                    "W_minus": W_minus,             # sum of negative ranks
                    "W_signed": W_signed,           # W_plus - W_minus
                    # p-value & significance
                    "p_value": pval,
                    "significant_0.05": bool(
                        (pval is not None)
                        and (not np.isnan(pval))
                        and (pval <= 0.05)
                    ),
                    "significant_0.01": bool(
                        (pval is not None)
                        and (not np.isnan(pval))
                        and (pval <= 0.01)
                    ),
                    # Effect size
                    "effect_size_r": effect_r,
                }
            )

# ==== BUILD AND SAVE SUMMARY TABLE ====
wsr_df = pd.DataFrame(results)

output_file = "./../../dataset/wsr_results_all_metrics.csv"
wsr_df.to_csv(output_file, index=False)

print(f"WSR results saved to: {output_file}")
print(f"Total rows: {len(wsr_df)}")
print(wsr_df.head())

WSR results saved to: ./../../dataset/wsr_results_all_metrics.csv
Total rows: 512
      Model Dataset         BaselineMetric  ProposedMetric  W_min_scipy  \
0  Adaboost     MC1               AUC mean        AUC mean          0.0   
1  Adaboost     MC1                F1 mean         F1 mean          0.0   
2  Adaboost     MC1         Precision mean  Precision mean          0.0   
3  Adaboost     MC1            Recall mean     Recall mean          0.0   
4  Adaboost     MC1  Generalizability mean        GLR mean          0.0   

   W_plus  W_minus  W_signed  p_value  significant_0.05  significant_0.01  \
0     0.0     15.0     -15.0   0.0625             False             False   
1    15.0      0.0      15.0   0.0625             False             False   
2    15.0      0.0      15.0   0.0625             False             False   
3     0.0     15.0     -15.0   0.0625             False             False   
4     0.0     15.0     -15.0   0.0625             False             False   

   e