In [None]:
import pandas as pd
import numpy as np
from scipy.stats import wilcoxon

In [None]:
def wilcoxon_twosided_with_effects(base: np.ndarray, ours: np.ndarray, zero_method: str = 'wilcox'):
    """
    Compute the two-sided Wilcoxon results and the effect size.
    Difference (d = \text{base} - \text{ours}).
    """
    # Drop NAs and check shapes
    mask = np.isfinite(base) & np.isfinite(ours)
    x = base[mask].astype(np.float64)
    y = ours[mask].astype(np.float64)
    if x.shape != y.shape:
        raise ValueError("Baseline and proposed method lengths do not match")
    if x.size == 0:
        raise ValueError("Number of valid samples is 0")

    d = x - y
    n_total = d.size
    n_zero  = int((d == 0).sum())
    n_eff   = int((d != 0).sum())
    n_pos   = int((d > 0).sum())
    n_neg   = int((d < 0).sum())
    if n_eff == 0:
        raise ValueError("All differences are 0: Wilcoxon cannot be computed")

    # Two-sided Wilcoxon (report this p-value only)
    res_two = wilcoxon(x, y, zero_method=zero_method, alternative='two-sided', mode='auto')

    # Compute rank-biserial r exactly and obtain R+ (do not report its p-value)
    res_gt = wilcoxon(x, y, zero_method=zero_method, alternative='greater', mode='auto')
    R_plus = float(res_gt.statistic)
    S = n_eff * (n_eff + 1) / 2.0             
    r_rb = (2.0 * R_plus / S) - 1.0            
    cles = 0.5 * (r_rb + 1.0)                

    # Paired Cohen’s d_z and Hedges’ g
    d_mean = d.mean()
    d_std  = d.std(ddof=1)
    dz = np.nan if d_std == 0 else d_mean / d_std
    if n_eff > 2 and np.isfinite(dz):
        J = 1.0 - 3.0 / (4.0 * n_eff - 9.0)    
        g = J * dz
    else:
        g = np.nan

    median_diff = float(np.median(d))

    # —— Output (two-sided) ——
    print("=== Wilcoxon (two-sided) and effect sizes ===")
    print(f"Effective pairs n_eff         : {n_eff} / total pairs n_total: {n_total} (zero differences: {n_zero})")
    print(f"T statistic (min rank sum)    : {res_two.statistic:.6f}")
    print(f"p-value (two-sided)           : {res_two.pvalue:.3e}")
    print("--- Effect sizes ---")
    print(f"rank-biserial r                : {r_rb:.3f}")
    print(f"CLES（P[base>ours]）           : {cles:.3f}")
    print(f"Cohen’s d_z (paired)           : {dz:.3f}")
    print(f"Hedges' g                      : {g:.3f}")
    print(f"Median of paired differences d : {median_diff:.6f}")

    return {
        "n_total": n_total,
        "n_zero": n_zero,
        "n_eff": n_eff,
        "wilcoxon_T": float(res_two.statistic),
        "wilcoxon_p_two_sided": float(res_two.pvalue),
        "rank_biserial_r": float(r_rb),
        "CLES": float(cles),
        "cohen_dz": float(dz),
        "hedges_g": float(g),
        "median_diff": float(median_diff),
    }

In [None]:
ours_path =            # Proposed-method error file path
baseline_path =        # Baseline error file path

def main(ours_path: str, baseline_path: str, zero_method: str = 'wilcox'):
    ours = pd.read_csv(ours_path, header=None, dtype=np.float64).squeeze("columns").to_numpy()
    base = pd.read_csv(baseline_path, header=None, dtype=np.float64).squeeze("columns").to_numpy()
    if ours.shape != base.shape:
        raise ValueError("The two files have different lengths")
    _ = wilcoxon_twosided_with_effects(base, ours, zero_method=zero_method)

if __name__ == "__main__":
    main(ours_path, baseline_path, zero_method='wilcox')