In [10]:
import pandas as pd
from scipy import stats
import numpy as np


all_data = {
    # 模型1 (Baseline)
    'model1': {
        'Precision': [0.6933, 0.6123, 0.7193, 0.6962, 0.5314, 0.6501, 0.5276],
        'Recall':    [0.7556, 0.6207, 0.7000, 1.0000, 0.5865, 0.7068, 0.4234],
        'F1-Score':  [0.7231, 0.6165, 0.7095, 0.8209, 0.5576, 0.6772, 0.4698],
        'mAP50':     [0.7740, 0.6556, 0.7590, 0.9434, 0.6914, 0.6781, 0.4801],
        'mAP50-95':  [0.4353, 0.3499, 0.4801, 0.8840, 0.3713, 0.3706, 0.2332]
    },
    # 模型2 (+Attention)
    'model2': {
        'Precision': [0.6610, 0.5712, 0.6256, 0.8111, 0.6400, 0.6182, 0.5149],
        'Recall':    [0.8444, 0.6418, 0.7667, 0.9375, 0.6207, 0.7293, 0.4955],
        'F1-Score':  [0.7416, 0.6044, 0.6890, 0.8697, 0.6302, 0.6692, 0.5050],
        'mAP50':     [0.8519, 0.6579, 0.6630, 0.9492, 0.7295, 0.7093, 0.4874],
        'mAP50-95':  [0.4778, 0.3533, 0.4478, 0.8871, 0.3632, 0.3748, 0.2375]
    },
    # 模型3 (+MONAI)
    'model3': {
        'Precision': [0.6298, 0.5988, 0.6593, 0.8907, 0.7117, 0.5847, 0.5603],
        'Recall':    [0.7184, 0.6119, 0.7097, 0.9688, 0.5963, 0.7727, 0.5000],
        'F1-Score':  [0.6712, 0.6053, 0.6836, 0.9281, 0.6489, 0.6657, 0.5284],
        'mAP50':     [0.7945, 0.6343, 0.7536, 0.9749, 0.8241, 0.7105, 0.5160],
        'mAP50-95':  [0.4545, 0.3533, 0.5332, 0.8939, 0.4396, 0.3632, 0.2509]
    },
    # 模型4 (+MONAI+Attention)
    'model4': {
        'Precision': [0.6736, 0.5966, 0.7142, 0.8255, 0.7789, 0.5988, 0.5134],
        'Recall':    [0.8444, 0.6368, 0.8329, 1.0000, 0.7290, 0.7182, 0.5000],
        'F1-Score':  [0.7494, 0.6161, 0.7690, 0.9044, 0.7531, 0.6531, 0.5066],
        'mAP50':     [0.8371, 0.6293, 0.8053, 0.9629, 0.8606, 0.7180, 0.4946],
        'mAP50-95':  [0.4860, 0.3351, 0.4969, 0.9029, 0.4783, 0.3934, 0.2390]
    }
}


# --- 2. 创建一个修正后的函数来执行批量检验 ---
def perform_wilcoxon_tests(model_a_name, model_b_name, data_dict):
    """
    对两个模型在所有5个指标上进行批量Wilcoxon检验。
    检验模型B是否优于模型A。
    """
    print(f"\n--- 统计检验: {model_b_name} vs. {model_a_name} ---")
    print(f"零假设 (H0): {model_b_name} 的性能不优于 {model_a_name}")
    print(f"备择假设 (H1): {model_b_name} 的性能优于 {model_a_name}\n")
    
    results_summary = {}
    metrics_to_test = ['Precision', 'Recall', 'F1-Score', 'mAP50', 'mAP50-95']
    
    for metric in metrics_to_test:
        scores_a = data_dict[model_a_name][metric]
        scores_b = data_dict[model_b_name][metric]
        
        
        try:
            stat, p_value = stats.wilcoxon(scores_b, scores_a, alternative='greater', zero_method='zsplit')
        except ValueError:
            diff = np.array(scores_b) - np.array(scores_a)
            non_zero_diff = diff[diff != 0]
            if len(non_zero_diff) == 0:
                p_value = 1.0
            elif np.all(non_zero_diff > 0):

                stat, p_value = stats.wilcoxon(non_zero_diff, alternative='greater')
            else:
                
                stat, p_value = stats.wilcoxon(diff, alternative='greater', zero_method='zsplit')

        
        verdict = "显著提升 (p < 0.05)" if p_value < 0.05 else "不显著 (p >= 0.05)"
        results_summary[metric] = {'p_value': p_value, 'verdict': verdict}
        

    print(f"{'指标':<12} | {'p-value':<10} | {'结论'}")
    print("-" * 45)
    for metric, result in results_summary.items():
 
        p_val_str = f"{result['p_value']:.4f}"
        print(f"{metric:<12} | {p_val_str:<10} | {result['verdict']}")
        
    return results_summary

# --- 3. 执行三组核心对比 ---

# 对比1: 模型2 (+MONAI) vs. 模型1 (Baseline)
results1 = perform_wilcoxon_tests('model1', 'model3', all_data)

# 对比2: 模型3 (+Attention) vs. 模型1 (Baseline)
results2 = perform_wilcoxon_tests('model1', 'model2', all_data)

# 对比3: 模型4 (最终模型) vs. 模型1 (Baseline)
results3 = perform_wilcoxon_tests('model1', 'model4', all_data)


--- 统计检验: model3 vs. model1 ---
零假设 (H0): model3 的性能不优于 model1
备择假设 (H1): model3 的性能优于 model1

指标           | p-value    | 结论
---------------------------------------------
Precision    | 0.4688     | 不显著 (p >= 0.05)
Recall       | 0.2891     | 不显著 (p >= 0.05)
F1-Score     | 0.2891     | 不显著 (p >= 0.05)
mAP50        | 0.0547     | 不显著 (p >= 0.05)
mAP50-95     | 0.0234     | 显著提升 (p < 0.05)

--- 统计检验: model2 vs. model1 ---
零假设 (H0): model2 的性能不优于 model1
备择假设 (H1): model2 的性能优于 model1

指标           | p-value    | 结论
---------------------------------------------
Precision    | 0.5938     | 不显著 (p >= 0.05)
Recall       | 0.0547     | 不显著 (p >= 0.05)
F1-Score     | 0.1484     | 不显著 (p >= 0.05)
mAP50        | 0.1484     | 不显著 (p >= 0.05)
mAP50-95     | 0.3438     | 不显著 (p >= 0.05)

--- 统计检验: model4 vs. model1 ---
零假设 (H0): model4 的性能不优于 model1
备择假设 (H1): model4 的性能优于 model1

指标           | p-value    | 结论
---------------------------------------------
Precision    | 0.5938     | 不显著 (p >= 0.0