In [1]:
import os
import pickle
import argparse
import numpy as np
import pandas as pd
from sklearn.metrics import classification_report, roc_auc_score

def compute_average_report_across_runs(reports):
    overall_report = {
        'HC': {'precision': [], 'recall': [], 'f1-score': [], 'support': []},
        'PD': {'precision': [], 'recall': [], 'f1-score': [], 'support': []},
        'accuracy': [],
        'macro avg': {'precision': [], 'recall': [], 'f1-score': [], 'support': []},
        'weighted avg': {'precision': [], 'recall': [], 'f1-score': [], 'support': []},
    }

    # Spoiler: It's gonna be inefficient :)

    for report in reports:
        for key in report.keys():
            if key == 'accuracy':
                overall_report[key].append(report[key])
            else:
                for key2 in report[key].keys():
                    overall_report[key][key2].append(report[key][key2])

    for key in overall_report.keys():
            if key == 'accuracy':
                overall_report[key] = f'{round(np.array(overall_report[key]).mean(), 4)}±{round(np.array(overall_report[key]).std(), 4)}'
            else:
                for key2 in report[key].keys():
                    overall_report[key][key2] = f'{round(np.array(overall_report[key][key2]).mean(), 4)}±{round(np.array(overall_report[key][key2]).std(), 4)}'

    # -- just for a more clean output
    overall_report = pd.DataFrame.from_dict(overall_report).T
    overall_report.iloc[2,0] = ''
    overall_report.iloc[2,1] = ''
    overall_report.iloc[2,3] = overall_report.iloc[3,3]

    return overall_report

def get_reports(exps_dir):
    val_reports = []
    test_reports = []
    run_dirs = os.listdir(exps_dir)
    for run_dir in run_dirs:
        val_preds = []
        val_labels = []
        test_preds = []
        test_labels = []

        run_dir_path = os.path.join(exps_dir, run_dir)
        fold_dirs = os.listdir(run_dir_path)
        
        for fold_dir in fold_dirs:
            model_output_dir = os.path.join(run_dir_path, fold_dir, 'model_output')

            # -- validation set
            val_report_path = os.path.join(os.path.join(model_output_dir, 'validation_classification.pkl'))
            with open(val_report_path, 'rb') as f:
                val_model_output = pickle.load(f)

            val_preds += val_model_output['preds']
            val_labels += val_model_output['labels']

            # -- test set
            test_report_path = os.path.join(os.path.join(model_output_dir, 'test_classification.pkl'))
            with open(test_report_path, 'rb') as f:
                test_model_output = pickle.load(f)

            test_preds += test_model_output['preds']
            test_labels += test_model_output['labels']

        # -- computing reports
        val_reports.append(
            classification_report(
                val_labels,
                val_preds,
                target_names=['HC', 'PD'],
                output_dict=True,
            )
        )

        test_reports.append(
            classification_report(
                test_labels,
                test_preds,
                target_names=['HC', 'PD'],
                output_dict=True,
            )
        )

    return val_reports, test_reports


def get_report_perfold(exps_dir):

    val_fold_reports = {}
    test_fold_reports = {}
    run_dirs = sorted(os.listdir(exps_dir))
    
    for run_dir in run_dirs:
        run_dir_path = os.path.join(exps_dir, run_dir)
        fold_dirs = sorted(os.listdir(run_dir_path))
        
        val_fold_reports[run_dir] = []
        test_fold_reports[run_dir] = []
        
        for fold_dir in fold_dirs:
            model_output_dir = os.path.join(run_dir_path, fold_dir, 'model_output')
            
            # -- validation set
            val_report_path = os.path.join(model_output_dir, 'validation_classification.pkl')
            if os.path.exists(val_report_path):
                with open(val_report_path, 'rb') as f:
                    val_model_output = pickle.load(f)
                
                val_probs = [preds[1] for preds in val_model_output['probs']]
                val_model_output['probs'] = val_probs  
         
                val_auc = roc_auc_score(val_model_output['labels'], val_probs)
                
                val_fold_report = classification_report(
                    val_model_output['labels'],
                    val_model_output['preds'],
                    target_names=['HC', 'PD'],
                    output_dict=True,
                )
                
                val_fold_reports[run_dir].append({
                    'fold': fold_dir,
                    'report': val_fold_report,
                    'preds': val_model_output['preds'],
                    'labels': val_model_output['labels'],
                    'probs': val_model_output['probs'],  
                    'auc': val_auc  
                })
            
            # -- test set
            test_report_path = os.path.join(model_output_dir, 'test_classification.pkl')
            if os.path.exists(test_report_path):
                with open(test_report_path, 'rb') as f:
                    test_model_output = pickle.load(f)
                
                test_probs = [preds[1] for preds in test_model_output['probs']]
                test_model_output['probs'] = test_probs  
                
                test_auc = roc_auc_score(test_model_output['labels'], test_probs)
                
                test_fold_report = classification_report(
                    test_model_output['labels'],
                    test_model_output['preds'],
                    target_names=['HC', 'PD'],
                    output_dict=True,
                )
                
                test_fold_reports[run_dir].append({
                    'fold': fold_dir,
                    'report': test_fold_report,
                    'preds': test_model_output['preds'],
                    'labels': test_model_output['labels'],
                    'probs': test_model_output['probs'],  
                    'auc': test_auc  
                })
    
    return val_fold_reports, test_fold_reports

def compute_fold_stats(fold_reports):

    fold_stats = {}
    
    for run_dir, fold_results in fold_reports.items():
        accuracies = [fold_result['report']['accuracy'] for fold_result in fold_results]
        f1_scores = [fold_result['report']['PD']['f1-score'] for fold_result in fold_results]
        precisions = [fold_result['report']['PD']['precision'] for fold_result in fold_results]
        recalls = [fold_result['report']['PD']['recall'] for fold_result in fold_results]
        
        sensitivities = [fold_result['report']['PD']['recall'] for fold_result in fold_results]  
        specificities = [fold_result['report']['HC']['recall'] for fold_result in fold_results] 
        
        aucs = [fold_result['auc'] for fold_result in fold_results]
        
        fold_stats[run_dir] = {
            'accuracy': {
                'mean': np.mean(accuracies),
                'std': np.std(accuracies)
            },
            'f1': {
                'mean': np.mean(f1_scores),
                'std': np.std(f1_scores)
            },
            'precision': {
                'mean': np.mean(precisions),
                'std': np.std(precisions)
            },
            'recall': {
                'mean': np.mean(recalls),
                'std': np.std(recalls)
            },
            'sensitivity': {
                'mean': np.mean(sensitivities),
                'std': np.std(sensitivities)
            },
            'specificity': {
                'mean': np.mean(specificities),
                'std': np.std(specificities)
            },
            'auc': {
                'mean': np.mean(aucs),
                'std': np.std(aucs)
            }
        }
    
    return fold_stats



In [None]:
import scipy.stats as stats
from collections import defaultdict

def compare_two_exps_with_wilcoxon(exps_dir1, exps_dir2):
    """
    Compare results from two experiment directories using Wilcoxon signed-rank test
    
    Args:
        exps_dir1: First experiment directory path
        exps_dir2: Second experiment directory path
        
    Returns:
        Dictionary with p-values for each metric
    """
    # Get reports for both experiment directories
    _, test_fold_reports1 = get_report_perfold(exps_dir1)
    _, test_fold_reports2 = get_report_perfold(exps_dir2)
    
    # Store metrics per fold for both experiments
    metrics_by_fold1 = defaultdict(lambda: defaultdict(list))
    metrics_by_fold2 = defaultdict(lambda: defaultdict(list))
    
    # Process first experiment directory
    for run_dir, fold_results in test_fold_reports1.items():
        for fold_result in fold_results:
            fold = fold_result['fold']
            # Store metrics for this fold
            metrics_by_fold1[fold]['accuracy'].append(fold_result['report']['accuracy'])
            metrics_by_fold1[fold]['f1'].append(fold_result['report']['PD']['f1-score'])
            metrics_by_fold1[fold]['precision'].append(fold_result['report']['PD']['precision'])
            metrics_by_fold1[fold]['recall'].append(fold_result['report']['PD']['recall'])
            metrics_by_fold1[fold]['sensitivity'].append(fold_result['report']['PD']['recall'])
            metrics_by_fold1[fold]['specificity'].append(fold_result['report']['HC']['recall'])
            metrics_by_fold1[fold]['auc'].append(fold_result['auc'])
    
    # Process second experiment directory
    for run_dir, fold_results in test_fold_reports2.items():
        for fold_result in fold_results:
            fold = fold_result['fold']
            # Store metrics for this fold
            metrics_by_fold2[fold]['accuracy'].append(fold_result['report']['accuracy'])
            metrics_by_fold2[fold]['f1'].append(fold_result['report']['PD']['f1-score'])
            metrics_by_fold2[fold]['precision'].append(fold_result['report']['PD']['precision'])
            metrics_by_fold2[fold]['recall'].append(fold_result['report']['PD']['recall'])
            metrics_by_fold2[fold]['sensitivity'].append(fold_result['report']['PD']['recall'])
            metrics_by_fold2[fold]['specificity'].append(fold_result['report']['HC']['recall'])
            metrics_by_fold2[fold]['auc'].append(fold_result['auc'])
    
    # Calculate mean values for each fold
    metrics_to_test = ['accuracy', 'f1', 'precision', 'sensitivity', 'specificity', 'auc']
    fold_means1 = defaultdict(list)
    fold_means2 = defaultdict(list)
    
    # Make sure we only use folds that exist in both experiments
    common_folds = set(metrics_by_fold1.keys()) & set(metrics_by_fold2.keys())
    
    # Calculate means for each fold
    for fold in common_folds:
        for metric in metrics_to_test:
            if metrics_by_fold1[fold][metric] and metrics_by_fold2[fold][metric]:
                fold_means1[metric].append(np.mean(metrics_by_fold1[fold][metric]))
                fold_means2[metric].append(np.mean(metrics_by_fold2[fold][metric]))
    
    # print(f"Number of common folds: {len(common_folds)}")
    # print(f"Metrics to test: {metrics_to_test}")
    # print(f"Fold means for experiment 1: {fold_means1}")
    # print(f"Fold means for experiment 2: {fold_means2}")
    
    # Run Wilcoxon signed-rank test for each metric
    results = {}
    for metric in metrics_to_test:
        if len(fold_means1[metric]) >= 5:  # Need at least 5 pairs for reliable results
            statistic, p_value = stats.wilcoxon(fold_means1[metric], fold_means2[metric])
            results[metric] = {
                'p_value': p_value,
                'significant': p_value < 0.05,
                'means1': np.mean(fold_means1[metric]),
                'means2': np.mean(fold_means2[metric]),
                'difference': np.mean(fold_means1[metric]) - np.mean(fold_means2[metric])
            }
    
    return results

# Example usage
exps_dir1 = "/home/yzhong/gits/interpretable-pd/exps/pcgita_splits_10foldnew/cross_full/allsent/"
exps_dir2 = "/home/yzhong/gits/interpretable-pd/exps/pcgita_splits_10foldnew/cross_full_fix_value/allsent/"


results = compare_two_exps_with_wilcoxon(exps_dir1, exps_dir2)

# Print results in a nice format
print(f"Statistical comparison between: \n- M{exps_dir1.strip('/').split('/')[-1]} and \n- M{exps_dir2.strip('/').split('/')[-1]}\n")
print("Wilcoxon Signed-Rank Test Results:")
for metric, result in results.items():
    significance = "* significant *" if result['significant'] else "not significant"
    direction = ">" if result['difference'] > 0 else "<"
    print(f"{metric.upper()}: {result['means1']:.4f} {direction} {result['means2']:.4f}, p={result['p_value']:.4f} ({significance})")

Statistical comparison between: 
- Mcross_full and 
- Mallsent

Wilcoxon Signed-Rank Test Results:


In [None]:

def get_avg_F1_for_allfolds(exps_dir1):
    """
    Compare results from two experiment directories using Wilcoxon signed-rank test
    
    Args:
        exps_dir1: First experiment directory path
        exps_dir2: Second experiment directory path
        
    Returns:
        Dictionary with p-values for each metric
    """
    
    print(f"Processing experiment directory: {exps_dir1}")
    # Get reports for both experiment directories
    _, test_fold_reports1 = get_report_perfold(exps_dir1)
    
    # Store metrics per fold for both experiments
    metrics_by_fold1 = defaultdict(lambda: defaultdict(list))
    
    # Process first experiment directory
    for run_dir, fold_results in test_fold_reports1.items():
        for fold_result in fold_results:
            fold = fold_result['fold']
            # Store metrics for this fold
            metrics_by_fold1[fold]['accuracy'].append(fold_result['report']['accuracy'])
            metrics_by_fold1[fold]['f1'].append(fold_result['report']['PD']['f1-score'])
            metrics_by_fold1[fold]['precision'].append(fold_result['report']['PD']['precision'])
            metrics_by_fold1[fold]['recall'].append(fold_result['report']['PD']['recall'])
            metrics_by_fold1[fold]['sensitivity'].append(fold_result['report']['PD']['recall'])
            metrics_by_fold1[fold]['specificity'].append(fold_result['report']['HC']['recall'])
            metrics_by_fold1[fold]['auc'].append(fold_result['auc'])
    
    
    # Calculate mean values for each fold
    metrics_to_test = ['accuracy', 'f1', 'precision', 'sensitivity', 'specificity', 'auc']
    fold_means1 = defaultdict(list)
    
    # Make sure we only use folds that exist in both experiments
    common_folds = set(metrics_by_fold1.keys()) 
    fold_means1_f1 = []
    # Calculate means for each fold
    for fold in common_folds:

        if metrics_by_fold1[fold]['f1'] :
            fold_means1_f1.append(np.mean(metrics_by_fold1[fold]['f1']))
            
    return fold_means1_f1

exp1 = "cross_full"
exp2 = "cross_full_fix"
exp3 = "cross_full_fix_value"
exp4 = "cross_token_oldcate"
exps = [exp1, exp2, exp3, exp4]

def get_report_perexp(exp):
    
    dir1 = "/home/yzhong/gits/interpretable-pd/exps/pcgita_splits_10foldnew/"
    dir2 = "/home/yzhong/gits/interpretable-pd/exps/gita/"
    dir3 = "/home/yzhong/gits/interpretable-pd/exps/gita-splitmono/"
    pertask2f1 = defaultdict(list)

    exps_dir1 = os.path.join(dir1, exp, 'allsent') 
    fold_means_allsent = get_avg_F1_for_allfolds(exps_dir1)
    pertask2f1['ALLSENT'] = fold_means_allsent
    
    exps_dir3 = os.path.join(dir3, exp, 'MONOLOGUE')
    fold_means_splitmono = get_avg_F1_for_allfolds(exps_dir3)
    pertask2f1['SPLIT-MONO'] = fold_means_splitmono

    tasks = ["DDK", "READ", "SENTENCES", "SUSTAINED-VOWELS", "WORDS"]
    for task in tasks:
        exp_dir = os.path.join(dir2, exp, task)
        if os.path.exists(exp_dir):
            fold_means = get_avg_F1_for_allfolds(exp_dir)
            pertask2f1[task] = fold_means
            
    
    all_f1_scores = []
    for task, f1_scores in pertask2f1.items():
        all_f1_scores.extend(f1_scores)
    return all_f1_scores


all_f1_scores1 = get_report_perexp(exp1)
all_f1_scores2 = get_report_perexp(exp2)
all_f1_scores3 = get_report_perexp(exp3)
all_f1_scores4 = get_report_perexp(exp4)


Processing experiment directory: /home/yzhong/gits/interpretable-pd/exps/pcgita_splits_10foldnew/cross_full/allsent
Processing experiment directory: /home/yzhong/gits/interpretable-pd/exps/gita-splitmono/cross_full/MONOLOGUE
Processing experiment directory: /home/yzhong/gits/interpretable-pd/exps/gita/cross_full/DDK
Processing experiment directory: /home/yzhong/gits/interpretable-pd/exps/gita/cross_full/MONOLOGUE
Processing experiment directory: /home/yzhong/gits/interpretable-pd/exps/gita/cross_full/READ
Processing experiment directory: /home/yzhong/gits/interpretable-pd/exps/gita/cross_full/SENTENCES
Processing experiment directory: /home/yzhong/gits/interpretable-pd/exps/gita/cross_full/SUSTAINED-VOWELS
Processing experiment directory: /home/yzhong/gits/interpretable-pd/exps/gita/cross_full/WORDS
Processing experiment directory: /home/yzhong/gits/interpretable-pd/exps/pcgita_splits_10foldnew/cross_full_fix/allsent
Processing experiment directory: /home/yzhong/gits/interpretable-pd/e

In [None]:
# Perform Wilcoxon signed-rank test
import scipy.stats as stats

# turn to numpy arrays
all_f1_scores1 = np.array(all_f1_scores1)
all_f1_scores2 = np.array(all_f1_scores2)

# Calculate mean values for comparison
mean1 = np.mean(all_f1_scores1)
mean2 = np.mean(all_f1_scores2)
difference = mean1 - mean2

# Perform the Wilcoxon test
statistic1, p_value1 = stats.wilcoxon(all_f1_scores1, all_f1_scores2)

# Create result dictionary for F1 score
results = {
    'f1': {
        'p_value': p_value1,
        'significant': p_value1 < 0.05,
        'means1': mean1,
        'means2': mean2,
        'difference': difference
    }
}

print(f"Comparison between M{1} and M{2}: statistic={statistic1}, p-value={p_value1}")

for metric, result in results.items():
    significance = "* significant *" if result['significant'] else "not significant"
    direction = ">" if result['difference'] > 0 else "<"
    print(f"{metric.upper()}: {result['means1']:.4f} {direction} {result['means2']:.4f}, p={result['p_value']:.4f} ({significance})")

all_f1_scores1: [0.8066321610566094, 0.8073277419212397, 0.8796157094167605, 0.7681825026598317, 0.7358288159801576, 0.8620941645579328, 0.767299895931475, 0.8319605098779521, 0.6552673804761117, 0.6335934482958153, 0.6809972608950107, 0.7854279216177625, 0.8132027789547422, 0.7710879355442068, 0.8318101590687089, 0.6996374175859857, 0.8371729179476668, 0.8126861639325738, 0.7863773522370272, 0.7137153977520951, 0.6923413703599461, 0.7729249011857708, 0.8463431305536568, 0.6378947368421052, 0.8944927536231884, 0.7487445887445888, 0.8691729323308269, 0.8188594014680971, 0.5259803921568628, 0.8404784688995216, 0.6243287314331025, 0.7945997368225334, 0.813085680922493, 0.784384973170349, 0.8663242677179357, 0.650103972108884, 0.7712133341307609, 0.7748921441994566, 0.7250658414255435, 0.7350179902467038]
all_f1_scores2: [0.8142615365104608, 0.7873015586526346, 0.9202661847519436, 0.8033810448743395, 0.7738202916867264, 0.8870164133565528, 0.7842193310275445, 0.8334628210249871, 0.66750337

In [56]:
def compare_all_model_combinations(all_f1_scores_dict):
    """
    Compare all possible combinations of models using Wilcoxon signed-rank test
    
    Args:
        all_f1_scores_dict: Dictionary with model names as keys and F1 score arrays as values
        
    Returns:
        results_dict: Dictionary with pairwise comparison results
    """
    import itertools
    import numpy as np
    import scipy.stats as stats
    
    # Convert all arrays to numpy arrays if not already
    for key in all_f1_scores_dict:
        all_f1_scores_dict[key] = np.array(all_f1_scores_dict[key])
    
    # Get all possible combinations of model pairs
    model_pairs = list(itertools.combinations(all_f1_scores_dict.keys(), 2))
    
    # Dictionary to store results
    results_dict = {}
    
    # Run Wilcoxon test on all pairs
    for model1, model2 in model_pairs:
        scores1 = all_f1_scores_dict[model1]
        scores2 = all_f1_scores_dict[model2]
        
        # Calculate mean values for comparison
        mean1 = np.mean(scores1)
        mean2 = np.mean(scores2)
        difference = mean1 - mean2
        
        # Perform the Wilcoxon test
        statistic, p_value = stats.wilcoxon(scores1, scores2)
        
        # Store results
        pair_key = f"{model1}_vs_{model2}"
        results_dict[pair_key] = {
            'statistic': statistic,
            'p_value': p_value,
            'significant': p_value < 0.05,
            'means1': mean1,
            'means2': mean2,
            'difference': difference
        }
    
    return results_dict

# Create a dictionary with your F1 scores
all_f1_scores_dict = {
    'M1': all_f1_scores1,
    'M2': all_f1_scores2,
    'M3': all_f1_scores3,
    'M4': all_f1_scores4
}

# Run the comparison
results = compare_all_model_combinations(all_f1_scores_dict)

# Print results in a nicely formatted table
print("=" * 80)
print("Wilcoxon Signed-Rank Test Results for All Model Combinations")
print("=" * 80)
print(f"{'Comparison':15s} | {'Mean 1':8s} | {'Mean 2':8s} | {'Diff':8s} | {'p-value':10s} | Result")
print("-" * 80)

for pair, result in results.items():
    model1, model2 = pair.split('_vs_')
    significance = "* significant *" if result['significant'] else "not significant"
    direction = ">" if result['difference'] > 0 else "<"
    print(f"{model1} vs {model2:5s} | {result['means1']:.4f} | {result['means2']:.4f} | {result['difference']:.4f} | {result['p_value']:.6f} | {significance}")

print("=" * 80)

# Create a matrix of p-values for visualization
print("\nP-value Matrix (row vs column):")
models = sorted(all_f1_scores_dict.keys())
print(f"{'':8s}", end="")
for model in models:
    print(f"{model:8s}", end="")
print()

for model1 in models:
    print(f"{model1:8s}", end="")
    for model2 in models:
        if model1 == model2:
            print(f"{'---':8s}", end="")
        else:
            pair_key = f"{model1}_vs_{model2}" if f"{model1}_vs_{model2}" in results else f"{model2}_vs_{model1}"
            if pair_key in results:
                p_value = results[pair_key]['p_value']
                print(f"{p_value:.6f}", end="")
            else:
                print(f"{'N/A':8s}", end="")
    print()

Wilcoxon Signed-Rank Test Results for All Model Combinations
Comparison      | Mean 1   | Mean 2   | Diff     | p-value    | Result
--------------------------------------------------------------------------------
M1 vs M2    | 0.7545 | 0.7626 | -0.0081 | 0.400848 | not significant
M1 vs M3    | 0.7545 | 0.6685 | 0.0860 | 0.000000 | * significant *
M1 vs M4    | 0.7545 | 0.7285 | 0.0260 | 0.000717 | * significant *
M2 vs M3    | 0.7626 | 0.6685 | 0.0941 | 0.000000 | * significant *
M2 vs M4    | 0.7626 | 0.7285 | 0.0341 | 0.000010 | * significant *
M3 vs M4    | 0.6685 | 0.7285 | -0.0600 | 0.000000 | * significant *

P-value Matrix (row vs column):
        M1      M2      M3      M4      
M1      ---     0.4008480.0000000.000717
M2      0.400848---     0.0000000.000010
M3      0.0000000.000000---     0.000000
M4      0.0007170.0000100.000000---     
