In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
import os
import ast
from glob import glob
from sklearn import metrics
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

The performance of 3 models are analyzed under their respective header in this notebook (i.e., CheXpert-trained, MIMIC-trained, NIH-trained models)

# CheXpert

In [None]:
def evaluate_csv(csv_file_path, label_name):
    try:
        predict_df = pd.read_csv(csv_file_path)
        predict_df['Gender_Probability'] = predict_df['Gender_Probability'].apply(lambda x: ast.literal_eval(x))

        # Pick correct label column
        if 'Gender' in predict_df.columns:
            y_true = predict_df['Gender']
        elif 'sex' in predict_df.columns:
            y_true = predict_df['sex']
        elif 'Sex' in predict_df.columns:
            y_true = predict_df['Sex']
        elif 'PatientSex' in predict_df.columns:
            y_true = predict_df['PatientSex']
        elif 'gender' in predict_df.columns:
            y_true = predict_df['gender']
        else:
            raise KeyError("Neither 'Gender' nor 'Sex' column found in the data.")

        # Standardize labels
        y_true = y_true.replace({'Male': 'M', 'Female': 'F'})
        y_true = y_true.map({'F': 1, 'M': 0})  # Female = 1, Male = 0

        mask = y_true.notna()
        y_true = y_true[mask]
        y_score = predict_df.loc[mask, 'Gender_Probability'].apply(lambda x: x[0]).values

        # Compute AUROC
        wAUROC = metrics.roc_auc_score(y_true, y_score, average = 'weighted')

        # Compute AUPRC
        wAUPRC = metrics.average_precision_score(y_true, y_score, average='weighted')

        print(f'{label_name} - wAUROC: {wAUROC:.4f}, wAUPRC: {wAUPRC:.4f}')

    except (FileNotFoundError, KeyError, ValueError) as e:
        print(f"Error processing {csv_file_path}: {e}")

# Files
csv_files_with_labels = [
    ('/content/chexpert_gender_brax_test_full.csv', 'BRAX'),
    ('/content/chexpert_gender_chexpert_test.csv', 'CheXpert'),
    ('/content/chexpert_gender_jsrt_test.csv', 'JSRT'),
    ('/content/chexpert_gender_mimic_test.csv', 'MIMIC'),
    ('/content/chexpert_gender_nih_test.csv', 'NIH'),
    ('/content/chexpert_gender_padchest_test_full.csv', 'PadChest'),
    ('/content/chexpert_gender_vindr_test_full.csv', 'VinDR'),
    ('/content/chexpert_gender_vindr_peds_test.csv', 'VinDr-PCXR'),
    ('/content/chexpert_gender_shenzhen_test.csv', 'Shenzhen')
]

for file_path, label in csv_files_with_labels:
    evaluate_csv(file_path, label)


BRAX - wAUROC: 0.9780, wAUPRC: 0.9805
CheXpert - wAUROC: 0.9971, wAUPRC: 0.9959
JSRT - wAUROC: 0.8168, wAUPRC: 0.8412
MIMIC - wAUROC: 0.9970, wAUPRC: 0.9965
NIH - wAUROC: 0.9935, wAUPRC: 0.9917


  predict_df = pd.read_csv(csv_file_path)


PadChest - wAUROC: 0.5445, wAUPRC: 0.5648
VinDR - wAUROC: 0.9652, wAUPRC: 0.9553
VinDr-PCXR - wAUROC: 0.5892, wAUPRC: 0.4752
Shenzhen - wAUROC: 0.9070, wAUPRC: 0.8237


In [None]:
import pandas as pd
import numpy as np
import ast
from sklearn import metrics

import pandas as pd
import numpy as np
import ast
from sklearn import metrics

def bootstrap_wauc(csv_file_path, label_name, n_iterations=1000, random_state=2025):
    try:
        # Load CSV
        df = pd.read_csv(csv_file_path)
        df['Gender_Probability'] = df['Gender_Probability'].apply(lambda x: ast.literal_eval(x))

        # Identify the correct label column
        if 'Gender' in df.columns:
            y_true = df['Gender']
        elif 'sex' in df.columns:
            y_true = df['sex']
        elif 'Sex' in df.columns:
            y_true = df['Sex']
        elif 'PatientSex' in df.columns:
            y_true = df['PatientSex']
        elif 'gender' in df.columns:
            y_true = df['gender']
        else:
            raise KeyError("Neither 'Gender' nor 'Sex' column found in the data.")

        # Normalize labels: Female = 1, Male = 0
        y_true = y_true.replace({'Male': 'M', 'Female': 'F'}).map({'F': 1, 'M': 0})
        df = df[y_true.notna()].copy()
        y_true = y_true[y_true.notna()]
        y_score = df['Gender_Probability'].apply(lambda x: x[0]).values

        # Combine into DataFrame for sampling
        data = pd.DataFrame({'y_true': y_true.values, 'y_score': y_score})

        rng = np.random.default_rng(seed=random_state)

        auroc_scores = []
        auprc_scores = []

        for _ in range(n_iterations):
            sample = data.sample(n=len(data), replace=True, random_state=rng.integers(1e9))
            try:
                wauc = metrics.roc_auc_score(sample['y_true'], sample['y_score'], average = 'weighted')
                wauprc = metrics.average_precision_score(sample['y_true'], sample['y_score'], average = 'weighted')
                auroc_scores.append(wauc)
                auprc_scores.append(wauprc)
            except ValueError:
                continue  # Skip invalid bootstrap samples

        # Compute confidence intervals
        auroc_ci = np.percentile(auroc_scores, [2.5, 97.5])
        auprc_ci = np.percentile(auprc_scores, [2.5, 97.5])

        print(f"{label_name} - AUROC 95% CI: [{auroc_ci[0]:.4f}, {auroc_ci[1]:.4f}], "
              f"AUPRC 95% CI: [{auprc_ci[0]:.4f}, {auprc_ci[1]:.4f}]")

        return auroc_ci, auprc_ci

    except (FileNotFoundError, KeyError, ValueError) as e:
        print(f"Error processing {csv_file_path}: {e}")
        return None, None


csv_files_with_labels = [
    ('/content/chexpert_gender_brax_test_full.csv', 'BRAX'),
    ('/content/chexpert_gender_chexpert_test.csv', 'CheXpert'),
    ('/content/chexpert_gender_jsrt_test.csv', 'JSRT'),
    ('/content/chexpert_gender_mimic_test.csv', 'MIMIC'),
    ('/content/chexpert_gender_nih_test.csv', 'NIH'),
    ('/content/chexpert_gender_padchest_test_full.csv', 'PadChest'),
    ('/content/chexpert_gender_vindr_test_full.csv', 'VinDR'),
    ('/content/chexpert_gender_vindr_peds_test.csv', 'VinDr-PCXR'),
    ('/content/chexpert_gender_shenzhen_test.csv', 'Shenzhen')
]
for file_path, label in csv_files_with_labels:
    bootstrap_wauc(file_path, label)

BRAX - AUROC 95% CI: [0.9763, 0.9794], AUPRC 95% CI: [0.9786, 0.9821]
CheXpert - AUROC 95% CI: [0.9967, 0.9974], AUPRC 95% CI: [0.9954, 0.9964]
JSRT - AUROC 95% CI: [0.7663, 0.8651], AUPRC 95% CI: [0.7852, 0.8909]
MIMIC - AUROC 95% CI: [0.9966, 0.9972], AUPRC 95% CI: [0.9961, 0.9968]
NIH - AUROC 95% CI: [0.9928, 0.9942], AUPRC 95% CI: [0.9907, 0.9926]


  df = pd.read_csv(csv_file_path)


PadChest - AUROC 95% CI: [0.5413, 0.5481], AUPRC 95% CI: [0.5605, 0.5693]
VinDR - AUROC 95% CI: [0.9612, 0.9691], AUPRC 95% CI: [0.9487, 0.9616]
VinDr-PCXR - AUROC 95% CI: [0.5779, 0.6008], AUPRC 95% CI: [0.4598, 0.4920]
Shenzhen - AUROC 95% CI: [0.8797, 0.9330], AUPRC 95% CI: [0.7689, 0.8751]


# MIMIC

In [None]:
# Files
csv_files_with_labels = [
    ('/content/mimic_gender_brax_test_full.csv', 'BRAX'),
    ('/content/mimic_gender_chexpert_test.csv', 'CheXpert'),
    ('/content/mimic_gender_jsrt_test.csv', 'JSRT'),
    ('/content/mimic_gender_mimic_test.csv', 'MIMIC'),
    ('/content/mimic_gender_nih_test.csv', 'NIH'),
    ('/content/mimic_gender_padchest_test_full.csv', 'PadChest'),
    ('/content/mimic_gender_vindr_test_full.csv', 'VinDR'),
    ('/content/mimic_gender_vindr_peds_test.csv', 'VinDr-PCXR'),
    ('/content/mimic_gender_shenzhen_test.csv', 'Shenzhen')
]

for file_path, label in csv_files_with_labels:
    evaluate_csv(file_path, label)


BRAX - wAUROC: 0.9687, wAUPRC: 0.9705
CheXpert - wAUROC: 0.9943, wAUPRC: 0.9922
JSRT - wAUROC: 0.9659, wAUPRC: 0.9687
MIMIC - wAUROC: 0.9969, wAUPRC: 0.9967
NIH - wAUROC: 0.9887, wAUPRC: 0.9848


  predict_df = pd.read_csv(csv_file_path)


PadChest - wAUROC: 0.5627, wAUPRC: 0.5675
VinDR - wAUROC: 0.9308, wAUPRC: 0.9075
VinDr-PCXR - wAUROC: 0.5675, wAUPRC: 0.4638
Shenzhen - wAUROC: 0.8109, wAUPRC: 0.7142


In [None]:
csv_files_with_labels = [
    ('/content/mimic_gender_brax_test_full.csv', 'BRAX'),
    ('/content/mimic_gender_chexpert_test.csv', 'CheXpert'),
    ('/content/mimic_gender_jsrt_test.csv', 'JSRT'),
    ('/content/mimic_gender_mimic_test.csv', 'MIMIC'),
    ('/content/mimic_gender_nih_test.csv', 'NIH'),
    ('/content/mimic_gender_padchest_test_full.csv', 'PadChest'),
    ('/content/mimic_gender_vindr_test_full.csv', 'VinDR'),
    ('/content/mimic_gender_vindr_peds_test.csv', 'VinDr-PCXR'),
    ('/content/mimic_gender_shenzhen_test.csv', 'Shenzhen')
]
for file_path, label in csv_files_with_labels:
    bootstrap_wauc(file_path, label)

BRAX - AUROC 95% CI: [0.9666, 0.9706], AUPRC 95% CI: [0.9682, 0.9727]
CheXpert - AUROC 95% CI: [0.9938, 0.9948], AUPRC 95% CI: [0.9915, 0.9929]
JSRT - AUROC 95% CI: [0.9393, 0.9876], AUPRC 95% CI: [0.9410, 0.9892]
MIMIC - AUROC 95% CI: [0.9966, 0.9972], AUPRC 95% CI: [0.9964, 0.9970]
NIH - AUROC 95% CI: [0.9877, 0.9897], AUPRC 95% CI: [0.9832, 0.9863]


  df = pd.read_csv(csv_file_path)


PadChest - AUROC 95% CI: [0.5592, 0.5662], AUPRC 95% CI: [0.5632, 0.5722]
VinDR - AUROC 95% CI: [0.9256, 0.9362], AUPRC 95% CI: [0.8980, 0.9171]
VinDr-PCXR - AUROC 95% CI: [0.5557, 0.5797], AUPRC 95% CI: [0.4486, 0.4808]
Shenzhen - AUROC 95% CI: [0.7740, 0.8463], AUPRC 95% CI: [0.6577, 0.7679]


# NIH

In [None]:
csv_files_with_labels = [
    ('/content/nih_gender_brax_test_full.csv', 'BRAX'),
    ('/content/nih_gender_chexpert_test.csv', 'CheXpert'),
    ('/content/nih_gender_jsrt_test.csv', 'JSRT'),
    ('/content/nih_gender_mimic_test.csv', 'MIMIC'),
    ('/content/nih_gender_nih_test.csv', 'NIH'),
    ('/content/nih_gender_padchest_test_full.csv', 'PadChest'),
    ('/content/nih_gender_vindr_test_full.csv', 'VinDR'),
    ('/content/nih_gender_vindr_peds_test.csv', 'VinDr-PCXR'),
    ('/content/nih_gender_shenzhen_test.csv', 'Shenzhen')
]

for file_path, label in csv_files_with_labels:
    evaluate_csv(file_path, label)


BRAX - wAUROC: 0.9709, wAUPRC: 0.9747
CheXpert - wAUROC: 0.9615, wAUPRC: 0.9473
JSRT - wAUROC: 0.9764, wAUPRC: 0.9740
MIMIC - wAUROC: 0.9769, wAUPRC: 0.9740
NIH - wAUROC: 0.9962, wAUPRC: 0.9953


  predict_df = pd.read_csv(csv_file_path)


PadChest - wAUROC: 0.5456, wAUPRC: 0.5909
VinDR - wAUROC: 0.9546, wAUPRC: 0.9491
VinDr-PCXR - wAUROC: 0.6049, wAUPRC: 0.4992
Shenzhen - wAUROC: 0.9613, wAUPRC: 0.9169


In [None]:
csv_files_with_labels = [
    ('/content/nih_gender_brax_test_full.csv', 'BRAX'),
    ('/content/nih_gender_chexpert_test.csv', 'CheXpert'),
    ('/content/nih_gender_jsrt_test.csv', 'JSRT'),
    ('/content/nih_gender_mimic_test.csv', 'MIMIC'),
    ('/content/nih_gender_nih_test.csv', 'NIH'),
    ('/content/nih_gender_padchest_test_full.csv', 'PadChest'),
    ('/content/nih_gender_vindr_test_full.csv', 'VinDR'),
    ('/content/nih_gender_vindr_peds_test.csv', 'VinDr-PCXR'),
    ('/content/nih_gender_shenzhen_test.csv', 'Shenzhen')
]


for file_path, label in csv_files_with_labels:
    bootstrap_wauc(file_path, label)

BRAX - AUROC 95% CI: [0.9690, 0.9728], AUPRC 95% CI: [0.9727, 0.9764]
CheXpert - AUROC 95% CI: [0.9598, 0.9631], AUPRC 95% CI: [0.9448, 0.9500]
JSRT - AUROC 95% CI: [0.9523, 0.9965], AUPRC 95% CI: [0.9415, 0.9975]
MIMIC - AUROC 95% CI: [0.9757, 0.9781], AUPRC 95% CI: [0.9726, 0.9755]
NIH - AUROC 95% CI: [0.9957, 0.9966], AUPRC 95% CI: [0.9947, 0.9959]


  df = pd.read_csv(csv_file_path)


PadChest - AUROC 95% CI: [0.5421, 0.5489], AUPRC 95% CI: [0.5865, 0.5953]
VinDR - AUROC 95% CI: [0.9500, 0.9592], AUPRC 95% CI: [0.9425, 0.9555]
VinDr-PCXR - AUROC 95% CI: [0.5933, 0.6165], AUPRC 95% CI: [0.4835, 0.5160]
Shenzhen - AUROC 95% CI: [0.9432, 0.9766], AUPRC 95% CI: [0.8782, 0.9533]
