In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
import os
import ast
from glob import glob
from sklearn import metrics
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

The performance of 3 models are analyzed under their respective header in this notebook (i.e., CheXpert-trained, MIMIC-trained, NIH-trained models)

# CheXpert

In [None]:
import pandas as pd
import ast
from sklearn import metrics
from sklearn.preprocessing import LabelEncoder
# Standardized race/ethnicity categories
STANDARD_RACES = ['Hispanic/Latino', 'Non-Hispanic Asian', 'Non-Hispanic Black',  'Non-Hispanic White']

# Mapping from original labels to standardized labels
RACE_STANDARDIZATION = {
    'Asian': 'Non-Hispanic Asian',
    'Non-Hispanic Asian': 'Non-Hispanic Asian',
    'Black': 'Non-Hispanic Black',
    'Non-Hispanic Black': 'Non-Hispanic Black',
    'Hispanic/Latino': 'Hispanic/Latino',
    'Other': 'Other',
    'White': 'Non-Hispanic White',
    'Non-Hispanic White': 'Non-Hispanic White'
}

def evaluate_race_csv(csv_file_path, label_name):
    try:
        predict_df = pd.read_csv(csv_file_path)

        # Parse the stringified list into a real list
        predict_df['Race/Ethnicity_Probability'] = predict_df['Race/Ethnicity_Probability'].apply(ast.literal_eval)
        predict_df['Race/Ethnicity'] = predict_df['Race/Ethnicity'].map(RACE_STANDARDIZATION)
        predict_df = predict_df[predict_df['Race/Ethnicity'] != 'Other']

        # Encode the textual labels into integers
        label_encoder = LabelEncoder()
        y_true = label_encoder.fit_transform(predict_df['Race/Ethnicity'])

        # Get the list of probabilities
        y_score = predict_df['Race/Ethnicity_Probability'].tolist()

        # Compute weighted AUROC
        wAUROC = metrics.roc_auc_score(
            y_true,
            y_score,
            multi_class='ovr',
            average='weighted'
        )

        # Compute weighted AUPRC
        wAUPRC = metrics.average_precision_score(
            y_true,
            y_score,
            average='weighted'
        )

        print(f'{label_name} - wAUROC: {wAUROC:.4f}, wAUPRC: {wAUPRC:.4f}')

    except (FileNotFoundError, KeyError, ValueError) as e:
        print(f"Error processing {csv_file_path}: {e}")

# Define files and labels
csv_files_with_labels = [
    ('/content/chexpert_race2_chexpert_test.csv', 'CheXpert'),
    ('/content/chexpert_race2_mimic_test.csv', 'MIMIC')
]

# Run evaluations
for file_path, label in csv_files_with_labels:
    evaluate_race_csv(file_path, label)


CheXpert - wAUROC: 0.8931, wAUPRC: 0.8367
MIMIC - wAUROC: 0.9133, wAUPRC: 0.8824


In [None]:
import pandas as pd
import ast
from sklearn import metrics
from sklearn.preprocessing import LabelEncoder
import numpy as np

# Standardized race/ethnicity categories
STANDARD_RACES = ['Hispanic/Latino', 'Non-Hispanic Asian', 'Non-Hispanic Black',  'Non-Hispanic White']

# Mapping from original labels to standardized labels
RACE_STANDARDIZATION = {
    'Asian': 'Non-Hispanic Asian',
    'Non-Hispanic Asian': 'Non-Hispanic Asian',
    'Black': 'Non-Hispanic Black',
    'Non-Hispanic Black': 'Non-Hispanic Black',
    'Hispanic/Latino': 'Hispanic/Latino',
    'Other': 'Other',
    'White': 'Non-Hispanic White',
    'Non-Hispanic White': 'Non-Hispanic White'
}

def evaluate_race_csv(csv_file_path, label_name):
    try:
        predict_df = pd.read_csv(csv_file_path)

        # Parse the stringified list into a real list
        predict_df['Race/Ethnicity_Probability'] = predict_df['Race/Ethnicity_Probability'].apply(ast.literal_eval)
        predict_df['Race/Ethnicity'] = predict_df['Race/Ethnicity'].map(RACE_STANDARDIZATION)
        predict_df = predict_df[predict_df['Race/Ethnicity'] != 'Other']


        # Encode the textual labels into integers
        label_encoder = LabelEncoder()
        y_true = label_encoder.fit_transform(predict_df['Race/Ethnicity'])
        classes = label_encoder.classes_

        # Get the list of probabilities and convert to numpy array
        y_score = np.array(predict_df['Race/Ethnicity_Probability'].tolist())

        # Calculate class prevalences (weights)
        class_counts = np.bincount(y_true)
        class_weights = class_counts / len(y_true)

        # Initialize lists to store per-class metrics
        auroc_scores = []
        auprc_scores = []

        # Compute metrics for each class using one-vs-rest approach
        for class_idx in range(len(classes)):
            # Create binary labels for current class
            y_true_binary = (y_true == class_idx).astype(int)

            # Get probabilities for current class
            y_score_class = y_score[:, class_idx]

            # Compute AUROC
            if len(np.unique(y_true_binary)) >= 2:  # Need at least one positive and one negative sample
                auroc = metrics.roc_auc_score(y_true_binary, y_score_class)
                auroc_scores.append(auroc)
            else:
                auroc_scores.append(np.nan)

            # Compute AUPRC
            auprc = metrics.average_precision_score(y_true_binary, y_score_class)
            auprc_scores.append(auprc)

        # Calculate manually weighted averages (ignoring NaN values)
        valid_auroc = ~np.isnan(auroc_scores)
        weighted_auroc = np.sum(np.array(auroc_scores)[valid_auroc] * class_weights[valid_auroc]) / np.sum(class_weights[valid_auroc])

        weighted_auprc = np.sum(np.array(auprc_scores) * class_weights) / np.sum(class_weights)

        # Print results
        print(f'\n{label_name} Results:')
        print('Class\t\tPrevalence\tAUROC\t\tAUPRC')
        for i, class_name in enumerate(classes):
            print(f'{class_name:<15}{class_weights[i]:.4f}\t\t{auroc_scores[i]:.4f}\t\t{auprc_scores[i]:.4f}')

        print(f'\n{label_name} - Manually Weighted Average - wAUROC: {weighted_auroc:.4f}, wAUPRC: {weighted_auprc:.4f}')

    except (FileNotFoundError, KeyError, ValueError) as e:
        print(f"Error processing {csv_file_path}: {e}")

# Define files and labels
csv_files_with_labels = [
    ('/content/chexpert_race2_chexpert_test.csv', 'CheXpert'),
    ('/content/chexpert_race2_mimic_test.csv', 'MIMIC')
]

# Run evaluations
for file_path, label in csv_files_with_labels:
    evaluate_race_csv(file_path, label)


CheXpert Results:
Class		Prevalence	AUROC		AUPRC
Hispanic/Latino0.1360		0.8057		0.4892
Non-Hispanic Asian0.1432		0.9290		0.7646
Non-Hispanic Black0.0573		0.9491		0.6827
Non-Hispanic White0.6635		0.8984		0.9368

CheXpert - Manually Weighted Average - wAUROC: 0.8931, wAUPRC: 0.8367

MIMIC Results:
Class		Prevalence	AUROC		AUPRC
Hispanic/Latino0.0549		0.6975		0.1714
Non-Hispanic Asian0.0383		0.9436		0.6549
Non-Hispanic Black0.1764		0.9466		0.8227
Non-Hispanic White0.7303		0.9199		0.9623

MIMIC - Manually Weighted Average - wAUROC: 0.9133, wAUPRC: 0.8824


In [None]:
import pandas as pd
import ast
from sklearn import metrics
from sklearn.preprocessing import LabelEncoder
import numpy as np

STANDARD_RACES = ['Hispanic/Latino', 'Non-Hispanic Asian', 'Non-Hispanic Black',  'Non-Hispanic White']
RACE_STANDARDIZATION = {
    'Asian': 'Non-Hispanic Asian',
    'Non-Hispanic Asian': 'Non-Hispanic Asian',
    'Black': 'Non-Hispanic Black',
    'Non-Hispanic Black': 'Non-Hispanic Black',
    'Hispanic/Latino': 'Hispanic/Latino',
    'Other': 'Other',
    'White': 'Non-Hispanic White',
    'Non-Hispanic White': 'Non-Hispanic White'
}

def bootstrap_race_metrics(csv_file_path, label_name, n_iterations=1000, random_state=2025):
    try:
        df = pd.read_csv(csv_file_path)
        df['Race/Ethnicity_Probability'] = df['Race/Ethnicity_Probability'].apply(ast.literal_eval)
        df['Race/Ethnicity'] = df['Race/Ethnicity'].map(RACE_STANDARDIZATION)
        df = df[df['Race/Ethnicity'] != 'Other'].copy()

        label_encoder = LabelEncoder()
        y_true_all = label_encoder.fit_transform(df['Race/Ethnicity'])
        y_score_all = np.array(df['Race/Ethnicity_Probability'].tolist())
        classes = label_encoder.classes_
        n_classes = len(classes)

        data = pd.DataFrame({'y_true': y_true_all.tolist(), 'y_score': y_score_all.tolist()})
        rng = np.random.default_rng(seed=random_state)

        auroc_bootstrap = []
        auprc_bootstrap = []

        for _ in range(n_iterations):
            sample = data.sample(n=len(data), replace=True, random_state=rng.integers(1e9))
            y_true = np.array(sample['y_true'].tolist())
            y_score = np.array(sample['y_score'].tolist())

            auroc_scores = []
            auprc_scores = []
            class_counts = np.bincount(y_true, minlength=n_classes)
            class_weights = class_counts / len(y_true)

            for class_idx in range(n_classes):
                y_true_binary = (y_true == class_idx).astype(int)
                y_score_class = y_score[:, class_idx]

                if len(np.unique(y_true_binary)) >= 2:
                    auroc = metrics.roc_auc_score(y_true_binary, y_score_class)
                else:
                    auroc = np.nan
                auprc = metrics.average_precision_score(y_true_binary, y_score_class)

                auroc_scores.append(auroc)
                auprc_scores.append(auprc)

            # Weighted average ignoring NaN in AUROC
            valid_auroc = ~np.isnan(auroc_scores)
            weighted_auroc = np.sum(np.array(auroc_scores)[valid_auroc] * class_weights[valid_auroc]) / np.sum(class_weights[valid_auroc])
            weighted_auprc = np.sum(np.array(auprc_scores) * class_weights) / np.sum(class_weights)

            auroc_bootstrap.append(weighted_auroc)
            auprc_bootstrap.append(weighted_auprc)

        # Compute confidence intervals
        auroc_ci = np.percentile(auroc_bootstrap, [2.5, 97.5])
        auprc_ci = np.percentile(auprc_bootstrap, [2.5, 97.5])

        print(f"\n{label_name} - Bootstrapped Weighted Metrics:")
        print(f"wAUROC 95% CI: [{auroc_ci[0]:.4f}, {auroc_ci[1]:.4f}]")
        print(f"wAUPRC 95% CI: [{auprc_ci[0]:.4f}, {auprc_ci[1]:.4f}]")

        return auroc_ci, auprc_ci

    except (FileNotFoundError, KeyError, ValueError) as e:
        print(f"Error processing {csv_file_path}: {e}")
        return None, None


# Run bootstrap evaluation
csv_files_with_labels = [
    ('/content/chexpert_race2_chexpert_test.csv', 'CheXpert'),
    ('/content/chexpert_race2_mimic_test.csv', 'MIMIC')
]

for file_path, label in csv_files_with_labels:
    bootstrap_race_metrics(file_path, label)



CheXpert - Bootstrapped Weighted Metrics:
wAUROC 95% CI: [0.8895, 0.8965]
wAUPRC 95% CI: [0.8323, 0.8414]

MIMIC - Bootstrapped Weighted Metrics:
wAUROC 95% CI: [0.9101, 0.9162]
wAUPRC 95% CI: [0.8789, 0.8863]


In [None]:
import pandas as pd
import ast
from sklearn import metrics
from sklearn.preprocessing import LabelEncoder
import numpy as np

STANDARD_RACES = ['Hispanic/Latino', 'Non-Hispanic Asian', 'Non-Hispanic Black',  'Non-Hispanic White']
RACE_STANDARDIZATION = {
    'Asian': 'Non-Hispanic Asian',
    'Non-Hispanic Asian': 'Non-Hispanic Asian',
    'Black': 'Non-Hispanic Black',
    'Non-Hispanic Black': 'Non-Hispanic Black',
    'Hispanic/Latino': 'Hispanic/Latino',
    'Other': 'Other',
    'White': 'Non-Hispanic White',
    'Non-Hispanic White': 'Non-Hispanic White'
}

def bootstrap_race_metrics(csv_file_path, label_name, n_iterations=1000, random_state=2025):
    try:
        df = pd.read_csv(csv_file_path)
        df['Race/Ethnicity_Probability'] = df['Race/Ethnicity_Probability'].apply(ast.literal_eval)
        df['Race/Ethnicity'] = df['Race/Ethnicity'].map(RACE_STANDARDIZATION)
        df = df[df['Race/Ethnicity'] != 'Other'].copy()

        label_encoder = LabelEncoder()
        y_true_all = label_encoder.fit_transform(df['Race/Ethnicity'])
        y_score_all = np.array(df['Race/Ethnicity_Probability'].tolist())
        classes = label_encoder.classes_
        n_classes = len(classes)

        data = pd.DataFrame({'y_true': y_true_all.tolist(), 'y_score': y_score_all.tolist()})
        rng = np.random.default_rng(seed=random_state)

        auroc_bootstrap = []
        auprc_bootstrap = []
        per_class_aurocs = [[] for _ in range(n_classes)]

        for _ in range(n_iterations):
            sample = data.sample(n=len(data), replace=True, random_state=rng.integers(1e9))
            y_true = np.array(sample['y_true'].tolist())
            y_score = np.array(sample['y_score'].tolist())

            auroc_scores = []
            auprc_scores = []
            class_counts = np.bincount(y_true, minlength=n_classes)
            class_weights = class_counts / len(y_true)

            for class_idx in range(n_classes):
                y_true_binary = (y_true == class_idx).astype(int)
                y_score_class = y_score[:, class_idx]

                if len(np.unique(y_true_binary)) >= 2:
                    auroc = metrics.roc_auc_score(y_true_binary, y_score_class)
                else:
                    auroc = np.nan
                auprc = metrics.average_precision_score(y_true_binary, y_score_class)

                auroc_scores.append(auroc)
                auprc_scores.append(auprc)

                # Store for per-class CI
                per_class_aurocs[class_idx].append(auroc)

            valid_auroc = ~np.isnan(auroc_scores)
            weighted_auroc = np.sum(np.array(auroc_scores)[valid_auroc] * class_weights[valid_auroc]) / np.sum(class_weights[valid_auroc])
            weighted_auprc = np.sum(np.array(auprc_scores) * class_weights) / np.sum(class_weights)

            auroc_bootstrap.append(weighted_auroc)
            auprc_bootstrap.append(weighted_auprc)

        auroc_ci = np.percentile(auroc_bootstrap, [2.5, 97.5])
        auprc_ci = np.percentile(auprc_bootstrap, [2.5, 97.5])

        print(f"\n{label_name} - Bootstrapped Weighted Metrics:")
        print(f"wAUROC 95% CI: [{auroc_ci[0]:.4f}, {auroc_ci[1]:.4f}]")
        print(f"wAUPRC 95% CI: [{auprc_ci[0]:.4f}, {auprc_ci[1]:.4f}]\n")

        # Per-class AUROC confidence intervals
        print(f"{label_name} - Per-Class AUROC 95% CIs:")
        for i, class_name in enumerate(classes):
            class_aurocs = np.array(per_class_aurocs[i])
            class_aurocs = class_aurocs[~np.isnan(class_aurocs)]
            if len(class_aurocs) == 0:
                print(f"{class_name:<25} Insufficient data for AUROC")
            else:
                ci = np.percentile(class_aurocs, [2.5, 97.5])
                print(f"{class_name:<25} AUROC 95% CI: [{ci[0]:.4f}, {ci[1]:.4f}]")

        return auroc_ci, auprc_ci

    except (FileNotFoundError, KeyError, ValueError) as e:
        print(f"Error processing {csv_file_path}: {e}")
        return None, None

# Run bootstrap evaluation
csv_files_with_labels = [
    ('/content/chexpert_race2_chexpert_test.csv', 'CheXpert'),
    ('/content/chexpert_race2_mimic_test.csv', 'MIMIC')
]

for file_path, label in csv_files_with_labels:
    bootstrap_race_metrics(file_path, label)


CheXpert - Bootstrapped Weighted Metrics:
wAUROC 95% CI: [0.8895, 0.8965]
wAUPRC 95% CI: [0.8323, 0.8414]

CheXpert - Per-Class AUROC 95% CIs:
Hispanic/Latino           AUROC 95% CI: [0.7979, 0.8130]
Non-Hispanic Asian        AUROC 95% CI: [0.9247, 0.9334]
Non-Hispanic Black        AUROC 95% CI: [0.9442, 0.9542]
Non-Hispanic White        AUROC 95% CI: [0.8944, 0.9020]

MIMIC - Bootstrapped Weighted Metrics:
wAUROC 95% CI: [0.9101, 0.9162]
wAUPRC 95% CI: [0.8789, 0.8863]

MIMIC - Per-Class AUROC 95% CIs:
Hispanic/Latino           AUROC 95% CI: [0.6855, 0.7102]
Non-Hispanic Asian        AUROC 95% CI: [0.9375, 0.9496]
Non-Hispanic Black        AUROC 95% CI: [0.9438, 0.9493]
Non-Hispanic White        AUROC 95% CI: [0.9163, 0.9231]


# MIMIC

In [None]:
import pandas as pd
import ast
from sklearn import metrics
from sklearn.preprocessing import LabelEncoder


def evaluate_race_csv(csv_file_path, label_name):
    try:
        predict_df = pd.read_csv(csv_file_path)

        # Parse the stringified list into a real list
        predict_df['Race/Ethnicity_Probability'] = predict_df['Race/Ethnicity_Probability'].apply(ast.literal_eval)

        # Encode the textual labels into integers
        label_encoder = LabelEncoder()
        y_true = label_encoder.fit_transform(predict_df['Race/Ethnicity'])

        # Get the list of probabilities
        y_score = predict_df['Race/Ethnicity_Probability'].tolist()

        # Compute weighted AUROC
        wAUROC = metrics.roc_auc_score(
            y_true,
            y_score,
            multi_class='ovr',
            average='weighted'
        )

        # Compute weighted AUPRC
        wAUPRC = metrics.average_precision_score(
            y_true,
            y_score,
            average='weighted'
        )

        print(f'{label_name} - wAUROC: {wAUROC:.4f}, wAUPRC: {wAUPRC:.4f}')

    except (FileNotFoundError, KeyError, ValueError) as e:
        print(f"Error processing {csv_file_path}: {e}")

# Define files and labels
csv_files_with_labels = [
    ('/content/mimic_race_mimic_test.csv', 'MIMIC')
]

# Run evaluations
for file_path, label in csv_files_with_labels:
    evaluate_race_csv(file_path, label)


MIMIC - wAUROC: 0.9046, wAUPRC: 0.8439


In [None]:
import pandas as pd
import ast
from sklearn import metrics
from sklearn.preprocessing import LabelEncoder
import numpy as np

# Standardized race/ethnicity categories
STANDARD_RACES = ['Asian', 'Black', 'Hispanic/Latino', 'Other', 'White']

# Mapping from original labels to standardized labels
RACE_STANDARDIZATION = {
    'Asian': 'Asian',
    'Non-Hispanic Asian': 'Asian',
    'Black': 'Black',
    'Non-Hispanic Black': 'Black',
    'Hispanic/Latino': 'Hispanic/Latino',
    'Other': 'Other',
    'White': 'White',
    'Non-Hispanic White': 'White'
}

def evaluate_race_csv(csv_file_path, label_name):
    try:
        predict_df = pd.read_csv(csv_file_path)

        # Parse probabilities and standardize race labels
        predict_df['Race/Ethnicity_Probability'] = predict_df['Race/Ethnicity_Probability'].apply(ast.literal_eval)
        predict_df['Race/Ethnicity'] = predict_df['Race/Ethnicity'].map(RACE_STANDARDIZATION)
        # Encode the textual labels into integers
        label_encoder = LabelEncoder()
        y_true = label_encoder.fit_transform(predict_df['Race/Ethnicity'])
        classes = label_encoder.classes_

        # Get the list of probabilities and convert to numpy array
        y_score = np.array(predict_df['Race/Ethnicity_Probability'].tolist())

        # Calculate class prevalences (weights)
        class_counts = np.bincount(y_true)
        class_weights = class_counts / len(y_true)

        # Initialize lists to store per-class metrics
        auroc_scores = []
        auprc_scores = []

        # Compute metrics for each class using one-vs-rest approach
        for class_idx in range(len(classes)):
          if class_idx == 3 and label_name == 'CheXpert':
            # Create binary labels for current class
            y_true_binary = (y_true == 3).astype(int)

            # Get probabilities for current class
            y_score_class = y_score[:, 4]

            # Compute AUROC
            if len(np.unique(y_true_binary)) >= 2:  # Need at least one positive and one negative sample
                auroc = metrics.roc_auc_score(y_true_binary, y_score_class)
                auroc_scores.append(auroc)
            else:
                auroc_scores.append(np.nan)
            auprc = metrics.average_precision_score(y_true_binary, y_score_class)
            auprc_scores.append(auprc)
          else:
            # Create binary labels for current class
            y_true_binary = (y_true == class_idx).astype(int)

            # Get probabilities for current class
            y_score_class = y_score[:, class_idx]

            # Compute AUROC
            if len(np.unique(y_true_binary)) >= 2:  # Need at least one positive and one negative sample
                auroc = metrics.roc_auc_score(y_true_binary, y_score_class)
                auroc_scores.append(auroc)
            else:
                auroc_scores.append(np.nan)

            # Compute AUPRC
            auprc = metrics.average_precision_score(y_true_binary, y_score_class)
            auprc_scores.append(auprc)

        # Calculate manually weighted averages (ignoring NaN values)
        valid_auroc = ~np.isnan(auroc_scores)
        weighted_auroc = np.sum(np.array(auroc_scores)[valid_auroc] * class_weights[valid_auroc]) / np.sum(class_weights[valid_auroc])

        weighted_auprc = np.sum(np.array(auprc_scores) * class_weights) / np.sum(class_weights)

        # Print results
        print(f'\n{label_name} Results:')
        print('Class\t\tPrevalence\tAUROC\t\tAUPRC')
        for i, class_name in enumerate(classes):
            print(f'{class_name:<15}{class_weights[i]:.4f}\t\t{auroc_scores[i]:.4f}\t\t{auprc_scores[i]:.4f}')

        print(f'\n{label_name} - Manually Weighted Average - wAUROC: {weighted_auroc:.4f}, wAUPRC: {weighted_auprc:.4f}')

    except (FileNotFoundError, KeyError, ValueError) as e:
        print(f"Error processing {csv_file_path}: {e}")

# Define files and labels
csv_files_with_labels = [
    ('/content/mimic_race_chexpert_race_v2_test.csv', 'CheXpert'),
    ('/content/mimic_race_mimic_test.csv', 'MIMIC')
]

# Run evaluations
for file_path, label in csv_files_with_labels:
    evaluate_race_csv(file_path, label)


CheXpert Results:
Class		Prevalence	AUROC		AUPRC
Asian          0.1432		0.8565		0.5727
Black          0.0573		0.9222		0.5933
Hispanic/Latino0.1360		0.7098		0.2902
White          0.6635		0.8349		0.8933

CheXpert - Manually Weighted Average - wAUROC: 0.8259, wAUPRC: 0.7481

MIMIC Results:
Class		Prevalence	AUROC		AUPRC
Asian          0.0372		0.9246		0.5131
Black          0.1711		0.9487		0.8121
Hispanic/Latino0.0533		0.7892		0.1988
Other          0.0301		0.6410		0.0517
White          0.7083		0.9128		0.9511

MIMIC - Manually Weighted Average - wAUROC: 0.9046, wAUPRC: 0.8439


In [None]:
import pandas as pd
import ast
from sklearn import metrics
from sklearn.preprocessing import LabelEncoder
import numpy as np

# Standardized race/ethnicity categories
STANDARD_RACES = ['Asian', 'Black', 'Hispanic/Latino', 'Other', 'White']

# Mapping from original labels to standardized labels
RACE_STANDARDIZATION = {
    'Asian': 'Asian',
    'Non-Hispanic Asian': 'Asian',
    'Black': 'Black',
    'Non-Hispanic Black': 'Black',
    'Hispanic/Latino': 'Hispanic/Latino',
    'Other': 'Other',
    'White': 'White',
    'Non-Hispanic White': 'White'
}

def bootstrap_race_metrics(csv_file_path, label_name, n_iterations=1000, random_state=2025):
    try:
        df = pd.read_csv(csv_file_path)
        df['Race/Ethnicity_Probability'] = df['Race/Ethnicity_Probability'].apply(ast.literal_eval)
        df['Race/Ethnicity'] = df['Race/Ethnicity'].map(RACE_STANDARDIZATION)
        label_encoder = LabelEncoder()
        y_true_all = label_encoder.fit_transform(df['Race/Ethnicity'])
        y_score_all = np.array(df['Race/Ethnicity_Probability'].tolist())
        classes = label_encoder.classes_
        n_classes = len(classes)

        data = pd.DataFrame({'y_true': y_true_all.tolist(), 'y_score': y_score_all.tolist()})
        rng = np.random.default_rng(seed=random_state)

        auroc_bootstrap = []
        auprc_bootstrap = []

        for _ in range(n_iterations):
            sample = data.sample(n=len(data), replace=True, random_state=rng.integers(1e9))
            y_true = np.array(sample['y_true'].tolist())
            y_score = np.array(sample['y_score'].tolist())

            auroc_scores = []
            auprc_scores = []
            class_counts = np.bincount(y_true, minlength=n_classes)
            class_weights = class_counts / len(y_true)

            for class_idx in range(n_classes):
                y_true_binary = (y_true == class_idx).astype(int)

                # Special case for CheXpert "Other" class index 3
                if class_idx == 3 and label_name == 'CheXpert':
                    y_score_class = y_score[:, 4]
                else:
                    y_score_class = y_score[:, class_idx]

                if len(np.unique(y_true_binary)) >= 2:
                    auroc = metrics.roc_auc_score(y_true_binary, y_score_class)
                else:
                    auroc = np.nan
                auprc = metrics.average_precision_score(y_true_binary, y_score_class)

                auroc_scores.append(auroc)
                auprc_scores.append(auprc)

            valid_auroc = ~np.isnan(auroc_scores)
            weighted_auroc = np.sum(np.array(auroc_scores)[valid_auroc] * class_weights[valid_auroc]) / np.sum(class_weights[valid_auroc])
            weighted_auprc = np.sum(np.array(auprc_scores) * class_weights) / np.sum(class_weights)

            auroc_bootstrap.append(weighted_auroc)
            auprc_bootstrap.append(weighted_auprc)

        auroc_ci = np.percentile(auroc_bootstrap, [2.5, 97.5])
        auprc_ci = np.percentile(auprc_bootstrap, [2.5, 97.5])

        print(f"\n{label_name} - Bootstrapped Weighted Metrics:")
        print(f"wAUROC 95% CI: [{auroc_ci[0]:.4f}, {auroc_ci[1]:.4f}]")
        print(f"wAUPRC 95% CI: [{auprc_ci[0]:.4f}, {auprc_ci[1]:.4f}]")

        return auroc_ci, auprc_ci

    except (FileNotFoundError, KeyError, ValueError) as e:
        print(f"Error processing {csv_file_path}: {e}")
        return None, None


# Run bootstrap evaluation on your new files
csv_files_with_labels = [
    ('/content/mimic_race_chexpert_race_v2_test.csv', 'CheXpert'),
    ('/content/mimic_race_mimic_test.csv', 'MIMIC')
]

for path, label in csv_files_with_labels:
    bootstrap_race_metrics(path, label)



CheXpert - Bootstrapped Weighted Metrics:
wAUROC 95% CI: [0.8210, 0.8304]
wAUPRC 95% CI: [0.7425, 0.7540]

MIMIC - Bootstrapped Weighted Metrics:
wAUROC 95% CI: [0.9015, 0.9078]
wAUPRC 95% CI: [0.8401, 0.8480]


In [None]:
import pandas as pd
import ast
from sklearn import metrics
from sklearn.preprocessing import LabelEncoder
import numpy as np

# Standardized race/ethnicity categories
STANDARD_RACES = ['Asian', 'Black', 'Hispanic/Latino', 'Other', 'White']

# Mapping from original labels to standardized labels
RACE_STANDARDIZATION = {
    'Asian': 'Asian',
    'Non-Hispanic Asian': 'Asian',
    'Black': 'Black',
    'Non-Hispanic Black': 'Black',
    'Hispanic/Latino': 'Hispanic/Latino',
    'Other': 'Other',
    'White': 'White',
    'Non-Hispanic White': 'White'
}

def bootstrap_race_metrics(csv_file_path, label_name, n_iterations=1000, random_state=2025):
    try:
        df = pd.read_csv(csv_file_path)
        df['Race/Ethnicity_Probability'] = df['Race/Ethnicity_Probability'].apply(ast.literal_eval)
        df['Race/Ethnicity'] = df['Race/Ethnicity'].map(RACE_STANDARDIZATION)
        label_encoder = LabelEncoder()
        y_true_all = label_encoder.fit_transform(df['Race/Ethnicity'])
        y_score_all = np.array(df['Race/Ethnicity_Probability'].tolist())
        classes = label_encoder.classes_
        n_classes = len(classes)

        data = pd.DataFrame({'y_true': y_true_all.tolist(), 'y_score': y_score_all.tolist()})
        rng = np.random.default_rng(seed=random_state)

        # Store AUROC and AUPRC for weighted and each class
        auroc_bootstrap = []
        auprc_bootstrap = []
        per_class_aurocs = [[] for _ in range(n_classes)]

        for _ in range(n_iterations):
            sample = data.sample(n=len(data), replace=True, random_state=rng.integers(1e9))
            y_true = np.array(sample['y_true'].tolist())
            y_score = np.array(sample['y_score'].tolist())

            auroc_scores = []
            auprc_scores = []
            class_counts = np.bincount(y_true, minlength=n_classes)
            class_weights = class_counts / len(y_true)

            for class_idx in range(n_classes):
                y_true_binary = (y_true == class_idx).astype(int)

                # Special case for CheXpert "Other" class index 3
                if class_idx == 3 and label_name == 'CheXpert':
                    y_score_class = y_score[:, 4]
                else:
                    y_score_class = y_score[:, class_idx]

                if len(np.unique(y_true_binary)) >= 2:
                    auroc = metrics.roc_auc_score(y_true_binary, y_score_class)
                else:
                    auroc = np.nan
                auprc = metrics.average_precision_score(y_true_binary, y_score_class)

                auroc_scores.append(auroc)
                auprc_scores.append(auprc)

                # Collect for CI computation
                per_class_aurocs[class_idx].append(auroc)

            valid_auroc = ~np.isnan(auroc_scores)
            weighted_auroc = np.sum(np.array(auroc_scores)[valid_auroc] * class_weights[valid_auroc]) / np.sum(class_weights[valid_auroc])
            weighted_auprc = np.sum(np.array(auprc_scores) * class_weights) / np.sum(class_weights)

            auroc_bootstrap.append(weighted_auroc)
            auprc_bootstrap.append(weighted_auprc)

        # Compute weighted CIs
        auroc_ci = np.percentile(auroc_bootstrap, [2.5, 97.5])
        auprc_ci = np.percentile(auprc_bootstrap, [2.5, 97.5])

        print(f"\n{label_name} - Bootstrapped Weighted Metrics:")
        print(f"wAUROC 95% CI: [{auroc_ci[0]:.4f}, {auroc_ci[1]:.4f}]")
        print(f"wAUPRC 95% CI: [{auprc_ci[0]:.4f}, {auprc_ci[1]:.4f}]")

        # Compute and print per-class AUROC CIs
        print(f"\n{label_name} - Per-Class AUROC 95% Confidence Intervals:")
        for i, class_name in enumerate(classes):
            class_aurocs = [x for x in per_class_aurocs[i] if not np.isnan(x)]
            if len(class_aurocs) > 0:
                ci_low, ci_high = np.percentile(class_aurocs, [2.5, 97.5])
                print(f"{class_name:<20}: [{ci_low:.4f}, {ci_high:.4f}]")
            else:
                print(f"{class_name:<20}: AUROC CI not computable (insufficient variation)")

        return auroc_ci, auprc_ci

    except (FileNotFoundError, KeyError, ValueError) as e:
        print(f"Error processing {csv_file_path}: {e}")
        return None, None


# Run bootstrap evaluation
csv_files_with_labels = [
    ('/content/mimic_race_chexpert_race_v2_test.csv', 'CheXpert'),
    ('/content/mimic_race_mimic_test.csv', 'MIMIC')
]

for path, label in csv_files_with_labels:
    bootstrap_race_metrics(path, label)



CheXpert - Bootstrapped Weighted Metrics:
wAUROC 95% CI: [0.8210, 0.8304]
wAUPRC 95% CI: [0.7425, 0.7540]

CheXpert - Per-Class AUROC 95% Confidence Intervals:
Asian               : [0.8506, 0.8631]
Black               : [0.9152, 0.9290]
Hispanic/Latino     : [0.7014, 0.7179]
White               : [0.8296, 0.8396]

MIMIC - Bootstrapped Weighted Metrics:
wAUROC 95% CI: [0.9015, 0.9078]
wAUPRC 95% CI: [0.8401, 0.8480]

MIMIC - Per-Class AUROC 95% Confidence Intervals:
Asian               : [0.9187, 0.9311]
Black               : [0.9462, 0.9512]
Hispanic/Latino     : [0.7799, 0.7978]
Other               : [0.6250, 0.6558]
White               : [0.9095, 0.9160]


# We also trained a classifier on CheXpert where only non-Hispanic Asian, non-Hispanic Black and non-Hispanic White individuals were included. These are the results

# CheXpert

In [None]:
import pandas as pd
import ast
from sklearn import metrics
from sklearn.preprocessing import LabelEncoder
# Standardized race/ethnicity categories
STANDARD_RACES = ['Hispanic/Latino', 'Non-Hispanic Asian', 'Non-Hispanic Black',  'Non-Hispanic White']

# Mapping from original labels to standardized labels
RACE_STANDARDIZATION = {
    'Asian': 'Non-Hispanic Asian',
    'Non-Hispanic Asian': 'Non-Hispanic Asian',
    'Black': 'Non-Hispanic Black',
    'Non-Hispanic Black': 'Non-Hispanic Black',
    'Hispanic/Latino': 'Hispanic/Latino',
    'Other': 'Other',
    'White': 'Non-Hispanic White',
    'Non-Hispanic White': 'Non-Hispanic White'
}

def evaluate_race_csv(csv_file_path, label_name):
    try:
        predict_df = pd.read_csv(csv_file_path)

        # Parse the stringified list into a real list
        predict_df['Race/Ethnicity_Probability'] = predict_df['Race/Ethnicity_Probability'].apply(ast.literal_eval)
        predict_df['Race/Ethnicity'] = predict_df['Race/Ethnicity'].map(RACE_STANDARDIZATION)
        predict_df = predict_df[predict_df['Race/Ethnicity'] != 'Other']
        predict_df = predict_df[predict_df['Race/Ethnicity'] != 'Hispanic/Latino']

        # Encode the textual labels into integers
        label_encoder = LabelEncoder()
        y_true = label_encoder.fit_transform(predict_df['Race/Ethnicity'])

        # Get the list of probabilities
        y_score = predict_df['Race/Ethnicity_Probability'].tolist()

        # Compute weighted AUROC
        wAUROC = metrics.roc_auc_score(
            y_true,
            y_score,
            multi_class='ovr',
            average='weighted'
        )

        # Compute weighted AUPRC
        wAUPRC = metrics.average_precision_score(
            y_true,
            y_score,
            average='weighted'
        )

        print(f'{label_name} - wAUROC: {wAUROC:.4f}, wAUPRC: {wAUPRC:.4f}')

    except (FileNotFoundError, KeyError, ValueError) as e:
        print(f"Error processing {csv_file_path}: {e}")

# Define files and labels
csv_files_with_labels = [
    ('/content/chexpert_race1_chexpert_test.csv', 'CheXpert'),
    ('/content/chexpert_race1_mimic_test.csv', 'MIMIC')
]

# Run evaluations
for file_path, label in csv_files_with_labels:
    evaluate_race_csv(file_path, label)


CheXpert - wAUROC: 0.9362, wAUPRC: 0.9312
MIMIC - wAUROC: 0.9490, wAUPRC: 0.9454


In [None]:
import pandas as pd
import ast
from sklearn import metrics
from sklearn.preprocessing import LabelEncoder
import numpy as np

# Standardized race/ethnicity categories
STANDARD_RACES = ['Hispanic/Latino', 'Non-Hispanic Asian', 'Non-Hispanic Black',  'Non-Hispanic White']

# Mapping from original labels to standardized labels
RACE_STANDARDIZATION = {
    'Asian': 'Non-Hispanic Asian',
    'Non-Hispanic Asian': 'Non-Hispanic Asian',
    'Black': 'Non-Hispanic Black',
    'Non-Hispanic Black': 'Non-Hispanic Black',
    'Hispanic/Latino': 'Hispanic/Latino',
    'Other': 'Other',
    'White': 'Non-Hispanic White',
    'Non-Hispanic White': 'Non-Hispanic White'
}

def evaluate_race_csv(csv_file_path, label_name):
    try:
        predict_df = pd.read_csv(csv_file_path)

        # Parse the stringified list into a real list
        predict_df['Race/Ethnicity_Probability'] = predict_df['Race/Ethnicity_Probability'].apply(ast.literal_eval)
        predict_df['Race/Ethnicity'] = predict_df['Race/Ethnicity'].map(RACE_STANDARDIZATION)
        predict_df = predict_df[predict_df['Race/Ethnicity'] != 'Other']
        predict_df = predict_df[predict_df['Race/Ethnicity'] != 'Hispanic/Latino']


        # Encode the textual labels into integers
        label_encoder = LabelEncoder()
        y_true = label_encoder.fit_transform(predict_df['Race/Ethnicity'])
        classes = label_encoder.classes_

        # Get the list of probabilities and convert to numpy array
        y_score = np.array(predict_df['Race/Ethnicity_Probability'].tolist())

        # Calculate class prevalences (weights)
        class_counts = np.bincount(y_true)
        class_weights = class_counts / len(y_true)

        # Initialize lists to store per-class metrics
        auroc_scores = []
        auprc_scores = []

        # Compute metrics for each class using one-vs-rest approach
        for class_idx in range(len(classes)):
            # Create binary labels for current class
            y_true_binary = (y_true == class_idx).astype(int)

            # Get probabilities for current class
            y_score_class = y_score[:, class_idx]

            # Compute AUROC
            if len(np.unique(y_true_binary)) >= 2:  # Need at least one positive and one negative sample
                auroc = metrics.roc_auc_score(y_true_binary, y_score_class)
                auroc_scores.append(auroc)
            else:
                auroc_scores.append(np.nan)

            # Compute AUPRC
            auprc = metrics.average_precision_score(y_true_binary, y_score_class)
            auprc_scores.append(auprc)

        # Calculate manually weighted averages (ignoring NaN values)
        valid_auroc = ~np.isnan(auroc_scores)
        weighted_auroc = np.sum(np.array(auroc_scores)[valid_auroc] * class_weights[valid_auroc]) / np.sum(class_weights[valid_auroc])

        weighted_auprc = np.sum(np.array(auprc_scores) * class_weights) / np.sum(class_weights)

        # Print results
        print(f'\n{label_name} Results:')
        print('Class\t\tPrevalence\tAUROC\t\tAUPRC')
        for i, class_name in enumerate(classes):
            print(f'{class_name:<15}{class_weights[i]:.4f}\t\t{auroc_scores[i]:.4f}\t\t{auprc_scores[i]:.4f}')

        print(f'\n{label_name} - Manually Weighted Average - wAUROC: {weighted_auroc:.4f}, wAUPRC: {weighted_auprc:.4f}')

    except (FileNotFoundError, KeyError, ValueError) as e:
        print(f"Error processing {csv_file_path}: {e}")

# Define files and labels
csv_files_with_labels = [
    ('/content/chexpert_race1_chexpert_test.csv', 'CheXpert'),
    ('/content/chexpert_race1_mimic_test.csv', 'MIMIC')
]

# Run evaluations
for file_path, label in csv_files_with_labels:
    evaluate_race_csv(file_path, label)


CheXpert Results:
Class		Prevalence	AUROC		AUPRC
Non-Hispanic Asian0.1702		0.9435		0.8318
Non-Hispanic Black0.0722		0.9293		0.7120
Non-Hispanic White0.7576		0.9353		0.9744

CheXpert - Manually Weighted Average - wAUROC: 0.9362, wAUPRC: 0.9312

MIMIC Results:
Class		Prevalence	AUROC		AUPRC
Non-Hispanic Asian0.0406		0.9325		0.6014
Non-Hispanic Black0.1867		0.9533		0.8680
Non-Hispanic White0.7727		0.9489		0.9822

MIMIC - Manually Weighted Average - wAUROC: 0.9490, wAUPRC: 0.9454


In [None]:
import pandas as pd
import ast
from sklearn import metrics
from sklearn.preprocessing import LabelEncoder
import numpy as np

# Standardized race/ethnicity categories
STANDARD_RACES = ['Non-Hispanic Asian', 'Non-Hispanic Black',  'Non-Hispanic White']

# Mapping from original labels to standardized labels
RACE_STANDARDIZATION = {
    'Asian': 'Non-Hispanic Asian',
    'Non-Hispanic Asian': 'Non-Hispanic Asian',
    'Black': 'Non-Hispanic Black',
    'Non-Hispanic Black': 'Non-Hispanic Black',
    'Hispanic/Latino': 'Other',
    'Other': 'Other',
    'White': 'Non-Hispanic White',
    'Non-Hispanic White': 'Non-Hispanic White'
}

def bootstrap_race_metrics(csv_file_path, label_name, n_iterations=1000, random_state=2025):
    try:
        df = pd.read_csv(csv_file_path)
        df['Race/Ethnicity_Probability'] = df['Race/Ethnicity_Probability'].apply(ast.literal_eval)
        df['Race/Ethnicity'] = df['Race/Ethnicity'].map(RACE_STANDARDIZATION)
        df = df[df['Race/Ethnicity'] != 'Other'].copy()

        label_encoder = LabelEncoder()
        y_true_all = label_encoder.fit_transform(df['Race/Ethnicity'])
        y_score_all = np.array(df['Race/Ethnicity_Probability'].tolist())
        classes = label_encoder.classes_
        n_classes = len(classes)

        data = pd.DataFrame({'y_true': y_true_all.tolist(), 'y_score': y_score_all.tolist()})
        rng = np.random.default_rng(seed=random_state)

        auroc_bootstrap = []
        auprc_bootstrap = []

        for _ in range(n_iterations):
            sample = data.sample(n=len(data), replace=True, random_state=rng.integers(1e9))
            y_true = np.array(sample['y_true'].tolist())
            y_score = np.array(sample['y_score'].tolist())

            auroc_scores = []
            auprc_scores = []
            class_counts = np.bincount(y_true, minlength=n_classes)
            class_weights = class_counts / len(y_true)

            for class_idx in range(n_classes):
                y_true_binary = (y_true == class_idx).astype(int)
                y_score_class = y_score[:, class_idx]

                if len(np.unique(y_true_binary)) >= 2:
                    auroc = metrics.roc_auc_score(y_true_binary, y_score_class)
                else:
                    auroc = np.nan
                auprc = metrics.average_precision_score(y_true_binary, y_score_class)

                auroc_scores.append(auroc)
                auprc_scores.append(auprc)

            # Weighted average ignoring NaN in AUROC
            valid_auroc = ~np.isnan(auroc_scores)
            weighted_auroc = np.sum(np.array(auroc_scores)[valid_auroc] * class_weights[valid_auroc]) / np.sum(class_weights[valid_auroc])
            weighted_auprc = np.sum(np.array(auprc_scores) * class_weights) / np.sum(class_weights)

            auroc_bootstrap.append(weighted_auroc)
            auprc_bootstrap.append(weighted_auprc)

        # Compute confidence intervals
        auroc_ci = np.percentile(auroc_bootstrap, [2.5, 97.5])
        auprc_ci = np.percentile(auprc_bootstrap, [2.5, 97.5])

        print(f"\n{label_name} - Bootstrapped Weighted Metrics:")
        print(f"wAUROC 95% CI: [{auroc_ci[0]:.4f}, {auroc_ci[1]:.4f}]")
        print(f"wAUPRC 95% CI: [{auprc_ci[0]:.4f}, {auprc_ci[1]:.4f}]")

        return auroc_ci, auprc_ci

    except (FileNotFoundError, KeyError, ValueError) as e:
        print(f"Error processing {csv_file_path}: {e}")
        return None, None


# Run bootstrap evaluation
csv_files_with_labels = [
    ('/content/chexpert_race1_chexpert_test.csv', 'CheXpert'),
    ('/content/chexpert_race1_mimic_test.csv', 'MIMIC')
]

for file_path, label in csv_files_with_labels:
    bootstrap_race_metrics(file_path, label)



CheXpert - Bootstrapped Weighted Metrics:
wAUROC 95% CI: [0.9327, 0.9396]
wAUPRC 95% CI: [0.9280, 0.9345]

MIMIC - Bootstrapped Weighted Metrics:
wAUROC 95% CI: [0.9466, 0.9513]
wAUPRC 95% CI: [0.9433, 0.9477]


In [None]:
import pandas as pd
import ast
from sklearn import metrics
from sklearn.preprocessing import LabelEncoder
import numpy as np

# Standardized race/ethnicity categories
STANDARD_RACES = ['Non-Hispanic Asian', 'Non-Hispanic Black',  'Non-Hispanic White']

# Mapping from original labels to standardized labels
RACE_STANDARDIZATION = {
    'Asian': 'Non-Hispanic Asian',
    'Non-Hispanic Asian': 'Non-Hispanic Asian',
    'Black': 'Non-Hispanic Black',
    'Non-Hispanic Black': 'Non-Hispanic Black',
    'Hispanic/Latino': 'Other',
    'Other': 'Other',
    'White': 'Non-Hispanic White',
    'Non-Hispanic White': 'Non-Hispanic White'
}

def bootstrap_race_metrics(csv_file_path, label_name, n_iterations=1000, random_state=2025):
    try:
        df = pd.read_csv(csv_file_path)
        df['Race/Ethnicity_Probability'] = df['Race/Ethnicity_Probability'].apply(ast.literal_eval)
        df['Race/Ethnicity'] = df['Race/Ethnicity'].map(RACE_STANDARDIZATION)
        df = df[df['Race/Ethnicity'] != 'Other'].copy()

        label_encoder = LabelEncoder()
        y_true_all = label_encoder.fit_transform(df['Race/Ethnicity'])
        y_score_all = np.array(df['Race/Ethnicity_Probability'].tolist())
        classes = label_encoder.classes_
        n_classes = len(classes)

        data = pd.DataFrame({'y_true': y_true_all.tolist(), 'y_score': y_score_all.tolist()})
        rng = np.random.default_rng(seed=random_state)

        auroc_bootstrap = []
        auprc_bootstrap = []
        per_class_aurocs = [[] for _ in range(n_classes)]

        for _ in range(n_iterations):
            sample = data.sample(n=len(data), replace=True, random_state=rng.integers(1e9))
            y_true = np.array(sample['y_true'].tolist())
            y_score = np.array(sample['y_score'].tolist())

            auroc_scores = []
            auprc_scores = []
            class_counts = np.bincount(y_true, minlength=n_classes)
            class_weights = class_counts / len(y_true)

            for class_idx in range(n_classes):
                y_true_binary = (y_true == class_idx).astype(int)
                y_score_class = y_score[:, class_idx]

                if len(np.unique(y_true_binary)) >= 2:
                    auroc = metrics.roc_auc_score(y_true_binary, y_score_class)
                else:
                    auroc = np.nan
                auprc = metrics.average_precision_score(y_true_binary, y_score_class)

                auroc_scores.append(auroc)
                auprc_scores.append(auprc)

                # Store for per-class CI
                per_class_aurocs[class_idx].append(auroc)

            valid_auroc = ~np.isnan(auroc_scores)
            weighted_auroc = np.sum(np.array(auroc_scores)[valid_auroc] * class_weights[valid_auroc]) / np.sum(class_weights[valid_auroc])
            weighted_auprc = np.sum(np.array(auprc_scores) * class_weights) / np.sum(class_weights)

            auroc_bootstrap.append(weighted_auroc)
            auprc_bootstrap.append(weighted_auprc)

        auroc_ci = np.percentile(auroc_bootstrap, [2.5, 97.5])
        auprc_ci = np.percentile(auprc_bootstrap, [2.5, 97.5])

        print(f"\n{label_name} - Bootstrapped Weighted Metrics:")
        print(f"wAUROC 95% CI: [{auroc_ci[0]:.4f}, {auroc_ci[1]:.4f}]")
        print(f"wAUPRC 95% CI: [{auprc_ci[0]:.4f}, {auprc_ci[1]:.4f}]\n")

        # Per-class AUROC confidence intervals
        print(f"{label_name} - Per-Class AUROC 95% CIs:")
        for i, class_name in enumerate(classes):
            class_aurocs = np.array(per_class_aurocs[i])
            class_aurocs = class_aurocs[~np.isnan(class_aurocs)]
            if len(class_aurocs) == 0:
                print(f"{class_name:<25} Insufficient data for AUROC")
            else:
                ci = np.percentile(class_aurocs, [2.5, 97.5])
                print(f"{class_name:<25} AUROC 95% CI: [{ci[0]:.4f}, {ci[1]:.4f}]")

        return auroc_ci, auprc_ci

    except (FileNotFoundError, KeyError, ValueError) as e:
        print(f"Error processing {csv_file_path}: {e}")
        return None, None


# Run bootstrap evaluation
csv_files_with_labels = [
    ('/content/chexpert_race1_chexpert_test.csv', 'CheXpert'),
    ('/content/chexpert_race1_mimic_test.csv', 'MIMIC')
]

for file_path, label in csv_files_with_labels:
    bootstrap_race_metrics(file_path, label)



CheXpert - Bootstrapped Weighted Metrics:
wAUROC 95% CI: [0.9327, 0.9396]
wAUPRC 95% CI: [0.9280, 0.9345]

CheXpert - Per-Class AUROC 95% CIs:
Non-Hispanic Asian        AUROC 95% CI: [0.9397, 0.9470]
Non-Hispanic Black        AUROC 95% CI: [0.9223, 0.9360]
Non-Hispanic White        AUROC 95% CI: [0.9317, 0.9387]

MIMIC - Bootstrapped Weighted Metrics:
wAUROC 95% CI: [0.9466, 0.9513]
wAUPRC 95% CI: [0.9433, 0.9477]

MIMIC - Per-Class AUROC 95% CIs:
Non-Hispanic Asian        AUROC 95% CI: [0.9256, 0.9389]
Non-Hispanic Black        AUROC 95% CI: [0.9508, 0.9559]
Non-Hispanic White        AUROC 95% CI: [0.9464, 0.9514]


# MIMIC

In [None]:
import pandas as pd
import ast
from sklearn import metrics
from sklearn.preprocessing import LabelEncoder
import numpy as np

# Standardized race/ethnicity categories
STANDARD_RACES = ['Asian', 'Black', 'Hispanic/Latino', 'Other', 'White']

# Mapping from original labels to standardized labels
RACE_STANDARDIZATION = {
    'Asian': 'Asian',
    'Non-Hispanic Asian': 'Asian',
    'Black': 'Black',
    'Non-Hispanic Black': 'Black',
    'Hispanic/Latino': 'Hispanic/Latino',
    'Other': 'Other',
    'White': 'White',
    'Non-Hispanic White': 'White'
}

def evaluate_race_csv(csv_file_path, label_name):
    try:
        predict_df = pd.read_csv(csv_file_path)

        # Parse probabilities and standardize race labels
        predict_df['Race/Ethnicity_Probability'] = predict_df['Race/Ethnicity_Probability'].apply(ast.literal_eval)
        predict_df['Race/Ethnicity'] = predict_df['Race/Ethnicity'].map(RACE_STANDARDIZATION)
        # Encode the textual labels into integers
        label_encoder = LabelEncoder()
        y_true = label_encoder.fit_transform(predict_df['Race/Ethnicity'])
        classes = label_encoder.classes_

        # Get the list of probabilities and convert to numpy array
        y_score = np.array(predict_df['Race/Ethnicity_Probability'].tolist())

        # Calculate class prevalences (weights)
        class_counts = np.bincount(y_true)
        class_weights = class_counts / len(y_true)

        # Initialize lists to store per-class metrics
        auroc_scores = []
        auprc_scores = []

        # Compute metrics for each class using one-vs-rest approach
        for class_idx in range(len(classes)):
          if class_idx == 2 and label_name == 'CheXpert':
            # Create binary labels for current class
            y_true_binary = (y_true == 2).astype(int)

            # Get probabilities for current class
            y_score_class = y_score[:, 4]

            # Compute AUROC
            if len(np.unique(y_true_binary)) >= 2:  # Need at least one positive and one negative sample
                auroc = metrics.roc_auc_score(y_true_binary, y_score_class)
                auroc_scores.append(auroc)
            else:
                auroc_scores.append(np.nan)
            auprc = metrics.average_precision_score(y_true_binary, y_score_class)
            auprc_scores.append(auprc)
          else:
            # Create binary labels for current class
            y_true_binary = (y_true == class_idx).astype(int)

            # Get probabilities for current class
            y_score_class = y_score[:, class_idx]

            # Compute AUROC
            if len(np.unique(y_true_binary)) >= 2:  # Need at least one positive and one negative sample
                auroc = metrics.roc_auc_score(y_true_binary, y_score_class)
                auroc_scores.append(auroc)
            else:
                auroc_scores.append(np.nan)

            # Compute AUPRC
            auprc = metrics.average_precision_score(y_true_binary, y_score_class)
            auprc_scores.append(auprc)

        # Calculate manually weighted averages (ignoring NaN values)
        valid_auroc = ~np.isnan(auroc_scores)
        weighted_auroc = np.sum(np.array(auroc_scores)[valid_auroc] * class_weights[valid_auroc]) / np.sum(class_weights[valid_auroc])

        weighted_auprc = np.sum(np.array(auprc_scores) * class_weights) / np.sum(class_weights)

        # Print results
        print(f'\n{label_name} Results:')
        print('Class\t\tPrevalence\tAUROC\t\tAUPRC')
        for i, class_name in enumerate(classes):
            print(f'{class_name:<15}{class_weights[i]:.4f}\t\t{auroc_scores[i]:.4f}\t\t{auprc_scores[i]:.4f}')

        print(f'\n{label_name} - Manually Weighted Average - wAUROC: {weighted_auroc:.4f}, wAUPRC: {weighted_auprc:.4f}')

    except (FileNotFoundError, KeyError, ValueError) as e:
        print(f"Error processing {csv_file_path}: {e}")

# Define files and labels
csv_files_with_labels = [
    ('/content/mimic_race_chexpert_race_v1_test.csv', 'CheXpert')
]

# Run evaluations
for file_path, label in csv_files_with_labels:
    evaluate_race_csv(file_path, label)


CheXpert Results:
Class		Prevalence	AUROC		AUPRC
Asian          0.1702		0.8773		0.6741
Black          0.0722		0.9119		0.6481
White          0.7576		0.8740		0.9481

CheXpert - Manually Weighted Average - wAUROC: 0.8773, wAUPRC: 0.8798


In [None]:
import pandas as pd
import ast
from sklearn import metrics
from sklearn.preprocessing import LabelEncoder
import numpy as np

# Standardized race/ethnicity categories
STANDARD_RACES = ['Asian', 'Black', 'Hispanic/Latino', 'Other', 'White']

# Mapping from original labels to standardized labels
RACE_STANDARDIZATION = {
    'Asian': 'Asian',
    'Non-Hispanic Asian': 'Asian',
    'Black': 'Black',
    'Non-Hispanic Black': 'Black',
    'Hispanic/Latino': 'Hispanic/Latino',
    'Other': 'Other',
    'White': 'White',
    'Non-Hispanic White': 'White'
}

def bootstrap_race_metrics(csv_file_path, label_name, n_iterations=1000, random_state=2025):
    try:
        df = pd.read_csv(csv_file_path)
        df['Race/Ethnicity_Probability'] = df['Race/Ethnicity_Probability'].apply(ast.literal_eval)
        df['Race/Ethnicity'] = df['Race/Ethnicity'].map(RACE_STANDARDIZATION)
        label_encoder = LabelEncoder()
        y_true_all = label_encoder.fit_transform(df['Race/Ethnicity'])
        y_score_all = np.array(df['Race/Ethnicity_Probability'].tolist())
        classes = label_encoder.classes_
        n_classes = len(classes)

        data = pd.DataFrame({'y_true': y_true_all.tolist(), 'y_score': y_score_all.tolist()})
        rng = np.random.default_rng(seed=random_state)

        auroc_bootstrap = []
        auprc_bootstrap = []

        for _ in range(n_iterations):
            sample = data.sample(n=len(data), replace=True, random_state=rng.integers(1e9))
            y_true = np.array(sample['y_true'].tolist())
            y_score = np.array(sample['y_score'].tolist())

            auroc_scores = []
            auprc_scores = []
            class_counts = np.bincount(y_true, minlength=n_classes)
            class_weights = class_counts / len(y_true)

            for class_idx in range(n_classes):
                y_true_binary = (y_true == class_idx).astype(int)

                # Special case for CheXpert "Other" class index 3
                if class_idx == 2 and label_name == 'CheXpert':
                    y_score_class = y_score[:, 4]
                else:
                    y_score_class = y_score[:, class_idx]

                if len(np.unique(y_true_binary)) >= 2:
                    auroc = metrics.roc_auc_score(y_true_binary, y_score_class)
                else:
                    auroc = np.nan
                auprc = metrics.average_precision_score(y_true_binary, y_score_class)

                auroc_scores.append(auroc)
                auprc_scores.append(auprc)

            valid_auroc = ~np.isnan(auroc_scores)
            weighted_auroc = np.sum(np.array(auroc_scores)[valid_auroc] * class_weights[valid_auroc]) / np.sum(class_weights[valid_auroc])
            weighted_auprc = np.sum(np.array(auprc_scores) * class_weights) / np.sum(class_weights)

            auroc_bootstrap.append(weighted_auroc)
            auprc_bootstrap.append(weighted_auprc)

        auroc_ci = np.percentile(auroc_bootstrap, [2.5, 97.5])
        auprc_ci = np.percentile(auprc_bootstrap, [2.5, 97.5])

        print(f"\n{label_name} - Bootstrapped Weighted Metrics:")
        print(f"wAUROC 95% CI: [{auroc_ci[0]:.4f}, {auroc_ci[1]:.4f}]")
        print(f"wAUPRC 95% CI: [{auprc_ci[0]:.4f}, {auprc_ci[1]:.4f}]")

        return auroc_ci, auprc_ci

    except (FileNotFoundError, KeyError, ValueError) as e:
        print(f"Error processing {csv_file_path}: {e}")
        return None, None


# Run bootstrap evaluation
csv_files_with_labels = [
    ('/content/mimic_race_chexpert_race_v1_test.csv', 'CheXpert')
]

for file_path, label in csv_files_with_labels:
    bootstrap_race_metrics(file_path, label)



CheXpert - Bootstrapped Weighted Metrics:
wAUROC 95% CI: [0.8724, 0.8823]
wAUPRC 95% CI: [0.8755, 0.8846]


In [None]:
import pandas as pd
import ast
from sklearn import metrics
from sklearn.preprocessing import LabelEncoder
import numpy as np

# Standardized race/ethnicity categories
STANDARD_RACES = ['Asian', 'Black', 'Hispanic/Latino', 'Other', 'White']

# Mapping from original labels to standardized labels
RACE_STANDARDIZATION = {
    'Asian': 'Asian',
    'Non-Hispanic Asian': 'Asian',
    'Black': 'Black',
    'Non-Hispanic Black': 'Black',
    'Hispanic/Latino': 'Hispanic/Latino',
    'Other': 'Other',
    'White': 'White',
    'Non-Hispanic White': 'White'
}

def bootstrap_race_metrics(csv_file_path, label_name, n_iterations=1000, random_state=2025):
    try:
        df = pd.read_csv(csv_file_path)
        df['Race/Ethnicity_Probability'] = df['Race/Ethnicity_Probability'].apply(ast.literal_eval)
        df['Race/Ethnicity'] = df['Race/Ethnicity'].map(RACE_STANDARDIZATION)
        label_encoder = LabelEncoder()
        y_true_all = label_encoder.fit_transform(df['Race/Ethnicity'])
        y_score_all = np.array(df['Race/Ethnicity_Probability'].tolist())
        classes = label_encoder.classes_
        n_classes = len(classes)

        data = pd.DataFrame({'y_true': y_true_all.tolist(), 'y_score': y_score_all.tolist()})
        rng = np.random.default_rng(seed=random_state)

        # Store AUROC and AUPRC for weighted and each class
        auroc_bootstrap = []
        auprc_bootstrap = []
        per_class_aurocs = [[] for _ in range(n_classes)]

        for _ in range(n_iterations):
            sample = data.sample(n=len(data), replace=True, random_state=rng.integers(1e9))
            y_true = np.array(sample['y_true'].tolist())
            y_score = np.array(sample['y_score'].tolist())

            auroc_scores = []
            auprc_scores = []
            class_counts = np.bincount(y_true, minlength=n_classes)
            class_weights = class_counts / len(y_true)

            for class_idx in range(n_classes):
                y_true_binary = (y_true == class_idx).astype(int)

                # Special case for CheXpert "Other" class index 3
                if class_idx == 2 and label_name == 'CheXpert':
                    y_score_class = y_score[:, 4]
                else:
                    y_score_class = y_score[:, class_idx]

                if len(np.unique(y_true_binary)) >= 2:
                    auroc = metrics.roc_auc_score(y_true_binary, y_score_class)
                else:
                    auroc = np.nan
                auprc = metrics.average_precision_score(y_true_binary, y_score_class)

                auroc_scores.append(auroc)
                auprc_scores.append(auprc)

                # Collect for CI computation
                per_class_aurocs[class_idx].append(auroc)

            valid_auroc = ~np.isnan(auroc_scores)
            weighted_auroc = np.sum(np.array(auroc_scores)[valid_auroc] * class_weights[valid_auroc]) / np.sum(class_weights[valid_auroc])
            weighted_auprc = np.sum(np.array(auprc_scores) * class_weights) / np.sum(class_weights)

            auroc_bootstrap.append(weighted_auroc)
            auprc_bootstrap.append(weighted_auprc)

        # Compute weighted CIs
        auroc_ci = np.percentile(auroc_bootstrap, [2.5, 97.5])
        auprc_ci = np.percentile(auprc_bootstrap, [2.5, 97.5])

        print(f"\n{label_name} - Bootstrapped Weighted Metrics:")
        print(f"wAUROC 95% CI: [{auroc_ci[0]:.4f}, {auroc_ci[1]:.4f}]")
        print(f"wAUPRC 95% CI: [{auprc_ci[0]:.4f}, {auprc_ci[1]:.4f}]")

        # Compute and print per-class AUROC CIs
        print(f"\n{label_name} - Per-Class AUROC 95% Confidence Intervals:")
        for i, class_name in enumerate(classes):
            class_aurocs = [x for x in per_class_aurocs[i] if not np.isnan(x)]
            if len(class_aurocs) > 0:
                ci_low, ci_high = np.percentile(class_aurocs, [2.5, 97.5])
                print(f"{class_name:<20}: [{ci_low:.4f}, {ci_high:.4f}]")
            else:
                print(f"{class_name:<20}: AUROC CI not computable (insufficient variation)")

        return auroc_ci, auprc_ci

    except (FileNotFoundError, KeyError, ValueError) as e:
        print(f"Error processing {csv_file_path}: {e}")
        return None, None


# Run bootstrap evaluation
csv_files_with_labels = [
    ('/content/mimic_race_chexpert_race_v1_test.csv', 'CheXpert')
]

for file_path, label in csv_files_with_labels:
    bootstrap_race_metrics(file_path, label)



CheXpert - Bootstrapped Weighted Metrics:
wAUROC 95% CI: [0.8724, 0.8823]
wAUPRC 95% CI: [0.8755, 0.8846]

CheXpert - Per-Class AUROC 95% Confidence Intervals:
Asian               : [0.8718, 0.8832]
Black               : [0.9042, 0.9191]
White               : [0.8687, 0.8795]
