In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
import os
import ast
from glob import glob
from sklearn import metrics
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

The performance of 3 models are analyzed under their respective header in this notebook (i.e., CheXpert-trained, MIMIC-trained, NIH-trained models)

# CheXpert

In [None]:
def evaluate_age_csv(csv_file_path, label_name):
    try:
        predict_df = pd.read_csv(csv_file_path)

        # Mapping from tensor ID to age group label
        age_group_mapping = {
            0: '0-20 yo',
            1: '21-40 yo',
            2: '41-60 yo',
            3: '61-80 yo',
            4: '81+ yo'
        }

        # Count and normalize age tensor IDs
        value_counts = predict_df['Age_Tensor_Id'].value_counts(normalize=True).sort_index()

        print(f"\n=== Age Distribution for {label_name} ===")
        for age_id, proportion in value_counts.items():
            age_group = age_group_mapping.get(age_id, f'Unknown ({age_id})')
            print(f"{age_group}: {proportion * 100:.2f}% of samples")

    except (FileNotFoundError, KeyError, ValueError) as e:
        print(f"Error processing {csv_file_path}: {e}")

# Define files and labels
csv_files_with_labels = [
    ('/content/chexpert_age_vindr_peds_test.csv', 'VinDr-PCXR'),
]

# Run evaluations
for file_path, label in csv_files_with_labels:
    evaluate_age_csv(file_path, label)


=== Age Distribution for VinDr-PCXR ===
0-20 yo: 55.06% of samples
21-40 yo: 43.77% of samples
41-60 yo: 1.16% of samples
61-80 yo: 0.01% of samples


In [None]:
import pandas as pd
import numpy as np

def bootstrap_age_prevalence(csv_file_path, label_name, n_iterations=1000, random_state=2025):
    try:
        df = pd.read_csv(csv_file_path)

        # Age group mapping
        age_map = {
            0: '0-20 yo',
            1: '21-40 yo',
            2: '41-60 yo',
            3: '61-80 yo',
            4: '81+ yo'
        }

        df = df[df['Age_Tensor_Id'].isin(age_map.keys())].copy()
        y = df['Age_Tensor_Id'].values
        rng = np.random.default_rng(seed=random_state)

        prevalence_bootstrap = {k: [] for k in age_map.keys()}

        for _ in range(n_iterations):
            sample = rng.choice(y, size=len(y), replace=True)
            counts = np.bincount(sample, minlength=5) / len(sample)
            for i in age_map.keys():
                prevalence_bootstrap[i].append(counts[i])

        print(f"\n{label_name} - Bootstrapped Age Prevalence (95% CI):")
        for age_id, samples in prevalence_bootstrap.items():
            age_label = age_map[age_id]
            ci_low, ci_high = np.percentile(samples, [2.5, 97.5])
            print(f"{age_label:<10}: 95% CI: [{ci_low*100:.2f}, {ci_high*100:.2f}]")

    except (FileNotFoundError, KeyError, ValueError) as e:
        print(f"Error processing {csv_file_path}: {e}")

# Define files and labels
csv_files_with_labels = [
    ('/content/chexpert_age_vindr_peds_test.csv', 'VinDr-PCXR'),
    # Add more files as needed
]

# Run evaluations
for file_path, label in csv_files_with_labels:
    bootstrap_age_prevalence(file_path, label)



VinDr-PCXR - Bootstrapped Age Prevalence (95% CI):
0-20 yo   : 95% CI: [54.02, 56.07]
21-40 yo  : 95% CI: [42.76, 44.76]
41-60 yo  : 95% CI: [0.96, 1.39]
61-80 yo  : 95% CI: [0.00, 0.03]
81+ yo    : 95% CI: [0.00, 0.00]


# MIMIC

In [None]:
def evaluate_age_csv(csv_file_path, label_name):
    try:
        predict_df = pd.read_csv(csv_file_path)

        # Mapping from tensor ID to age group label
        age_group_mapping = {
            0: '0-20 yo',
            1: '21-40 yo',
            2: '41-60 yo',
            3: '61-80 yo',
            4: '81+ yo'
        }

        # Count and normalize age tensor IDs
        value_counts = predict_df['Age_Tensor_Id'].value_counts(normalize=True).sort_index()

        print(f"\n=== Age Distribution for {label_name} ===")
        for age_id, proportion in value_counts.items():
            age_group = age_group_mapping.get(age_id, f'Unknown ({age_id})')
            print(f"{age_group}: {proportion * 100:.2f}% of samples")

    except (FileNotFoundError, KeyError, ValueError) as e:
        print(f"Error processing {csv_file_path}: {e}")

# Define files and labels
csv_files_with_labels = [
    ('/content/mimic_age_vindr_peds_test.csv', 'VinDr-PCXR'),
]

# Run evaluations
for file_path, label in csv_files_with_labels:
    evaluate_age_csv(file_path, label)


=== Age Distribution for VinDr-PCXR ===
0-20 yo: 36.71% of samples
21-40 yo: 48.59% of samples
41-60 yo: 11.81% of samples
61-80 yo: 2.84% of samples
81+ yo: 0.04% of samples


In [None]:
import pandas as pd
import numpy as np

def bootstrap_age_prevalence(csv_file_path, label_name, n_iterations=1000, random_state=2025):
    try:
        df = pd.read_csv(csv_file_path)

        # Age group mapping
        age_map = {
            0: '0-20 yo',
            1: '21-40 yo',
            2: '41-60 yo',
            3: '61-80 yo',
            4: '81+ yo'
        }

        df = df[df['Age_Tensor_Id'].isin(age_map.keys())].copy()
        y = df['Age_Tensor_Id'].values
        rng = np.random.default_rng(seed=random_state)

        prevalence_bootstrap = {k: [] for k in age_map.keys()}

        for _ in range(n_iterations):
            sample = rng.choice(y, size=len(y), replace=True)
            counts = np.bincount(sample, minlength=5) / len(sample)
            for i in age_map.keys():
                prevalence_bootstrap[i].append(counts[i])

        print(f"\n{label_name} - Bootstrapped Age Prevalence (95% CI):")
        for age_id, samples in prevalence_bootstrap.items():
            age_label = age_map[age_id]
            ci_low, ci_high = np.percentile(samples, [2.5, 97.5])
            print(f"{age_label:<10}: 95% CI: [{ci_low*100:.2f}, {ci_high*100:.2f}]")

    except (FileNotFoundError, KeyError, ValueError) as e:
        print(f"Error processing {csv_file_path}: {e}")

# Define files and labels
csv_files_with_labels = [
    ('/content/mimic_age_vindr_peds_test.csv', 'VinDr-PCXR'),
    # Add more files as needed
]

# Run evaluations
for file_path, label in csv_files_with_labels:
    bootstrap_age_prevalence(file_path, label)



VinDr-PCXR - Bootstrapped Age Prevalence (95% CI):
0-20 yo   : 95% CI: [35.77, 37.73]
21-40 yo  : 95% CI: [47.61, 49.62]
41-60 yo  : 95% CI: [11.15, 12.48]
61-80 yo  : 95% CI: [2.50, 3.18]
81+ yo    : 95% CI: [0.01, 0.09]


# NIH

In [None]:
def evaluate_age_csv(csv_file_path, label_name):
    try:
        predict_df = pd.read_csv(csv_file_path)

        # Mapping from tensor ID to age group label
        age_group_mapping = {
            0: '0-20 yo',
            1: '21-40 yo',
            2: '41-60 yo',
            3: '61-80 yo',
            4: '81+ yo'
        }

        # Count and normalize age tensor IDs
        value_counts = predict_df['Age_Tensor_Id'].value_counts(normalize=True).sort_index()

        print(f"\n=== Age Distribution for {label_name} ===")
        for age_id, proportion in value_counts.items():
            age_group = age_group_mapping.get(age_id, f'Unknown ({age_id})')
            print(f"{age_group}: {proportion * 100:.2f}% of samples")

    except (FileNotFoundError, KeyError, ValueError) as e:
        print(f"Error processing {csv_file_path}: {e}")

# Define files and labels
csv_files_with_labels = [
    ('/content/nih_age_vindr_peds_test.csv', 'VinDr-PCXR'),
]

# Run evaluations
for file_path, label in csv_files_with_labels:
    evaluate_age_csv(file_path, label)


=== Age Distribution for VinDr-PCXR ===
0-20 yo: 95.84% of samples
21-40 yo: 2.63% of samples
41-60 yo: 1.01% of samples
61-80 yo: 0.53% of samples


In [None]:
import pandas as pd
import numpy as np

def bootstrap_age_prevalence(csv_file_path, label_name, n_iterations=1000, random_state=2025):
    try:
        df = pd.read_csv(csv_file_path)

        # Age group mapping
        age_map = {
            0: '0-20 yo',
            1: '21-40 yo',
            2: '41-60 yo',
            3: '61-80 yo',
            4: '81+ yo'
        }

        df = df[df['Age_Tensor_Id'].isin(age_map.keys())].copy()
        y = df['Age_Tensor_Id'].values
        rng = np.random.default_rng(seed=random_state)

        prevalence_bootstrap = {k: [] for k in age_map.keys()}

        for _ in range(n_iterations):
            sample = rng.choice(y, size=len(y), replace=True)
            counts = np.bincount(sample, minlength=5) / len(sample)
            for i in age_map.keys():
                prevalence_bootstrap[i].append(counts[i])

        print(f"\n{label_name} - Bootstrapped Age Prevalence (95% CI):")
        for age_id, samples in prevalence_bootstrap.items():
            age_label = age_map[age_id]
            ci_low, ci_high = np.percentile(samples, [2.5, 97.5])
            print(f"{age_label:<10}: 95% CI: [{ci_low*100:.2f}, {ci_high*100:.2f}]")

    except (FileNotFoundError, KeyError, ValueError) as e:
        print(f"Error processing {csv_file_path}: {e}")

# Define files and labels
csv_files_with_labels = [
    ('/content/nih_age_vindr_peds_test.csv', 'VinDr-PCXR'),
    # Add more files as needed
]

# Run evaluations
for file_path, label in csv_files_with_labels:
    bootstrap_age_prevalence(file_path, label)



VinDr-PCXR - Bootstrapped Age Prevalence (95% CI):
0-20 yo   : 95% CI: [95.43, 96.25]
21-40 yo  : 95% CI: [2.32, 2.96]
41-60 yo  : 95% CI: [0.81, 1.22]
61-80 yo  : 95% CI: [0.39, 0.68]
81+ yo    : 95% CI: [0.00, 0.00]
