In [None]:
import pandas as pd
from fastai.vision.all import *
from sklearn.model_selection import train_test_split
import os

In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [None]:
#Read labels csv and keep only frontal images
path = "..." #Replace with user path
df = pd.read_csv(os.path.join(path, "train.csv"))
df = pd.concat([df, pd.read_csv(os.path.join(path, "valid.csv"))], axis = 0, ignore_index=True)
df = df[df["Frontal/Lateral"] == "Frontal"]
df_cleaned = df.iloc[:,0:3]

In [None]:
#Create a PATIENT column with patient ids
patient_id=[]
for patient in df_cleaned['Path'].str.split('/'):
    patient_id.append(patient[2])
df_cleaned['PATIENT'] = patient_id

In [None]:
path = "..." #Replace with user path
df_demo = pd.read_excel(os.path.join(path, "CHEXPERT DEMO.xlsx"))
df_demo = df_demo.sort_values(by=['PATIENT'])

In [None]:
#Create new dataframe (df_merged) with the labels and demographic info for each image
df_merged = pd.merge(df_cleaned, df_demo, on='PATIENT')

**Remove Inconsistent Labels**

In [None]:
#Some patient sex/gender entries are inconsistent; exclude these by setting GENDER as Unknown
print(df_merged['Sex'].compare(df_merged['GENDER']))
df_merged.loc[df_merged['Sex'].compare(df_merged['GENDER']).index, 'GENDER']='Unknown'

          self   other
172345  Female    Male
176168    Male  Female
176788    Male  Female


In [None]:
#Some patient age entries are inconsistent; exclude these by setting AGE_AT_CXR as Unknown
print(df_merged['Age'].compare(df_merged['AGE_AT_CXR']))
df_merged.loc[df_merged['Age'].compare(df_merged['AGE_AT_CXR']).index, 'AGE_AT_CXR']=0

        self  other
2       83.0   87.0
16      19.0   22.0
17      19.0   22.0
18      19.0   22.0
19      19.0   22.0
...      ...    ...
190479  77.0   71.0
190482  77.0   76.0
190491  49.0   50.0
190492  87.0   85.0
190498  90.0   94.0

[46583 rows x 2 columns]


**Clean Up Age Labels**

In [None]:
agedict = {
    "0 to 20": 0,
    "21 to 40": 1,
    "41 to 60": 2,
    "61 to 80": 3,
    ">=81": 4
}

#Using the AGE_AT_CXR column create Age Category column stratify patients by age
df_merged["Patient Age Category"]=(~df_merged["AGE_AT_CXR"]>=0).astype(int)
df_merged["Patient Age Category"]=df_merged["Patient Age Category"]+(df_merged["AGE_AT_CXR"]>=21).astype(int)
df_merged["Patient Age Category"]=df_merged["Patient Age Category"]+(df_merged["AGE_AT_CXR"]>=41).astype(int)
df_merged["Patient Age Category"]=df_merged["Patient Age Category"]+(df_merged["AGE_AT_CXR"]>=61).astype(int)
df_merged["Patient Age Category"]=df_merged["Patient Age Category"]+(df_merged["AGE_AT_CXR"]>=81).astype(int)
df_merged.loc[df_merged['AGE_AT_CXR'] == 0, 'Patient Age Category'] = np.nan

**Clean up Race Labels**

In [None]:
df_merged['PRIMARY_RACE'] = df_merged['PRIMARY_RACE'].fillna(value = 'Unknown')

#Clean up PRIMARY_RACE column
patient_race=[]
for patient in df_merged["PRIMARY_RACE"].str.split(","):
    patient_race.append(patient[0])
df_merged['PRIMARY_RACE'] = patient_race
df_merged['PRIMARY_RACE'].unique()

array(['Other', 'White', 'Black or African American',
       'Native Hawaiian or Other Pacific Islander', 'Asian', 'Unknown',
       'Native American', 'Race and Ethnicity Unknown', 'Black',
       'American Indian or Alaska Native', 'Patient Refused',
       'Pacific Islander', 'White or Caucasian',
       'Asian - Historical Conv'], dtype=object)

In [None]:
#Merge redundant race labels. While prior studies generlaly remove NHPI individuals from analaysis, we will code them as Asian to generate a AAPI label used in prior literature.
df_merged['PRIMARY_RACE'] = df_merged['PRIMARY_RACE'].replace(['Race and Ethnicity Unknown'],'Unknown')
df_merged['PRIMARY_RACE'] = df_merged['PRIMARY_RACE'].replace(['Patient Refused'],'Unknown')
df_merged['PRIMARY_RACE'] = df_merged['PRIMARY_RACE'].replace(['White or Caucasian'],'White')
df_merged['PRIMARY_RACE'] = df_merged['PRIMARY_RACE'].replace(['Asian - Historical Conv'],'Asian')
df_merged['PRIMARY_RACE'] = df_merged['PRIMARY_RACE'].replace(['Pacific Islander'],'Native Hawaiian or Other Pacific Islander')
df_merged['PRIMARY_RACE'] = df_merged['PRIMARY_RACE'].replace(['Native Hawaiian or Other Pacific Islander'],'Asian')
df_merged['PRIMARY_RACE'] = df_merged['PRIMARY_RACE'].replace(['American Indian or Alaska Native'],'Native American')
df_merged['PRIMARY_RACE'] = df_merged['PRIMARY_RACE'].replace(['Black or African American'],'Black')

In [None]:
print(df_merged['PRIMARY_RACE'].unique())
print(df_merged['ETHNICITY'].unique())

['Other' 'White' 'Black' 'Asian' 'Unknown' 'Native American']
['Non-Hispanic/Non-Latino' 'Hispanic/Latino' 'Unknown' nan
 'Patient Refused' 'Not Hispanic' 'Hispanic']


In [None]:
df_merged = df_merged.rename(columns={"GENDER": "Gender"})
df_merged['Gender'] = df_merged['Gender'].replace(['Female'],'F')
df_merged['Gender'] = df_merged['Gender'].replace(['Male'],'M')
df_merged['Gender'].unique()

array(['F', 'M', 'Unknown'], dtype=object)

**Creat Seperate Dataframes for Race, Sex, Age To Generate Train/Val/Test Splits**

In [None]:
#Formulation 1
#Race/Ethnicity labels will be set as Non-hispanic white, Non-hispanic black, Non-hispanic asian, Non-hispanic Native american
#Training/Validation/Test Sets will be made from patients of the above labels
df1 = df_merged[df_merged['ETHNICITY']=='Non-Hispanic/Non-Latino']
df1 =  df1[df1['PRIMARY_RACE'] != 'Native American']
df1 = pd.concat([df1, df_merged[df_merged['ETHNICITY']=='Not Hispanic']],axis = 0, ignore_index=True)
df1.drop(df1[df1['PRIMARY_RACE']=='Other'].index,inplace=True )
df1.drop(df1[df1['PRIMARY_RACE']=='Unknown'].index,inplace=True )
df1.loc[df1['PRIMARY_RACE'] == 'Asian', 'Race/Ethnicity'] = 'Non-Hispanic Asian'
df1.loc[df1['PRIMARY_RACE'] == 'Black', 'Race/Ethnicity'] = 'Non-Hispanic Black'
df1.loc[df1['PRIMARY_RACE'] == 'White', 'Race/Ethnicity'] = 'Non-Hispanic White'
df1 = df1.reset_index()
df1['Race/Ethnicity'].unique()
#We have created df1 dataframe to include pepole fitting our inclusion criteria

array(['Non-Hispanic White', 'Non-Hispanic Black', 'Non-Hispanic Asian'],
      dtype=object)

In [None]:
#Create df_gender Dataframe including people with known Gender labeled
df_gender = df_merged[df_merged['Gender'] != 'Unknown']
df_gender = df_gender.reset_index()

In [None]:
#Create df_age Dataframe including people with known Age Category labeled
df_age = df_merged[~df_merged['Patient Age Category'].isna()]
df_age = df_age.reset_index()

**Create Train/Val/Test Splits**

In [None]:
#Method to ensure no leakage occured
def no_leakage(train, val, test):
    intersect1 = pd.Series(np.intersect1d(train["PATIENT"], val["PATIENT"]))
    intersect2 = pd.Series(np.intersect1d(train["PATIENT"], test["PATIENT"]))
    intersect3 = pd.Series(np.intersect1d(val["PATIENT"], test["PATIENT"]))
    return intersect1.empty and intersect2.empty and intersect3.empty


In [None]:
#Create train, validation, test splits for age group dataframe
train_id, test_id = train_test_split(df_age["PATIENT"].unique(), test_size=0.2, random_state=2022)
train_id, val_id = train_test_split(train_id, test_size=0.125, random_state=2022)

df_age.loc[df_age["PATIENT"].isin(train_id), 'split']= 'train'
df_age.loc[df_age["PATIENT"].isin(val_id), 'split']='val'
df_age.loc[df_age["PATIENT"].isin(test_id), 'split']='test'

df_age.split.value_counts(normalize=True)

train    0.698602
test     0.200214
val      0.101184
Name: split, dtype: float64

In [None]:
train_df = df_age[df_age.split == 'train']
val_df = df_age[df_age.split == 'val']
test_df = df_age[df_age.split == 'test']

if no_leakage(train_df, val_df, test_df):
    # Calculates percentage of images with certain feature
    data = [["train",  len(train_df[train_df["Gender"] == 'M']) / len(train_df),
             len(train_df[train_df["Gender"] == 'F']) / len(train_df),
             len(train_df[train_df["Patient Age Category"] == 0]) / len(train_df),
             len(train_df[train_df["Patient Age Category"] == 1]) / len(train_df),
             len(train_df[train_df["Patient Age Category"] == 2]) / len(train_df),
             len(train_df[train_df["Patient Age Category"] == 3]) / len(train_df),
             len(train_df[train_df["Patient Age Category"] == 4]) / len(train_df),
             len(train_df[train_df["PRIMARY_RACE"] == 'Asian']) / len(train_df),
             len(train_df[train_df["PRIMARY_RACE"] == 'Black']) / len(train_df),
             len(train_df[train_df["PRIMARY_RACE"] == 'Native American']) / len(train_df),
             len(train_df[train_df["PRIMARY_RACE"] == 'White']) / len(train_df),
             len(train_df[train_df["PRIMARY_RACE"] == 'Other']) / len(train_df),
             len(train_df[train_df["PRIMARY_RACE"] == 'Unknown']) / len(train_df),
             (len(train_df[train_df["ETHNICITY"] == 'Hispanic/Latino']) + len(train_df[train_df["ETHNICITY"] == 'Hispanic']))/ len(train_df)]]

    data.append(["val", len(val_df[val_df["Gender"] == 'M']) / len(val_df),
                 len(val_df[val_df["Gender"] == 'F']) / len(val_df),
                 len(val_df[val_df["Patient Age Category"] == 0]) / len(val_df),
                 len(val_df[val_df["Patient Age Category"] == 1]) / len(val_df),
                 len(val_df[val_df["Patient Age Category"] == 2]) / len(val_df),
                 len(val_df[val_df["Patient Age Category"] == 3]) / len(val_df),
                 len(val_df[val_df["Patient Age Category"] == 4]) / len(val_df),
                 len(val_df[val_df["PRIMARY_RACE"] == 'Asian']) / len(val_df),
                 len(val_df[val_df["PRIMARY_RACE"] == 'Black']) / len(val_df),
                 len(val_df[val_df["PRIMARY_RACE"] == 'Native American']) / len(val_df),
                 len(val_df[val_df["PRIMARY_RACE"] == 'White']) / len(val_df),
                 len(val_df[val_df["PRIMARY_RACE"] == 'Other']) / len(val_df),
                 len(val_df[val_df["PRIMARY_RACE"] == 'Unknown']) / len(val_df),
                 (len(val_df[val_df["ETHNICITY"] == 'Hispanic/Latino']) + len(val_df[val_df["ETHNICITY"] == 'Hispanic']))/ len(val_df)])

    data.append(["test", len(test_df[test_df["Gender"] == 'M']) / len(test_df),
                 len(test_df[test_df["Gender"] == 'F']) / len(test_df),
                 len(test_df[test_df["Patient Age Category"] == 0]) / len(test_df),
                 len(test_df[test_df["Patient Age Category"] == 1]) / len(test_df),
                 len(test_df[test_df["Patient Age Category"] == 2]) / len(test_df),
                 len(test_df[test_df["Patient Age Category"] == 3]) / len(test_df),
                 len(test_df[test_df["Patient Age Category"] == 4]) / len(test_df),
                 len(test_df[test_df["PRIMARY_RACE"] == 'Asian']) / len(test_df),
                 len(test_df[test_df["PRIMARY_RACE"] == 'Black']) / len(test_df),
                 len(test_df[test_df["PRIMARY_RACE"] == 'Native American']) / len(test_df),
                 len(test_df[test_df["PRIMARY_RACE"] == 'White']) / len(test_df),
                 len(test_df[test_df["PRIMARY_RACE"] == 'Other']) / len(test_df),
                 len(test_df[test_df["PRIMARY_RACE"] == 'Unknown']) / len(test_df),
                 (len(test_df[test_df["ETHNICITY"] == 'Hispanic/Latino']) + len(test_df[test_df["ETHNICITY"] == 'Hispanic']))/ len(test_df)])
    demographics_df = pd.DataFrame(data, columns=["Data Set", "Male Patients", "Female Patients", "0 to 20 years", "21 to 40 years",
                                                  "41 to 60 years", "61 to 80 years", ">= 81 years", "Asian",
                                                  "Black","Native American", "White", "Other",
                                                  "Unknown", "Hispanic/Latino"])
    demographics_df.set_index('Data Set')

    # Writes csv files
    path = '/content/drive/MyDrive/'
    df_age.to_csv(os.path.join(path, 'chexpert_age_split.csv'), encoding='utf-8', index=False)
    demographics_df.to_csv(os.path.join(path, 'chexpert_age_demographic_distribution.csv'), encoding='utf-8', index=False)

In [None]:
#Create train, validation, test splits for gender group dataframe
train_id, test_id = train_test_split(df_gender["PATIENT"].unique(), test_size=0.2, random_state=2022)
train_id, val_id = train_test_split(train_id, test_size=0.125, random_state=2022)

df_gender.loc[df_gender["PATIENT"].isin(train_id), 'split']= 'train'
df_gender.loc[df_gender["PATIENT"].isin(val_id), 'split']='val'
df_gender.loc[df_gender["PATIENT"].isin(test_id), 'split']='test'

df_gender.split.value_counts(normalize=True)

train    0.703268
test     0.198756
val      0.097976
Name: split, dtype: float64

In [None]:
train_df = df_gender[df_gender.split == 'train']
val_df = df_gender[df_gender.split == 'val']
test_df = df_gender[df_gender.split == 'test']

if no_leakage(train_df, val_df, test_df):
    # Calculates percentage of images with certain feature
    data = [["train",  len(train_df[train_df["Gender"] == 'M']) / len(train_df),
             len(train_df[train_df["Gender"] == 'F']) / len(train_df),
             len(train_df[train_df["Patient Age Category"] == 0]) / len(train_df),
             len(train_df[train_df["Patient Age Category"] == 1]) / len(train_df),
             len(train_df[train_df["Patient Age Category"] == 2]) / len(train_df),
             len(train_df[train_df["Patient Age Category"] == 3]) / len(train_df),
             len(train_df[train_df["Patient Age Category"] == 4]) / len(train_df),
             len(train_df[train_df["PRIMARY_RACE"] == 'Asian']) / len(train_df),
             len(train_df[train_df["PRIMARY_RACE"] == 'Black']) / len(train_df),
             len(train_df[train_df["PRIMARY_RACE"] == 'Native American']) / len(train_df),
             len(train_df[train_df["PRIMARY_RACE"] == 'White']) / len(train_df),
             len(train_df[train_df["PRIMARY_RACE"] == 'Other']) / len(train_df),
             len(train_df[train_df["PRIMARY_RACE"] == 'Unknown']) / len(train_df),
             (len(train_df[train_df["ETHNICITY"] == 'Hispanic/Latino']) + len(train_df[train_df["ETHNICITY"] == 'Hispanic']))/ len(train_df)]]

    data.append(["val", len(val_df[val_df["Gender"] == 'M']) / len(val_df),
                 len(val_df[val_df["Gender"] == 'F']) / len(val_df),
                 len(val_df[val_df["Patient Age Category"] == 0]) / len(val_df),
                 len(val_df[val_df["Patient Age Category"] == 1]) / len(val_df),
                 len(val_df[val_df["Patient Age Category"] == 2]) / len(val_df),
                 len(val_df[val_df["Patient Age Category"] == 3]) / len(val_df),
                 len(val_df[val_df["Patient Age Category"] == 4]) / len(val_df),
                 len(val_df[val_df["PRIMARY_RACE"] == 'Asian']) / len(val_df),
                 len(val_df[val_df["PRIMARY_RACE"] == 'Black']) / len(val_df),
                 len(val_df[val_df["PRIMARY_RACE"] == 'Native American']) / len(val_df),
                 len(val_df[val_df["PRIMARY_RACE"] == 'White']) / len(val_df),
                 len(val_df[val_df["PRIMARY_RACE"] == 'Other']) / len(val_df),
                 len(val_df[val_df["PRIMARY_RACE"] == 'Unknown']) / len(val_df),
                 (len(val_df[val_df["ETHNICITY"] == 'Hispanic/Latino']) + len(val_df[val_df["ETHNICITY"] == 'Hispanic']))/ len(val_df)])

    data.append(["test", len(test_df[test_df["Gender"] == 'M']) / len(test_df),
                 len(test_df[test_df["Gender"] == 'F']) / len(test_df),
                 len(test_df[test_df["Patient Age Category"] == 0]) / len(test_df),
                 len(test_df[test_df["Patient Age Category"] == 1]) / len(test_df),
                 len(test_df[test_df["Patient Age Category"] == 2]) / len(test_df),
                 len(test_df[test_df["Patient Age Category"] == 3]) / len(test_df),
                 len(test_df[test_df["Patient Age Category"] == 4]) / len(test_df),
                 len(test_df[test_df["PRIMARY_RACE"] == 'Asian']) / len(test_df),
                 len(test_df[test_df["PRIMARY_RACE"] == 'Black']) / len(test_df),
                 len(test_df[test_df["PRIMARY_RACE"] == 'Native American']) / len(test_df),
                 len(test_df[test_df["PRIMARY_RACE"] == 'White']) / len(test_df),
                 len(test_df[test_df["PRIMARY_RACE"] == 'Other']) / len(test_df),
                 len(test_df[test_df["PRIMARY_RACE"] == 'Unknown']) / len(test_df),
                 (len(test_df[test_df["ETHNICITY"] == 'Hispanic/Latino']) + len(test_df[test_df["ETHNICITY"] == 'Hispanic']))/ len(test_df)])
    demographics_df = pd.DataFrame(data, columns=["Data Set", "Male Patients", "Female Patients", "0 to 20 years", "21 to 40 years",
                                                  "41 to 60 years", "61 to 80 years", ">= 81 years", "Asian",
                                                  "Black","Native American", "White", "Other",
                                                  "Unknown", "Hispanic/Latino"])
    demographics_df.set_index('Data Set')

    # Writes csv files
    path = '/content/drive/MyDrive/'
    df_gender.to_csv(os.path.join(path, 'chexpert_gender_split.csv'), encoding='utf-8', index=False)
    demographics_df.to_csv(os.path.join(path, 'chexpert_gender_demographic_distribution.csv'), encoding='utf-8', index=False)

In [None]:
#Create train, validation, test splits for race/ethnicity formulation 1
train_id, test_id = train_test_split(df1["PATIENT"].unique(), test_size=0.2, random_state=2022)
train_id, val_id = train_test_split(train_id, test_size=0.125, random_state=2022)

df1.loc[df1["PATIENT"].isin(train_id), 'split']= 'train'
df1.loc[df1["PATIENT"].isin(val_id), 'split']='val'
df1.loc[df1["PATIENT"].isin(test_id), 'split']='test'

df1.split.value_counts(normalize=True)

train    0.701789
test     0.201422
val      0.096789
Name: split, dtype: float64