In [None]:
import pandas as pd
from fastai.vision.all import *
from sklearn.model_selection import train_test_split
import os

In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [None]:
path = '...' #Replace with user path
df = pd.read_csv(os.path.join(path, 'mimic-cxr-2.0.0-metadata.csv'))
path_df = pd.read_csv(os.path.join(path, 'cxr-record-list.csv'))
path_df = path_df.drop(columns = ['subject_id', 'study_id'])
df = pd.merge(df, path_df, on="dicom_id", how = 'left')

In [None]:
print(df.ViewPosition.unique())
print('# of total CXRs: ', len(df))
print('# of total PA and AP CXRs: ', sum(df.ViewPosition=='AP')+sum(df.ViewPosition=='PA'))
ap_df = df[df['ViewPosition']=='AP']
pa_df = df[df['ViewPosition']=='PA']
df = pd.concat([ap_df, pa_df], ignore_index=True)

['PA' 'LATERAL' 'AP' 'LL' nan 'LAO' 'RAO' 'AP AXIAL' 'SWIMMERS' 'PA LLD'
 'AP LLD' 'XTABLE LATERAL' 'AP RLD' 'PA RLD' 'LPO']
# of total CXRs:  377110
# of total PA and AP CXRs:  243334


In [None]:
df.shape

(243334, 13)

**Match Demographic Information to Patients and Remove Inconsistent Records**

In [None]:
#Exclude individuals with inconsistently documented race
df_demo = pd.read_csv(os.path.join(path, 'admissions.csv'))
ethnicity_df = df_demo.loc[:,['subject_id', 'ethnicity']].drop_duplicates()
v = ethnicity_df.subject_id.value_counts()
subject_id_more_than_once = v.index[v.gt(1)]
ambiguous_ethnicity_df = ethnicity_df[ethnicity_df.subject_id.isin(subject_id_more_than_once)]
inconsistent_ethnicity = ambiguous_ethnicity_df.subject_id.unique()

In [None]:
#Obtain and isolate columns of subject_id and ethnicity
#df_demo = pd.read_csv(os.path.join(path, 'admissions.csv'))
#df_demo = df_demo[["subject_id", "ethnicity"]]
#df_demo = df_demo.drop_duplicates(['subject_id'])
#df_demo = df_demo.reset_index()
#Merge ethnicity onto df with paths based on subject_id
df = pd.merge(df, ethnicity_df, on="subject_id", how = 'left')
df = df[~df.subject_id.isin(inconsistent_ethnicity)]

In [None]:
df.shape

(224689, 14)

In [None]:
df_patients = pd.read_csv(os.path.join(path, 'patients.csv'))
df_demo = pd.merge(df_demo, df_patients, on = 'subject_id', how = 'left')
df_demo['admittime'] = df_demo['admittime'].str[0:4]
df_demo['admittime'] = df_demo['admittime'].astype(int)

In [None]:
#Use subject_id to get age and gender information and merge onto df
df_demo['anchor_age'] = df_demo['anchor_age'].astype(int)
df_demo['age'] = df_demo['admittime'] - df_demo['anchor_year'] + df_demo['anchor_age'].astype(int)
#Age effectively unknown if anchor_age is 0
df_demo.loc[df_demo['anchor_age'] == 0, 'age'] = np.nan
agedict = {
    "0 to 20": 0,
    "21 to 40": 1,
    "41 to 60": 2,
    "61 to 80": 3,
    ">=81": 4
}
df_demo = df_demo[["subject_id", "gender", "age"]]
df_demo = df_demo.drop_duplicates()
df = df.merge(df_demo, on="subject_id", how = 'left')
df = df.drop_duplicates(subset=['dicom_id'])

**Clean Up Age, Sex and Race Labels**

In [None]:
#Using the anchor_age column create Age Category column stratify patients by age
df["Patient Age Category"]=np.logical_not(df["age"]>=0).astype(int)
df["Patient Age Category"]=df["Patient Age Category"]+(df["age"]>=21).astype(int)
df["Patient Age Category"]=df["Patient Age Category"]+(df["age"]>=41).astype(int)
df["Patient Age Category"]=df["Patient Age Category"]+(df["age"]>=61).astype(int)
df["Patient Age Category"]=df["Patient Age Category"]+(df["age"]>=81).astype(int)
df.loc[df['age'] == 0, 'Patient Age Category'] = np.nan


In [None]:
df['gender'].unique()

array(['F', 'M', nan], dtype=object)

In [None]:
#Cleaning up race labels
df['ethnicity'] = df['ethnicity'].replace(['UNABLE TO OBTAIN'],'Unknown')
df['ethnicity'] = df['ethnicity'].replace(['UNKNOWN'],'Unknown')
df['ethnicity'] = df['ethnicity'].replace(['OTHER'],'Other')
df['ethnicity'] = df['ethnicity'].replace(['WHITE'],'White')
df['ethnicity'] = df['ethnicity'].replace(['BLACK/AFRICAN AMERICAN'],'Black')
df['ethnicity'] = df['ethnicity'].replace(['ASIAN'],'Asian')
df['ethnicity'] = df['ethnicity'].replace(['HISPANIC/LATINO'],'Hispanic/Latino')
df['ethnicity'] = df['ethnicity'].replace(['AMERICAN INDIAN/ALASKA NATIVE'],'Native American')
df = df.rename(columns={"ethnicity": "Race/Ethnicity", "gender": "Gender"}) #Note that gender is reported solely as M or F or na

**Creat Seperate Dataframes for Race, Sex, Age To Generate Train/Val/Test Splits**

In [None]:
#Create age, gender and race/ethnicity dataframes excluding NaN values fore each demographic variable
df_age = df[~df['Patient Age Category'].isna()]
df_age = df_age.reset_index()
df_gender = df[~df['Gender'].isna()]
df_gender = df_gender.reset_index()
df_race =  df[df['Race/Ethnicity'] != 'Unknown']
df_race =  df_race[~df_race['Race/Ethnicity'].isna()]
df_race =  df_race[df_race['Race/Ethnicity'] != 'Native American']
df_race = df_race.reset_index()

**Create Train/Val/Test Splits**

In [None]:
#Method to ensure no leakage occured; true indicates no leakage
def no_leakage(train, val, test):
    intersect1 = pd.Series(np.intersect1d(train["subject_id"], val["subject_id"]))
    intersect2 = pd.Series(np.intersect1d(train["subject_id"], test["subject_id"]))
    intersect3 = pd.Series(np.intersect1d(val["subject_id"], test["subject_id"]))
    return intersect1.empty and intersect2.empty and intersect3.empty

In [None]:
#Create train, validation, test splits for age
train_id, test_id = train_test_split(df_age['subject_id'].unique(), test_size=0.2, random_state=2022)
train_id, val_id = train_test_split(train_id, test_size=0.125, random_state=2022)

df_age.loc[df_age.subject_id.isin(train_id), 'split']= 'train'
df_age.loc[df_age.subject_id.isin(val_id), 'split']='val'
df_age.loc[df_age.subject_id.isin(test_id), 'split']='test'

df_age.split.value_counts(normalize=True)

train    0.702504
test     0.198951
val      0.098545
Name: split, dtype: float64

In [None]:
train_df = df_age[df_age.split == 'train']
val_df = df_age[df_age.split == 'val']
test_df = df_age[df_age.split == 'test']

if no_leakage(train_df, val_df, test_df):
    # Calculates percentage of images with certain feature
    data = [["train", len(train_df[train_df["Gender"] == 'F']) / len(train_df),
             len(train_df[train_df["Patient Age Category"] == 0]) / len(train_df),
             len(train_df[train_df["Patient Age Category"] == 1]) / len(train_df),
             len(train_df[train_df["Patient Age Category"] == 2]) / len(train_df),
             len(train_df[train_df["Patient Age Category"] == 3]) / len(train_df),
             len(train_df[train_df["Patient Age Category"] == 4]) / len(train_df),
             len(train_df[train_df["Race/Ethnicity"] == 'Asian']) / len(train_df),
             len(train_df[train_df["Race/Ethnicity"] == 'Black']) / len(train_df),
             len(train_df[train_df["Race/Ethnicity"] == 'Native American']) / len(train_df),
             len(train_df[train_df["Race/Ethnicity"] == 'Hispanic/Latino']) / len(train_df),
             len(train_df[train_df["Race/Ethnicity"] == 'White']) / len(train_df),
             len(train_df[train_df["Race/Ethnicity"] == 'Other']) / len(train_df),
             len(train_df[train_df["Race/Ethnicity"] == 'Unknown']) / len(train_df)]]
    data.append(["val", len(val_df[val_df["Gender"] == 'F']) / len(val_df),
                 len(val_df[val_df["Patient Age Category"] == 0]) / len(val_df),
                 len(val_df[val_df["Patient Age Category"] == 1]) / len(val_df),
                 len(val_df[val_df["Patient Age Category"] == 2]) / len(val_df),
                 len(val_df[val_df["Patient Age Category"] == 3]) / len(val_df),
                 len(val_df[val_df["Patient Age Category"] == 4]) / len(val_df),
                 len(val_df[val_df["Race/Ethnicity"] == 'Asian']) / len(val_df),
                 len(val_df[val_df["Race/Ethnicity"] == 'Black']) / len(val_df),
                 len(val_df[val_df["Race/Ethnicity"] == 'Native American']) / len(val_df),
                 len(val_df[val_df["Race/Ethnicity"] == 'Hispanic/Latino']) / len(val_df),
                 len(val_df[val_df["Race/Ethnicity"] == 'White']) / len(val_df),
                 len(val_df[val_df["Race/Ethnicity"] == 'Other']) / len(val_df),
                 len(val_df[val_df["Race/Ethnicity"] == 'Unknown']) / len(val_df)])
    data.append(["test", len(test_df[test_df["Gender"] == 'F']) / len(test_df),
                 len(test_df[test_df["Patient Age Category"] == 0]) / len(test_df),
                 len(test_df[test_df["Patient Age Category"] == 1]) / len(test_df),
                 len(test_df[test_df["Patient Age Category"] == 2]) / len(test_df),
                 len(test_df[test_df["Patient Age Category"] == 3]) / len(test_df),
                 len(test_df[test_df["Patient Age Category"] == 4]) / len(test_df),
                 len(test_df[test_df["Race/Ethnicity"] == 'Asian']) / len(test_df),
                 len(test_df[test_df["Race/Ethnicity"] == 'Black']) / len(test_df),
                 len(test_df[test_df["Race/Ethnicity"] == 'Native American']) / len(test_df),
                 len(test_df[test_df["Race/Ethnicity"] == 'Hispanic/Latino']) / len(test_df),
                 len(test_df[test_df["Race/Ethnicity"] == 'White']) / len(test_df),
                 len(test_df[test_df["Race/Ethnicity"] == 'Other']) / len(test_df),
                 len(test_df[test_df["Race/Ethnicity"] == 'Unknown']) / len(test_df)])
    demographics_df = pd.DataFrame(data, columns=["Data Set", "Female Patients", "0 to 20 years", "21 to 40 years",
                                                  "41 to 60 years", "61 to 80 years", ">= 81 years", "Asian",
                                                  "Black",
                                                  "Native American", "Hispanic/Latino", "White", "Other",
                                                  "Unknown"])
    demographics_df.set_index('Data Set')

    # Writes csv files
    path = '/content/drive/MyDrive/'
    df_age.to_csv(os.path.join(path, 'mimic_age_split.csv'), encoding='utf-8', index=False)
    demographics_df.to_csv(os.path.join(path, 'mimic_age_demographic_distribution.csv'),  encoding='utf-8', index=False)


In [None]:
#Create train, validation, test splits for gender
train_id, test_id = train_test_split(df_gender['subject_id'].unique(), test_size=0.2, random_state=2022)
train_id, val_id = train_test_split(train_id, test_size=0.125, random_state=2022)

df_gender.loc[df_gender.subject_id.isin(train_id), 'split']= 'train'
df_gender.loc[df_gender.subject_id.isin(val_id), 'split']='val'
df_gender.loc[df_gender.subject_id.isin(test_id), 'split']='test'

df_gender.split.value_counts(normalize=True)

train    0.702730
test     0.198207
val      0.099063
Name: split, dtype: float64

In [None]:
train_df = df_gender[df_gender.split == 'train']
val_df = df_gender[df_gender.split == 'val']
test_df = df_gender[df_gender.split == 'test']

if no_leakage(train_df, val_df, test_df):
    # Calculates percentage of images with certain feature
    data = [["train", len(train_df[train_df["Gender"] == 'F']) / len(train_df),
             len(train_df[train_df["Patient Age Category"] == 0]) / len(train_df),
             len(train_df[train_df["Patient Age Category"] == 1]) / len(train_df),
             len(train_df[train_df["Patient Age Category"] == 2]) / len(train_df),
             len(train_df[train_df["Patient Age Category"] == 3]) / len(train_df),
             len(train_df[train_df["Patient Age Category"] == 4]) / len(train_df),
             len(train_df[train_df["Race/Ethnicity"] == 'Asian']) / len(train_df),
             len(train_df[train_df["Race/Ethnicity"] == 'Black']) / len(train_df),
             len(train_df[train_df["Race/Ethnicity"] == 'Native American']) / len(train_df),
             len(train_df[train_df["Race/Ethnicity"] == 'Hispanic/Latino']) / len(train_df),
             len(train_df[train_df["Race/Ethnicity"] == 'White']) / len(train_df),
             len(train_df[train_df["Race/Ethnicity"] == 'Other']) / len(train_df),
             len(train_df[train_df["Race/Ethnicity"] == 'Unknown']) / len(train_df)]]
    data.append(["val", len(val_df[val_df["Gender"] == 'F']) / len(val_df),
                 len(val_df[val_df["Patient Age Category"] == 0]) / len(val_df),
                 len(val_df[val_df["Patient Age Category"] == 1]) / len(val_df),
                 len(val_df[val_df["Patient Age Category"] == 2]) / len(val_df),
                 len(val_df[val_df["Patient Age Category"] == 3]) / len(val_df),
                 len(val_df[val_df["Patient Age Category"] == 4]) / len(val_df),
                 len(val_df[val_df["Race/Ethnicity"] == 'Asian']) / len(val_df),
                 len(val_df[val_df["Race/Ethnicity"] == 'Black']) / len(val_df),
                 len(val_df[val_df["Race/Ethnicity"] == 'Native American']) / len(val_df),
                 len(val_df[val_df["Race/Ethnicity"] == 'Hispanic/Latino']) / len(val_df),
                 len(val_df[val_df["Race/Ethnicity"] == 'White']) / len(val_df),
                 len(val_df[val_df["Race/Ethnicity"] == 'Other']) / len(val_df),
                 len(val_df[val_df["Race/Ethnicity"] == 'Unknown']) / len(val_df)])
    data.append(["test", len(test_df[test_df["Gender"] == 'F']) / len(test_df),
                 len(test_df[test_df["Patient Age Category"] == 0]) / len(test_df),
                 len(test_df[test_df["Patient Age Category"] == 1]) / len(test_df),
                 len(test_df[test_df["Patient Age Category"] == 2]) / len(test_df),
                 len(test_df[test_df["Patient Age Category"] == 3]) / len(test_df),
                 len(test_df[test_df["Patient Age Category"] == 4]) / len(test_df),
                 len(test_df[test_df["Race/Ethnicity"] == 'Asian']) / len(test_df),
                 len(test_df[test_df["Race/Ethnicity"] == 'Black']) / len(test_df),
                 len(test_df[test_df["Race/Ethnicity"] == 'Native American']) / len(test_df),
                 len(test_df[test_df["Race/Ethnicity"] == 'Hispanic/Latino']) / len(test_df),
                 len(test_df[test_df["Race/Ethnicity"] == 'White']) / len(test_df),
                 len(test_df[test_df["Race/Ethnicity"] == 'Other']) / len(test_df),
                 len(test_df[test_df["Race/Ethnicity"] == 'Unknown']) / len(test_df)])
    demographics_df = pd.DataFrame(data, columns=["Data Set", "Female Patients", "0 to 20 years", "21 to 40 years",
                                                  "41 to 60 years", "61 to 80 years", ">= 81 years", "Asian",
                                                  "Black",
                                                  "Native American", "Hispanic/Latino", "White", "Other",
                                                  "Unknown"])
    demographics_df.set_index('Data Set')

    # Writes csv files
    path = '/content/drive/MyDrive/'
    df_gender.to_csv(os.path.join(path, 'mimic_gender_split.csv'), encoding='utf-8', index=False)
    demographics_df.to_csv(os.path.join(path, 'mimic_gender_demographic_distribution.csv'),  encoding='utf-8', index=False)

In [None]:
#Create train, validation, test splits for race/ethnicity
train_id, test_id = train_test_split(df_race["subject_id"].unique(), test_size=0.2, random_state=2022)
train_id, val_id = train_test_split(train_id, test_size=0.125, random_state=2022)

train_df= df_race[df_race["subject_id"].isin(train_id)]
val_df= df_race[df_race["subject_id"].isin(val_id)]
test_df= df_race[df_race["subject_id"].isin(test_id)]

df_race.loc[df_race.subject_id.isin(train_id), 'split']= 'train'
df_race.loc[df_race.subject_id.isin(val_id), 'split']='val'
df_race.loc[df_race.subject_id.isin(test_id), 'split']='test'

df_race.split.value_counts(normalize=True)

train    0.70062
test     0.20272
val      0.09666
Name: split, dtype: float64

In [None]:
train_df = df_race[df_race.split == 'train']
val_df = df_race[df_race.split == 'val']
test_df = df_race[df_race.split == 'test']

if no_leakage(train_df, val_df, test_df):
    # Calculates percentage of images with certain feature
    data = [["train", len(train_df[train_df["Gender"] == 'F']) / len(train_df),
             len(train_df[train_df["Patient Age Category"] == 0]) / len(train_df),
             len(train_df[train_df["Patient Age Category"] == 1]) / len(train_df),
             len(train_df[train_df["Patient Age Category"] == 2]) / len(train_df),
             len(train_df[train_df["Patient Age Category"] == 3]) / len(train_df),
             len(train_df[train_df["Patient Age Category"] == 4]) / len(train_df),
             len(train_df[train_df["Race/Ethnicity"] == 'Asian']) / len(train_df),
             len(train_df[train_df["Race/Ethnicity"] == 'Black']) / len(train_df),
             len(train_df[train_df["Race/Ethnicity"] == 'Native American']) / len(train_df),
             len(train_df[train_df["Race/Ethnicity"] == 'Hispanic/Latino']) / len(train_df),
             len(train_df[train_df["Race/Ethnicity"] == 'White']) / len(train_df),
             len(train_df[train_df["Race/Ethnicity"] == 'Other']) / len(train_df),
             len(train_df[train_df["Race/Ethnicity"] == 'Unknown']) / len(train_df)]]
    data.append(["val", len(val_df[val_df["Gender"] == 'F']) / len(val_df),
                 len(val_df[val_df["Patient Age Category"] == 0]) / len(val_df),
                 len(val_df[val_df["Patient Age Category"] == 1]) / len(val_df),
                 len(val_df[val_df["Patient Age Category"] == 2]) / len(val_df),
                 len(val_df[val_df["Patient Age Category"] == 3]) / len(val_df),
                 len(val_df[val_df["Patient Age Category"] == 4]) / len(val_df),
                 len(val_df[val_df["Race/Ethnicity"] == 'Asian']) / len(val_df),
                 len(val_df[val_df["Race/Ethnicity"] == 'Black']) / len(val_df),
                 len(val_df[val_df["Race/Ethnicity"] == 'Native American']) / len(val_df),
                 len(val_df[val_df["Race/Ethnicity"] == 'Hispanic/Latino']) / len(val_df),
                 len(val_df[val_df["Race/Ethnicity"] == 'White']) / len(val_df),
                 len(val_df[val_df["Race/Ethnicity"] == 'Other']) / len(val_df),
                 len(val_df[val_df["Race/Ethnicity"] == 'Unknown']) / len(val_df)])
    data.append(["test", len(test_df[test_df["Gender"] == 'F']) / len(test_df),
                 len(test_df[test_df["Patient Age Category"] == 0]) / len(test_df),
                 len(test_df[test_df["Patient Age Category"] == 1]) / len(test_df),
                 len(test_df[test_df["Patient Age Category"] == 2]) / len(test_df),
                 len(test_df[test_df["Patient Age Category"] == 3]) / len(test_df),
                 len(test_df[test_df["Patient Age Category"] == 4]) / len(test_df),
                 len(test_df[test_df["Race/Ethnicity"] == 'Asian']) / len(test_df),
                 len(test_df[test_df["Race/Ethnicity"] == 'Black']) / len(test_df),
                 len(test_df[test_df["Race/Ethnicity"] == 'Native American']) / len(test_df),
                 len(test_df[test_df["Race/Ethnicity"] == 'Hispanic/Latino']) / len(test_df),
                 len(test_df[test_df["Race/Ethnicity"] == 'White']) / len(test_df),
                 len(test_df[test_df["Race/Ethnicity"] == 'Other']) / len(test_df),
                 len(test_df[test_df["Race/Ethnicity"] == 'Unknown']) / len(test_df)])
    demographics_df = pd.DataFrame(data, columns=["Data Set", "Female Patients", "0 to 20 years", "21 to 40 years",
                                                  "41 to 60 years", "61 to 80 years", ">= 81 years", "Asian",
                                                  "Black",
                                                  "Native American", "Hispanic/Latino", "White", "Other",
                                                  "Unknown"])
    demographics_df.set_index('Data Set')

    # Writes csv files
    path = '/content/drive/MyDrive/'
    df_race.to_csv(os.path.join(path, 'mimic_race_split.csv'), encoding='utf-8', index=False)
    demographics_df.to_csv(os.path.join(path, 'mimic_race_demographic_distribution.csv'),  encoding='utf-8', index=False)

In [None]:
#Create train, validation, test splits for race including only White, Black, Asian individuals
df_reading_race = df_race[df_race['Race/Ethnicity']!='Hispanic/Latino']
df_reading_race = df_reading_race[df_reading_race['Race/Ethnicity']!='Other']
train_id, test_id = train_test_split(df_reading_race["subject_id"].unique(), test_size=0.2, random_state=2022)
train_id, val_id = train_test_split(train_id, test_size=0.125, random_state=2022)

train_df= df_reading_race[df_reading_race["subject_id"].isin(train_id)]
val_df= df_reading_race[df_reading_race["subject_id"].isin(val_id)]
test_df= df_reading_race[df_reading_race["subject_id"].isin(test_id)]

df_reading_race.loc[df_reading_race.subject_id.isin(train_id), 'split']= 'train'
df_reading_race.loc[df_reading_race.subject_id.isin(val_id), 'split']='val'
df_reading_race.loc[df_reading_race.subject_id.isin(test_id), 'split']='test'

df_reading_race.split.value_counts(normalize=True)

train    0.700426
test     0.201215
val      0.098359
Name: split, dtype: float64

In [None]:
train_df = df_race[df_race.split == 'train']
val_df = df_race[df_race.split == 'val']
test_df = df_race[df_race.split == 'test']

if no_leakage(train_df, val_df, test_df):
    # Calculates percentage of images with certain feature
    data = [["train", len(train_df[train_df["Gender"] == 'F']) / len(train_df),
             len(train_df[train_df["Patient Age Category"] == 0]) / len(train_df),
             len(train_df[train_df["Patient Age Category"] == 1]) / len(train_df),
             len(train_df[train_df["Patient Age Category"] == 2]) / len(train_df),
             len(train_df[train_df["Patient Age Category"] == 3]) / len(train_df),
             len(train_df[train_df["Patient Age Category"] == 4]) / len(train_df),
             len(train_df[train_df["Race/Ethnicity"] == 'Asian']) / len(train_df),
             len(train_df[train_df["Race/Ethnicity"] == 'Black']) / len(train_df),
             len(train_df[train_df["Race/Ethnicity"] == 'Native American']) / len(train_df),
             len(train_df[train_df["Race/Ethnicity"] == 'Hispanic/Latino']) / len(train_df),
             len(train_df[train_df["Race/Ethnicity"] == 'White']) / len(train_df),
             len(train_df[train_df["Race/Ethnicity"] == 'Other']) / len(train_df),
             len(train_df[train_df["Race/Ethnicity"] == 'Unknown']) / len(train_df)]]
    data.append(["val", len(val_df[val_df["Gender"] == 'F']) / len(val_df),
                 len(val_df[val_df["Patient Age Category"] == 0]) / len(val_df),
                 len(val_df[val_df["Patient Age Category"] == 1]) / len(val_df),
                 len(val_df[val_df["Patient Age Category"] == 2]) / len(val_df),
                 len(val_df[val_df["Patient Age Category"] == 3]) / len(val_df),
                 len(val_df[val_df["Patient Age Category"] == 4]) / len(val_df),
                 len(val_df[val_df["Race/Ethnicity"] == 'Asian']) / len(val_df),
                 len(val_df[val_df["Race/Ethnicity"] == 'Black']) / len(val_df),
                 len(val_df[val_df["Race/Ethnicity"] == 'Native American']) / len(val_df),
                 len(val_df[val_df["Race/Ethnicity"] == 'Hispanic/Latino']) / len(val_df),
                 len(val_df[val_df["Race/Ethnicity"] == 'White']) / len(val_df),
                 len(val_df[val_df["Race/Ethnicity"] == 'Other']) / len(val_df),
                 len(val_df[val_df["Race/Ethnicity"] == 'Unknown']) / len(val_df)])
    data.append(["test", len(test_df[test_df["Gender"] == 'F']) / len(test_df),
                 len(test_df[test_df["Patient Age Category"] == 0]) / len(test_df),
                 len(test_df[test_df["Patient Age Category"] == 1]) / len(test_df),
                 len(test_df[test_df["Patient Age Category"] == 2]) / len(test_df),
                 len(test_df[test_df["Patient Age Category"] == 3]) / len(test_df),
                 len(test_df[test_df["Patient Age Category"] == 4]) / len(test_df),
                 len(test_df[test_df["Race/Ethnicity"] == 'Asian']) / len(test_df),
                 len(test_df[test_df["Race/Ethnicity"] == 'Black']) / len(test_df),
                 len(test_df[test_df["Race/Ethnicity"] == 'Native American']) / len(test_df),
                 len(test_df[test_df["Race/Ethnicity"] == 'Hispanic/Latino']) / len(test_df),
                 len(test_df[test_df["Race/Ethnicity"] == 'White']) / len(test_df),
                 len(test_df[test_df["Race/Ethnicity"] == 'Other']) / len(test_df),
                 len(test_df[test_df["Race/Ethnicity"] == 'Unknown']) / len(test_df)])
    demographics_df = pd.DataFrame(data, columns=["Data Set", "Female Patients", "0 to 20 years", "21 to 40 years",
                                                  "41 to 60 years", "61 to 80 years", ">= 81 years", "Asian",
                                                  "Black",
                                                  "Native American", "Hispanic/Latino", "White", "Other",
                                                  "Unknown"])
    demographics_df.set_index('Data Set')

    # Writes csv files
    path = '/content/drive/MyDrive/'
    df_race.to_csv(os.path.join(path, 'mimic_reading_race_split.csv'), encoding='utf-8', index=False)
    demographics_df.to_csv(os.path.join(path, 'mimic_reading_race_demographic_distribution.csv'),  encoding='utf-8', index=False)