Note that while we split the BRAX dataset into a 70/10/20 train/val/test split, our model testing pipeline used the entire dataset for validation of our US-trained models. Future experiments, will involve training models on the BRAX dataset using these splits.

In [None]:
!chmod 600 ~/.passwd-s3fs

In [None]:
!apt install s3fs

Reading package lists... Done
Building dependency tree       
Reading state information... Done
s3fs is already the newest version (1.82-1).
The following package was automatically installed and is no longer required:
  libnvidia-common-460
Use 'apt autoremove' to remove it.
0 upgraded, 0 newly installed, 0 to remove and 20 not upgraded.


In [None]:
!mkdir /content/s3
!s3fs um2ii-datasets /content/s3

mkdir: cannot create directory ‘/content/s3’: File exists


In [None]:
import pandas as pd
from fastai.vision.all import *
from sklearn.model_selection import train_test_split
import os

In [None]:
path = '/content/s3/BRAX/brax/1.1.0'
df = pd.read_csv(os.path.join(path, 'master_spreadsheet_update.csv'))

In [None]:
#Isolate only frontal view CXRs
print(df['ViewPosition'].unique())
ap_df = df[df['ViewPosition']=='AP']
pa_df = df[df['ViewPosition']=='PA']
df = pd.concat([ap_df, pa_df], ignore_index=True)

['L' nan 'PA' 'AP' 'RL' 'LT-DECUB' 'AP LLD' 'RLO']


In [None]:
df.loc[df['PatientAge'] == '85 or more', 'PatientAge'] = '85'
df['PatientAge'] = df['PatientAge'].astype(int)
df['PatientAge'].unique() #No missing age values

array([85,  0,  5, 10, 70, 80, 55, 45, 40, 65, 60, 50, 35, 75, 20, 30, 15,
       25])

In [None]:
sum(df['PatientAge']==0)

(19309, 26)

In [None]:
#Clean Patient Age column and stratify patients based on age group in Patient Age Category column
agedict = {
    "0 to 20": 0,
    "21 to 40": 1,
    "41 to 60": 2,
    "61 to 80": 3,
    ">=81": 4
}
df["Patient Age Category"]=(~df["PatientAge"]>=0).astype(int)
df["Patient Age Category"]=df["Patient Age Category"]+(df["PatientAge"]>=21).astype(int)
df["Patient Age Category"]=df["Patient Age Category"]+(df["PatientAge"]>=41).astype(int)
df["Patient Age Category"]=df["Patient Age Category"]+(df["PatientAge"]>=61).astype(int)
df["Patient Age Category"]=df["Patient Age Category"]+(df["PatientAge"]>=81).astype(int)
df['PatientSex'].unique() #Sex reported as M, F or O but only M and F labels are present

array(['M', 'F'], dtype=object)

In [None]:
#Create train, validation, test splits
train_id, test_id = train_test_split(df["PatientID"].unique(), test_size=0.2, random_state=2022)
train_id, val_id = train_test_split(train_id, test_size=0.125, random_state=2022)
df.loc[df["PatientID"].isin(train_id), 'split']= 'train'
df.loc[df["PatientID"].isin(val_id), 'split']='val'
df.loc[df["PatientID"].isin(test_id), 'split']='test'

df.split.value_counts(normalize=True)

train    0.694754
test     0.201616
val      0.103630
Name: split, dtype: float64

In [None]:
#Method to ensure no leakage occured
def no_leakage(train, val, test):
    intersect1 = pd.Series(np.intersect1d(train["PatientID"], val["PatientID"]))
    intersect2 = pd.Series(np.intersect1d(train["PatientID"], test["PatientID"]))
    intersect3 = pd.Series(np.intersect1d(val["PatientID"], test["PatientID"]))
    return intersect1.empty and intersect2.empty and intersect3.empty

In [None]:
train_df = df[df.split == 'train']
val_df = df[df.split == 'val']
test_df = df[df.split == 'test']

if no_leakage(train_df, val_df, test_df):
    # Calculates percentage of images with certain feature
    data = [["train", len(train_df[train_df["PatientSex"] == 'F']) / len(train_df),
             len(train_df[train_df["Patient Age Category"] == 0]) / len(train_df),
             len(train_df[train_df["Patient Age Category"] == 1]) / len(train_df),
             len(train_df[train_df["Patient Age Category"] == 2]) / len(train_df),
             len(train_df[train_df["Patient Age Category"] == 3]) / len(train_df),
             len(train_df[train_df["Patient Age Category"] == 4]) / len(train_df)]]
    data.append(["val", len(val_df[val_df["PatientSex"] == 'F']) / len(val_df),
                 len(val_df[val_df["Patient Age Category"] == 0]) / len(val_df),
                 len(val_df[val_df["Patient Age Category"] == 1]) / len(val_df),
                 len(val_df[val_df["Patient Age Category"] == 2]) / len(val_df),
                 len(val_df[val_df["Patient Age Category"] == 3]) / len(val_df),
                 len(val_df[val_df["Patient Age Category"] == 4]) / len(val_df)])
    data.append(["test", len(test_df[test_df["PatientSex"] == 'F']) / len(test_df),
                 len(test_df[test_df["Patient Age Category"] == 0]) / len(test_df),
                 len(test_df[test_df["Patient Age Category"] == 1]) / len(test_df),
                 len(test_df[test_df["Patient Age Category"] == 2]) / len(test_df),
                 len(test_df[test_df["Patient Age Category"] == 3]) / len(test_df),
                 len(test_df[test_df["Patient Age Category"] == 4]) / len(test_df)])
    demographics_df = pd.DataFrame(data, columns=["Data Set", "Female Patients", "0 to 20 years", "21 to 40 years",
                                                  "41 to 60 years", "61 to 80 years", ">= 81 years"])
    demographics_df.set_index('Data Set')

    # Writes csv files
    path = '/content/sample_data'
    df.to_csv(os.path.join(path, 'brax_split.csv'), encoding='utf-8', index=False)
    demographics_df.to_csv(os.path.join(path, 'brax_demographic_distribution.csv'),  encoding='utf-8', index=False)