Note that while we split the VinDr dataset into a 70/10/20 train/val/test split, our model testing pipeline used the entire dataset for validation of our US-trained models. Future experiments, will involve training models on the VinDr dataset using these splits.

In [None]:
import pandas as pd
from fastai.vision.all import *
from sklearn.model_selection import train_test_split
import os

In [None]:
df = pd.read_csv('/content/VinDr_Full/VinDr_labels.csv')
#VinDr has 18000 images but most do not have a valid age associated; exclude those without proper label
print(df['Age'].unique())
print('# Files with invalid age: ' + str(df['Age'].tolist().count(np.nan) + df['Age'].tolist().count('Y') + df['Age'].tolist().count('000D') + df['Age'].tolist().count('000Y')))
df['Age'][df['Age']=='39'] = '39Y'
df['Age'][df['Age']=='65'] = '65Y'
df['Age'][df['Age']=='60'] = '60Y'
df['Age'][df['Age']=='58'] = '58Y'
df['Age'][df['Age']=='Y'] = '0Y'
df['Age'] = df['Age'].str[:-1].astype(float)
df['Age'].replace(0, np.nan, inplace = True)
print(df['Age'].unique())

[nan '028Y' 'Y' '080Y' '058Y' '056Y' '044Y' '000Y' '063Y' '088Y' '025Y'
 '057Y' '078Y' '033Y' '069Y' '066Y' '079Y' '070Y' '050Y' '076Y' '075Y'
 '043Y' '064Y' '037Y' '040Y' '012Y' '062Y' '073Y' '027Y' '060Y' '085Y'
 '065Y' '053Y' '021Y' '030Y' '061Y' '059Y' '032Y' '031Y' '048Y' '082Y'
 '054Y' '077Y' '055Y' '039Y' '068Y' '024Y' '052Y' '022Y' '046Y' '038Y'
 '018Y' '035Y' '083Y' '006Y' '036Y' '000D' '051Y' '041Y' '023Y' '007Y'
 '071Y' '086Y' '067Y' '049Y' '026Y' '005Y' '084Y' '029Y' '034Y' '045Y'
 '042Y' '081Y' '074Y' '047Y' '087Y' '009Y' '004Y' '015Y' '118Y' '010Y'
 '016Y' '119Y' '011Y' '072Y' '090Y' '013Y' '019Y' '020Y' '017Y' '089Y'
 '014Y' '238Y' '008Y' '124Y' '002Y' '003Y' '39' '65' '60' '58']
# Files with invalid age: 13766
[ nan  28.  80.  58.  56.  44.  63.  88.  25.  57.  78.  33.  69.  66.
  79.  70.  50.  76.  75.  43.  64.  37.  40.  12.  62.  73.  27.  60.
  85.  65.  53.  21.  30.  61.  59.  32.  31.  48.  82.  54.  77.  55.
  39.  68.  24.  52.  22.  46.  38.  18.  35.  83. 

In [None]:
#Clean Patient Age column and startify patients based on age group in Patient Age Category column
agedict = {
    "1 to 20": 0,
    "21 to 40": 1,
    "41 to 60": 2,
    "61 to 80": 3,
    ">=81": 4
}
df["Patient Age Category"]=0
df["Patient Age Category"]=df["Patient Age Category"]+(df["Age"]>=21).astype(int)
df["Patient Age Category"]=df["Patient Age Category"]+(df["Age"]>=41).astype(int)
df["Patient Age Category"]=df["Patient Age Category"]+(df["Age"]>=61).astype(int)
df["Patient Age Category"]=df["Patient Age Category"]+(df["Age"]>=81).astype(int)
df.loc[df['Age'].isna(), 'Patient Age Category'] = np.nan

In [None]:
#For convenience sake, we will rename the 'Sex' Variable as 'Gender'
#Due to low nuber of "O" Gender, we will exclude from training/testing
df = df.rename(columns={"Sex": "Gender"})
print(df['Gender'][df['Gender']=='O'])
print(df['Gender'][df['Gender'].isna()])
df.loc[df['Gender'] == 'O', 'Gender'] = np.nan

0        O
1        O
5        O
13       O
15       O
        ..
17982    O
17983    O
17991    O
17993    O
17994    O
Name: Gender, Length: 6388, dtype: object
8        NaN
20       NaN
32       NaN
38       NaN
40       NaN
        ... 
17948    NaN
17953    NaN
17980    NaN
17989    NaN
17997    NaN
Name: Gender, Length: 3004, dtype: object


In [None]:
df_gender = df[~df['Gender'].isna()]
df_gender = df_gender.reset_index()
print(df_gender.shape)
df_age = df[~df['Patient Age Category'].isna()]
df_age = df_age.reset_index()
print(df_age.shape)

(8608, 5)
(4234, 5)


In [None]:
#As far as I can tell, each vindr radiograph is a unique patient

In [None]:
#Create train, validation, test splits for gender
train_id, test_id = train_test_split(df_gender['Path'].unique(), test_size=0.2, random_state=2022)
train_id, val_id = train_test_split(train_id, test_size=0.125, random_state=2022)

df_gender.loc[df_gender.Path.isin(train_id), 'split']= 'train'
df_gender.loc[df_gender.Path.isin(val_id), 'split']='val'
df_gender.loc[df_gender.Path.isin(test_id), 'split']='test'

print(df_gender.split.value_counts(normalize=True))

train_df = df_gender[df_gender.split == 'train']
val_df = df_gender[df_gender.split == 'val']
test_df = df_gender[df_gender.split == 'test']
# Calculates percentage of images with certain feature
data = [["train",  len(train_df[train_df["Gender"] == 'M']) / len(train_df),
             len(train_df[train_df["Gender"] == 'F']) / len(train_df),
             len(train_df[train_df["Patient Age Category"] == 0]) / len(train_df),
             len(train_df[train_df["Patient Age Category"] == 1]) / len(train_df),
             len(train_df[train_df["Patient Age Category"] == 2]) / len(train_df),
             len(train_df[train_df["Patient Age Category"] == 3]) / len(train_df),
             len(train_df[train_df["Patient Age Category"] == 4]) / len(train_df)]]

data.append(["val", len(val_df[val_df["Gender"] == 'M']) / len(val_df),
                 len(val_df[val_df["Gender"] == 'F']) / len(val_df),
                 len(val_df[val_df["Patient Age Category"] == 0]) / len(val_df),
                 len(val_df[val_df["Patient Age Category"] == 1]) / len(val_df),
                 len(val_df[val_df["Patient Age Category"] == 2]) / len(val_df),
                 len(val_df[val_df["Patient Age Category"] == 3]) / len(val_df),
                 len(val_df[val_df["Patient Age Category"] == 4]) / len(val_df)])

data.append(["test", len(test_df[test_df["Gender"] == 'M']) / len(test_df),
                 len(test_df[test_df["Gender"] == 'F']) / len(test_df),
                 len(test_df[test_df["Patient Age Category"] == 0]) / len(test_df),
                 len(test_df[test_df["Patient Age Category"] == 1]) / len(test_df),
                 len(test_df[test_df["Patient Age Category"] == 2]) / len(test_df),
                 len(test_df[test_df["Patient Age Category"] == 3]) / len(test_df),
                 len(test_df[test_df["Patient Age Category"] == 4]) / len(test_df)])
demographics_df = pd.DataFrame(data, columns=["Data Set", "Male Patients", "Female Patients", "1 to 20 years", "21 to 40 years",
                                                  "41 to 60 years", "61 to 80 years", ">= 81 years"])
demographics_df.set_index('Data Set')

# Writes csv files
path = '/content/sample_data'
df_gender.to_csv(os.path.join(path, 'vindr_gender_split.csv'), encoding='utf-8', index=False)
demographics_df.to_csv(os.path.join(path, 'vindr_gender_demographic_distribution.csv'), encoding='utf-8', index=False)

train    0.699930
test     0.200046
val      0.100023
Name: split, dtype: float64


In [None]:
#Due to low sample size, we will use df_age solely for testing purposes
df_age['split'] = 'test'
data = [["test",  len(df_age[df_age["Gender"] == 'M']) / len(df_age),
             len(df_age[df_age["Gender"] == 'F']) / len(df_age),
             len(df_age[df_age["Patient Age Category"] == 0]) / len(df_age),
             len(df_age[df_age["Patient Age Category"] == 1]) / len(df_age),
             len(df_age[df_age["Patient Age Category"] == 2]) / len(df_age),
             len(df_age[df_age["Patient Age Category"] == 3]) / len(df_age),
             len(df_age[df_age["Patient Age Category"] == 4]) / len(df_age)]]
demographics_df = pd.DataFrame(data, columns=["Data Set", "Male Patients", "Female Patients", "1 to 20 years", "21 to 40 years",
                                                  "41 to 60 years", "61 to 80 years", ">= 81 years"])
demographics_df.set_index('Data Set')

path = '/content/sample_data'
df_age.to_csv(os.path.join(path, 'vindr_age_split.csv'), encoding='utf-8', index=False)
demographics_df.to_csv(os.path.join(path, 'vindr_age_demographic_distribution.csv'), encoding='utf-8', index=False)