In [1]:
import os
import pandas as pd
import numpy as np

In [2]:
datapath = "../dataset/mimiciv/mit_pretrained"

# Preprocess Data

**Splitting**
- https://glassboxmedicine.com/2019/09/15/best-use-of-train-val-test-splits-with-tips-for-medical-data/

## Read Data

In [15]:
embs = pd.read_csv(f'{datapath}/cxr_ic_fusion_1103.csv', on_bad_lines='warn')

Skipping line 45052: expected 6405 fields, saw 7173
Skipping line 45053: expected 6405 fields, saw 7173



In [11]:
# # read line 45052 from a csv file
# with open('../dataset/mimiciv/mit_pretrained/cxr_ic_fusion_1103.csv', 'r') as f:
#     lines = f.read().splitlines()

In [12]:
# # write the erring lines to a new file
# with open('err_lines.txt', 'w') as f:
#     for line in lines[45050:45055]:
#         f.write(line + '\n\n')

In [32]:
len(embs), len(embs.columns)

(45050, 6405)

In [33]:
embs.split.unique()

array(['train', 'validate', 'test'], dtype=object)

In [34]:
embs.haim_id.nunique()

8655

### Columns

In [41]:
x_cols = [c for c in embs.columns if c.startswith(('haim_id', 'de_', 'vd', 'vmd', 'ts_ce', 'ts_le', 'ts_pe', 'n_ecg', 'n_ech', 'split'))]

In [37]:
# slice multiple groups of columns from embs
x = embs.loc[:, x_cols]
y = embs.loc[:, 'split': 'Pneumothorax']

In [38]:
len(x.columns), len(y.columns)

(4042, 14)

## Split as Sefined in Dataset

In [10]:
print("Number of samples in each split:")
print(f"train: {len(embs[embs.split == 'train']):,}")
print(f"val: {len(embs[embs.split == 'validate']):,}")
print(f"test: {len(embs[embs.split == 'test']):,}")

Number of samples in each split:
train: 43,738
val: 321
test: 991


In [9]:
print("Percentages of train, validate, test: ")
print(f"train: {len(embs[embs.split == 'train'])/len(embs)*100:.3f}%")
print(f"val: {len(embs[embs.split == 'validate'])/len(embs)*100:.3f}%")
print(f"test: {len(embs[embs.split == 'test'])/len(embs)*100:.3f}%")

Percentages of train, validate, test: 
train: 97.088%
val: 0.713%
test: 2.200%


In [25]:
# x_train = all rows where split == 'train' and all columns except 'split'
x_train = x.loc[x.split == 'train', x.columns != 'split']
y_train = y.loc[x.split == 'train', y.columns != 'split']

x_val = x.loc[x.split == 'validate', x.columns != 'split']
y_val = y.loc[x.split == 'validate', y.columns != 'split']

x_test = x.loc[x.split == 'test', x.columns != 'split']
y_test = y.loc[x.split == 'test', y.columns != 'split']

## Random Split on HAIM_IDs

**BUT**
- Multiple HAIM IDs are used for a single patient (look in HAIM code for details)
- HAIM_ID is a unique combination of `subject_id`, `hadm_id` and `stay_id`
- **So need to revisit splitting strategy**

In [20]:
haim_ids = pd.DataFrame(embs.haim_id.unique())
len(haim_ids)

8655

In [21]:
def split_patients(patients, valid_pct=0.2, test_pct=0.2, random_state=1234):
    '''Split the patients dataframe'''
    train_pct = 1 - (valid_pct + test_pct)
    print(f'Splits:: train: {train_pct}, valid: {valid_pct}, test: {test_pct}')
    patients = patients.sample(frac=1, random_state=random_state).reset_index(drop=True)
    return np.split(patients, [int(train_pct*len(patients)), int((train_pct+valid_pct)*len(patients))])

In [30]:
train_haim_ids, val_haim_ids, test_haim_ids = split_patients(haim_ids, valid_pct=0.1, test_pct=0.1, random_state=1234)

Splits:: train: 0.8, valid: 0.1, test: 0.1


In [31]:
len(train_haim_ids), len(val_haim_ids), len(test_haim_ids)

(6924, 865, 866)

In [39]:
# x_train = all rows with haim_id in train_haim_ids and all columns except 'split'
x_train = x.loc[x.haim_id.isin(train_haim_ids[0].values.flatten()), x.columns != 'split']
y_train = y.loc[x.haim_id.isin(train_haim_ids[0].values.flatten()), y.columns != 'split']



AttributeError: 'DataFrame' object has no attribute 'haim_id'

## Write to Disk

In [None]:
print("Confirming shapes of train, val, test splits:")
print(f"x_train: {x_train.shape}, y_train: {y_train.shape}")
print(f"x_val: {x_val.shape}, y_val: {y_val.shape}")
print(f"x_test: {x_test.shape}, y_test: {y_test.shape}")


Confirming shapes of train, val, test splits:
x_train: (43738, 4041), y_train: (43738, 13)
x_val: (321, 4041), y_val: (321, 13)
x_test: (991, 4041), y_test: (991, 13)


In [None]:
print(y_train.columns)

Index(['Atelectasis', 'Cardiomegaly', 'Consolidation', 'Edema',
       'Enlarged Cardiomediastinum', 'Fracture', 'Lung Lesion', 'Lung Opacity',
       'No Finding', 'Pleural Effusion', 'Pleural Other', 'Pneumonia',
       'Pneumothorax'],
      dtype='object')


In [None]:
# Write to files into a new folder called splits
if not os.path.exists(f'{datapath}/splits'):
    os.makedirs(f'{datapath}/splits')
    
x_train.to_csv(f'{datapath}/splits/x_train.csv', index=False)
y_train.to_csv(f'{datapath}/splits/y_train.csv', index=False)

x_val.to_csv(f'{datapath}/splits/x_val.csv', index=False)
y_val.to_csv(f'{datapath}/splits/y_val.csv', index=False)

x_test.to_csv(f'{datapath}/splits/x_test.csv', index=False)
y_test.to_csv(f'{datapath}/splits/y_test.csv', index=False)

# Read splits

In [3]:
x_train = pd.read_csv(f'{datapath}/splits/x_train.csv')
y_train = pd.read_csv(f'{datapath}/splits/y_train.csv')

x_val = pd.read_csv(f'{datapath}/splits/x_val.csv')
y_val = pd.read_csv(f'{datapath}/splits/y_val.csv')

x_test = pd.read_csv(f'{datapath}/splits/x_test.csv')
y_test = pd.read_csv(f'{datapath}/splits/y_test.csv')

In [4]:
x_train.head(5)

Unnamed: 0,de_0,de_1,de_2,de_3,de_4,de_5,vd_0,vd_1,vd_2,vd_3,...,n_ech_758,n_ech_759,n_ech_760,n_ech_761,n_ech_762,n_ech_763,n_ech_764,n_ech_765,n_ech_766,n_ech_767
0,53.0,0,7,2,1,1,0.000185,0.120269,0.427006,0.004908,...,0.043556,-0.141896,0.041412,-0.061159,-0.126256,-0.234913,-0.014596,0.997139,-0.325668,0.999876
1,53.0,0,4,2,1,1,0.0,0.024619,0.479709,0.006753,...,-0.035256,-0.12845,0.231663,-0.123437,-0.073406,-0.28757,0.059177,0.988906,-0.340464,0.999796
2,69.0,1,7,1,1,1,0.0,0.020457,0.253198,0.016274,...,0.085233,-0.195788,0.066648,-0.064522,-0.054913,-0.214809,-0.019572,0.995771,-0.26019,0.99982
3,53.0,0,7,2,1,1,0.00306,0.107806,0.385208,0.002662,...,0.043556,-0.141896,0.041412,-0.061159,-0.126256,-0.234913,-0.014596,0.997139,-0.325668,0.999876
4,53.0,0,7,2,1,1,0.0,0.06878,0.281578,0.018962,...,0.043556,-0.141896,0.041412,-0.061159,-0.126256,-0.234913,-0.014596,0.997139,-0.325668,0.999876


In [5]:
y_train.head(5)

Unnamed: 0,Atelectasis,Cardiomegaly,Consolidation,Edema,Enlarged Cardiomediastinum,Fracture,Lung Lesion,Lung Opacity,No Finding,Pleural Effusion,Pleural Other,Pneumonia,Pneumothorax
0,1.0,1.0,,1.0,,,,1.0,,,,,
1,,,,,,,,,1.0,,,,0.0
2,,0.0,1.0,,,,,,,0.0,,,
3,1.0,1.0,,1.0,,,,1.0,,,,,
4,,,,,,,,1.0,,1.0,,-1.0,


## Label Counts

### Before Fixing `-1`s

In [6]:
for col in y_train.columns:
    print(f"{col} --> {y_train[col].unique()}")

Atelectasis --> [ 1. nan  0. -1.]
Cardiomegaly --> [ 1. nan  0. -1.]
Consolidation --> [nan  1.  0. -1.]
Edema --> [ 1. nan  0. -1.]
Enlarged Cardiomediastinum --> [nan  1.  0. -1.]
Fracture --> [nan  1. -1.  0.]
Lung Lesion --> [nan  1. -1.  0.]
Lung Opacity --> [ 1. nan -1.  0.]
No Finding --> [nan  1.]
Pleural Effusion --> [nan  0.  1. -1.]
Pleural Other --> [nan  1. -1.  0.]
Pneumonia --> [nan -1.  0.  1.]
Pneumothorax --> [nan  0.  1. -1.]


In [7]:
y_train.sum(axis=0).to_dict()

{'Atelectasis': 12231.0,
 'Cardiomegaly': 13285.0,
 'Consolidation': 2463.0,
 'Edema': 6614.0,
 'Enlarged Cardiomediastinum': -1133.0,
 'Fracture': 474.0,
 'Lung Lesion': 743.0,
 'Lung Opacity': 12344.0,
 'No Finding': 5013.0,
 'Pleural Effusion': 19011.0,
 'Pleural Other': 220.0,
 'Pneumonia': 397.0,
 'Pneumothorax': 2519.0}

### After Fixing

In [8]:
new_y_train = y_train.fillna(0).replace(-1, 0)
for col in new_y_train.columns:
    print(f"{col} --> {new_y_train[col].unique()}")

Atelectasis --> [1. 0.]
Cardiomegaly --> [1. 0.]
Consolidation --> [0. 1.]
Edema --> [1. 0.]
Enlarged Cardiomediastinum --> [0. 1.]
Fracture --> [0. 1.]
Lung Lesion --> [0. 1.]
Lung Opacity --> [1. 0.]
No Finding --> [0. 1.]
Pleural Effusion --> [0. 1.]
Pleural Other --> [0. 1.]
Pneumonia --> [0. 1.]
Pneumothorax --> [0. 1.]


In [9]:
new_y_train.sum(axis=0).to_dict()

{'Atelectasis': 14336.0,
 'Cardiomegaly': 15279.0,
 'Consolidation': 3558.0,
 'Edema': 10310.0,
 'Enlarged Cardiomediastinum': 2309.0,
 'Fracture': 508.0,
 'Lung Lesion': 860.0,
 'Lung Opacity': 13235.0,
 'No Finding': 5013.0,
 'Pleural Effusion': 20322.0,
 'Pleural Other': 305.0,
 'Pneumonia': 3730.0,
 'Pneumothorax': 2902.0}