In [1]:
import cv2
import gc
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import pydicom
from sklearn.model_selection import StratifiedKFold
from tqdm import tqdm_notebook

In [2]:
data_dir = "/mnt/DATA/rsna/"
meta_data_dir = data_dir + "gzip/"

In [3]:
dropped_cols = ['BitsAllocated', 'BitsStored',
       'Columns', 'HighBit', 'ImageOrientationPatient_0',
       'ImageOrientationPatient_1', 'ImageOrientationPatient_2',
       'ImageOrientationPatient_3', 'ImageOrientationPatient_4',
       'ImageOrientationPatient_5', 'ImagePositionPatient_0',
       'ImagePositionPatient_1', 'Modality',
       'PhotometricInterpretation', 'PixelRepresentation',
       'PixelSpacing_0', 'PixelSpacing_1', 'RescaleIntercept', 'RescaleSlope',
       'Rows', 'SOPInstanceUID', 'SamplesPerPixel', 'SeriesInstanceUID',
       'StudyID',]

## Prepare the labels & metadata

In [None]:
def get_metadata(image_dir):

    labels = [
        'BitsAllocated', 'BitsStored', 'Columns', 'HighBit', 
        'ImageOrientationPatient_0', 'ImageOrientationPatient_1', 'ImageOrientationPatient_2',
        'ImageOrientationPatient_3', 'ImageOrientationPatient_4', 'ImageOrientationPatient_5',
        'ImagePositionPatient_0', 'ImagePositionPatient_1', 'ImagePositionPatient_2',
        'Modality', 'PatientID', 'PhotometricInterpretation', 'PixelRepresentation',
        'PixelSpacing_0', 'PixelSpacing_1', 'RescaleIntercept', 'RescaleSlope', 'Rows', 'SOPInstanceUID',
        'SamplesPerPixel', 'SeriesInstanceUID', 'StudyID', 'StudyInstanceUID', 
        'WindowCenter', 'WindowWidth', 'Image',
    ]

    data = {l: [] for l in labels}

    for image in tqdm_notebook(os.listdir(image_dir)):
        data["Image"].append(image[:-4])

        ds = pydicom.dcmread(os.path.join(image_dir, image))

        for metadata in ds.dir():
            if metadata != "PixelData":
                metadata_values = getattr(ds, metadata)
                if type(metadata_values) == pydicom.multival.MultiValue and metadata not in ["WindowCenter", "WindowWidth"]:
                    for i, v in enumerate(metadata_values):
                        data[f"{metadata}_{i}"].append(v)
                else:
                    if type(metadata_values) == pydicom.multival.MultiValue and metadata in ["WindowCenter", "WindowWidth"]:
                        data[metadata].append(metadata_values[0])
                    else:
                        data[metadata].append(metadata_values)

    return pd.DataFrame(data).set_index("Image")

In [None]:
# Generate metadata dataframes
train_metadata = get_metadata(os.path.join(data_dir, "stage_1_train_images"))
test_metadata = get_metadata(os.path.join(data_dir, "stage_1_test_images"))

train_metadata.to_parquet(f'{meta_data_dir}/train_metadata.parquet.gzip', compression='gzip')
test_metadata.to_parquet(f'{meta_data_dir}/test_metadata.parquet.gzip', compression='gzip')

### Note: 'ImagePositionPatient_2' is z-axis

In [4]:
train_df = pd.read_csv(data_dir + "stage_1_train.csv").drop_duplicates()
train_df['image'] = train_df["ID"].str.slice(stop=12)
train_df['Diagnosis'] = train_df['ID'].str.slice(start=13)
train_labels = train_df.pivot(index="image", 
                              columns="Diagnosis", 
                              values="Label")

In [5]:
test_df = pd.read_csv(data_dir + "stage_1_sample_submission.csv").drop_duplicates()
test_df['image'] = test_df["ID"].str.slice(stop=12)

In [6]:
train = pd.concat([train_labels["any"],
                   train_labels["epidural"], train_labels["intraparenchymal"],
                   train_labels["intraventricular"], train_labels["subarachnoid"],
                   train_labels["subdural"]], 1).reset_index()

In [7]:
test = test_df["image"].drop_duplicates()

In [8]:
train_parq = pd.read_parquet(meta_data_dir + "train_metadata.parquet.gzip")
train_parq["image"] = train_parq.index
test_parq = pd.read_parquet(meta_data_dir + "test_metadata.parquet.gzip")
test_parq["image"] = test_parq.index

In [9]:
merged_train = pd.merge(train, train_parq, how="inner", on="image")
merged_test = pd.merge(test, test_parq, how="inner", on="image")
merged_train.drop(columns=dropped_cols, inplace=True)
merged_test.drop(columns=dropped_cols, inplace=True)

del train, test, train_parq, test_parq
gc.collect()

merged_train = merged_train.groupby(["StudyInstanceUID"]) \
    .apply(lambda x: x.sort_values(["ImagePositionPatient_2"], 
                                   ascending = True))\
    .reset_index(drop=True)

merged_test = merged_test.groupby(["StudyInstanceUID"]) \
    .apply(lambda x: x.sort_values(["ImagePositionPatient_2"], 
                                   ascending = True)) \
    .reset_index(drop=True)

# merged_train.to_csv(data_dir + "train_metadata.csv", index=False)
# merged_test.to_csv(data_dir + "test_metadata.csv", index=False)

In [10]:
merged_train

Unnamed: 0,image,any,epidural,intraparenchymal,intraventricular,subarachnoid,subdural,ImagePositionPatient_2,PatientID,StudyInstanceUID,WindowCenter,WindowWidth
0,ID_3a422b8d7,0,0,0,0,0,0,28.2225,ID_e0d2de32,ID_00047d6503,30.0,80.0
1,ID_490b10d5a,0,0,0,0,0,0,33.2225,ID_e0d2de32,ID_00047d6503,30.0,80.0
2,ID_be2a0ca1c,0,0,0,0,0,0,38.2225,ID_e0d2de32,ID_00047d6503,30.0,80.0
3,ID_af42e31f3,0,0,0,0,0,0,43.2225,ID_e0d2de32,ID_00047d6503,30.0,80.0
4,ID_3131664ab,0,0,0,0,0,0,48.2225,ID_e0d2de32,ID_00047d6503,30.0,80.0
...,...,...,...,...,...,...,...,...,...,...,...,...
674253,ID_7b0c5edc0,0,0,0,0,0,0,1310.0500,ID_822276a2,ID_fffdba8d7b,40.0,80.0
674254,ID_e8e195f90,0,0,0,0,0,0,1315.0500,ID_822276a2,ID_fffdba8d7b,40.0,80.0
674255,ID_d64ef9ea6,0,0,0,0,0,0,1320.0500,ID_822276a2,ID_fffdba8d7b,40.0,80.0
674256,ID_5838a09b1,0,0,0,0,0,0,1325.0500,ID_822276a2,ID_fffdba8d7b,40.0,80.0


## Split by StudyInstanceUID

In [None]:
study_id_counts = merged_train.groupby(["StudyInstanceUID"])["any"].sum()
study_ids, slice_counts = study_id_counts.index.values, study_id_counts.values

In [None]:
binarized_slice_counts = slice_counts > 0.

In [None]:
skf = StratifiedKFold(n_splits=5, random_state=2709)

In [None]:
folds = {}
for i, (train_idx, val_idx) in enumerate(skf.split(study_ids, binarized_slice_counts)):
    folds[i] = (study_ids[train_idx], study_ids[val_idx])

    np.save(data_dir + "train_fold" + str(i) + ".npy", study_ids[train_idx])
    np.save(data_dir + "valid_fold" + str(i) + ".npy", study_ids[val_idx])

In [None]:
for fold in range(5):
    print("\n================FOLD {}================".format(str(fold)))
    
    for col in [
        "any", 
        "intraparenchymal", "intraventricular", 
        "subarachnoid", "subdural", "epidural"
    ]:
        train_df = merged_train[merged_train["StudyInstanceUID"].isin(folds[fold][0])]
        num_train_strat_sid = train_df[col].sum()
        print(col,
              "train samples: {:.1f} ===".format(num_train_strat_sid),
              "alpha: {:.3f}".format(compute_alpha(
                  num_train_strat_sid / train_df['epidural'].sum())
                                    )
             )

## Split by PatientID

In [None]:
patient_id_counts = merged_train.groupby(["PatientID"])["any"].sum()
patient_ids, slice_counts = patient_id_counts.index.values, patient_id_counts.values

In [None]:
patient_ids

In [None]:
slice_counts

In [None]:
binarized_slice_counts = slice_counts > 0.

In [None]:
binarized_slice_counts.shape

In [None]:
patient_ids.shape

In [None]:
np.sum(binarized_slice_counts) / 17079

In [None]:
skf = StratifiedKFold(n_splits=5, random_state=2709)

folds = {}
for i, (train_idx, val_idx) in enumerate(skf.split(patient_ids, binarized_slice_counts)):
    folds[i] = (patient_ids[train_idx], patient_ids[val_idx])
    np.save(data_dir + "train_patients_fold" + str(i) + ".npy", patient_ids[train_idx])
    np.save(data_dir + "valid_patients_fold" + str(i) + ".npy", patient_ids[val_idx])

In [None]:
for fold in range(5):
    print("\n================FOLD {}================".format(str(fold)))
    
    for col in [
        "any", 
        "intraparenchymal", "intraventricular", 
        "subarachnoid", "subdural", "epidural"
    ]:
        train_df = merged_train[merged_train["PatientID"].isin(folds[fold][0])]
        num_train_strat_sid = train_df[col].sum()
        print(col,
              "train samples: {:.1f} ===".format(num_train_strat_sid),
             )

In [3]:
train_meta = pd.read_csv("/mnt/DATA/rsna/train_metadata.csv")
test_meta = pd.read_csv("/mnt/DATA/rsna/test_metadata.csv")
# check patient id overlap
print(np.intersect1d(train_meta["PatientID"].unique(),
               test_meta["PatientID"].unique()).shape)
# check train patient id
print(train_meta["PatientID"].unique().shape)
# check test patient id
print(test_meta["PatientID"].unique().shape)

(285,)
(17079,)
(2144,)


In [4]:
overlap_patient_ids = np.intersect1d(train_meta["PatientID"].unique(),
                                     test_meta["PatientID"].unique())

In [6]:
test_meta[test_meta["PatientID"].isin(overlap_patient_ids)]

Unnamed: 0,image,ImagePositionPatient_2,PatientID,StudyInstanceUID,WindowCenter,WindowWidth
306,ID_38eff6d7a,43.957554,ID_10c07909,ID_0136ebaa38,30.0,80.0
307,ID_2c0775e47,46.571781,ID_10c07909,ID_0136ebaa38,30.0,80.0
308,ID_c172014ce,49.186012,ID_10c07909,ID_0136ebaa38,30.0,80.0
309,ID_942888df3,51.800240,ID_10c07909,ID_0136ebaa38,30.0,80.0
310,ID_4c7153a14,54.412552,ID_10c07909,ID_0136ebaa38,30.0,80.0
...,...,...,...,...,...,...
78436,ID_fb7d67d85,113.664000,ID_4f7414e4,ID_ffb2e70ba3,30.0,80.0
78437,ID_2232eadb1,118.733000,ID_4f7414e4,ID_ffb2e70ba3,30.0,80.0
78438,ID_dcf40f4d7,123.803000,ID_4f7414e4,ID_ffb2e70ba3,30.0,80.0
78439,ID_8878be83a,128.872000,ID_4f7414e4,ID_ffb2e70ba3,30.0,80.0
