In [1]:
import pathlib
import pickle
import re
import sys

import nibabel as nib
import numpy as np
import pandas as pd

# 1. Data preprocessing

In [2]:
# Path to the data folder. This may be different between users
location = 'data/secure/'

data_path = pathlib.Path(location)

### 1.1 Rename folders
We ran into some issues where the extremely deep paths and long filenames were causing errors with our ability to load the data. To correct this issue, we renamed folders by changing the second long folder name to `feature_masks`. For example,

`data/SECURE_KEY/2036311/DPm.1.2.840.113681.2863050713.1318230214.3060.1227/DPm.1.2.840.113681.2863050713.1318230214.3060`

becomes 

`data/SECURE_KEY/2036311/DPm.1.2.840.113681.2863050713.1318230214.3060.1227/feature_masks`

This does not cause any confusion or ambiguity, as both the filenames themselves, as well as the parent folders contain the same information. The directory structure within the directory given above by `location` is then:

```
location
├── 2036311
|   └── DPm.1.2.840.113681.2863050713.1318230214.3060.1227
|       └── feature_masks
|           └── feature
|               └── DPm ... .nii.gz
|               └── ...
|               └── DPm ... .nii.gz
|           └── mask
|               └── DPm ... 0_mask_win_97_sliding_97_mean.nii.gz
|               └── DPm ... _mask.nii.gz
└── ...
```

In [3]:
# Rename folders
for feature_dir in data_path.glob('*/*/*/feature/'):
    parent_dir = feature_dir.parent
    parent_dir.rename(parent_dir.parent / 'feature_masks')

### 1.2 Train/test splitting
We have been given data on roughly 570 patients. There is some ambiguity in the number, though, as we have case/control status on 575 patients, while we have feature maps for 569 patients. Of the 569 patients with features, 114 were cases and the remaining 455 were controls. In the context of breast cancer prediction, this means 114 of the patients eventually developed breast cancer while the others did not.

Among the patients with extracted feature maps, 533 patients had two images (corresponding to left and right breast), while the remaining 36 had an image for one side only. In consultation with Dr. Aimilia Gastounioti, we decided the most sensible approach would be to treat each image as a separate sample. Using this approach, we have 1102 total samples.

We opted for an 80/20 train/test split, a standard split fraction. This means that 455 patients are assigned to the traing set and 114 are assigned to the test set. We took care to ensure that the case/control ratio within both groups reflected the overall distributions. This can be seen below, where 20.2% of the patients assigned to the training set were cases, and 19.3% of the test set were cases. These numbers are not exactly 20%, though they make sense in light of the fact that the number of patients is not evenly divisible in the fraction we desire.

To eliminate a source of bias in our model, we did not allow patients with multiple images to have their data split between training and testing data. This means that patients with two images always had both images together, and we split the data by patients rather than by sample.

In [4]:
# Train/test split
# Get list of patients with feature maps
patients_list = [subdir.name for subdir in data_path.glob('*/') if subdir.is_dir()]

# Read in case/control information
case_control_df = pd.read_excel('controlcase.xlsx')

# Create a dictionary mapping patient_id to case/control status
patient_id_to_case = case_control_df[['DummyID', 'Class']].set_index('DummyID')['Class'].to_dict()

# Set random seed so that split can be done reproducibly
np.random.seed(0)

# Pick patients whose images will be in train/test sets
training_patients = np.random.choice(patients_list, replace=False, size=455)
testing_patients = [patient for patient in patients_list if patient not in training_patients]

In [5]:
# Verify the train/test split sizes
print(f'Training patients: {len(training_patients)}\n'
      f'Testing patients: {len(testing_patients)}\n')

# Verify the relative numbers of cases and controls between training and testing
num_training_cases = sum([patient_id_to_case[int(patient_id)] for patient_id in training_patients])
num_testing_cases = sum([patient_id_to_case[int(patient_id)] for patient_id in testing_patients])

print(f'Percent cases in training data: {num_training_cases / len(training_patients)}\n'
      f'Percent cases in testing data: {num_testing_cases / len(testing_patients)}')

Training patients: 455
Testing patients: 114

Percent cases in training data: 0.2021978021978022
Percent cases in testing data: 0.19298245614035087


In [6]:
# Write the ids of patients in training/testing to a file
# so that our methods can be replicated exactly
with open('data/training_patients.txt', 'w') as train_file:
    train_file.write('patient_id,case_status\n')
    for patient_id in training_patients:
        case_status = patient_id_to_case[int(patient_id)]
        train_file.write(f'{patient_id},{case_status}\n')
        
with open('data/testing_patients.txt', 'w') as test_file:
    test_file.write('patient_id,case_status\n')
    for patient_id in testing_patients:
        case_status = patient_id_to_case[int(patient_id)]
        test_file.write(f'{patient_id},{case_status}\n')

### 1.3 Load and process feature maps
Below, we extract all feature maps, apply the mask, sort, and combine features into 4D arrays. Then, we normalize features first across samples then within samples, just as was performed in the code provided for us.

Throughout the process, we are very careful to ensure that features are always in correspondence with their patient_id or case/control status. 

#### 1.3.1 Load data into lists of feature dictionaries

In [7]:
# Prepare data for CNN
feature_masks = data_path.glob('*/*/feature_masks/')

train_features = list()
train_classes = list()
test_features = list()
test_classes = list()

for feature_mask_path in feature_masks:
    # Get patient's dummy id
    patient_id = feature_mask_path.parent.parent.name

    # Get patient's case/control status
    patient_class = patient_id_to_case[int(patient_id)]

    # Load the sample's mask
    mask_path = list((feature_mask_path / 'mask').glob('*_mean.nii.gz'))[0].as_posix()
    mask = nib.load(mask_path).get_data().T

    # Iterate through all feature maps. Load and apply mask to each.
    patient_features = dict()
    features_paths = (feature_mask_path / 'feature').glob('*.nii.gz')
    for feature_path in features_paths:

        # Load feature map and apply mask
        feature_map = np.nan_to_num(nib.load(feature_path.as_posix()).get_data().T)
        masked_feature_map = np.multiply(feature_map, mask)

        # Extract the feature name from its filename. Eg: norm_win_97_sliding_97_box_counting from
        # DPm.1.2.840.113681.2863050709.1375427076.3328_norm_win_97_sliding_97_box_counting.nii.gz
        feature_name = re.search('(?<=_).+(?=\.nii\.gz)', feature_path.name).group()  # noqa: W605
        patient_features[feature_name] = masked_feature_map

    # Get patient's train/test category and add the data in the corresponding lists
    is_test = patient_id in testing_patients
    is_train = patient_id in training_patients
    assert not (is_test and is_train)
    if is_test:
        test_features.append(patient_features)
        test_classes.append(patient_class)
    elif is_train:
        train_features.append(patient_features)
        train_classes.append(patient_class)
    else:
        raise ValueError('Patient ID not found!')

#### 1.3.2 Combine the data into 4D arrays
Very importantly, ensure that the features are always ordered the same way for every sample.

In [8]:
# Save the data in 4D arrays

# Create an ordered list of feature names to ensure they are in the same
# order for every sample in the training and testing data
ordered_feature_names = sorted(train_features[0].keys())

# Save the data in 4D arrays
train_data = np.zeros((len(train_features), 34, 26, 29))
test_data = np.zeros((len(test_features), 34, 26, 29))

for sample_number, sample_dict in enumerate(train_features):
    for feature_number, feature_name in enumerate(ordered_feature_names):
        # Crop images to all be 34 x 26. Some are originally larger at 42 x 37
        train_data[sample_number, :, :, feature_number] = sample_dict[feature_name][0:34, 0:26]

for sample_number, sample_dict in enumerate(test_features):
    for feature_number, feature_name in enumerate(ordered_feature_names):
        # Crop images to all be 34 x 26. Some are originally larger at 42 x 37
        test_data[sample_number, :, :, feature_number] = sample_dict[feature_name][0:34, 0:26]

# Convert label lists to numpy arrays
train_classes = np.asarray(train_classes)
test_classes = np.asarray(test_classes)

#### 1.3.3 Normalize the feature maps
As was done in the preprocessing code from the 2016 paper, we first normalize across samples, then normalize features within samples. Note that we add a term, `epsilon` to the divisors below. This is because some features are zero across all samples or across all feature_maps within sample. In these cases, we would be dividing by zero, which would introduce unwanted `nan` terms into the data.

In [9]:
epsilon = 1e-8

# Normalize the data across samples
# Combine the data and find the largest magnitude values for each feature
full_data = np.concatenate((train_data, test_data))
max_image = np.abs(full_data).max(axis=0)

train_data = np.divide(train_data, max_image + epsilon)
test_data = np.divide(test_data, max_image + epsilon)

# Normalize feature maps so that the maximum value in each is 1.
# # This is the within-sample normalization that was performed
# # in the preprocessing code we received from the 2016 paper
for data_source in (train_data, test_data):
    for sample_number, sample in enumerate(data_source):
        for feature_number in range(29):
            feature_map = sample[:, :, feature_number]
            max_val = np.abs(feature_map).max()
            data_source[sample_number, :, :, feature_number] = np.divide(feature_map, max_val + epsilon)

# Save the data as pickled tuples of data, labels
training_set = (train_data, train_classes)
testing_set = (test_data, test_classes)

train_data_path = data_path.parent.joinpath('train_data.pkl')
test_data_path = data_path.parent.joinpath('test_data.pkl')

with open(train_data_path, 'wb') as f:
    pickle.dump(training_set, f)

with open(test_data_path, 'wb') as f:
    pickle.dump(testing_set, f)

# 2. Create and train CNN model

In [10]:
from keras.callbacks import EarlyStopping
from keras.layers import Dense, Conv2D, Activation, Flatten, MaxPooling2D, Dropout, SpatialDropout2D
from keras.models import Sequential
from keras.optimizers import SGD
from keras.preprocessing.image import ImageDataGenerator
from keras.utils import to_categorical
from sklearn.metrics import roc_auc_score
from tensorflow import set_random_seed

Using TensorFlow backend.


In [11]:
# Set numpy and TensorFlow random seeds in the hopes of making
# results reproducible. This will not be possible when using a GPU,
# as there may be asynchronous processing for which no random seed
# could account.
set_random_seed(2)
np.random.seed(1)

In [12]:
train_classes = to_categorical(train_classes)
test_classes = to_categorical(test_classes)

In [14]:
datagen = ImageDataGenerator()
datagen.fit(train_data)

val_datagen = ImageDataGenerator()
val_datagen.fit(test_data)

model = Sequential([
    Conv2D(10, kernel_size=(5, 5), activation='tanh',
           data_format='channels_last', input_shape=(34, 26, 29)),
    MaxPooling2D(pool_size=(2, 2)),
    Conv2D(10, kernel_size=(4, 3), activation='tanh'),
    MaxPooling2D(pool_size=(2, 2)),
    Flatten(),
    Dense(5, activation='tanh'),
    Dense(2, activation='sigmoid')
])

sgd = SGD(lr=0.01)
model.compile(optimizer=sgd, loss='binary_crossentropy',
              metrics=['binary_accuracy'])

# callback = EarlyStopping(monitor='val_loss', min_delta=0, patience=3,
#                          verbose=0, mode='auto', baseline=0.7)

class_weights = {0: 1, 1: 4}
model.fit_generator(datagen.flow(train_data, train_classes, batch_size=1, shuffle=True),
#                     callbacks=[callback],
                    steps_per_epoch=len(train_data), epochs=100,
                    class_weight=class_weights,
                    validation_data=val_datagen.flow(test_data, test_classes),
                    nb_val_samples=test_data.shape[0])

score = model.evaluate(test_data, test_classes)

print("Weighted test accuracy: ", score[1])
preds = model.predict(test_data)
auc = roc_auc_score(test_classes, preds)
print(model.summary())
print(f"AUROC: {auc}")

model.save('model/most_recent.h5')



Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100


Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100
Weighted test accuracy:  0.6396396380287033
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_3 (Conv2D)            (None, 30, 22, 10)        7260      
_________________________________________________________________
max_pooling2d_3 (MaxPooling2 (None, 15, 11, 10)        0         
_