In [1]:
!pip install statsmodels

Collecting statsmodels
  Downloading statsmodels-0.12.2-cp36-cp36m-manylinux1_x86_64.whl (9.5 MB)
[K     |████████████████████████████████| 9.5 MB 6.0 MB/s eta 0:00:01
[?25hCollecting patsy>=0.5
  Downloading patsy-0.5.1-py2.py3-none-any.whl (231 kB)
[K     |████████████████████████████████| 231 kB 31.5 MB/s eta 0:00:01
Installing collected packages: patsy, statsmodels
Successfully installed patsy-0.5.1 statsmodels-0.12.2
You should consider upgrading via the '/usr/bin/python3 -m pip install --upgrade pip' command.[0m


In [2]:
import numpy as np
import tensorflow as tf
print(tf.__version__)
import shutil
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix
import sklearn
import tempfile
from statsmodels.stats.proportion import proportion_confint
from functions.auc_delong_xu import auc_ci_Delong

from tensorflow import keras
from tensorflow.keras import layers
from keras.utils import to_categorical
from tensorflow.keras import regularizers
from keras import backend as K

from tensorflow.compat.v1 import ConfigProto
from tensorflow.compat.v1 import Session
import gc

from Functions.data_augmentation import *

In [2]:
from Functions.data_augmentation import *
from Functions.data_import import *

### Import data from HD5File

In [3]:
PATH_3D_H5 = 'Images/preprocessed_data_128_112_40.hdf5'
with h5py.File(PATH_3D_H5, 'r') as h5:
    print('H5-file: ', list(h5.keys()))

    X = h5["X"][:]
    Y_pat = h5["Y_pat"][:]
    pat = h5["pat"][:]

print(X.shape, X.min(), X.max(), X.mean(), X.std(), Y_pat.shape, pat.shape)

H5-file:  ['X', 'Y_pat', 'pat']
(508, 128, 112, 40, 1) -1.5907471988589987 26.031565772467758 2.1324024910545048e-18 0.9999999999999966 (508,) (508,)


In [9]:
@tf.function
def augment_3d_two(volume):
    """Randomly pick two data augmentation functions for every image"""

    def augment(volume):
        rand = np.random.randint(0,5, size = 2)
            
        if 0 in rand:
            volume = random_zoom3d(volume, 0.8,1.3) 
        if 1 in rand:
            volume = random_rotate3d(volume, -20, 20, -5, -5, -5, -5)
        if 2 in rand:
            volume = random_shift3d(volume, -20, 20, -20, 20, 0, 0) #do not shift in z direction
        if 3 in rand:
            volume = random_flip3d(volume)
        if 4 in rand:
            volume = random_gaussianfilter3d(volume, 0.2)
            
        return volume
    
    volume_shape = volume.shape
    augmented_volume = tf.numpy_function(augment, [volume], np.float64)
    augmented_volume = tf.reshape(augmented_volume, volume_shape)
    return augmented_volume

In [10]:
def train_preprocessing(volume, label):
    volume = augment_3d_two(volume)
    return volume, label

In [11]:
def get_dataset(X_train, Y_train, X_valid, Y_valid):
    Y_train = to_categorical(Y_train)
    Y_valid = to_categorical(Y_valid)
    
    bool_train_labels = Y_train[:,1] != 0
    pos_features = X_train[bool_train_labels]
    neg_features = X_train[~bool_train_labels]
    pos_labels = Y_train[bool_train_labels]
    neg_labels = Y_train[~bool_train_labels]
    
    def make_ds(features, labels):
        ds = tf.data.Dataset.from_tensor_slices((features, labels))#.cache()
        ds = ds.shuffle(len(pos_features)*2).repeat()
        return ds

    pos_ds = make_ds(pos_features, pos_labels)
    neg_ds = make_ds(neg_features, neg_labels)
    
    resampled_ds = tf.data.experimental.sample_from_datasets([pos_ds, neg_ds], weights=[0.5, 0.5])
    validation_loader = tf.data.Dataset.from_tensor_slices((X_valid, Y_valid))

    batch_size = 2
    # Augment the on the fly during training.
    train_dataset = (
        resampled_ds.shuffle(buffer_size = (len(pos_features)*2), reshuffle_each_iteration=True)
        .map(train_preprocessing)
        .batch(batch_size)
        .prefetch(2))

    validation_dataset = (
        validation_loader.shuffle(len(X_valid))
        .batch(batch_size)
        .prefetch(2))
    
    pos = len(pos_features)
    neg = len(neg_features)
    total = pos + neg
    resampled_steps_per_epoch = np.ceil(2.0*pos/batch_size)
    
    weight_for_0 = (1 / neg)*(total)/2.0 
    weight_for_1 = (1 / pos)*(total)/2.0

    class_weight = {0: weight_for_0, 1: weight_for_1}

    print('Weight for class 0: {:.2f}'.format(weight_for_0))
    print('Weight for class 1: {:.2f}'.format(weight_for_1))
    
    return train_dataset, validation_dataset, class_weight, batch_size, resampled_steps_per_epoch 

In [13]:
def get_model(IMAGE_DIMENSION):
    inputs = keras.Input(IMAGE_DIMENSION)

    x = layers.Conv3D(filters=8, kernel_size=3, activation="relu", padding = 'same')(inputs)
    x = layers.AveragePooling3D(pool_size = 2)(x)
    x = layers.BatchNormalization()(x)

    x = layers.Conv3D(filters=16, kernel_size=3, activation="relu", padding = 'same')(x)
    x = layers.AveragePooling3D(pool_size = 2)(x)
    x = layers.BatchNormalization()(x)
    
    x = layers.Conv3D(filters=32, kernel_size=3, activation="relu", padding = 'same')(x)
    x = layers.AveragePooling3D(pool_size = 2)(x)
    x = layers.BatchNormalization()(x)
    
    x = layers.Conv3D(filters=32, kernel_size=3, activation="relu", padding = 'same')(x)
    x = layers.AveragePooling3D(pool_size = 2)(x)
    x = layers.BatchNormalization()(x)
    
    x = layers.Conv3D(filters=64, kernel_size=3, activation="relu", padding = 'same')(x)
    x = layers.AveragePooling3D(pool_size = 2, padding = 'same')(x)
    x = layers.BatchNormalization()(x)

    x = layers.Flatten()(x)
    
    x = layers.Dense(units=64, activation="relu", kernel_regularizer=regularizers.l2(0.001))(x)
    x = layers.Dropout(0.01)(x)

    outputs = layers.Dense(units=2, activation="softmax")(x)

    # Define the model.
    model = keras.Model(inputs, outputs, name="3dcnn")

    return model

In [None]:
N_FOLDS = 5

## get stroke and tia indeces
stroke_idx = np.where(Y_pat == 1)
tia_idx = np.where(Y_pat == 0)

## shuffle indeces
np.random.seed(2021)
np.random.shuffle(stroke_idx[0])
np.random.shuffle(tia_idx[0])

## split indeces into 5 parts
splits_stroke = np.array_split(stroke_idx[0],N_FOLDS)
splits_tia = np.array_split(tia_idx[0], [31,62,93,125])

## define chosen splits for each fold
test_folds = [0, 1, 2, 3, 4]
valid_folds = [1, 2, 3, 4, 0]
train_folds = [[0, 1], [1, 2], [2, 3], [3, 4], [0, 4]] ## remove these splits for training data

for fold in range(N_FOLDS):
    
    ## define train, test and validation splits
    test_idx = np.concatenate((splits_stroke[test_folds[fold]], splits_tia[test_folds[fold]]), axis = None)
    valid_idx = np.concatenate((splits_stroke[valid_folds[fold]], splits_tia[valid_folds[fold]]), axis = None)

    train_stroke = np.delete(splits_stroke, train_folds[fold], 0)
    train_stroke = [item for sublist in train_stroke for item in sublist]
    
    train_tia = np.delete(splits_tia, train_folds[fold], 0)
    train_tia = [item for sublist in train_tia for item in sublist]
    
    train_idx = np.concatenate((train_stroke, train_tia), axis = None)
    
    X_train = X[train_idx]
    X_test = X[test_idx]
    X_valid = X[valid_idx]
    
    Y_train = Y_pat[train_idx]
    Y_test = Y_pat[test_idx]
    Y_valid = Y_pat[valid_idx]
    
    pat_train = pat[train_idx]
    pat_test = pat[test_idx]
    pat_valid = pat[valid_idx]
    
    #get resampled dataset
    train_dataset, validation_dataset, class_weight, batch_size, resampled_steps_per_epoch = get_dataset(X_train, Y_train, X_valid, Y_valid)
    
    #get model
    model = get_model(X_train[0].shape)
    print(model.summary())
    
    ### define metrics
    metrics = [
    keras.metrics.BinaryAccuracy(name='accuracy'),
    keras.metrics.AUC(name='auc'),
    keras.metrics.CategoricalCrossentropy(name="categorical_crossentropy")]

    ### prepare files for logging
    results_filepath = 'results'+str(fold)+'.csv'
    if os.path.exists(results_filepath):
        os.remove(results_filepath)
    
    history_filepath = 'History'+str(fold)
    if os.path.isdir(history_filepath):
        shutil.rmtree(history_filepath )
    os.makedirs(history_filepath )

    epochs_filepath = history_filepath+'/model.epoch{epoch:02d}.hdf5'

    ### define callback_list
    callback_list = [
    keras.callbacks.ModelCheckpoint(filepath=epochs_filepath, save_freq='epoch', verbose=1), 
    keras.callbacks.CSVLogger(results_filepath)]

    ### compile model
    model.compile(loss = "categorical_crossentropy",
    optimizer=keras.optimizers.Adam(learning_rate =  0.00001),
    metrics = metrics)
    
    ###train model
    epochs = 150
    hist = model.fit(
        train_dataset,
        validation_data=validation_dataset,
        epochs=epochs,
        verbose=1, callbacks=callback_list, 
        steps_per_epoch = resampled_steps_per_epoch,
        class_weight = class_weight)
    
    ###use epoch with minimal validation loss from the tenth epoch
    dat = pd.read_csv(results_filepath, index_col='epoch')
    best_model = np.where(dat.val_loss == np.min(dat.val_loss[10:]))[0][0] 
    best_model = best_model + 1
    model.load_weights(history_filepath+'/model.epoch'+str(best_model)+'.hdf5')
    
    y_prob = model.predict(X_test, batch_size=batch_size)
    y_pred = (y_prob[:,1] > 0.5).astype(np.int)
    
    #calculate categorical crossentropy
    Y_test_cat = to_categorical(Y_test)
    m = tf.keras.metrics.CategoricalCrossentropy()
    m.update_state(Y_test_cat, y_prob)
    catcrossentropy = m.result().numpy()
    
    df = pd.DataFrame()
    df.loc[:,"pat_id"] = list(pat_test)
    df.loc[:,"y_test"] = Y_test
    df.loc[:,"y_pred"] = y_pred
    df.loc[:,"y_prob"] = y_prob[:,1]
    df.loc[:,"cat_cross"] = list(np.repeat(catcrossentropy, len(y_pred)))
    df.loc[:,"fold"] = list(np.repeat(fold, len(y_pred)))
    
    df.to_csv("predictions"+str(fold)+".csv", index = False)

  return array(a, dtype, copy=False, order=order)


Weight for class 0: 1.61
Weight for class 1: 0.73
Model: "3dcnn"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 128, 112, 40, 1)] 0         
_________________________________________________________________
conv3d (Conv3D)              (None, 128, 112, 40, 32)  896       
_________________________________________________________________
average_pooling3d (AveragePo (None, 64, 56, 20, 32)    0         
_________________________________________________________________
batch_normalization (BatchNo (None, 64, 56, 20, 32)    128       
_________________________________________________________________
conv3d_1 (Conv3D)            (None, 64, 56, 20, 32)    27680     
_________________________________________________________________
average_pooling3d_1 (Average (None, 32, 28, 10, 32)    0         
_________________________________________________________________
batch_norma

In [16]:
### merge predictions from 5folds to one file
pred0 = pd.read_csv('predictions0.csv', index_col = False)
pred1 = pd.read_csv('predictions1.csv', index_col = False)
pred2 = pd.read_csv('predictions2.csv', index_col = False)
pred3 = pd.read_csv('predictions3.csv', index_col = False)
pred4 = pd.read_csv('predictions4.csv', index_col = False)

merged = pd.concat([pred0, pred1, pred2, pred3, pred4], axis=0)
merged = merged.reset_index()
merged.to_csv('pred5fold.csv', index = False)