In [1]:
import pandas as pd
import tensorflow as tf
import tensorflow.keras as keras
import numpy as np
import h5py
import os
import matplotlib.pyplot as plt
from functools import partial

In [2]:
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' 

gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
  try:
    for gpu in gpus:
      tf.config.experimental.set_memory_growth(gpu, True)
  except RuntimeError as e:
    print(e)

In [3]:
df = pd.read_csv("train_scores.csv")
print(df.isna().sum())
df = df.dropna()
df.head(5)

Id                0
age               0
domain1_var1    438
domain1_var2    438
domain2_var1     39
domain2_var2     39
dtype: int64


Unnamed: 0,Id,age,domain1_var1,domain1_var2,domain2_var1,domain2_var2
0,10001,57.436077,30.571975,62.553736,53.32513,51.427998
1,10002,59.580851,50.969456,67.470628,60.651856,58.311361
2,10004,71.413018,53.152498,58.012103,52.418389,62.536641
4,10007,38.617381,49.197021,65.674285,40.151376,34.096421
5,10008,35.326582,15.769168,65.782269,44.643805,50.448485


In [4]:
print(df.dtypes)

Id                int64
age             float64
domain1_var1    float64
domain1_var2    float64
domain2_var1    float64
domain2_var2    float64
dtype: object


In [5]:
df.describe()

Unnamed: 0,Id,age,domain1_var1,domain1_var2,domain2_var1,domain2_var2
count,5434.0,5434.0,5434.0,5434.0,5434.0,5434.0
mean,15929.972396,49.61521,51.474438,59.246408,47.244966,51.91631
std,3414.559796,13.441014,10.192038,11.387837,11.082251,11.794651
min,10001.0,14.257265,15.769168,1.021874,0.991172,0.815285
25%,12975.75,40.129361,44.780129,52.397196,40.072138,44.532715
50%,15982.5,48.948756,51.84806,60.054213,47.760527,52.542651
75%,18904.75,59.580851,58.498056,67.143291,55.003507,59.832945
max,21754.0,84.491113,81.32558,94.702874,80.834495,94.509903


In [7]:
DATA_PATH = "fMRI_train_pk"

file_ls = []
y_ls = []

for _, row in df.iterrows():
    file_ls.append(os.path.join(DATA_PATH, str(int(row["Id"]))+".pk"))
    ys = [item for _, item in row.iteritems()]
    y_ls.append(ys[1:])
    
y_ls = np.array(y_ls, dtype = np.float32)
print(y_ls.shape)

(5434, 5)


In [None]:
from sklearn.model_selection import train_test_split

train_f, test_f, train_label, test_label = train_test_split(
    file_ls, y_ls, test_size = 0.3, random_state = 42
)

val_f, evl_f, val_label, evl_label = train_test_split(
    test_f, test_label, test_size = 0.5, random_state = 42
)



In [None]:
print(train_label.shape)
print(val_label.shape)
print(evl_label.shape)

print(train_label.min(axis = 0))
print(val_label.min(axis = 0))
print(evl_label.min(axis = 0))

print(train_label.max(axis = 0))
print(val_label.max(axis = 0))
print(evl_label.max(axis = 0))

In [None]:
bins = np.linspace(0, 100, 50)
fig, ax = plt.subplots(3, 2)
fig.set_size_inches(18.5, 15.5)
for i in range(5):
    
    ax[i // 2, i % 2].hist(train_label[:,i], bins, alpha = 0.5, label = "Train")
    ax[i // 2, i % 2].hist(val_label[:,i], bins, alpha = 0.5, label = "Validation")
    ax[i // 2, i % 2].hist(evl_label[:,i], bins, alpha = 0.5, label = "Evaluation")
    ax[i // 2, i % 2].legend(["Train", "Validation", "Evaluation"])
    ax[i // 2, i % 2].set_title(df.columns[i + 1])

In [None]:
def normalize(img):
    mean = np.mean(img)
    std = np.std(img)
    img = (img - mean) / std
    img = img.transpose()
    return img

def DataGenerator(file_list, y_list):
    def generator():
        for file, y in zip(file_list, y_list):
            #ith h5py.File(file, "r") as f:
                #img = f["SM_feature"][()]
            with open(file, "rb") as f:
                img = pickle.load(f)
            img = normalize(img)
            yield img, y

    return generator
            
def DatasetReader(file_list, y_list, shuffle_size, batch_size):
    generator = DataGenerator(file_list, y_list)
    dataset = tf.data.Dataset.from_generator(
        generator,
        output_types = (tf.float32, tf.float32),
        output_shapes = (tf.TensorShape((53, 63, 52, 53)), tf.TensorShape((5,)))
    )
    
    dataset = dataset.repeat().batch(batch_size).shuffle(shuffle_size)
    
    return dataset.prefetch(tf.data.experimental.AUTOTUNE)

In [None]:
BATCH_SIZE = 12
train_set = DatasetReader(train_f, train_label, 24, BATCH_SIZE)
val_set = DatasetReader(val_f, val_label, 12, BATCH_SIZE)
evl_set = DatasetReader(evl_f, evl_label, 12, BATCH_SIZE)

In [None]:
for i in train_set.take(2):
    print(i[0].shape, i[1].shape)

In [None]:
DefaultConv3D = partial(keras.layers.Conv3D, kernel_size=3, strides=(1,)*3,
        padding="SAME", use_bias=True, kernel_regularizer = keras.regularizers.l2(0.01))

class ResidualUnit(keras.layers.Layer):
    # separate construction and execution
    # be aware of the strides' shape
    def __init__(self, filters, strides=(1,)*3, activation="relu", **kwargs):
        super().__init__(**kwargs)
        self.activation = keras.activations.get(activation)
        self.filters = filters
        self.strides = strides
                
        # a list a layers that can be iterated
        self.main_layers = [
                DefaultConv3D(self.filters, strides=self.strides, kernel_initializer="he_normal"),
                keras.layers.BatchNormalization(),
                self.activation,
                DefaultConv3D(self.filters, strides=(1,)*3, kernel_initializer="he_normal"),
                keras.layers.BatchNormalization()
                ]
        self.skip_layers = []
        if np.prod(self.strides) > 1:
            #self.skip_layers = [keras.layers.MaxPool3D(pool_size=(2,)*3, strides=strides, padding="SAME")]
            
            self.skip_layers = [
                DefaultConv3D(self.filters, kernel_size=1, strides=self.strides, kernel_initializer="he_normal"),
                keras.layers.BatchNormalization()
                ]          
            
    def call(self, inputs, **kwargs):
        x = inputs
        orig_x = inputs
        
        for layer in self.main_layers:
            x = layer(x) # f(x)
        
        for layer in self.skip_layers:
            orig_x = layer(orig_x)
        
        return self.activation(x + orig_x)
    
    def get_config(self):
        config = super(ResidualUnit, self).get_config()
        config.update({'filters': self.filters, 'strides':self.strides})
        
        return config

filters = (16, 32, 64)
strides = (1, 2, 2)
#(1,1,1)
model = keras.models.Sequential()
model.add(DefaultConv3D(filters[0], kernel_size=3, strides=(1,)*3,
        input_shape=[53, 63, 52, 53], kernel_initializer="he_normal"))
model.add(keras.layers.BatchNormalization())
model.add(keras.layers.Activation("relu"))
model.add(keras.layers.MaxPool3D(pool_size=(2,)*3, padding="SAME"))

for filter, stride in zip(filters[1:], strides[1:]):
    model.add(ResidualUnit(filter, strides=(stride,)*3))
    model.add(ResidualUnit(filter, strides=(1,)*3))

model.add(keras.layers.GlobalAvgPool3D())
model.add(keras.layers.Flatten()) # 128 
model.add(keras.layers.Dense(16, activation="relu", kernel_regularizer = keras.regularizers.l2(0.002)))
#model.add(keras.layers.Dropout(0.5 ))
model.add(keras.layers.Dense(5))
#model.add(keras.layers.Dropout(0.2 ))
optimizer = keras.optimizers.RMSprop(0.001)
model.compile(loss="mse",
        optimizer=optimizer,
        metrics=["mse", "mae"])

In [None]:
model.summary()

In [None]:
checkpoint_cb = keras.callbacks.ModelCheckpoint("./my_logs/First_try.h5", 
        monitor = 'val_loss', mode = 'min',
        save_best_only=True
        )

class PrintValTrainRatioCallback(keras.callbacks.Callback):
    def on_epoch_end(self, epoch, logs):
        print("\nval/train: {:.2f} \n".format(logs["val_loss"] / logs["loss"]))

root_logdir = os.path.join(os.curdir, "./my_logs/First_try")

def get_run_logdir(comment=None):
    import time
    run_id = time.strftime("run_%Y_%m_%d-%H_%M_%S{}".format(comment))
    return os.path.join(root_logdir, run_id)

run_logdir = get_run_logdir()
tensorboard_cb = keras.callbacks.TensorBoard(run_logdir)

In [None]:
history = model.fit(train_set, steps_per_epoch= 1024 // BATCH_SIZE, epochs=500,
          validation_data=val_set,
          validation_steps=800 // BATCH_SIZE,
          callbacks=[checkpoint_cb,  
                     PrintValTrainRatioCallback(), tensorboard_cb]
         )