# Libraries

In [None]:
import random
import numpy as np
import time
import copy
import scipy
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import GroupKFold, GroupShuffleSplit
from sklearn.preprocessing import OneHotEncoder as SklearnOneHotEncoder
from tqdm.notebook import tqdm, tnrange

In [2]:
from DME.Analysis.train import *

# Helper functions

In [None]:
class OneHotEncoder(SklearnOneHotEncoder):
    def __init__(self, **kwargs):
        super(OneHotEncoder, self).__init__(**kwargs)
        self.fit_flag = False

    def fit(self, X, **kwargs):
        out = super().fit(X)
        self.fit_flag = True
        return out

    def transform(self, X, **kwargs):
        sparse_matrix = super(OneHotEncoder, self).transform(X)
        new_columns = self.get_new_columns(X=X)
        d_out = pd.DataFrame(sparse_matrix.toarray(), columns=new_columns, index=X.index)
        return d_out

    def fit_transform(self, X, **kwargs):
        self.fit(X)
        return self.transform(X)

    def get_new_columns(self, X):
        new_columns = []
        for i, column in enumerate(X.columns):
            j = 0
            while j < len(self.categories_[i]):
                new_columns.append(f'{column}_<{self.categories_[i][j]}>')
                j += 1
        return new_columns

In [None]:
def encode(df):
    s = (df.dtypes == 'object')
    object_cols = list(s[s].index)
    OH_encoder = OneHotEncoder(dtype=int)
    OH_cols = pd.DataFrame(OH_encoder.fit_transform(df[object_cols]))
    num_data = df.drop(object_cols, axis=1)
    data_new = pd.concat([num_data, OH_cols], axis=1)
    return data_new

# Data pre-processing

In [None]:
data_train_full_raw = pd.read_csv('full data/data_train_full.csv')
test_interpolation_raw = pd.read_csv('full data/test_interpolation.csv')
test_extrapolation_raw = pd.read_csv('full data/test_extrapolation.csv')
data_train_raw = pd.read_csv('full data/data_train.csv')
data_val_raw = pd.read_csv('full data/data_val.csv')
n_samples_chosen_per_group = pd.read_csv('full data/n_samples_chosen_per_group_series.csv').to_numpy().reshape(-1)

# One-hot encode categorical features
data_train_full_raw2 = encode(data_train_full_raw)
test_interpolation_raw2 = encode(test_interpolation_raw)
test_extrapolation = encode(test_extrapolation_raw)
data_train = encode(data_train_raw)
data_val_raw2 = encode(data_val_raw)

# Combine train and interpolation set for Deep ME model
data_train_full = pd.concat([data_train_full_raw2, test_interpolation_raw2], ignore_index=True).reset_index(drop=True)
data_train_full.sort_values(['ID', 'times'], ignore_index=True, inplace=True)

# Fill in missing column for data_val
missing_column = pd.DataFrame(np.zeros(len(data_val_raw2), dtype=int), columns=['disease_<Vascular>'])
data_val = data_val_raw2.copy()
data_val.insert(data_val_raw2.shape[1], 'disease_<Vascular>', value=missing_column)

In [None]:
dataframe_val = [data_train, data_val]
dataframe = [data_train_full, test_extrapolation]

# Hyperparameter search and testing

## All features

### MLP

In [None]:
model_config = {}
model_config['input_dim'] = data_train.shape[1] - 3
model_config['hidden_dim'] = 1000
model_config['output_dim'] = 1

train_config = {}
train_config['n_epochs'] = 20
train_config['lr'] = 1e-3
train_config['n_adapt'] = 1
train_config['inner_lr'] = 0.01
train_config['l2_penalty'] = 0.001

# _, best_val_err = train_and_evaluate_model(model_config, train_config, dataframe_val, 
#                                            random_effects_column_names=['times'],
#                                            group_column_name='ID', y_column_name='egfr',
#                                            n_samples_chosen_per_group=n_samples_chosen_per_group,
#                                            model_type='MLP', random_state=1)
# print(best_val_err)

In [None]:
# print('Best val RMSE: ', best_val_err)

Best val RMSE:  15.111668283500233


In [None]:
rmse_test1, rmse_test2 = train_and_test_model(dataframe, model_config, train_config, random_effects_column_names=['times'],
                                              group_column_name='ID', y_column_name='egfr',
                                              n_samples_chosen_per_group=n_samples_chosen_per_group,
                                              model_type='MLP', random_state=1)

In [None]:
print('Extrapolation: ', rmse_test1)
print('Interpolation: ', rmse_test2)

Extrapolation:  17.450256275820045
Interpolation:  15.196058083411437


### RNN

In [None]:
# Learning rate seems to have the biggest influence on validation performance
# Stop training as soon as validation RMSE increases
model_config = {}
model_config['input_dim'] = data_train.shape[1] - 3
model_config['hidden_dim'] = 1000
model_config['output_dim'] = 1

train_config = {}
train_config['n_epochs'] = 20
train_config['lr'] = 1e-3
train_config['n_adapt'] = 1
train_config['inner_lr'] = 0.01
train_config['l2_penalty'] = 0.0001

# _, best_val_err = train_and_evaluate_model(model_config, train_config, dataframe_val, 
#                                            random_effects_column_names=['times'],
#                                            group_column_name='ID', y_column_name='egfr',
#                                            n_samples_chosen_per_group=n_samples_chosen_per_group,
#                                            model_type='LSTM', random_state=1)
# print(best_val_err)

In [None]:
print(best_val_err)

16.306481431318097


In [None]:
rmse_test1, rmse_test2 = train_and_test_model(dataframe, model_config, train_config, random_effects_column_names=['times'],
                                              group_column_name='ID', y_column_name='egfr',
                                              n_samples_chosen_per_group=n_samples_chosen_per_group,
                                              model_type='LSTM', random_state=1)

HBox(children=(FloatProgress(value=0.0, max=20.0), HTML(value='')))

In [None]:
print('Extrapolation: ', rmse_test1)
print('Interpolation: ', rmse_test2)

Extrapolation:  18.24463146101991
Interpolation:  16.273086815112475


## Selected features

In [None]:
data_train2 = data_train.drop(columns=['height', 'weight', 'bp.sys'])
data_val2 = data_val.drop(columns=['height', 'weight', 'bp.sys'])
data_train_full2 = data_train_full.drop(columns=['height', 'weight', 'bp.sys'])
test_extrapolation2 = test_extrapolation.drop(columns=['height', 'weight', 'bp.sys'])

In [None]:
dataframe_val2 = [data_train2, data_val2]
dataframe2 = [data_train_full2, test_extrapolation2]

### MLP

In [None]:
model_config = {}
model_config['input_dim'] = data_train2.shape[1] - 3
model_config['hidden_dim'] = 500
model_config['output_dim'] = 1

train_config = {}
train_config['n_epochs'] = 20
train_config['lr'] = 5e-3
train_config['n_adapt'] = 1
train_config['inner_lr'] = 0.01
train_config['l2_penalty'] = 0.001

_, best_val_err = train_and_evaluate_model(model_config, train_config, dataframe_val2, 
                                           random_effects_column_names=['times'],
                                           group_column_name='ID', y_column_name='egfr',
                                           n_samples_chosen_per_group=n_samples_chosen_per_group,
                                           model_type='MLP', random_state=1)
print(best_val_err)

In [None]:
print('Best val RMSE: ', best_val_err)

Best val RMSE:  14.523430670890003


In [None]:
rmse_test1, rmse_test2 = train_and_test_model(dataframe2, model_config, train_config, random_effects_column_names=['times'],
                                              group_column_name='ID', y_column_name='egfr',
                                              n_samples_chosen_per_group=n_samples_chosen_per_group,
                                              model_type='MLP', random_state=1)

HBox(children=(FloatProgress(value=0.0, max=20.0), HTML(value='')))


[Test (Extrapolation)] RMSE=17.928 and loss=38.945
[Test (Interpolation)] RMSE=15.173 and loss=44.984


In [None]:
print('Extrapolation: ', rmse_test1)
print('Interpolation: ', rmse_test2)

Extrapolation:  17.92766362488082
Interpolation:  15.172783145773614


### RNN

In [None]:
model_config = {}
model_config['input_dim'] = data_train.shape[1] - 3
model_config['hidden_dim'] = 1000
model_config['output_dim'] = 1

train_config = {}
train_config['n_epochs'] = 20
train_config['lr'] = 1e-3
train_config['n_adapt'] = 1
train_config['inner_lr'] = 0.01
train_config['l2_penalty'] = 0.0001

_, best_val_err = train_and_evaluate_model(model_config, train_config, dataframe_val, 
                                           random_effects_column_names=['times'],
                                           group_column_name='ID', y_column_name='egfr',
                                           n_samples_chosen_per_group=n_samples_chosen_per_group,
                                           model_type='LSTM', random_state=1)
print(best_val_err)

HBox(children=(FloatProgress(value=0.0, max=20.0), HTML(value='')))


Best validation error:  16.306481431318097
16.306481431318097


In [None]:
print(best_val_err)

16.306481431318097


In [None]:
rmse_test1, rmse_test2 = train_and_test_model(dataframe, model_config, train_config, random_effects_column_names=['times'],
                                              group_column_name='ID', y_column_name='egfr',
                                              n_samples_chosen_per_group=n_samples_chosen_per_group,
                                              model_type='LSTM', random_state=1)

In [None]:
print('Extrapolation: ', rmse_test1)
print('Interpolation: ', rmse_test2)

Extrapolation:  18.24463146101991
Interpolation:  16.273086815112475
