# Libraries

In [3]:
import random
import numpy as np
import time
import copy
import scipy
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import GroupKFold, GroupShuffleSplit
from sklearn.preprocessing import OneHotEncoder as SklearnOneHotEncoder
from tqdm.notebook import tqdm, tnrange

In [4]:
from DME.Analysis.train import *

# Helper functions

In [5]:
class OneHotEncoder(SklearnOneHotEncoder):
    def __init__(self, **kwargs):
        super(OneHotEncoder, self).__init__(**kwargs)
        self.fit_flag = False

    def fit(self, X, **kwargs):
        out = super().fit(X)
        self.fit_flag = True
        return out

    def transform(self, X, **kwargs):
        sparse_matrix = super(OneHotEncoder, self).transform(X)
        new_columns = self.get_new_columns(X=X)
        d_out = pd.DataFrame(sparse_matrix.toarray(), columns=new_columns, index=X.index)
        return d_out

    def fit_transform(self, X, **kwargs):
        self.fit(X)
        return self.transform(X)

    def get_new_columns(self, X):
        new_columns = []
        for i, column in enumerate(X.columns):
            j = 0
            while j < len(self.categories_[i]):
                new_columns.append(f'{column}_<{self.categories_[i][j]}>')
                j += 1
        return new_columns

In [6]:
def encode(df):
    s = (df.dtypes == 'object')
    object_cols = list(s[s].index)
    OH_encoder = OneHotEncoder(dtype=int)
    OH_cols = pd.DataFrame(OH_encoder.fit_transform(df[object_cols]))
    num_data = df.drop(object_cols, axis=1)
    data_new = pd.concat([num_data, OH_cols], axis=1)
    return data_new

# Data pre-processing

In [7]:
data_train_full_raw = pd.read_csv('Patras/data_train_full_patras.csv').drop(columns=['site'])
test_interpolation_raw = pd.read_csv('Patras/test_interpolation_patras.csv').drop(columns=['site'])
test_extrapolation_raw = pd.read_csv('Patras/test_extrapolation_patras.csv').drop(columns=['site'])
data_train_raw = pd.read_csv('Patras/data_train_patras.csv').drop(columns=['site'])
data_val_raw = pd.read_csv('Patras/data_val_patras.csv').drop(columns=['site'])
n_samples_chosen_per_group = pd.read_csv('Patras/n_samples_chosen_per_group_series_patras.csv').to_numpy().reshape(-1)

# One-hot encode categorical features
data_train_full_raw2 = encode(data_train_full_raw)
test_interpolation_raw2 = encode(test_interpolation_raw)
test_extrapolation = encode(test_extrapolation_raw)
data_train_raw2 = encode(data_train_raw)
data_val = encode(data_val_raw)

# Fix missing columns
data_train_raw2['disease_<Obstructive>'] = 0
data_train = data_train_raw2[list(data_val.columns.values)]
test_extrapolation['disease_<Obstructive>'] = 0
data_train_full_raw2['gender_<Unknown>'] = 0
data_train_full_raw2['ethnicity_<Asian>'] = 0
data_train_full_raw2['disease_<Vascular>'] = 0
data_train_full_raw3 = data_train_full_raw2[list(test_extrapolation.columns.values)]

test_interpolation_raw2['gender_<Unknown>'] = 0
test_interpolation_raw2['ethnicity_<Asian>'] = 0
test_interpolation_raw2['disease_<Vascular>'] = 0
test_interpolation_raw3 = test_interpolation_raw2[list(test_extrapolation.columns.values)]

# Combine train and interpolation set for Deep ME model
data_train_full = pd.concat([data_train_full_raw3, test_interpolation_raw3], ignore_index=True).reset_index(drop=True)
data_train_full.sort_values(['ID', 'times'], ignore_index=True, inplace=True)

In [8]:
dataframe_val = [data_train, data_val]
dataframe = [data_train_full, test_extrapolation]

# Hyperparameter search and testing

### MLP

In [None]:
model_config = {}
model_config['input_dim'] = data_train.shape[1] - 3
model_config['hidden_dim'] = 1000
model_config['output_dim'] = 1

train_config = {}
train_config['n_epochs'] = 20
train_config['lr'] = 1e-3
train_config['n_adapt'] = 1
train_config['inner_lr'] = 0.01
train_config['l2_penalty'] = 0.0001

_, best_val_err = train_and_evaluate_model(model_config, train_config, dataframe_val, 
                                           random_effects_column_names=['times'],
                                           group_column_name='ID', y_column_name='egfr',
                                           n_samples_chosen_per_group=n_samples_chosen_per_group,
                                           model_type='MLP', random_state=1)
print(best_val_err)

HBox(children=(FloatProgress(value=0.0, max=20.0), HTML(value='')))


Best validation error:  16.82539366761778
16.82539366761778


In [None]:
print('Best val RMSE: ', best_val_err)

Best val RMSE:  16.82539366761778


In [16]:
model_config = {}
model_config['input_dim'] = data_train_full.shape[1] - 3
model_config['hidden_dim'] = 1000
model_config['output_dim'] = 1

train_config = {}
train_config['n_epochs'] = 20
train_config['lr'] = 1e-3
train_config['n_adapt'] = 1
train_config['inner_lr'] = 0.01
train_config['l2_penalty'] = 0.0001

In [17]:
rmse_test1, rmse_test2 = train_and_test_model(dataframe, model_config, train_config, random_effects_column_names=['times'],
                                              group_column_name='ID', y_column_name='egfr',
                                              n_samples_chosen_per_group=n_samples_chosen_per_group,
                                              model_type='MLP', random_state=1)

HBox(children=(FloatProgress(value=0.0, max=20.0), HTML(value='')))


[Test (Extrapolation)] RMSE=17.529 and loss=362.353
[Test (Interpolation)] RMSE=14.835 and loss=424.639


In [18]:
print('Extrapolation: ', rmse_test1)
print('Interpolation: ', rmse_test2)

Extrapolation:  17.529392518019023
Interpolation:  14.835458724666692


### RNN

In [19]:
model_config = {}
model_config['input_dim'] = data_train.shape[1] - 3
model_config['hidden_dim'] = 500
model_config['output_dim'] = 1

train_config = {}
train_config['n_epochs'] = 20
train_config['lr'] = 1e-3
train_config['n_adapt'] = 1
train_config['inner_lr'] = 0.01
train_config['l2_penalty'] = 0.0001

_, best_val_err = train_and_evaluate_model(model_config, train_config, dataframe_val, 
                                           random_effects_column_names=['times'],
                                           group_column_name='ID', y_column_name='egfr',
                                           n_samples_chosen_per_group=n_samples_chosen_per_group,
                                           model_type='LSTM', random_state=1)
print(best_val_err)

HBox(children=(FloatProgress(value=0.0, max=20.0), HTML(value='')))


Best validation error:  17.496500591803617
17.496500591803617


In [20]:
print(best_val_err)

17.496500591803617


In [21]:
model_config = {}
model_config['input_dim'] = data_train_full.shape[1] - 3
model_config['hidden_dim'] = 500
model_config['output_dim'] = 1

train_config = {}
train_config['n_epochs'] = 20
train_config['lr'] = 1e-3
train_config['n_adapt'] = 1
train_config['inner_lr'] = 0.01
train_config['l2_penalty'] = 0.0001

In [22]:
rmse_test1, rmse_test2 = train_and_test_model(dataframe, model_config, train_config, random_effects_column_names=['times'],
                                              group_column_name='ID', y_column_name='egfr',
                                              n_samples_chosen_per_group=n_samples_chosen_per_group,
                                              model_type='LSTM', random_state=1)

HBox(children=(FloatProgress(value=0.0, max=20.0), HTML(value='')))


[Test (Extrapolation)] RMSE=17.221 and loss=362.353
[Test (Interpolation)] RMSE=15.332 and loss=424.639


In [23]:
print('Extrapolation: ', rmse_test1)
print('Interpolation: ', rmse_test2)

Extrapolation:  17.22057984974165
Interpolation:  15.331955115256827
