# Libraries

In [3]:
import random
import numpy as np
import time
import copy
import scipy
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import GroupKFold, GroupShuffleSplit
from sklearn.preprocessing import OneHotEncoder as SklearnOneHotEncoder
from tqdm.notebook import tqdm, tnrange

In [4]:
from DME.Analysis.train import *

# Helper functions

In [5]:
class OneHotEncoder(SklearnOneHotEncoder):
    def __init__(self, **kwargs):
        super(OneHotEncoder, self).__init__(**kwargs)
        self.fit_flag = False

    def fit(self, X, **kwargs):
        out = super().fit(X)
        self.fit_flag = True
        return out

    def transform(self, X, **kwargs):
        sparse_matrix = super(OneHotEncoder, self).transform(X)
        new_columns = self.get_new_columns(X=X)
        d_out = pd.DataFrame(sparse_matrix.toarray(), columns=new_columns, index=X.index)
        return d_out

    def fit_transform(self, X, **kwargs):
        self.fit(X)
        return self.transform(X)

    def get_new_columns(self, X):
        new_columns = []
        for i, column in enumerate(X.columns):
            j = 0
            while j < len(self.categories_[i]):
                new_columns.append(f'{column}_<{self.categories_[i][j]}>')
                j += 1
        return new_columns

In [6]:
def encode(df):
    s = (df.dtypes == 'object')
    object_cols = list(s[s].index)
    OH_encoder = OneHotEncoder(dtype=int)
    OH_cols = pd.DataFrame(OH_encoder.fit_transform(df[object_cols]))
    num_data = df.drop(object_cols, axis=1)
    data_new = pd.concat([num_data, OH_cols], axis=1)
    return data_new

# Data pre-processing

In [13]:
data_train_full_raw = pd.read_csv('Sheffield/data_train_full_sheffield.csv').drop(columns=['site'])
test_interpolation_raw = pd.read_csv('Sheffield/test_interpolation_sheffield.csv').drop(columns=['site'])
test_extrapolation_raw = pd.read_csv('Sheffield/test_extrapolation_sheffield.csv').drop(columns=['site'])
data_train_raw = pd.read_csv('Sheffield/data_train_sheffield.csv').drop(columns=['site'])
data_val_raw = pd.read_csv('Sheffield/data_val_sheffield.csv').drop(columns=['site'])
n_samples_chosen_per_group = pd.read_csv('Sheffield/n_samples_chosen_per_group_series_sheffield.csv').to_numpy().reshape(-1)

# One-hot encode categorical features
data_train_full_raw2 = encode(data_train_full_raw)
test_interpolation = encode(test_interpolation_raw)
test_extrapolation_raw2 = encode(test_extrapolation_raw)
data_train = encode(data_train_raw)
data_val_raw2 = encode(data_val_raw)

# Fix missing columns
data_val_raw2['gender_<Unknown>'] = 0
data_val = data_val_raw2[list(data_train.columns.values)]

test_extrapolation_raw2['gender_<Unknown>'] = 0
test_extrapolation = test_extrapolation_raw2[list(test_interpolation.columns.values)]


# Combine train and interpolation set for Deep ME model
data_train_full = pd.concat([data_train_full_raw2, test_interpolation], ignore_index=True).reset_index(drop=True)
data_train_full.sort_values(['ID', 'times'], ignore_index=True, inplace=True)

In [14]:
dataframe_val = [data_train, data_val]
dataframe = [data_train_full, test_extrapolation]

# Hyperparameter search and testing

## All features

### MLP

In [20]:
model_config = {}
model_config['input_dim'] = data_train.shape[1] - 3
model_config['hidden_dim'] = 1000
model_config['output_dim'] = 1

train_config = {}
train_config['n_epochs'] = 20
train_config['lr'] = 1e-3
train_config['n_adapt'] = 1
train_config['inner_lr'] = 0.01
train_config['l2_penalty'] = 0.0001

_, best_val_err = train_and_evaluate_model(model_config, train_config, dataframe_val, 
                                           random_effects_column_names=['times'],
                                           group_column_name='ID', y_column_name='egfr',
                                           n_samples_chosen_per_group=n_samples_chosen_per_group,
                                           model_type='MLP', random_state=1)
print(best_val_err)

HBox(children=(FloatProgress(value=0.0, max=20.0), HTML(value='')))


Best validation error:  14.590505235171882
14.590505235171882


In [21]:
print('Best val RMSE: ', best_val_err)

Best val RMSE:  14.590505235171882


In [29]:
model_config = {}
model_config['input_dim'] = data_train_full.shape[1] - 3
model_config['hidden_dim'] = 1000
model_config['output_dim'] = 1

train_config = {}
train_config['n_epochs'] = 25
train_config['lr'] = 1e-3
train_config['n_adapt'] = 1
train_config['inner_lr'] = 0.01
train_config['l2_penalty'] = 0.0001

In [30]:
rmse_test1, rmse_test2 = train_and_test_model(dataframe, model_config, train_config, random_effects_column_names=['times'],
                                              group_column_name='ID', y_column_name='egfr',
                                              n_samples_chosen_per_group=n_samples_chosen_per_group,
                                              model_type='MLP', random_state=1)

HBox(children=(FloatProgress(value=0.0, max=25.0), HTML(value='')))


[Test (Extrapolation)] RMSE=15.464 and loss=64.396
[Test (Interpolation)] RMSE=13.888 and loss=77.551


In [31]:
print('Extrapolation: ', rmse_test1)
print('Interpolation: ', rmse_test2)

Extrapolation:  15.463990251769946
Interpolation:  13.888157559955758


### RNN

In [None]:
model_config = {}
model_config['input_dim'] = data_train.shape[1] - 3
model_config['hidden_dim'] = 500
model_config['output_dim'] = 1

train_config = {}
train_config['n_epochs'] = 20
train_config['lr'] = 1e-3
train_config['n_adapt'] = 1
train_config['inner_lr'] = 0.01
train_config['l2_penalty'] = 0.0001

_, best_val_err = train_and_evaluate_model(model_config, train_config, dataframe_val, 
                                           random_effects_column_names=['times'],
                                           group_column_name='ID', y_column_name='egfr',
                                           n_samples_chosen_per_group=n_samples_chosen_per_group,
                                           model_type='LSTM', random_state=1)
print(best_val_err)

In [33]:
print(best_val_err)

14.855477994343692


In [40]:
model_config = {}
model_config['input_dim'] = data_train_full.shape[1] - 3
model_config['hidden_dim'] = 500
model_config['output_dim'] = 1

train_config = {}
train_config['n_epochs'] = 25
train_config['lr'] = 1e-3
train_config['n_adapt'] = 1
train_config['inner_lr'] = 0.01
train_config['l2_penalty'] = 0.0001

In [41]:
rmse_test1, rmse_test2 = train_and_test_model(dataframe, model_config, train_config, random_effects_column_names=['times'],
                                              group_column_name='ID', y_column_name='egfr',
                                              n_samples_chosen_per_group=n_samples_chosen_per_group,
                                              model_type='LSTM', random_state=1)

HBox(children=(FloatProgress(value=0.0, max=25.0), HTML(value='')))


[Test (Extrapolation)] RMSE=15.577 and loss=64.396
[Test (Interpolation)] RMSE=14.096 and loss=77.551


In [42]:
print('Extrapolation: ', rmse_test1)
print('Interpolation: ', rmse_test2)

Extrapolation:  15.576650291438073
Interpolation:  14.096012043907745
