In [1]:
import tensorflow as tf
from tensorflow import keras

In [2]:
import pandas as pd
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras import layers
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn import tree
import xgboost as xb

#Hyperparameter tuning modules
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint as sp_randint
from tensorflow.keras.wrappers.scikit_learn import KerasRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.metrics import make_scorer
import matplotlib.pyplot as plt
%matplotlib inline

from tensorflow.keras.callbacks import EarlyStopping

  import pandas.util.testing as tm


In [3]:
import warnings
warnings.filterwarnings("ignore")

In [4]:
# Reading training datasets
frequency = pd.read_csv('frequency_domain_features_train.csv')
heart_rate = pd.read_csv('heart_rate_non_linear_features_train.csv')
time_domain = pd.read_csv('time_domain_features_train.csv')

In [5]:
train = heart_rate.merge(frequency, how = 'inner', on = 'uuid')
train = train.merge(time_domain, how = 'inner', on = 'uuid')

In [6]:
train.head()

Unnamed: 0,uuid,SD1,SD2,sampen,higuci,datasetId,condition,VLF,VLF_PCT,LF,...,KURT,SKEW,MEAN_REL_RR,MEDIAN_REL_RR,SDRR_REL_RR,RMSSD_REL_RR,SDSD_REL_RR,SDRR_RMSSD_REL_RR,KURT_REL_RR,SKEW_REL_RR
0,89df2855-56eb-4706-a23b-b39363dd605a,11.001565,199.061782,2.139754,1.163485,2,no stress,2661.894136,72.203287,1009.249419,...,-0.856554,0.335218,-0.000203,-0.000179,0.01708,0.007969,0.007969,2.143342,-0.856554,0.335218
1,80c795e4-aa56-4cc0-939c-19634b89cbb2,9.170129,114.634458,2.174499,1.084711,2,interruption,2314.26545,76.975728,690.113275,...,-0.40819,-0.155286,-5.9e-05,0.000611,0.013978,0.004769,0.004769,2.930855,-0.40819,-0.155286
2,c2d5d102-967c-487d-88f2-8b005a449f3e,11.533417,118.939253,2.13535,1.176315,2,interruption,1373.887112,51.152225,1298.222619,...,0.351789,-0.656813,-1.1e-05,-0.000263,0.018539,0.008716,0.008716,2.127053,0.351789,-0.656813
3,37eabc44-1349-4040-8896-0d113ad4811f,11.119476,127.318597,2.178341,1.179688,2,no stress,2410.357408,70.180308,1005.981659,...,-0.504947,-0.386138,0.000112,0.000494,0.017761,0.00866,0.00866,2.050988,-0.504947,-0.386138
4,aa777a6a-7aa3-4f6e-aced-70f8691dd2b7,13.590641,87.718281,2.221121,1.249612,2,no stress,1151.17733,43.918366,1421.782051,...,-0.548408,-0.154252,-0.0001,-0.002736,0.023715,0.013055,0.013055,1.816544,-0.548408,-0.154252


In [7]:
train.drop(['uuid', 'condition'], inplace=True, axis=1)

In [8]:
train.head()

Unnamed: 0,SD1,SD2,sampen,higuci,datasetId,VLF,VLF_PCT,LF,LF_PCT,LF_NU,...,KURT,SKEW,MEAN_REL_RR,MEDIAN_REL_RR,SDRR_REL_RR,RMSSD_REL_RR,SDSD_REL_RR,SDRR_RMSSD_REL_RR,KURT_REL_RR,SKEW_REL_RR
0,11.001565,199.061782,2.139754,1.163485,2,2661.894136,72.203287,1009.249419,27.375666,98.485263,...,-0.856554,0.335218,-0.000203,-0.000179,0.01708,0.007969,0.007969,2.143342,-0.856554,0.335218
1,9.170129,114.634458,2.174499,1.084711,2,2314.26545,76.975728,690.113275,22.954139,99.695397,...,-0.40819,-0.155286,-5.9e-05,0.000611,0.013978,0.004769,0.004769,2.930855,-0.40819,-0.155286
2,11.533417,118.939253,2.13535,1.176315,2,1373.887112,51.152225,1298.222619,48.335104,98.950472,...,0.351789,-0.656813,-1.1e-05,-0.000263,0.018539,0.008716,0.008716,2.127053,0.351789,-0.656813
3,11.119476,127.318597,2.178341,1.179688,2,2410.357408,70.180308,1005.981659,29.290305,98.224706,...,-0.504947,-0.386138,0.000112,0.000494,0.017761,0.00866,0.00866,2.050988,-0.504947,-0.386138
4,13.590641,87.718281,2.221121,1.249612,2,1151.17733,43.918366,1421.782051,54.24216,96.720007,...,-0.548408,-0.154252,-0.0001,-0.002736,0.023715,0.013055,0.013055,1.816544,-0.548408,-0.154252


In [9]:
train.shape

(369289, 35)

In [10]:
# Dropping columns Joe suggested
cols = ['datasetId', 'SD2', 'SDRR_RMSSD_REL_RR', 'SDRR_REL_RR', 'MEAN_REL_RR', 'RMSSD_REL_RR', 'MEDIAN_REL_RR', 'RMSSD', 'LF_PCT', 'SKEW_REL_RR', 'HF', 'LF_NU', 'HF_NU', 'SD1', 'KURT_REL_RR']
train.drop(cols, inplace=True, axis=1)

In [11]:
# Train/valid/test split
X = train.drop('HR', axis=1)
y = train['HR']

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=7)

In [13]:
# Creating function to design model

def design_model(X=X_train, learning_rate=5):
    model = Sequential(name='model')
    input = tf.keras.Input(shape=(X.shape[1],))
    model.add(input)
    # Adding one hidden layer
    model.add(layers.Dense(128, activation='relu'))
    model.add(layers.Dropout(0.1))
    model.add(layers.Dense(64, activation='relu'))
    model.add(layers.Dropout(0.2))
    model.add(layers.Dense(32, activation='relu'))
    model.add(layers.Dropout(0.3))
    model.add(layers.Dense(1))
    opt = tf.keras.optimizers.Adam(learning_rate=learning_rate)
    model.compile(loss='mse', metrics=['mae'], optimizer=opt)
    return model

In [14]:
y_train = np.asarray(y_train)
y_test = np.asarray(y_test)

In [15]:
def fit_model(model, X_train, y_train, num_epochs):
    es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=20)
    history = model.fit(X_train, y_train, epochs=num_epochs, batch_size=30, verbose=1, validation_split=0.3, callbacks=[es])
    return history

In [16]:
model = design_model(X_train, learning_rate=0.5)
model.summary()

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 128)               2560      
_________________________________________________________________
dropout (Dropout)            (None, 128)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 64)                8256      
_________________________________________________________________
dropout_1 (Dropout)          (None, 64)                0         
_________________________________________________________________
dense_2 (Dense)              (None, 32)                2080      
_________________________________________________________________
dropout_2 (Dropout)          (None, 32)                0         
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 33    

In [17]:
#fit_model(model, X_train, y_train, num_epochs=30)

In [18]:
#model.fit(X_train, y_train, validation_split=0.3, verbose=1)

In [19]:
val_mse, val_mae = model.evaluate(X_test, y_test, verbose=0)

In [20]:
print("MAE: ", val_mae)

MAE:  280.3956604003906


In [21]:
model = design_model(X_train, learning_rate=2)

In [22]:
model

<tensorflow.python.keras.engine.sequential.Sequential at 0x1f705d5eac8>

In [23]:
# Grid Search
def do_grid_search():
    batch_size = [6, 64]
    epochs = [10, 40]
    model = KerasRegressor(build_fn=design_model)
    param_grid = dict(batch_size=batch_size, epochs=epochs)
    grid = GridSearchCV(estimator=model, verbose=1, n_jobs=-1, param_grid=param_grid, scoring = make_scorer(mean_absolute_error, greater_is_better=False), return_train_score=True)
    grid_result = grid.fit(X_train, y_train, verbose=0)
    grid_result.best_estimator_
    print(grid_result)
    print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
    
    means = grid_result.cv_results_['mean_test_score']
    stds = grid_result.cv_results_['std_test_score']
    params = grid_result.cv_results_['params']
    
    for mean, stdev, param in zip(means, stds, params):
        print("%f (%f) with: %r" % (mean, stdev, param))
        
    print("Training")
    
    means = grid_result.cv_results_['mean_train_score']
    stds = grid_result.cv_results_['std_train_score']
    params = grid_result.cv_results_['params']
    
    for mean, stdev, param in zip(means, stds, params):
        print("%f (%f) with: %r" % (mean, stdev, param))

In [25]:
#do_grid_search()

In [None]:
#-----------------Automated Stacking----------------------

In [26]:
from vecstack import stacking

In [27]:
models = [
    LinearRegression(n_jobs=-1),
    #RandomForestRegressor(random_state=7, max_depth=3),
    xb.XGBRegressor(random_state=7, learning_rate=0.1, n_estimators=50, max_depth=3)
]

In [28]:
s_train, s_test = stacking(models, X_train, y_train, X_test, regression=True, random_state=7, verbose=1, n_folds=5, stratified=True)

task:         [regression]
metric:       [mean_absolute_error]
mode:         [oof_pred_bag]
n_models:     [2]

model  0:     [LinearRegression]
    ----
    MEAN:     [0.97043693] + [0.00559822]
    FULL:     [0.97043693]

model  1:     [XGBRegressor]
    ----
    MEAN:     [0.59874771] + [0.00631591]
    FULL:     [0.59874770]



In [62]:
model = xb.XGBRegressor(random_state=7)

model = model.fit(s_train, y_train)

y_pred = model.predict(s_test)

print("Final MAE score: [%.8f]" % mean_absolute_error(y_test, y_pred))

Final MAE score: [0.30016436]


In [63]:
# Test datasets

frequency_test = pd.read_csv('frequency_domain_features_test.csv')

heart_rate_test = pd.read_csv('heart_rate_non_linear_features_test.csv')

time_domain_test = pd.read_csv('time_domain_features_test.csv')

In [64]:
test = heart_rate_test.merge(frequency_test, how='inner', on='uuid')
test = test.merge(time_domain_test, how='inner', on='uuid')

In [66]:
pred_vs_actual = pd.DataFrame(s_test)

In [67]:
pred_vs_actual.columns = ['LR', 'XGB']

pred_vs_actual['Predicted HR'] = y_pred

In [69]:
pred_vs_actual['Actual HR'] = y_test

In [70]:
pred_vs_actual

Unnamed: 0,LR,XGB,Predicted HR,Actual HR
0,73.827206,74.898750,73.994652,73.980817
1,63.682015,64.610434,64.159210,64.062723
2,67.912623,67.607881,67.494293,67.593559
3,80.217862,79.904066,80.007057,79.994453
4,55.034054,54.729639,54.605839,54.921336
...,...,...,...,...
110782,89.002267,86.335959,87.135750,86.813935
110783,78.540342,72.815196,76.918633,78.122775
110784,66.189631,65.862241,66.262794,66.340979
110785,68.944726,68.401019,68.664864,68.910415
