In [1]:
import tensorflow as tf
from tensorflow import keras

In [2]:
import pandas as pd
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras import layers
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer

#Hyperparameter tuning modules
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint as sp_randint
from tensorflow.keras.wrappers.scikit_learn import KerasRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.metrics import make_scorer
import matplotlib.pyplot as plt
%matplotlib inline

In [3]:
import warnings
warnings.filterwarnings("ignore")

In [4]:
# Reading training datasets
frequency = pd.read_csv('frequency_domain_features_train.csv')
heart_rate = pd.read_csv('heart_rate_non_linear_features_train.csv')
time_domain = pd.read_csv('time_domain_features_train.csv')

In [5]:
train = heart_rate.merge(frequency, how = 'inner', on = 'uuid')
train = train.merge(time_domain, how = 'inner', on = 'uuid')

In [6]:
train.head()

Unnamed: 0,uuid,SD1,SD2,sampen,higuci,datasetId,condition,VLF,VLF_PCT,LF,...,KURT,SKEW,MEAN_REL_RR,MEDIAN_REL_RR,SDRR_REL_RR,RMSSD_REL_RR,SDSD_REL_RR,SDRR_RMSSD_REL_RR,KURT_REL_RR,SKEW_REL_RR
0,89df2855-56eb-4706-a23b-b39363dd605a,11.001565,199.061782,2.139754,1.163485,2,no stress,2661.894136,72.203287,1009.249419,...,-0.856554,0.335218,-0.000203,-0.000179,0.01708,0.007969,0.007969,2.143342,-0.856554,0.335218
1,80c795e4-aa56-4cc0-939c-19634b89cbb2,9.170129,114.634458,2.174499,1.084711,2,interruption,2314.26545,76.975728,690.113275,...,-0.40819,-0.155286,-5.9e-05,0.000611,0.013978,0.004769,0.004769,2.930855,-0.40819,-0.155286
2,c2d5d102-967c-487d-88f2-8b005a449f3e,11.533417,118.939253,2.13535,1.176315,2,interruption,1373.887112,51.152225,1298.222619,...,0.351789,-0.656813,-1.1e-05,-0.000263,0.018539,0.008716,0.008716,2.127053,0.351789,-0.656813
3,37eabc44-1349-4040-8896-0d113ad4811f,11.119476,127.318597,2.178341,1.179688,2,no stress,2410.357408,70.180308,1005.981659,...,-0.504947,-0.386138,0.000112,0.000494,0.017761,0.00866,0.00866,2.050988,-0.504947,-0.386138
4,aa777a6a-7aa3-4f6e-aced-70f8691dd2b7,13.590641,87.718281,2.221121,1.249612,2,no stress,1151.17733,43.918366,1421.782051,...,-0.548408,-0.154252,-0.0001,-0.002736,0.023715,0.013055,0.013055,1.816544,-0.548408,-0.154252


In [7]:
train.drop(['uuid', 'condition'], inplace=True, axis=1)

In [8]:
train.head()

Unnamed: 0,SD1,SD2,sampen,higuci,datasetId,VLF,VLF_PCT,LF,LF_PCT,LF_NU,...,KURT,SKEW,MEAN_REL_RR,MEDIAN_REL_RR,SDRR_REL_RR,RMSSD_REL_RR,SDSD_REL_RR,SDRR_RMSSD_REL_RR,KURT_REL_RR,SKEW_REL_RR
0,11.001565,199.061782,2.139754,1.163485,2,2661.894136,72.203287,1009.249419,27.375666,98.485263,...,-0.856554,0.335218,-0.000203,-0.000179,0.01708,0.007969,0.007969,2.143342,-0.856554,0.335218
1,9.170129,114.634458,2.174499,1.084711,2,2314.26545,76.975728,690.113275,22.954139,99.695397,...,-0.40819,-0.155286,-5.9e-05,0.000611,0.013978,0.004769,0.004769,2.930855,-0.40819,-0.155286
2,11.533417,118.939253,2.13535,1.176315,2,1373.887112,51.152225,1298.222619,48.335104,98.950472,...,0.351789,-0.656813,-1.1e-05,-0.000263,0.018539,0.008716,0.008716,2.127053,0.351789,-0.656813
3,11.119476,127.318597,2.178341,1.179688,2,2410.357408,70.180308,1005.981659,29.290305,98.224706,...,-0.504947,-0.386138,0.000112,0.000494,0.017761,0.00866,0.00866,2.050988,-0.504947,-0.386138
4,13.590641,87.718281,2.221121,1.249612,2,1151.17733,43.918366,1421.782051,54.24216,96.720007,...,-0.548408,-0.154252,-0.0001,-0.002736,0.023715,0.013055,0.013055,1.816544,-0.548408,-0.154252


In [9]:
train.shape

(369289, 35)

In [None]:
# Dropping columns Joe suggested
cols = ['datasetId', 'SD2', 'SDRR_RMSSD_REL_RR', 'SDRR_REL_RR', 'MEAN_REL_RR', 'RMSSD_REL_RR', 'MEDIAN_REL_RR', 'RMSSD', 'LF_PCT', 'SKEW_REL_RR', 'HF', 'LF_NU', 'HF_NU', 'SD1', 'KURT_REL_RR']
train.drop(cols, inplace=True, axis=1)

In [12]:
train = train.sample(1000)

train.shape

(1000, 35)

In [13]:
from scipy import stats

z = np.abs(stats.zscore(train))
print(z)

[[9.65580247e-01 1.79301516e-01 6.74662650e-01 ... 4.85861697e-01
  4.84489798e-01 3.80888988e-01]
 [9.08456150e-01 6.72969710e-01 5.74924830e-01 ... 8.45790821e-01
  2.17479746e-01 2.92993787e-03]
 [1.25784184e-01 1.93345873e+00 1.06419498e-01 ... 4.25793251e+00
  4.53451272e-02 4.02820661e-02]
 ...
 [4.20293332e-01 6.58633871e-01 4.24457107e-01 ... 3.56309488e-01
  1.80742909e-01 2.07561635e-01]
 [8.95561029e-01 1.93238364e+00 1.21315655e+00 ... 1.12900556e+00
  1.36572502e-01 1.22455707e+00]
 [8.16372021e-01 5.93237167e-01 2.24316738e-01 ... 3.54122983e-01
  7.00429343e-02 2.02878208e-01]]


In [14]:
threshold = 3
np.where(z > threshold)

(array([  2,   2,   5,  14,  15,  28,  28,  28,  28,  34,  34,  34,  35,
         42,  42,  43,  43,  43,  43,  43,  43,  43,  43,  52,  59,  73,
         82,  82,  82,  82, 101, 101, 104, 104, 111, 111, 111, 111, 111,
        111, 111, 116, 118, 119, 119, 125, 127, 127, 132, 133, 133, 133,
        133, 143, 144, 144, 144, 144, 144, 144, 144, 149, 149, 166, 175,
        175, 183, 183, 183, 183, 188, 200, 200, 200, 202, 202, 202, 202,
        202, 202, 202, 202, 205, 205, 215, 215, 215, 222, 222, 225, 225,
        225, 256, 256, 256, 258, 258, 258, 258, 259, 263, 263, 273, 273,
        280, 281, 281, 281, 281, 281, 282, 283, 283, 283, 283, 286, 286,
        286, 286, 291, 297, 312, 312, 312, 312, 312, 312, 312, 312, 313,
        314, 326, 335, 339, 339, 339, 349, 349, 349, 349, 349, 351, 351,
        361, 372, 375, 375, 375, 375, 375, 380, 380, 380, 380, 391, 391,
        391, 395, 419, 419, 426, 426, 426, 426, 426, 426, 426, 426, 427,
        433, 433, 441, 441, 441, 441, 443, 444, 444

In [15]:
# Replacing outliers

#for i,j in zip(np.where(z > threshold)[0], np.where(z > threshold)[1]):
#    train.iloc[i,j] = train.iloc[:,j].median()

In [16]:
# Train/valid/test split
X = train.drop('HR', axis=1)
y = train['HR']

In [17]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=7)

In [35]:
# Creating function to design model

def design_model(X=X_train, learning_rate=5):
    model = Sequential(name='model')
    input = tf.keras.Input(shape=(X.shape[1],))
    model.add(input)
    # Adding one hidden layer
    model.add(layers.Dense(64, activation='relu'))
    model.add(layers.Dropout(0.1))
    model.add(layers.Dense(24, activation='relu'))
    model.add(layers.Dropout(0.2))
    model.add(layers.Dense(1))
    opt = tf.keras.optimizers.Adam(learning_rate=learning_rate)
    model.compile(loss='mse', metrics=['mae'], optimizer=opt)
    return model

In [36]:
y_train = np.asarray(y_train)
y_test = np.asarray(y_test)

In [37]:
# Standardize

scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [38]:
# Column transformer returns numpy arrays; converting features to dataframes

X_train_scaled = pd.DataFrame(X_train_scaled)
X_test_scaled = pd.DataFrame(X_test_scaled)

In [39]:
X_train_scaled.describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,24,25,26,27,28,29,30,31,32,33
count,700.0,700.0,700.0,700.0,700.0,700.0,700.0,700.0,700.0,700.0,...,700.0,700.0,700.0,700.0,700.0,700.0,700.0,700.0,700.0,700.0
mean,4.817823e-16,-4.1791970000000006e-17,-3.727177e-16,-8.762832e-16,0.0,-1.081674e-16,-4.989659e-16,1.464701e-16,1.617754e-16,4.694657e-16,...,1.522592e-17,-1.45915e-17,1.617754e-17,-4.032489e-17,-2.591974e-16,1.263275e-16,1.103482e-16,-1.544796e-16,1.522592e-17,-1.45915e-17
std,1.000715,1.000715,1.000715,1.000715,0.0,1.000715,1.000715,1.000715,1.000715,1.000715,...,1.000715,1.000715,1.000715,1.000715,1.000715,1.000715,1.000715,1.000715,1.000715,1.000715
min,-2.310417,-1.085831,-6.474495,-2.2223,0.0,-1.10111,-2.439536,-1.561105,-1.903756,-5.416768,...,-1.301597,-3.209802,-4.817871,-4.378502,-1.674843,-1.484638,-1.484635,-1.959566,-1.301597,-3.209802
25%,-0.7929216,-0.6183198,-0.1735056,-0.6483145,0.0,-0.6702856,-0.6298489,-0.6954512,-0.7164405,-0.3920302,...,-0.4761138,-0.568523,-0.4698945,-0.4937308,-0.8031669,-0.6953692,-0.6953736,-0.6735278,-0.4761138,-0.568523
50%,-0.1285083,-0.3344047,0.3509286,-0.09850064,0.0,-0.2877183,0.1664685,-0.2472124,-0.1438805,0.3057146,...,-0.2631829,-0.1459372,-0.009305591,0.1427472,-0.2302087,-0.2767337,-0.2767348,-0.1760203,-0.2631829,-0.1459372
75%,0.7543392,0.1715171,0.5904244,0.6551254,0.0,0.1685498,0.7220447,0.4153356,0.5753914,0.717925,...,0.07783912,0.3431827,0.4936829,0.6946046,0.6191716,0.4727599,0.4727636,0.5401128,0.07783912,0.3431827
max,2.642962,5.698896,0.8362369,2.7621,0.0,5.053007,1.912718,3.986618,2.460534,0.9876584,...,8.691206,4.884951,5.191782,2.624281,3.242947,4.053587,4.053591,4.14389,8.691206,4.884951


In [40]:
X_test_scaled.describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,24,25,26,27,28,29,30,31,32,33
count,300.0,300.0,300.0,300.0,300.0,300.0,300.0,300.0,300.0,300.0,...,300.0,300.0,300.0,300.0,300.0,300.0,300.0,300.0,300.0,300.0
mean,-0.02798,-0.060106,0.015275,0.064233,0.0,-0.037336,0.003722,-0.005084,-0.003586,-0.018082,...,-0.001251,0.020835,-0.051807,-0.017658,-0.043974,-0.036577,-0.036578,-0.043287,-0.001251,0.020835
std,0.969299,0.944212,1.076182,0.973451,0.0,0.93726,0.949207,0.979287,0.95101,0.959527,...,0.872222,1.012775,0.972916,1.019688,0.937186,0.904832,0.904832,0.919325,0.872222,1.012775
min,-1.635925,-1.00768,-6.421572,-2.12715,0.0,-1.046999,-2.319064,-1.482622,-1.839144,-4.687307,...,-1.347651,-2.65033,-4.796024,-4.230136,-1.520248,-1.331784,-1.331782,-1.739094,-1.347651,-2.65033
25%,-0.81872,-0.627508,-0.037618,-0.604536,0.0,-0.656765,-0.538232,-0.708844,-0.686264,-0.497372,...,-0.471881,-0.594556,-0.487491,-0.489956,-0.777661,-0.627092,-0.627091,-0.695316,-0.471881,-0.594556
50%,-0.154885,-0.377442,0.409214,-0.084554,0.0,-0.290894,0.088641,-0.254063,-0.081876,0.262465,...,-0.246886,-0.151997,0.021527,0.203373,-0.262009,-0.266542,-0.266541,-0.111422,-0.246886,-0.151997
75%,0.6386,0.099968,0.580178,0.691285,0.0,0.21858,0.688534,0.454796,0.510355,0.69309,...,0.090103,0.323643,0.342244,0.687993,0.479213,0.341071,0.341068,0.509441,0.090103,0.323643
max,2.744062,5.820626,0.789943,2.80251,0.0,4.823809,1.846918,3.930295,2.307352,0.983088,...,5.117406,3.820132,4.870722,2.126404,2.872971,3.450813,3.450806,3.228908,5.117406,3.820132


In [41]:
model = design_model(X_train, learning_rate=0.01)
model.summary()

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_12 (Dense)             (None, 64)                2240      
_________________________________________________________________
dropout_8 (Dropout)          (None, 64)                0         
_________________________________________________________________
dense_13 (Dense)             (None, 24)                1560      
_________________________________________________________________
dropout_9 (Dropout)          (None, 24)                0         
_________________________________________________________________
dense_14 (Dense)             (None, 1)                 25        
Total params: 3,825
Trainable params: 3,825
Non-trainable params: 0
_________________________________________________________________


In [42]:
model.fit(X_train, y_train, epochs=40, batch_size=1, verbose=1)

Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40
Epoch 21/40
Epoch 22/40
Epoch 23/40
Epoch 24/40
Epoch 25/40
Epoch 26/40
Epoch 27/40
Epoch 28/40
Epoch 29/40
Epoch 30/40
Epoch 31/40
Epoch 32/40
Epoch 33/40
Epoch 34/40
Epoch 35/40
Epoch 36/40
Epoch 37/40
Epoch 38/40
Epoch 39/40
Epoch 40/40


<tensorflow.python.keras.callbacks.History at 0x25895c16888>

In [43]:
val_mse, val_mae = model.evaluate(X_test, y_test, verbose=0)

In [44]:
print("MAE: ", val_mae)

MAE:  5.751535415649414


In [45]:
model = design_model(X_train, learning_rate=2)

In [46]:
model

<tensorflow.python.keras.engine.sequential.Sequential at 0x2589632be08>

In [47]:
#import xgboost as xb

In [48]:
# Grid Search
def do_grid_search():
    batch_size = [6, 64]
    epochs = [10, 40]
    model = KerasRegressor(build_fn=design_model)
    param_grid = dict(batch_size=batch_size, epochs=epochs)
    grid = GridSearchCV(estimator=model, verbose=1, n_jobs=-1, param_grid=param_grid, scoring = make_scorer(mean_absolute_error, greater_is_better=False), return_train_score=True)
    grid_result = grid.fit(X_train, y_train, verbose=0)
    grid_result.best_estimator_
    print(grid_result)
    print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
    
    means = grid_result.cv_results_['mean_test_score']
    stds = grid_result.cv_results_['std_test_score']
    params = grid_result.cv_results_['params']
    
    for mean, stdev, param in zip(means, stds, params):
        print("%f (%f) with: %r" % (mean, stdev, param))
        
    print("Training")
    
    means = grid_result.cv_results_['mean_train_score']
    stds = grid_result.cv_results_['std_train_score']
    params = grid_result.cv_results_['params']
    
    for mean, stdev, param in zip(means, stds, params):
        print("%f (%f) with: %r" % (mean, stdev, param))

In [49]:
do_grid_search()

Fitting 5 folds for each of 4 candidates, totalling 20 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:   18.6s finished


GridSearchCV(estimator=<tensorflow.python.keras.wrappers.scikit_learn.KerasRegressor object at 0x00000258963526C8>,
             n_jobs=-1, param_grid={'batch_size': [6, 64], 'epochs': [10, 40]},
             return_train_score=True,
             scoring=make_scorer(mean_absolute_error, greater_is_better=False),
             verbose=1)
Best: -38.962161 using {'batch_size': 6, 'epochs': 40}
-50.609530 (21.714310) with: {'batch_size': 6, 'epochs': 10}
-38.962161 (29.037847) with: {'batch_size': 6, 'epochs': 40}
-103.089765 (43.256972) with: {'batch_size': 64, 'epochs': 10}
-66.175366 (38.521007) with: {'batch_size': 64, 'epochs': 40}
Training
-51.150155 (21.937360) with: {'batch_size': 6, 'epochs': 10}
-38.628625 (28.361195) with: {'batch_size': 6, 'epochs': 40}
-102.375125 (41.401266) with: {'batch_size': 64, 'epochs': 10}
-65.390576 (38.082157) with: {'batch_size': 64, 'epochs': 40}
