In [1]:
import json
import os
import sys
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import biosppy
import operator
import re
#import import_ipynb
from sklearn.metrics import mean_absolute_error, r2_score, mean_squared_error, f1_score, accuracy_score
import seaborn as sns
from sklearn.linear_model import ElasticNet, LogisticRegression
from sklearn.ensemble import GradientBoostingRegressor, GradientBoostingClassifier
import random
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.wrappers.scikit_learn import KerasRegressor
from tensorflow.keras import regularizers
from tensorflow.keras.layers import Dropout
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
import tensorflow as tf
from sklearn.multiclass import OneVsRestClassifier
from sklearn.utils import resample

In [31]:
predicted_variable = 'DBP' # 
correlation_threshold = 0.95
compute_pvalue = False

In [32]:
tf.random.set_seed(42)

# User Defined Functions

In [33]:
def drop_correlation(df, labels, threshold = 0.95, plotcorr = False):
    corr = df.loc[:, ~df.columns.isin(labels)].corr()
    if plotcorr: 
        f, ax = plt.subplots(figsize=(15, 15))
        cmap = sns.diverging_palette(220, 10, as_cmap=True)
        sns.heatmap(corr, cmap = cmap,
                xticklabels=corr.columns.values,
                yticklabels=corr.columns.values)

    # Select upper triangle of correlation matrix
    upper = corr.abs().where(np.triu(np.ones(corr.shape), k=1).astype(np.bool))
    # Find features with correlation greater than 0.95
    to_drop = [column for column in upper.columns if any(upper[column] > threshold)]
    len(to_drop)
    # Drop features 
    print("New Dataframe Shape" + str(df.shape))
    return(df.drop(columns = to_drop))

In [34]:
df = pd.read_csv("../intermediate_data/bp_features.csv")

In [35]:
df.shape

(152, 45)

In [36]:
df.drop(df.loc[(df['SBP'] == 0)|(df['DBP'] == 0)].index, inplace = True)
df.shape

(149, 45)

In [37]:
df = drop_correlation(df, ['SBP', 'DBP'], correlation_threshold, plotcorr = False)
print(df.shape)

New Dataframe Shape(149, 45)
(149, 30)


In [38]:
def baseline_model():
    # create model
    model = Sequential()
    model.add(Dropout(0.2, input_shape=(27,)))
    model.add(Dense(50, input_dim=27, kernel_initializer='normal', kernel_regularizer=regularizers.l2(0.01), activation='relu'))
    model.add(Dropout(0.2))
    model.add(Dense(20, kernel_initializer='normal', kernel_regularizer=regularizers.l2(0.01), activation='relu'))
    model.add(Dropout(0.2))
    model.add(Dense(5, kernel_initializer='normal', kernel_regularizer=regularizers.l2(0.01), activation='relu'))
    model.add(Dense(1, kernel_initializer='normal'))
    # Compile model
    model.compile(loss='mean_squared_error', optimizer='adam')
    return model

In [39]:
def mean_absolute_percentage_error(y_true, y_pred): 
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

In [40]:
#num_features = df_train.loc[:, df_train.columns != predicted_variable].shape[1]
patient_ids = np.unique(df['patientid'])
estimators = []
estimators.append(('standardize', StandardScaler()))
estimators.append(('mlp', KerasRegressor(build_fn=baseline_model, epochs=50, batch_size=5, verbose=0)))
pipeline = Pipeline(estimators)

estimators_lr = []
estimators_lr.append(('standardize', StandardScaler()))
estimators_lr.append(('lr',  ElasticNet(alpha=1, l1_ratio=0.5, random_state = 42)))
pipeline_lr = Pipeline(estimators_lr)

estimators_gbm = []
estimators_gbm.append(('standardize', StandardScaler()))
estimators_gbm.append(('gbm',  GradientBoostingRegressor(learning_rate=0.01, n_estimators=100, random_state = 42)))
pipeline_gbm = Pipeline(estimators_gbm)




RMSE_NN = []
R2_NN = []
MAPE_NN = []

RMSE_LR = []
R2_LR = []
MAPE_LR = []

RMSE_GBM = []
R2_GBM = []
MAPE_GBM = []

i = 0

while len(patient_ids) > 1:
    
    i= i + 1
    random.seed(42)
    patient_test_ids = random.choices(patient_ids, k = 3)
    patient_ids = [e for e in patient_ids if e not in patient_test_ids]
    df_test = df.loc[df['patientid'].isin(patient_test_ids)].dropna()
    df_train = df[~df['patientid'].isin(patient_test_ids)].dropna()
    print("running fold" + str(i))
    
    cols_dropped = ['patientid']

    if predicted_variable == 'SBP':
        cols_dropped.append('DBP')
    elif predicted_variable == 'DBP':
        cols_dropped.append('SBP')
    df_train = df_train.drop(columns = cols_dropped)
    df_test = df_test.drop(columns = cols_dropped)
    
    ##nn
    pipeline.fit(X = df_train.loc[:, df_train.columns != predicted_variable].values, y = df_train[predicted_variable].values)
    predicted_labels = pipeline.predict(df_test.loc[:, df_test.columns != predicted_variable].values)
    
    RMSE_NN.append(np.sqrt(mean_squared_error(df_test[predicted_variable], predicted_labels)))  
    R2_NN.append(r2_score(df_test[predicted_variable], predicted_labels))
    MAPE_NN.append(mean_absolute_percentage_error(df_test[predicted_variable], predicted_labels))
    
    ##lr
    pipeline_lr.fit(X = df_train.loc[:, df_train.columns != predicted_variable].values, y = df_train[predicted_variable].values)
    predicted_labels = pipeline_lr.predict(df_test.loc[:, df_test.columns != predicted_variable].values)
    
    RMSE_LR.append(np.sqrt(mean_squared_error(df_test[predicted_variable], predicted_labels)))  
    R2_LR.append(r2_score(df_test[predicted_variable], predicted_labels))
    MAPE_LR.append(mean_absolute_percentage_error(df_test[predicted_variable], predicted_labels))
    
    #gbm 
    
    pipeline_gbm.fit(X = df_train.loc[:, df_train.columns != predicted_variable].values, y = df_train[predicted_variable].values)
    predicted_labels = pipeline_gbm.predict(df_test.loc[:, df_test.columns != predicted_variable].values)
    
    RMSE_GBM.append(np.sqrt(mean_squared_error(df_test[predicted_variable], predicted_labels)))  
    R2_GBM.append(r2_score(df_test[predicted_variable], predicted_labels))
    MAPE_GBM.append(mean_absolute_percentage_error(df_test[predicted_variable], predicted_labels))
    
    
    
    
    # evaluate model with standardized dataset
    
print("average RMSE for the NN for " + predicted_variable + " is " + str(np.mean(np.array(RMSE_NN)))+ " sd " + str(np.std(np.array(RMSE_NN)))) 
print("average R2 for the NN for " + predicted_variable + " is " + str(np.mean(np.array(R2_NN)))+ " sd " + str(np.std(np.array(R2_NN))))
print("average MAPE for the NN for " + predicted_variable + " is " + str(np.mean(np.array(MAPE_NN)))+ " sd " + str(np.std(np.array(MAPE_NN))))

print("average RMSE for the LR for " + predicted_variable + " is " + str(np.mean(np.array(RMSE_LR)))+ " sd " + str(np.std(np.array(RMSE_LR)))) 
print("average R2 for the LR for " + predicted_variable + " is " + str(np.mean(np.array(R2_LR)))+ " sd " + str(np.std(np.array(R2_LR))))
print("average MAPE for the LR for " + predicted_variable + " is " + str(np.mean(np.array(MAPE_LR)))+ " sd " + str(np.std(np.array(MAPE_LR))))

print("average RMSE for the GBM for " + predicted_variable + " is " + str(np.mean(np.array(RMSE_GBM)))+ " sd " + str(np.std(np.array(RMSE_GBM)))) 
print("average R2 for the GBM for " + predicted_variable + " is " + str(np.mean(np.array(R2_GBM)))+ " sd " + str(np.std(np.array(R2_GBM))))
print("average MAPE for the GBM for " + predicted_variable + " is " + str(np.mean(np.array(MAPE_GBM)))+ " sd " + str(np.std(np.array(MAPE_GBM))))

running fold1
running fold2
running fold3
running fold4
running fold5
running fold6
running fold7
running fold8
running fold9
average RMSE for the NN for DBP is 12.136126654342045 sd 4.594369297274952
average R2 for the NN for DBP is -2.473892881856523 sd 1.9373829172551815
average MAPE for the NN for DBP is 13.438211127030472 sd 5.1722108442806904
average RMSE for the LR for DBP is 8.614078165201596 sd 2.830945459727786
average R2 for the LR for DBP is -0.7720423104814619 sd 1.3651175070524055
average MAPE for the LR for DBP is 10.502411715456965 sd 4.6144672454089415
average RMSE for the GBM for DBP is 8.78383523162713 sd 3.114621797934496
average R2 for the GBM for DBP is -0.9348632630076601 sd 1.7094361959200173
average MAPE for the GBM for DBP is 10.724531859277501 sd 5.136914611099583


In [41]:
## classification

df['BP_Category'] = 'normal'
#df.loc[(df['SBP'] > 120) & (df['SBP'] < 130) & (df['DBP'] < 80), 'BP_Category'] = 'elivated'
df.loc[(df['SBP'] > 130) | (df['DBP'] > 80), 'BP_Category'] = 'hypertension'

In [42]:
df['BP_Category'].value_counts()

hypertension    95
normal          54
Name: BP_Category, dtype: int64

In [43]:
patient_ids = np.unique(df['patientid'])

estimators_lr = []
estimators_lr.append(('standardize', StandardScaler()))
estimators_lr.append(('lr',  LogisticRegression(penalty='l2', C=0.2, random_state = 42, solver = 'lbfgs', multi_class = 'ovr')))
pipeline_lr = Pipeline(estimators_lr)

estimators_gbc = []
estimators_gbc.append(('standardize', StandardScaler()))
estimators_gbc.append(('gbc',  GradientBoostingClassifier(learning_rate = 0.1, subsample = 0.5, random_state = 42)))
pipeline_gbc = Pipeline(estimators_gbc)

ACC_LR = []
F1_LR = []

ACC_GBC = []
F1_GBC = []

i = 0

while len(patient_ids) > 1:
    
    i= i + 1
    random.seed(42)
    patient_test_ids = random.choices(patient_ids, k = 3)
    patient_ids = [e for e in patient_ids if e not in patient_test_ids]
    df_test = df.loc[df['patientid'].isin(patient_test_ids)].dropna()
    df_train = df[~df['patientid'].isin(patient_test_ids)].dropna()
    print("running fold" + str(i))
    
    cols_dropped = ['patientid', 'SBP', 'DBP']
    df_train = df_train.drop(columns = cols_dropped)
    df_test = df_test.drop(columns = cols_dropped)
    
    df_majority = df_train[df_train.BP_Category == 'hypertension']
    df_minority = df_train[(df_train.BP_Category == 'elivated') | (df_train.BP_Category == 'normal')]
    
    df_minority_upsampled = resample(df_minority, 
                                 replace=True,     # sample with replacement
                                 n_samples=150,    # to match majority class
                                 random_state=123) # reproducible results
    df_train = pd.concat([df_majority, df_minority_upsampled])
    print(df_train['BP_Category'].value_counts())
 
    
    ##lr
    pipeline_lr.fit(X = df_train.loc[:, df_train.columns != 'BP_Category'].values, y = df_train['BP_Category'].values)
    predicted_labels = pipeline_lr.predict(df_test.loc[:, df_test.columns != 'BP_Category'].values)
    print(pd.Series(predicted_labels).value_counts())
    F1_LR.append(f1_score(df_test['BP_Category'], predicted_labels, average='weighted'))
    ACC_LR.append(accuracy_score(df_test['BP_Category'], predicted_labels))
    
    ##gbc
    
    pipeline_gbc.fit(X = df_train.loc[:, df_train.columns != 'BP_Category'].values, y = df_train['BP_Category'].values)
    predicted_labels = pipeline_gbc.predict(df_test.loc[:, df_test.columns != 'BP_Category'].values)
    
    F1_GBC.append(f1_score(df_test['BP_Category'], predicted_labels, average='weighted'))
    ACC_GBC.append(accuracy_score(df_test['BP_Category'], predicted_labels))
    

print("average ACC for the LR for " + predicted_variable + " is " + str(np.mean(np.array(ACC_LR)))+ " sd " + str(np.std(np.array(ACC_LR)))) 
print("average F1 for the LR for " + predicted_variable + " is " + str(np.mean(np.array(F1_LR)))+ " sd " + str(np.std(np.array(F1_LR))))


print("average ACC for the GBC for " + predicted_variable + " is " + str(np.mean(np.array(ACC_GBC)))+ " sd " + str(np.std(np.array(ACC_GBC)))) 
print("average F1 for the GBC for " + predicted_variable + " is " + str(np.mean(np.array(F1_GBC)))+ " sd " + str(np.std(np.array(F1_GBC))))

running fold1
normal          150
hypertension     84
Name: BP_Category, dtype: int64
normal          12
hypertension     6
dtype: int64
running fold2
normal          150
hypertension     87
Name: BP_Category, dtype: int64
normal          14
hypertension     2
dtype: int64
running fold3
normal          150
hypertension     82
Name: BP_Category, dtype: int64
hypertension    13
normal           4
dtype: int64
running fold4
normal          150
hypertension     81
Name: BP_Category, dtype: int64
normal          10
hypertension     8
dtype: int64
running fold5
normal          150
hypertension     82
Name: BP_Category, dtype: int64
hypertension    11
normal           7
dtype: int64
running fold6
normal          150
hypertension     83
Name: BP_Category, dtype: int64
hypertension    15
normal           4
dtype: int64
running fold7
normal          150
hypertension     83
Name: BP_Category, dtype: int64
normal          13
hypertension     4
dtype: int64
running fold8
normal          150
hyperte