In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_absolute_error, r2_score, mean_squared_error, f1_score, accuracy_score
from sklearn.linear_model import ElasticNet, LogisticRegression
from sklearn.ensemble import GradientBoostingRegressor, GradientBoostingClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
import tensorflow as tf
from sklearn.multiclass import OneVsRestClassifier
from sklearn.utils import resample
import random
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.wrappers.scikit_learn import KerasRegressor
from tensorflow.keras import regularizers
from tensorflow.keras.layers import Dropout
from sklearn.svm import SVR, SVC
from sklearn.feature_selection import RFE
from sklearn.dummy import DummyRegressor

In [2]:
def mean_absolute_percentage_error(y_true, y_pred): 
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

In [3]:
predicted_variable = 'dbp' 

In [4]:
df = pd.read_pickle("../intermediate_data/scalogram_resnet_representation.pkl")

In [5]:
df.head()

Unnamed: 0,representation,patientid,sbp,dbp
0,"[1.0031806, 1.2474066, 0.04806124, 0.35932362,...",20,178,89
1,"[0.520468, 1.3503281, 0.010736622, 0.15120158,...",22,112,58
2,"[0.5581223, 0.84562373, 0.0, 0.33805272, 0.169...",2,143,69
3,"[0.709408, 0.99573827, 0.03802151, 0.28552535,...",12,160,78
4,"[1.0288619, 1.0717208, 0.005405174, 0.31130534...",16,116,70


In [6]:
df['patientid'] = pd.to_numeric(df['patientid'])
df['sbp'] = pd.to_numeric(df['sbp'])
df['dbp'] = pd.to_numeric(df['dbp'])

In [7]:
df.drop(df.loc[(df['sbp'] == 0)|(df['dbp'] == 0)].index, inplace = True)
df.shape

(148, 4)

In [8]:
def baseline_model():
    # create model
    model = Sequential()
    model.add(Dense(50, input_dim=512, kernel_initializer='normal', kernel_regularizer=regularizers.l2(0.1), activation='relu'))
    model.add(Dense(20, kernel_initializer='normal', kernel_regularizer=regularizers.l2(0.1), activation='relu'))
    #model.add(Dense(5, kernel_initializer='normal', kernel_regularizer=regularizers.l2(0.1), activation='relu'))
    model.add(Dense(1, kernel_initializer='normal'))
    # Compile model
    model.compile(loss='mean_squared_error', optimizer='adam')
    return model

In [9]:
patient_ids = np.unique(df['patientid'])

estimator_base = SVR(kernel="linear")
selector = RFE(estimator_base, 40, step=1)

estimators = []
estimators.append(('standardize', StandardScaler()))
estimators.append(('mlp', KerasRegressor(build_fn=baseline_model, epochs=50, batch_size=5, verbose=0)))
pipeline = Pipeline(estimators)

estimators_lr = []
#estimators_lr.append(('standardize', StandardScaler()))
estimators_lr.append(('selector', selector))
estimators_lr.append(('lr',  ElasticNet(alpha=0.1, l1_ratio=0.5, random_state = 42)))
pipeline_lr = Pipeline(estimators_lr)

estimators_gbm = []
#estimators_gbm.append(('standardize', StandardScaler()))
estimators_gbm.append(('selector', selector))
estimators_gbm.append(('gbm',  GradientBoostingRegressor(learning_rate=0.01, n_estimators=50, random_state = 42)))
pipeline_gbm = Pipeline(estimators_gbm)

dummy_mean = DummyRegressor(strategy='mean')




RMSE_NN = []
R2_NN = []
MAPE_NN = []

RMSE_LR = []
R2_LR = []
MAPE_LR = []

RMSE_GBM = []
R2_GBM = []
MAPE_GBM = []

RMSE_Dummy = []
R2_Dummy = []
MAPE_Dummy = []

i = 0

while len(patient_ids) > 1:
    
    i= i + 1
    random.seed(42)
    patient_test_ids = random.choices(patient_ids, k = 2)
    patient_ids = [e for e in patient_ids if e not in patient_test_ids]
    df_test = df.loc[df['patientid'].isin(patient_test_ids)].dropna()
    df_train = df[~df['patientid'].isin(patient_test_ids)].dropna()
    print("running fold" + str(i))
    
    cols_dropped = ['patientid']

    if predicted_variable == 'sbp':
        cols_dropped.append('dbp')
    elif predicted_variable == 'dbp':
        cols_dropped.append('sbp')
    df_train = df_train.drop(columns = cols_dropped)
    df_test = df_test.drop(columns = cols_dropped)
    
    ##nn
    #pipeline.fit(X = np.stack(df_train["representation"]), y = df_train[predicted_variable].values)
    #predicted_labels = pipeline.predict(np.stack(df_test["representation"]))
    
    #RMSE_NN.append(np.sqrt(mean_squared_error(df_test[predicted_variable], predicted_labels)))  
    #R2_NN.append(r2_score(df_test[predicted_variable], predicted_labels))
    #MAPE_NN.append(mean_absolute_percentage_error(df_test[predicted_variable], predicted_labels))
    
    ##lr
    pipeline_lr.fit(X = np.stack(df_train["representation"]), y = df_train[predicted_variable].values)
    predicted_labels = pipeline_lr.predict(np.stack(df_test["representation"]))
    
    RMSE_LR.append(np.sqrt(mean_squared_error(df_test[predicted_variable], predicted_labels)))  
    R2_LR.append(r2_score(df_test[predicted_variable], predicted_labels))
    MAPE_LR.append(mean_absolute_percentage_error(df_test[predicted_variable], predicted_labels))
    
    #gbm 
    
    pipeline_gbm.fit(X = np.stack(df_train["representation"]), y = df_train[predicted_variable].values)
    predicted_labels = pipeline_gbm.predict(np.stack(df_test["representation"]))
    
    RMSE_GBM.append(np.sqrt(mean_squared_error(df_test[predicted_variable], predicted_labels)))  
    R2_GBM.append(r2_score(df_test[predicted_variable], predicted_labels))
    MAPE_GBM.append(mean_absolute_percentage_error(df_test[predicted_variable], predicted_labels))
    
    
    ##dummy: predicting mean
    
    dummy_mean.fit(X = np.stack(df_train["representation"]), y = df_train[predicted_variable].values)
    predicted_labels = dummy_mean.predict(np.stack(df_test["representation"]))
    
    RMSE_Dummy.append(np.sqrt(mean_squared_error(df_test[predicted_variable], predicted_labels)))  
    R2_Dummy.append(r2_score(df_test[predicted_variable], predicted_labels))
    MAPE_Dummy.append(mean_absolute_percentage_error(df_test[predicted_variable], predicted_labels))
    
    
    
    # evaluate model with standardized dataset
    
#print("average RMSE for the NN for " + predicted_variable + " is " + str(np.mean(np.array(RMSE_NN)))+ " sd " + str(np.std(np.array(RMSE_NN)))) 
#print("average R2 for the NN for " + predicted_variable + " is " + str(np.mean(np.array(R2_NN)))+ " sd " + str(np.std(np.array(R2_NN))))
#print("average MAPE for the NN for " + predicted_variable + " is " + str(np.mean(np.array(MAPE_NN)))+ " sd " + str(np.std(np.array(MAPE_NN))))

print("average RMSE for the LR for " + predicted_variable + " is " + str(np.mean(np.array(RMSE_LR)))+ " sd " + str(np.std(np.array(RMSE_LR)))) 
print("average R2 for the LR for " + predicted_variable + " is " + str(np.mean(np.array(R2_LR)))+ " sd " + str(np.std(np.array(R2_LR))))
print("average MAPE for the LR for " + predicted_variable + " is " + str(np.mean(np.array(MAPE_LR)))+ " sd " + str(np.std(np.array(MAPE_LR))))

print("average RMSE for the GBM for " + predicted_variable + " is " + str(np.mean(np.array(RMSE_GBM)))+ " sd " + str(np.std(np.array(RMSE_GBM)))) 
print("average R2 for the GBM for " + predicted_variable + " is " + str(np.mean(np.array(R2_GBM)))+ " sd " + str(np.std(np.array(R2_GBM))))
print("average MAPE for the GBM for " + predicted_variable + " is " + str(np.mean(np.array(MAPE_GBM)))+ " sd " + str(np.std(np.array(MAPE_GBM))))

print("average RMSE for the Dummy Predictor for " + predicted_variable + " is " + str(np.mean(np.array(RMSE_Dummy)))+ " sd " + str(np.std(np.array(RMSE_Dummy)))) 
print("average R2 for the Dummy Predictor for " + predicted_variable + " is " + str(np.mean(np.array(R2_Dummy)))+ " sd " + str(np.std(np.array(R2_Dummy))))
print("average MAPE for the Dummy Predictor for " + predicted_variable + " is " + str(np.mean(np.array(MAPE_Dummy)))+ " sd " + str(np.std(np.array(MAPE_Dummy))))


running fold1
running fold2
running fold3
running fold4
running fold5
running fold6
running fold7
running fold8
running fold9
running fold10
running fold11
running fold12
running fold13
average RMSE for the LR for dbp is 8.602114749317613 sd 2.2489694844680717
average R2 for the LR for dbp is -1.1324873259808346 sd 1.1999999348000359
average MAPE for the LR for dbp is 10.228215249214834 sd 3.7694908567674967
average RMSE for the GBM for dbp is 8.693514336249187 sd 2.4312386222119344
average R2 for the GBM for dbp is -1.230051481655818 sd 1.5057023367957585
average MAPE for the GBM for dbp is 10.76127893723761 sd 3.6896581767413386
average RMSE for the Dummy Predictor for dbp is 8.526479600790639 sd 2.558311151559337
average R2 for the Dummy Predictor for dbp is -1.1515115217440466 sd 1.544290237263158
average MAPE for the Dummy Predictor for dbp is 10.65367172913035 sd 3.815336180522183


In [10]:
df['BP_Category'] = 'normal'
#df.loc[(df['sbp'] > 120) & (df['sbp'] < 130) & (df['dbp'] < 80), 'BP_Category'] = 'elivated'
df.loc[(df['sbp'] > 130) | (df['dbp'] > 80), 'BP_Category'] = 'hypertension'

In [11]:
df['BP_Category'].value_counts()

hypertension    95
normal          53
Name: BP_Category, dtype: int64

In [12]:
patient_ids = np.unique(df['patientid'])

estimator_base = SVC(kernel="linear")
selector = RFE(estimator_base, 40, step=1)

estimators_lr = []
estimators_lr.append(('standardize', StandardScaler()))
estimators_lr.append(('selector', selector))
estimators_lr.append(('lr',  LogisticRegression(penalty='l2', C=0.1, random_state = 42, solver = 'lbfgs', multi_class = 'ovr', class_weight='balanced')))
pipeline_lr = Pipeline(estimators_lr)

estimators_gbc = []
#estimators_gbc.append(('standardize', StandardScaler()))
estimators_gbc.append(('selector', selector))
estimators_gbc.append(('gbc',  GradientBoostingClassifier(learning_rate = 0.1, subsample = 0.5, random_state = 42)))
pipeline_gbc = Pipeline(estimators_gbc)

ACC_LR = []
F1_LR = []

ACC_GBC = []
F1_GBC = []

i = 0

while len(patient_ids) > 1:
    
    i= i + 1
    random.seed(42)
    patient_test_ids = random.choices(patient_ids, k = 3)
    patient_ids = [e for e in patient_ids if e not in patient_test_ids]
    df_test = df.loc[df['patientid'].isin(patient_test_ids)].dropna()
    df_train = df[~df['patientid'].isin(patient_test_ids)].dropna()
    print("running fold" + str(i))
    
    cols_dropped = ['patientid', 'sbp', 'dbp']
    df_train = df_train.drop(columns = cols_dropped)
    df_test = df_test.drop(columns = cols_dropped)
    
    #df_majority = df_train[df_train.BP_Category == 'hypertension']
    #df_minority = df_train[(df_train.BP_Category == 'elivated') | (df_train.BP_Category == 'normal')]
    
    #df_minority_upsampled = resample(df_minority, 
    #                             replace=True,     # sample with replacement
    #                             n_samples=150,    # to match majority class
    #                             random_state=123) # reproducible results
    #df_train = pd.concat([df_majority, df_minority_upsampled])
    print(df_train['BP_Category'].value_counts())
 
    
    ##lr
    pipeline_lr.fit(X = np.stack(df_train["representation"]), y = df_train['BP_Category'].values)
    predicted_labels = pipeline_lr.predict(np.stack(df_test["representation"]))
    print(pd.Series(predicted_labels).value_counts())
    F1_LR.append(f1_score(df_test['BP_Category'], predicted_labels, average='weighted'))
    ACC_LR.append(accuracy_score(df_test['BP_Category'], predicted_labels))
    
    ##gbc
    
    pipeline_gbc.fit(X = np.stack(df_train["representation"]), y = df_train['BP_Category'].values)
    predicted_labels = pipeline_gbc.predict(np.stack(df_test["representation"]))
    
    F1_GBC.append(f1_score(df_test['BP_Category'], predicted_labels, average='weighted'))
    ACC_GBC.append(accuracy_score(df_test['BP_Category'], predicted_labels))
    

print("average ACC for the LR for " + predicted_variable + " is " + str(np.mean(np.array(ACC_LR)))+ " sd " + str(np.std(np.array(ACC_LR)))) 
print("average F1 for the LR for " + predicted_variable + " is " + str(np.mean(np.array(F1_LR)))+ " sd " + str(np.std(np.array(F1_LR))))


print("average ACC for the GBC for " + predicted_variable + " is " + str(np.mean(np.array(ACC_GBC)))+ " sd " + str(np.std(np.array(ACC_GBC)))) 
print("average F1 for the GBC for " + predicted_variable + " is " + str(np.mean(np.array(F1_GBC)))+ " sd " + str(np.std(np.array(F1_GBC))))

running fold1
hypertension    84
normal          47
Name: BP_Category, dtype: int64
normal          12
hypertension     5
dtype: int64
running fold2
hypertension    87
normal          45
Name: BP_Category, dtype: int64
normal          9
hypertension    7
dtype: int64
running fold3
hypertension    82
normal          49
Name: BP_Category, dtype: int64
hypertension    10
normal           7
dtype: int64
running fold4
hypertension    81
normal          49
Name: BP_Category, dtype: int64
hypertension    12
normal           6
dtype: int64
running fold5
hypertension    82
normal          48
Name: BP_Category, dtype: int64
hypertension    12
normal           6
dtype: int64
running fold6
hypertension    83
normal          46
Name: BP_Category, dtype: int64
hypertension    15
normal           4
dtype: int64
running fold7
hypertension    83
normal          48
Name: BP_Category, dtype: int64
hypertension    9
normal          8
dtype: int64
running fold8
hypertension    89
normal          44
Name: B