# Train feedback estimators using Logistic and Linear Regression

This script trains the following feedback estimators on the ACCORD_BPClass_v2.csv dataset

* CVDRiskEstr: logistic regression, obtain accuracy arounnd 0.88
* SBPEstr: linear regression, poor fitting, R^2 = 0.03
* A1CEstr: linear regression, poor fitting, R^2 = 0.09

In [1]:
import pandas as pd
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import train_test_split
from keras.models import Sequential
from keras.layers import Activation, Dense
import tensorflow as tf
import pickle
import numpy as np

2023-04-17 23:58:26.126217: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
DATA = '../../data/ACCORD_BPClass_v2_merged.csv' # _merged means sbp_discrete levels are merged

context_fea = ['baseline_age', 'female', 'race_whiteother', #'race_black', 
                'edu_baseline',
                'cvd_hx_baseline', 
                'baseline_BMI', 
                # 'baseline_BMI_discrete',
                'cigarett_baseline',
                ]

## Preprocess

In [15]:
# discretize the baseline_BMI

fn = DATA
df = pd.read_csv(fn)

baseline_BMI_discrete = []
for i in range(df.shape[0]):
    row = df.iloc[i]
    BMI = row['baseline_BMI']      

    if BMI < 18.5:
        baseline_BMI_discrete.append(0)
    elif BMI < 25:
        baseline_BMI_discrete.append(1)
    elif BMI < 30:
        baseline_BMI_discrete.append(2)
    else:
        baseline_BMI_discrete.append(3)

df['baseline_BMI_discrete'] = baseline_BMI_discrete

# save to csv
# fn_out = fn.replace('.csv', '_contextual.csv')
fn_out = fn
df.to_csv(fn_out, index=False)

## CVDRisk

In [38]:
def train_CVDRisk_estimator(fn, flag):

    # flag = 'BP' or 'BG' or 'BPBG'
    
    # get the features and labels
    df = pd.read_csv(fn)
    # print(df.info())
    
    if flag == 'BP':
        # state_cols = ['sbp_discrete_merged', 'BMI_discrete'] # state vector
        state_cols = ['sbp_discrete_merged'] # state vector

        medclass_cols =['Diur', 'ACE', 'Beta-blocker', 'CCB'] # pick top 4 most frequently used BP med classes
                        
    elif flag == 'BG':
        medclass_cols = ['BMI', 'sbp', 'hba1c', 'TC', 'hdl', 
                        'bgclass_none', ]
    elif flag == 'BPBG':
        medclass_cols = ['BMI', 'sbp', 'hba1c', 'TC', 'hdl', 
                        'bpclass_none', 'Diur', 'ACE', 'Beta-blocker', 'CCB', 
                        'ARB', 'Alpha-Beta-blocker', 'Alpha-blocker', 'Sympath', 'Vasod',
                        'bgclass_none', ]
    else:
        print('Error: flag must be BP, BG or BPBG')
        exit()

    fea_cols = context_fea + state_cols + medclass_cols

    print('fea_cols = ', fea_cols)

    X = df[fea_cols].values 
    y = df['CVDRisk_feedback_binary'].values # here we use 0.2 as the threshold to make the binarized class balance, as only 3% data has CVDRisk_feedback >= 0.5

    # print('X.shape = ', X.shape)
    # print('y.shape = ', y.shape)
    
    """
    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=100)

    # standardize the data
    from sklearn.preprocessing import StandardScaler
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)
    
    # Train a logistic regression model to predict the risk of CVD
    lr = LogisticRegression(max_iter=400).fit(X_train, y_train)
    probs = lr.predict_proba(X_test)[:,1]
    # print('probs = ', probs)
    threshold = 0.2
    y_pred = (probs >= threshold).astype(int)
    # print('y_pred = ', y_pred)
    # print('y_test = ', y_test)

    # get the accuracy
    from sklearn.metrics import accuracy_score
    acc = accuracy_score(y_test, y_pred)
    print('test acc = ', acc)

    y_test_numeric = df['CVDRisk_feedback'].values
    # get the MSE between y_text_numeric and probs
    from sklearn.metrics import mean_squared_error
    mse = mean_squared_error(y_test_numeric, probs)
    print('test mse = ', mse)

    # train_score = lr.score(X_train, y_train)
    # test_score =  lr.score(X_test, y_test)
    # print('train_score = ', train_score)
    # print('test_score = ', test_score)
    """

    # retrain the model on the whole dataset
    from sklearn.preprocessing import StandardScaler
    scaler_all = StandardScaler()
    X = scaler_all.fit_transform(X)
    estimator_all = LogisticRegression(max_iter=400).fit(X, y)
    probs = estimator_all.predict_proba(X)[:,1]
    # print('probs = ', probs)
    threshold = 0.2
    y_pred = (probs >= threshold).astype(int)
    # print('y_pred = ', y_pred)
    # print('y_test = ', y_test)

    # get the accuracy
    from sklearn.metrics import accuracy_score
    acc = accuracy_score(y, y_pred)
    print('acc = ', acc)

    y_numeric = df['CVDRisk_feedback'].values
    # get the MSE, MAE, RMSE, r2
    from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
    mse = mean_squared_error(y_numeric, probs)
    mae = mean_absolute_error(y_numeric, probs)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_numeric, probs)
    print('MSE = ', mse)
    print('MAE = ', mae)
    print('RMSE = ', rmse)
    print('R2 = ', r2)

    return (estimator_all, scaler_all, fea_cols)

CVDRisk_estimator_BP, CVDRisk_scaler_BP, CVDRisk_fea_BP = train_CVDRisk_estimator(DATA, 'BP')

fea_cols =  ['baseline_age', 'female', 'race_whiteother', 'edu_baseline', 'cvd_hx_baseline', 'baseline_BMI_discrete', 'cigarett_baseline', 'sbp_discrete_merged', 'Diur', 'ACE', 'Beta-blocker', 'CCB']
acc =  0.8256418663018779
MSE =  0.1641520307058241
MAE =  0.315682863349527
RMSE =  0.40515679767939733
R2 =  -8.507924848563299


In [46]:
# train a linear regression model for CVD risk
def train_CVDRisk_estimator_linear(fn):
    # get the features and labels
    df = pd.read_csv(fn)
    # print(df.info())
    # stop

    medclass_cols =['Diur', 'ACE', 'Beta-blocker', 'CCB'] # pick top 4 most frequently used BP med classes

    state_cols = ['sbp_discrete_merged'] # state vector
    fea_cols = context_fea + state_cols + medclass_cols
 
    X = df[fea_cols].values 
    y = df['CVDRisk_feedback'].values

    # use  -log(1/y-1) to transform the y to a linear scale
    y = -np.log(1/y-1)  

    print('X.shape = ', X.shape)
    print('y.shape = ', y.shape)
    
    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=100)

    # Train a linear regression model to predict the risk of CVD
    lr = LinearRegression().fit(X_train, y_train)
    train_score = lr.score(X_train, y_train)
    test_score =  lr.score(X_test, y_test)
    print('train_score = ', train_score)
    print('test_score = ', test_score)

    # retrain the model on the whole dataset
    estimator_all = LinearRegression().fit(X, y)
    train_score_all = estimator_all.score(X, y)
    print('train_score using all data = ', train_score_all)

    return estimator_all        

estimator = train_CVDRisk_estimator_linear(DATA)

X.shape =  (176906, 12)
y.shape =  (176906,)
train_score =  0.7914448950585644
test_score =  0.7917677348536329
train_score using all data =  0.7915120237782078


## SBP

In [61]:
# train a linear regression model for SBP feedback
def train_SBP_estimator(fn):
    # get the features and labels
    df = pd.read_csv(fn)
    # print(df.info())
    # stop

    medclass_cols =['Diur', 'ACE', 'Beta-blocker', 'CCB'] # pick top 4 most frequently used BP med classes

    # state_cols = ['BMI_discrete'] # state vector
    state_cols = ['BMI', 'hba1c', 'TC', 'hdl'] # state vector
    fea_cols = context_fea + state_cols + medclass_cols
    # fea_cols = context_fea + medclass_cols

    # fea_cols = ['BMI'] # try to predict SBP from BMI only

    X = df[fea_cols].values 
    y = df['sbp_feedback'].values

    from sklearn.preprocessing import PolynomialFeatures
    poly = PolynomialFeatures(degree=2)
    X = poly.fit_transform(X)

    # check for p values
    import statsmodels.api as sm
    X = sm.add_constant(X)
    model = sm.OLS(y, X).fit()
    print(model.summary())



    print('X.shape = ', X.shape)
    print('y.shape = ', y.shape)
    
    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=100)

    # Train a linear regression model to predict the risk of CVD
    lr = LinearRegression().fit(X_train, y_train)
    train_score = lr.score(X_train, y_train)
    test_score =  lr.score(X_test, y_test)
    print('train_score = ', train_score)
    print('test_score = ', test_score)

    # retrain the model on the whole dataset
    estimator_all = LinearRegression().fit(X, y)
    train_score_all = estimator_all.score(X, y)
    print('train_score using all data = ', train_score_all)

    return estimator_all        

SBP_feedback_estimator = train_SBP_estimator(DATA)

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.053
Model:                            OLS   Adj. R-squared:                  0.052
Method:                 Least Squares   F-statistic:                     100.7
Date:                Mon, 17 Apr 2023   Prob (F-statistic):               0.00
Time:                        22:59:42   Log-Likelihood:            -7.2665e+05
No. Observations:              176906   AIC:                         1.453e+06
Df Residuals:                  176807   BIC:                         1.454e+06
Df Model:                          98                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const        109.2334      6.599     16.554      0.0

## A1C

In [28]:
# train a linear regression model for A1C feedback
def train_A1C_estimator(fn):
    # get the features and labels
    df = pd.read_csv(fn)
    state_cols = ['sbp_discrete'] # state vector
    medclass_cols =['Diur', 'ACE', 'Beta-blocker', 'CCB'] # pick top 4 most frequently used BP med classes

    fea_cols = context_fea + state_cols + medclass_cols
    # fea_cols.remove('race_black')
    X = df[fea_cols].values 
    y = df['hba1c_feedback'].values

    print('X.shape = ', X.shape)
    print('y.shape = ', y.shape)
    
    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=100)

    # Train a linear regression model to predict the risk of CVD
    lr = LinearRegression().fit(X_train, y_train)
    train_score = lr.score(X_train, y_train)
    test_score =  lr.score(X_test, y_test)
    print('train_score = ', train_score)
    print('test_score = ', test_score)

    # retrain the model on the whole dataset
    estimator = LinearRegression().fit(X, y)
    train_score2 = estimator.score(X, y)
    print('train_score using all data = ', train_score2)

    return estimator        

SBP_estimator = train_A1C_estimator(DATA)

X.shape =  (176906, 12)
y.shape =  (176906,)
train_score =  0.06409643687454869
test_score =  0.06762149278475882
train_score using all data =  0.06481888991998697


# Train Estimators using ANN

## ANN model class

In [5]:
# write a class which build a ANN model for SBP feedback, using tensorflow framwork
   
class ANN:
    def __init__(self, input_shape, output_shape, hidden_layers, output_activation='linear'):
        self.input_shape = input_shape
        self.output_shape = output_shape
        self.hidden_layers = hidden_layers        
        
        # Define the model architecture
        self.model = tf.keras.Sequential()
        self.model.add(tf.keras.layers.Input(shape=input_shape))
        for i in range(len(hidden_layers)):
            # self.model.add(tf.keras.layers.Dense(hidden_layers[i], activation='relu'))
            self.model.add(tf.keras.layers.Dense(hidden_layers[i], activation='linear'))
        self.model.add(tf.keras.layers.Dense(output_shape, activation=output_activation))
        
    def train(self, x_train, y_train, x_val, y_val, epochs=10, batch_size=16):
        # Compile the model with appropriate loss function and optimizer
        self.model.compile(loss='mean_squared_error', optimizer='adam', metrics=['mse'])
        # self.model.compile(loss='mean_squared_error', optimizer='adam')
        
        # Train the model
        self.model.fit(x_train, y_train, validation_data=(x_val, y_val), epochs=epochs, batch_size=batch_size, verbose=1)
    
    def evaluate(self, x, y):
        # Evaluate the model on the test set
        return self.model.evaluate(x, y)
        
    def predict(self, x):
        # Use the model to make predictions
        return self.model.predict(x)

## CVDRisk

In [41]:
# train a ANN model for CVDRisk 
def train_CVDRisk_estimator_ANN(fn, flag, train_onwhole=False):
    # flag = 'BP' or 'BG' or 'BPBG'
    
    # get the features and labels
    df = pd.read_csv(fn)

    if flag == 'BP':
        state_cols = ['sbp_discrete_merged'] # state vector

        medclass_cols =['Diur', 'ACE', 'Beta-blocker', 'CCB'] # pick top 4 most frequently used BP med classes

    elif flag == 'BG':
        medclass_cols = ['BMI', 'sbp', 'hba1c', 'TC', 'hdl', 
                        'bgclass_none', ]
    elif flag == 'BPBG':
        medclass_cols = ['BMI', 'sbp', 'hba1c', 'TC', 'hdl', 
                        'bpclass_none', 'Diur', 'ACE', 'Beta-blocker', 'CCB', 
                        'ARB', 'Alpha-Beta-blocker', 'Alpha-blocker', 'Sympath', 'Vasod',
                        'bgclass_none', ]
    else:
        print('Error: flag must be BP, BG or BPBG')
        exit()

    fea_cols = context_fea + state_cols + medclass_cols
    X = df[fea_cols].values 
    y = df['CVDRisk_feedback'].values
    print('X.shape = ', X.shape)
    print('y.shape = ', y.shape)
    
    # Split the data into training and testing sets
    X_train, X_valtest, y_train, y_valtest = train_test_split(X, y, test_size=0.3, random_state=100)
    X_val, X_test, y_val, y_test = train_test_split(X_valtest, y_valtest, test_size=0.5, random_state=100)

    # model = ANN(X.shape[1], 1, [16, 8], 'sigmoid')
    model = ANN(X.shape[1], 1, [16, 8], 'linear')
    if not train_onwhole: # to explore hyperparameters
        model.train(X_train, y_train, X_val, y_val, epochs=20, batch_size=32)
        test_score = model.evaluate(X_test, y_test)
        print('test_score = ', test_score)
        return (None, fea_cols)

    else: # retrain the model on the whole dataset
        model.train(X, y, X, y, epochs=20, batch_size=32)    
        return (model, fea_cols)

# none, CVDRisk_fea = train_CVDRisk_estimator_ANN('../../../Codes/Accord/data/ACCORD_BPClass_v2.csv', 'BP')    
CVDRisk_estimator_ANN_BP, CVDRisk_fea_BP = train_CVDRisk_estimator_ANN(DATA, 'BP', True)

X.shape =  (176906, 12)
y.shape =  (176906,)


2023-04-17 20:52:03.328981: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


## SBP

In [6]:
# train SBP estimator using ANN model
def train_SBP_estimator_ANN(fn, train_onwhole=False):
    # get the features and labels
    df = pd.read_csv(fn)

    medclass_cols =['Diur', 'ACE', 'Beta-blocker', 'CCB']

    fea_cols = context_fea + medclass_cols
    X = df[fea_cols].values 
    y = df['sbp_feedback'].values

    print('X.shape = ', X.shape)
    print('y.shape = ', y.shape)
    
    # Split the data into training, validation and testing sets
    X_train, X_valtest, y_train, y_valtest = train_test_split(X, y, test_size=0.3, random_state=100)
    X_val, X_test, y_val, y_test = train_test_split(X_valtest, y_valtest, test_size=0.5, random_state=100)

    model = ANN(X.shape[1], 1, [16, 8])
    if not train_onwhole: # to explore hyperparameters
        model.train(X_train, y_train, X_val, y_val, epochs=20, batch_size=16)
        test_score = model.evaluate(X_test, y_test)
        print('test_score = ', test_score)
        return (None, fea_cols)

    else: # retrain the model on the whole dataset
        model.train(X, y, X, y, epochs=50, batch_size=16)    
        return (model, fea_cols)

# none, SBP_fea = train_SBP_estimator_ANN('../../../Codes/Accord/data/ACCORD_BPClass_v2.csv')
SBP_estimator_ANN, SBP_fea = train_SBP_estimator_ANN(DATA, True)


X.shape =  (176906, 11)
y.shape =  (176906,)
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15

## A1C 

In [10]:
# train a ANN model for A1C feedback
def train_A1C_estimator_ANN(fn, train_onwhole=False):
    # get the features and labels
    df = pd.read_csv(fn)

    medclass_cols = ['BMI', 'sbp', 'TC', 'hdl',   
                    'bgclass_none', 'Bingu', 'Thiaz', 'Sulfon', 'Meglit', 
                    'Alpha-gluc']

    fea_cols = context_fea + medclass_cols
    X = df[fea_cols].values 
    y = df['hba1c'].values

    print('X.shape = ', X.shape)
    print('y.shape = ', y.shape)
    
    # Split the data into training, validation and testing sets
    X_train, X_valtest, y_train, y_valtest = train_test_split(X, y, test_size=0.3, random_state=100)
    X_val, X_test, y_val, y_test = train_test_split(X_valtest, y_valtest, test_size=0.5, random_state=100)

    model = ANN(X.shape[1], 1, [16, 8])
    if not train_onwhole: # to explore hyperparameters
        model.train(X_train, y_train, X_val, y_val, epochs=20, batch_size=32)
        test_score = model.evaluate(X_test, y_test)
        print('test_score = ', test_score)
        return (None, fea_cols)
    else:
        model.train(X, y, X, y, epochs=15, batch_size=32)    
        return (model, fea_cols)       

# none, A1C_fea = train_A1C_estimator_ANN('../../../Codes/Accord/data/ACCORD_BGClass_v2.csv')
A1C_estimator_ANN, A1C_fea = train_A1C_estimator_ANN('../../../Codes/Accord/data/ACCORD_BGClass_v2.csv', True)

X.shape =  (290467, 17)
y.shape =  (290467,)
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


## Save trained model for RL feedback

In [18]:
estimators = {'CVDRisk-BP': CVDRisk_estimator_ANN_BP, 'SBP': SBP_estimator_ANN, 'A1C': A1C_estimator_ANN}
feature_cols = {'CVDRisk-BP': CVDRisk_fea_BP, 'SBP': SBP_fea, 'A1C': A1C_fea}

# save the estimators and feature columns
with open('estimators.pkl', 'wb') as f:
    pickle.dump(estimators, f)

with open('feature_cols.pkl', 'wb') as f:
    pickle.dump(feature_cols, f)

INFO:tensorflow:Assets written to: ram://ef1c0d68-e16a-48f8-8607-ae7c517c88cc/assets
INFO:tensorflow:Assets written to: ram://9afb876a-7a58-40e2-b0f0-0b64595f98d0/assets
INFO:tensorflow:Assets written to: ram://20326107-bf74-4531-a314-ed37f7657990/assets
