# Train feedback estimators using Logistic and Linear Regression

This script trains the following feedback estimators on the ACCORD_BPClass_v2.csv dataset

* CVDRiskEstr: logistic regression, obtain accuracy arounnd 0.88
* SBPEstr: linear regression, poor fitting, R^2 = 0.03
* A1CEstr: linear regression, poor fitting, R^2 = 0.09

In [13]:
import pandas as pd
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import train_test_split
from keras.models import Sequential
from keras.layers import Activation, Dense
import tensorflow as tf
import pickle

In [2]:
context_fea = ['baseline_age', 'female', 'race_whiteother', #'race_black', 
                'edu_baseline',
                'cvd_hx_baseline', 'baseline_BMI', 'cigarett_baseline']

## CVDRisk

In [28]:
def train_CVDRisk_estimator(fn, flag):

    # flag = 'BP' or 'BG' or 'BPBG'
    
    # get the features and labels
    df = pd.read_csv(fn)

    if flag == 'BP':
        medclass_cols = ['BMI', 'sbp', 'hba1c', 'TC', 'hdl', 
                        'bpclass_none', 'Diur', 'ACE', 'Beta-blocker', 'CCB', 
                        'ARB', 'Alpha-Beta-blocker', 'Alpha-blocker', 'Sympath', 'Vasod']
    elif flag == 'BG':
        medclass_cols = ['BMI', 'sbp', 'hba1c', 'TC', 'hdl', 
                        'bgclass_none', ]
    elif flag == 'BPBG':
        medclass_cols = ['BMI', 'sbp', 'hba1c', 'TC', 'hdl', 
                        'bpclass_none', 'Diur', 'ACE', 'Beta-blocker', 'CCB', 
                        'ARB', 'Alpha-Beta-blocker', 'Alpha-blocker', 'Sympath', 'Vasod',
                        'bgclass_none', ]
    else:
        print('Error: flag must be BP, BG or BPBG')
        exit()

    fea_cols = context_fea + medclass_cols
    # remove the race_black column to avoid multicollinearity, but performance is roughly the same
    # fea_cols.remove('race_black')

    X = df[fea_cols].values 
    y = df['CVDRisk_feedback_binary'].values

    print('X.shape = ', X.shape)
    print('y.shape = ', y.shape)
    
    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=100)

    # Train a logistic regression model to predict the risk of CVD
    lr = LogisticRegression(max_iter=400).fit(X_train, y_train)
    train_score = lr.score(X_train, y_train)
    test_score =  lr.score(X_test, y_test)
    print('train_score = ', train_score)
    print('test_score = ', test_score)

    # retrain the model on the whole dataset
    estimator = LogisticRegression(max_iter=200).fit(X, y)
    train_score2 = estimator.score(X, y)
    print('train_score2 = ', train_score2)

    return (estimator, fea_cols)

CVDRisk_estimator_BP, CVDRisk_fea_BP = train_CVDRisk_estimator('../../../Codes/Accord/data/ACCORD_BPClass_v2.csv', 'BP')

X.shape =  (176906, 22)
y.shape =  (176906,)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


train_score =  0.8791865690624912
test_score =  0.8792606410039003
train_score2 =  0.8646908527692673


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


## SBP

In [25]:
# train a linear regression model for SBP feedback
def train_SBP_estimator(fn):
    # get the features and labels
    df = pd.read_csv(fn)

    medclass_cols = ['BMI',  
                    'bpclass_none', 'Diur', 'ACE', 'Beta-blocker', 'CCB', 
                    'ARB', 'Alpha-Beta-blocker', 'Alpha-blocker', 'Sympath', 'Vasod']

    fea_cols = context_fea + medclass_cols
    # remove race_black from fea_cols list, to avoid multicollinearity. But performance is still very poor!
    # fea_cols.remove('race_black')

    X = df[fea_cols].values 
    y = df['sbp'].values

    print('X.shape = ', X.shape)
    print('y.shape = ', y.shape)
    
    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=100)

    # Train a linear regression model to predict the risk of CVD
    lr = LinearRegression().fit(X_train, y_train)
    train_score = lr.score(X_train, y_train)
    test_score =  lr.score(X_test, y_test)
    print('train_score = ', train_score)
    print('test_score = ', test_score)

    # retrain the model on the whole dataset
    estimator = LinearRegression().fit(X, y)
    train_score2 = estimator.score(X, y)
    print('train_score2 = ', train_score2)

    return estimator        

SBP_estimator = train_SBP_estimator('../../../Codes/Accord/data/ACCORD_BPClass_v2.csv')

X.shape =  (176906, 18)
y.shape =  (176906,)
train_score =  0.03102635614224325
test_score =  0.028663873790212402
train_score2 =  0.030605337454086223


## A1C

In [26]:
# train a linear regression model for A1C feedback
def train_A1C_estimator(fn):
    # get the features and labels
    df = pd.read_csv(fn)

    medclass_cols = ['BMI', 'sbp', 'TC', 'hdl',   
                    'bgclass_none', 'Bingu', 'Thiaz', 'Sulfon', 'Meglit', 
                    'Alpha-gluc']

    fea_cols = context_fea + medclass_cols
    # fea_cols.remove('race_black')
    X = df[fea_cols].values 
    y = df['hba1c'].values

    print('X.shape = ', X.shape)
    print('y.shape = ', y.shape)
    
    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=100)

    # Train a linear regression model to predict the risk of CVD
    lr = LinearRegression().fit(X_train, y_train)
    train_score = lr.score(X_train, y_train)
    test_score =  lr.score(X_test, y_test)
    print('train_score = ', train_score)
    print('test_score = ', test_score)

    # retrain the model on the whole dataset
    estimator = LinearRegression().fit(X, y)
    train_score2 = estimator.score(X, y)
    print('train_score2 = ', train_score2)

    return estimator        

SBP_estimator = train_A1C_estimator('../../../Codes/Accord/data/ACCORD_BGClass_v2.csv')

X.shape =  (290467, 17)
y.shape =  (290467,)
train_score =  0.09026242567547482
test_score =  0.08718707308778972
train_score2 =  0.08966819902124046


# Train Estimators using ANN

## ANN model class

In [3]:
# write a class which build a ANN model for SBP feedback, using tensorflow framwork
   
class ANN:
    def __init__(self, input_shape, output_shape, hidden_layers, output_activation='linear'):
        self.input_shape = input_shape
        self.output_shape = output_shape
        self.hidden_layers = hidden_layers        
        
        # Define the model architecture
        self.model = tf.keras.Sequential()
        self.model.add(tf.keras.layers.Input(shape=input_shape))
        for i in range(len(hidden_layers)):
            self.model.add(tf.keras.layers.Dense(hidden_layers[i], activation='relu'))
        self.model.add(tf.keras.layers.Dense(output_shape, activation=output_activation))
        
    def train(self, x_train, y_train, x_val, y_val, epochs=10, batch_size=16):
        # Compile the model with appropriate loss function and optimizer
        # self.model.compile(loss='mean_squared_error', optimizer='adam', metrics=['mse'])
        self.model.compile(loss='mean_squared_error', optimizer='adam')
        
        # Train the model
        self.model.fit(x_train, y_train, validation_data=(x_val, y_val), epochs=epochs, batch_size=batch_size, verbose=1)
    
    def evaluate(self, x, y):
        # Evaluate the model on the test set
        return self.model.evaluate(x, y)
        
    def predict(self, x):
        # Use the model to make predictions
        return self.model.predict(x)

## CVDRisk

In [17]:
# train a ANN model for CVDRisk 
def train_CVDRisk_estimator_ANN(fn, flag, train_onwhole=False):
    # flag = 'BP' or 'BG' or 'BPBG'
    
    # get the features and labels
    df = pd.read_csv(fn)

    if flag == 'BP':
        medclass_cols = ['BMI', 'sbp', 'hba1c', 'TC', 'hdl', 
                        'bpclass_none', 'Diur', 'ACE', 'Beta-blocker', 'CCB', 
                        'ARB', 'Alpha-Beta-blocker', 'Alpha-blocker', 'Sympath', 'Vasod']
    elif flag == 'BG':
        medclass_cols = ['BMI', 'sbp', 'hba1c', 'TC', 'hdl', 
                        'bgclass_none', ]
    elif flag == 'BPBG':
        medclass_cols = ['BMI', 'sbp', 'hba1c', 'TC', 'hdl', 
                        'bpclass_none', 'Diur', 'ACE', 'Beta-blocker', 'CCB', 
                        'ARB', 'Alpha-Beta-blocker', 'Alpha-blocker', 'Sympath', 'Vasod',
                        'bgclass_none', ]
    else:
        print('Error: flag must be BP, BG or BPBG')
        exit()

    fea_cols = context_fea + medclass_cols
    X = df[fea_cols].values 
    y = df['CVDRisk_feedback_binary'].values
    print('X.shape = ', X.shape)
    print('y.shape = ', y.shape)
    
    # Split the data into training and testing sets
    X_train, X_valtest, y_train, y_valtest = train_test_split(X, y, test_size=0.3, random_state=100)
    X_val, X_test, y_val, y_test = train_test_split(X_valtest, y_valtest, test_size=0.5, random_state=100)

    model = ANN(X.shape[1], 1, [16, 8], 'sigmoid')
    if not train_onwhole: # to explore hyperparameters
        model.train(X_train, y_train, X_val, y_val, epochs=20, batch_size=32)
        test_score = model.evaluate(X_test, y_test)
        print('test_score = ', test_score)
        return (None, fea_cols)

    else: # retrain the model on the whole dataset
        model.train(X, y, X, y, epochs=20, batch_size=32)    
        return (model, fea_cols)

# none, CVDRisk_fea = train_CVDRisk_estimator_ANN('../../../Codes/Accord/data/ACCORD_BPClass_v2.csv', 'BP')    
CVDRisk_estimator_ANN_BP, CVDRisk_fea_BP = train_CVDRisk_estimator_ANN('../../../Codes/Accord/data/ACCORD_BPClass_v2.csv', 'BP', True)

X.shape =  (176906, 22)
y.shape =  (176906,)
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


## SBP

In [15]:
# train SBP estimator using ANN model
def train_SBP_estimator_ANN(fn, train_onwhole=False):
    # get the features and labels
    df = pd.read_csv(fn)

    medclass_cols = ['BMI',  
                    'bpclass_none', 'Diur', 'ACE', 'Beta-blocker', 'CCB', 
                    'ARB', 'Alpha-Beta-blocker', 'Alpha-blocker', 'Sympath', 'Vasod']

    fea_cols = context_fea + medclass_cols
    X = df[fea_cols].values 
    y = df['sbp'].values

    print('X.shape = ', X.shape)
    print('y.shape = ', y.shape)
    
    # Split the data into training, validation and testing sets
    X_train, X_valtest, y_train, y_valtest = train_test_split(X, y, test_size=0.3, random_state=100)
    X_val, X_test, y_val, y_test = train_test_split(X_valtest, y_valtest, test_size=0.5, random_state=100)

    model = ANN(X.shape[1], 1, [16, 8])
    if not train_onwhole: # to explore hyperparameters
        model.train(X_train, y_train, X_val, y_val, epochs=20, batch_size=16)
        test_score = model.evaluate(X_test, y_test)
        print('test_score = ', test_score)
        return (None, fea_cols)

    else: # retrain the model on the whole dataset
        model.train(X, y, X, y, epochs=15, batch_size=16)    
        return (model, fea_cols)

# none, SBP_fea = train_SBP_estimator_ANN('../../../Codes/Accord/data/ACCORD_BPClass_v2.csv')
SBP_estimator_ANN, SBP_fea = train_SBP_estimator_ANN('../../../Codes/Accord/data/ACCORD_BPClass_v2.csv', True)


X.shape =  (176906, 18)
y.shape =  (176906,)
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


## A1C 

In [10]:
# train a ANN model for A1C feedback
def train_A1C_estimator_ANN(fn, train_onwhole=False):
    # get the features and labels
    df = pd.read_csv(fn)

    medclass_cols = ['BMI', 'sbp', 'TC', 'hdl',   
                    'bgclass_none', 'Bingu', 'Thiaz', 'Sulfon', 'Meglit', 
                    'Alpha-gluc']

    fea_cols = context_fea + medclass_cols
    X = df[fea_cols].values 
    y = df['hba1c'].values

    print('X.shape = ', X.shape)
    print('y.shape = ', y.shape)
    
    # Split the data into training, validation and testing sets
    X_train, X_valtest, y_train, y_valtest = train_test_split(X, y, test_size=0.3, random_state=100)
    X_val, X_test, y_val, y_test = train_test_split(X_valtest, y_valtest, test_size=0.5, random_state=100)

    model = ANN(X.shape[1], 1, [16, 8])
    if not train_onwhole: # to explore hyperparameters
        model.train(X_train, y_train, X_val, y_val, epochs=20, batch_size=32)
        test_score = model.evaluate(X_test, y_test)
        print('test_score = ', test_score)
        return (None, fea_cols)
    else:
        model.train(X, y, X, y, epochs=15, batch_size=32)    
        return (model, fea_cols)       

# none, A1C_fea = train_A1C_estimator_ANN('../../../Codes/Accord/data/ACCORD_BGClass_v2.csv')
A1C_estimator_ANN, A1C_fea = train_A1C_estimator_ANN('../../../Codes/Accord/data/ACCORD_BGClass_v2.csv', True)

X.shape =  (290467, 17)
y.shape =  (290467,)
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


## Save trained model for RL feedback

In [18]:
estimators = {'CVDRisk-BP': CVDRisk_estimator_ANN_BP, 'SBP': SBP_estimator_ANN, 'A1C': A1C_estimator_ANN}
feature_cols = {'CVDRisk-BP': CVDRisk_fea_BP, 'SBP': SBP_fea, 'A1C': A1C_fea}

# save the estimators and feature columns
with open('estimators.pkl', 'wb') as f:
    pickle.dump(estimators, f)

with open('feature_cols.pkl', 'wb') as f:
    pickle.dump(feature_cols, f)

INFO:tensorflow:Assets written to: ram://ef1c0d68-e16a-48f8-8607-ae7c517c88cc/assets
INFO:tensorflow:Assets written to: ram://9afb876a-7a58-40e2-b0f0-0b64595f98d0/assets
INFO:tensorflow:Assets written to: ram://20326107-bf74-4531-a314-ed37f7657990/assets
