# Train feedback estimators 

* using Logistic and Linear Regression

In [2]:
import pandas as pd
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import train_test_split
from keras.models import Sequential
from keras.layers import Activation, Dense
import tensorflow as tf
import pickle
import numpy as np

In [3]:
DATA= '../data/ACCORD_BPBGClass_v2_Contextual.csv'

context_fea = ['baseline_age', 'female', 'race_whiteother', #'race_black', 
                # 'edu_baseline',
                'edu_baseline_1',
                'edu_baseline_2',
                'edu_baseline_3',
                'cvd_hx_baseline', 
                'baseline_BMI', 
                # 'baseline_BMI_discrete',
                # 'cigarett_baseline',
                'cigarett_baseline_1',
                ]

## CVDRisk

In [4]:
# train a transformed linear regression model for CVD risk

def train_CVDRisk_estimator_linear(fn):
    # get the features and labels
    df = pd.read_csv(fn)
    print(df.shape)

    medclass_cols = ['Diur', 'ACE',  'Beta-blocker', 'CCB', 
                    'Bingu', 'Thiaz', 'Sulfon', 'Meglit']

    state_cols = ['sbp_discrete_merged','hba1c_discrete_merged'] # state vector

    fea_cols = context_fea + state_cols + medclass_cols
    print('fea_cols = ', fea_cols)
 
    X = df[fea_cols].values 
    y = df['CVDRisk_feedback'].values
    y_true  = y

    # transform the y to a linear scale
    y = -np.log((1-y)/y)
    
    print('X.shape = ', X.shape)
    print('y.shape = ', y.shape)
    
    # train the model on the whole dataset
    estimator_all = LinearRegression().fit(X, y)
    train_score_all = estimator_all.score(X, y)
    print('train_score using all data = ', train_score_all)

    # use RMSE as the evaluation metric
    y_pred = estimator_all.predict(X)
    y_pred_transformed = 1/(1+np.exp(-y_pred))
    from sklearn.metrics import mean_squared_error
    mse = mean_squared_error(y_true, y_pred_transformed)
    rmse = np.sqrt(mse)
    print('RMSE = ', rmse)

    # print the coefficients
    print('coef = ', estimator_all.coef_)
    print('intercept = ', estimator_all.intercept_)

    return estimator_all        

CVDRisk_estimator_BG = train_CVDRisk_estimator_linear(DATA)

# save the estimator model to a file
import pickle
with open('output_final/CVDRisk_estimator_BPBG.pkl', 'wb') as f:
    pickle.dump(CVDRisk_estimator_BG, f)

(139005, 82)
fea_cols =  ['baseline_age', 'female', 'race_whiteother', 'edu_baseline_1', 'edu_baseline_2', 'edu_baseline_3', 'cvd_hx_baseline', 'baseline_BMI', 'cigarett_baseline_1', 'sbp_discrete_merged', 'hba1c_discrete_merged', 'Diur', 'ACE', 'Beta-blocker', 'CCB', 'Bingu', 'Thiaz', 'Sulfon', 'Meglit']
X.shape =  (139005, 19)
y.shape =  (139005,)
train_score using all data =  0.7632080321728918
RMSE =  0.06914175431351219
coef =  [ 8.54603743e-02 -6.87473049e-01 -3.10032591e-01  1.67793558e-02
  4.14499189e-02  3.12983020e-02  4.33194747e-02 -1.45709072e-07
  6.96179713e-01  4.12818524e-01  4.06201986e-02 -4.53682043e-02
 -4.82166893e-02  2.47813254e-02 -1.03417525e-02 -2.61801318e-02
  3.73280753e-02  3.77494266e-02 -2.64449409e-02]
intercept =  -6.427567237746595


## A1C

In [5]:
# train a linear regression model for A1C feedback
def train_A1C_estimator(fn):
    # get the features and labels
    df = pd.read_csv(fn)
    # state_cols = ['sbp_discrete'] # state vector
    state_cols = []

    medclass_cols = ['Diur', 'ACE',  'Beta-blocker', 'CCB', 
                    'Bingu', 'Thiaz', 'Sulfon', 'Meglit']

    fea_cols = context_fea + state_cols + medclass_cols
 
    X = df[fea_cols].values 
    y = df['hba1c_feedback'].values

    print('X.shape = ', X.shape)
    print('y.shape = ', y.shape)
    
    from sklearn.linear_model import Lasso, Ridge
    # estimator_all = Lasso(alpha=1).fit(X, y)
    # estimator_all = Ridge(alpha=20).fit(X, y)
    estimator_all = LinearRegression().fit(X, y)

    train_score_all = estimator_all.score(X, y)
    print('R2 score = ', train_score_all)
    
    # compute the RMSE score 
    y_pred = estimator_all.predict(X)
    from sklearn.metrics import mean_squared_error
    rmse = np.sqrt(mean_squared_error(y, y_pred))
    print('RMSE = ', rmse)    

    return estimator_all        
    

A1C_feedback_estimator_BG = train_A1C_estimator(DATA) 

import pickle
with open('output_final/A1C_feedback_estimator_BPBG.pkl', 'wb') as f:
    pickle.dump(A1C_feedback_estimator_BG, f)

X.shape =  (139005, 17)
y.shape =  (139005,)
R2 score =  0.10279290505437833
RMSE =  1.0113976206677335


## SBP

In [6]:
# train a linear regression model for SBP feedback

def train_SBP_estimator(fn):
    # get the features and labels
    df = pd.read_csv(fn)
    print(df.shape)

    medclass_cols = ['Diur', 'ACE',  'Beta-blocker', 'CCB', 
                    'Bingu', 'Thiaz', 'Sulfon', 'Meglit']
                    
    state_cols = []
    
    fea_cols = context_fea + state_cols + medclass_cols
    # fea_cols = context_fea + medclass_cols
    # fea_cols = all_fea_cols # try to predict SBP from all features
    # fea_cols = ['BMI'] # try to predict SBP from BMI only
    print('fea_cols = ', fea_cols)

    # drop rows with -1 value in any of the features
    # for fea in fea_cols:
    #     df = df[df[fea] != -1]    

    # only keep rows with 'Visit' = 'F01'
    # df = df[df['Visit'] == 'F01']

    X = df[fea_cols].values 
    y = df['sbp_feedback'].values

    # np.save('output/X_sbp.npy', X)
    # np.save('output/y_sbp.npy', y)

    # check for p values
    # import statsmodels.api as sm
    # X = sm.add_constant(X)
    # model = sm.OLS(y, X).fit()
    # print(model.summary())

    print('X.shape = ', X.shape)
    print('y.shape = ', y.shape)
    
    # Split the data into training and testing sets
    # X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=100)

    # use Lasso to fit the model
    # from sklearn.linear_model import Lasso
    # estimator = Lasso(alpha=1).fit(X_train, y_train)

    # use OLS to fit the model
    # from sklearn.linear_model import LinearRegression
    # estimator = LinearRegression().fit(X_train, y_train)

    # train_score = estimator.score(X_train, y_train)
    # test_score =  estimator.score(X_test, y_test)
    # print('train_score = ', train_score)
    # print('test_score = ', test_score)

    # # compute the RMSE score on the test set
    # y_pred = estimator.predict(X_test)
    # from sklearn.metrics import mean_squared_error
    # rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    # print('Test RMSE = ', rmse)


    # # Train a linear regression model to predict the risk of CVD
    # lr = LinearRegression().fit(X_train, y_train)
    # train_score = lr.score(X_train, y_train)
    # test_score =  lr.score(X_test, y_test)
    # print('train_score = ', train_score)
    # print('test_score = ', test_score)

    # retrain the model on the whole dataset
    from sklearn.linear_model import Lasso, Ridge
    # estimator_all = Lasso(alpha=1).fit(X, y)
    # estimator_all = Ridge(alpha=20).fit(X, y)
    estimator_all = LinearRegression().fit(X, y)

    train_score_all = estimator_all.score(X, y)
    print('R2 score = ', train_score_all)
    
    # compute the RMSE score 
    y_pred = estimator_all.predict(X)
    from sklearn.metrics import mean_squared_error
    rmse = np.sqrt(mean_squared_error(y, y_pred))
    print('RMSE = ', rmse)    

    return estimator_all        
    

SBP_feedback_estimator = train_SBP_estimator(DATA)


# save the estimator model to a file
import pickle
with open('output_final/SBP_feedback_estimator_BPBG.pkl', 'wb') as f:
    pickle.dump(SBP_feedback_estimator, f)

(139005, 82)
fea_cols =  ['baseline_age', 'female', 'race_whiteother', 'edu_baseline_1', 'edu_baseline_2', 'edu_baseline_3', 'cvd_hx_baseline', 'baseline_BMI', 'cigarett_baseline_1', 'Diur', 'ACE', 'Beta-blocker', 'CCB', 'Bingu', 'Thiaz', 'Sulfon', 'Meglit']
X.shape =  (139005, 17)
y.shape =  (139005,)
R2 score =  0.034864035772759205
RMSE =  14.983796417115135
