In [None]:
###############################################################
# Purpose :  Run the Baseline model to get the residuals
###############################################################

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Lasso, LassoCV, ElasticNet, ElasticNetCV, LogisticRegression, RidgeCV
from sklearn.metrics import r2_score, explained_variance_score, mean_squared_error, f1_score, accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import cross_val_score
from sklearn import preprocessing

import xgboost as xgb
from xgboost import XGBRegressor
import pickle
import optuna

from step_2_auxiliary_functions import load_prsice, objective, adj_pheno, bootstrap_prediction_intervals

In [6]:
#Parameters tuned from Optuna for baseline model adjustment
#SBP
sbp_baseline_params = {'max_depth': 99, 'min_child_weight': 20, 'subsample': 0.4, 'colsample_bytree': 0.7000000000000001,\
                       'lambda': 0, 'alpha': 49, 'gamma': 19, 'eta': 0.01, 'nthread':-1}
sbp_baseline_xgb_num_boost_rounds = 460


#DBP
dbp_baseline_params = {'max_depth': 3, 'min_child_weight': 100, 'subsample': 0.6, 'colsample_bytree': 1.0, \
                       'lambda': 20, 'alpha': 14, 'gamma': 32, 'eta': 0.04, 'nthread':-1} 
dbp_baseline_xgb_num_boost_rounds = 315

In [7]:
baseline_phenotypes_dir = '/2022_BP_ensemble/Data/TOPMed_phenotypes/baseline data/'

In [8]:
#Read in SBP baseline data and run baseline model 

baseline_phenotype_data_train_sbp = pd.read_csv(baseline_phenotypes_dir+'TOPMed_sbp_x_train_baseline_data.csv', dtype = {'sample.id':'str'}).set_index('sample.id')
baseline_phenotype_data_test_sbp = pd.read_csv(baseline_phenotypes_dir+'TOPMed_sbp_x_test_baseline_data.csv', dtype = {'sample.id':'str'}).set_index('sample.id')

baseline_y_train_sbp = pd.read_csv(baseline_phenotypes_dir+'TOPMed_sbp_y_train_baseline_data.csv', dtype = {'sample.id':'str'}).set_index('sample.id').iloc[:, 0]
baseline_y_test_sbp = pd.read_csv(baseline_phenotypes_dir+'TOPMed_sbp_y_test_baseline_data.csv', dtype = {'sample.id':'str'}).set_index('sample.id').iloc[:, 0]

In [None]:
#XGBoost SBP model for baseline adjustment
topmed_no_pc_sbp_load_y_train_adj, topmed_no_pc_sbp_load_y_test_adj, topmed_xgb_no_pc_sbp_results = adj_pheno(baseline_phenotype_data_train_sbp, baseline_phenotype_data_test_sbp, 
                                                                                         baseline_y_train_sbp, baseline_y_test_sbp, 
                                                                                         sbp_baseline_params, sbp_baseline_xgb_num_boost_rounds,\
                                                                                         var = 'SBP', model_type = 'xgboost', ukbb_flag= False, output_results = True, 
                                                                                         save_model_weights = False,
                                                                                         load_model_weights = True,
                                                                                         model_weights_name = 'TOPMed_baseline_sbp_xgb_model_weights',
                                                                                         train_export_data_name = 'TOPMed_training_sbp_baseline_data_and_xgb_adjustment.csv', 
                                                                                         test_export_data_name = 'TOPMed_testing_sbp_baseline_data_and_xgb_adjustment.csv')
topmed_xgb_no_pc_sbp_results

In [None]:
xgb_sbp_model_name = 'TOPMed_baseline_sbp_xgb_model_weights_SBP.pkl'
xgb_baseline_model_sbp_prediction_interval = bootstrap_prediction_intervals('sbp', baseline_phenotype_data_test_sbp, baseline_y_test_sbp, 'xgboost', xgb_sbp_model_name)


In [None]:
#linear regression SBP model for baseline adjustment
topmed_no_pc_sbp_load_y_train_adj_lr, topmed_no_pc_sbp_load_y_test_adj_lr, topmed_lr_no_pc_sbp_results = adj_pheno(baseline_phenotype_data_train_sbp, baseline_phenotype_data_test_sbp, 
                                                                                         baseline_y_train_sbp, baseline_y_test_sbp, 
                                                                                         sbp_baseline_params, sbp_baseline_xgb_num_boost_rounds,\
                                                                                         var = 'SBP', model_type = 'linear regression', ukbb_flag= False, output_results = True, 
                                                                                         save_model_weights = False,
                                                                                         load_model_weights = True,
                                                                                         model_weights_name = 'TOPMed_baseline_sbp_linear_regression_model_weights',
                                                                                         train_export_data_name = 'TOPMed_training_sbp_baseline_data_and_linear_regression_adjustment.csv', 
                                                                                         test_export_data_name = 'TOPMed_testing_sbp_baseline_data_and_linear_regression_adjustment.csv')
topmed_lr_no_pc_sbp_results

In [None]:
lr_sbp_model_name = 'TOPMed_baseline_sbp_linear_regression_model_weights_lr_SBP.pkl'
lr_baseline_model_sbp_prediction_interval = bootstrap_prediction_intervals('sbp', baseline_phenotype_data_test_sbp, baseline_y_test_sbp, 'lr', lr_sbp_model_name)


In [13]:
#Read in DBP baseline data 

baseline_phenotype_data_train_dbp = pd.read_csv(baseline_phenotypes_dir+'TOPMed_dbp_x_train_baseline_data.csv', dtype = {'sample.id':'str'}).set_index('sample.id')
baseline_phenotype_data_test_dbp = pd.read_csv(baseline_phenotypes_dir+'TOPMed_dbp_x_test_baseline_data.csv', dtype = {'sample.id':'str'}).set_index('sample.id')

baseline_y_train_dbp = pd.read_csv(baseline_phenotypes_dir+'TOPMed_dbp_y_train_baseline_data.csv', dtype = {'sample.id':'str'}).set_index('sample.id').iloc[:, 0]
baseline_y_test_dbp = pd.read_csv(baseline_phenotypes_dir+'TOPMed_dbp_y_test_baseline_data.csv', dtype = {'sample.id':'str'}).set_index('sample.id').iloc[:, 0]

In [None]:
#DBP XGBoost Model
topmed_no_pc_dbp_load_y_train_adj, topmed_no_pc_dbp_load_y_test_adj, topmed_xgb_no_pc_dbp_results = adj_pheno(baseline_phenotype_data_train_dbp, baseline_phenotype_data_test_dbp, 
                                                                                         baseline_y_train_dbp, baseline_y_test_dbp, 
                                                                                         dbp_baseline_params, dbp_baseline_xgb_num_boost_rounds,\
                                                                                         var = 'DBP', model_type = 'xgboost', ukbb_flag= False, output_results = True, 
                                                                                         save_model_weights = False,
                                                                                         load_model_weights = True,
                                                                                         model_weights_name = 'TOPMed_baseline_dbp_linear_regression_model_weights',
                                                                                         train_export_data_name = 'TOPMed_training_dbp_baseline_data_and_xgb_adjustment.csv', 
                                                                                         test_export_data_name = 'TOPMed_testing_dbp_baseline_data_and_xgb_adjustment.csv')


In [None]:
xgb_dbp_model_name = 'TOPMed_baseline_dbp_xgb_model_weights_DBP.pkl'
xgb_baseline_model_dbp_prediction_interval = bootstrap_prediction_intervals('dbp', baseline_phenotype_data_test_dbp, baseline_y_test_dbp, 'xgboost', xgb_dbp_model_name)
xgb_baseline_model_dbp_prediction_interval

In [None]:
#DBP Linear Regression Model
topmed_no_pc_dbp_load_y_train_adj_lr, topmed_no_pc_dbp_load_y_test_adj_lr, topmed_lr_no_pc_dbp_results = adj_pheno(baseline_phenotype_data_train_dbp, baseline_phenotype_data_test_dbp, 
                                                                                         baseline_y_train_dbp, baseline_y_test_dbp, 
                                                                                         dbp_baseline_params, dbp_baseline_xgb_num_boost_rounds,\
                                                                                         var = 'DBP', model_type = 'linear regression', ukbb_flag= False, output_results = True, 
                                                                                         save_model_weights = False,
                                                                                         load_model_weights = True,
                                                                                         model_weights_name = 'TOPMed_baseline_dbp_linear_regression_model_weights',
                                                                                         train_export_data_name = 'TOPMed_training_dbp_baseline_data_and_linear_regression_adjustment.csv', 
                                                                                         test_export_data_name = 'TOPMed_testing_dbp_baseline_data_and_linear_regression_adjustment.csv')
topmed_lr_no_pc_dbp_results

In [None]:
lr_dbp_model_name = 'TOPMed_baseline_dbp_linear_regression_model_weights_lr_DBP.pkl'
lr_baseline_model_dbp_prediction_interval = bootstrap_prediction_intervals('dbp', baseline_phenotype_data_test_dbp, baseline_y_test_dbp, 'lr', lr_dbp_model_name)
lr_baseline_model_dbp_prediction_interval

In [None]:
all_prediction_interval_results = pd.concat([
    pd.concat([xgb_baseline_model_sbp_prediction_interval, topmed_xgb_no_pc_sbp_results['test_result']],axis = 1),
    pd.concat([lr_baseline_model_sbp_prediction_interval, topmed_lr_no_pc_sbp_results['test_result']],axis = 1),
    pd.concat([xgb_baseline_model_dbp_prediction_interval, topmed_xgb_no_pc_dbp_results['test_result']],axis = 1),
    pd.concat([lr_baseline_model_dbp_prediction_interval, topmed_lr_no_pc_dbp_results['test_result']],axis = 1)
], axis = 0)
all_prediction_interval_results.to_csv('/2022_BP_ensemble/Results/baseline_model_prediction_intervals.csv')
