In [3]:
###############################################################
# Purpose :  load baseline adjusted data, PRS, and run genetic models
###############################################################

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Lasso, LassoCV, ElasticNet, ElasticNetCV, LogisticRegression, RidgeCV
from sklearn.metrics import r2_score, explained_variance_score, mean_squared_error, f1_score, accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import cross_val_score
from sklearn import preprocessing

import xgboost as xgb
from xgboost import XGBRegressor
import pickle

import optuna

from step_4_auxiliary_functions import objective, run_regr, run_prediction, run_model, bootstrap_prediction_intervals

In [7]:
sbp_params_genetic = {'max_depth': 1, 'min_child_weight': 57, 'subsample': 0.30000000000000004, 'colsample_bytree': 0.8, 'lambda': 48, 'alpha': 48, 'gamma': 17, 'eta': 0.09999999999999999, 'nthread':-1}
sbp_num_boost_rounds_genetic = 290

sbp_params_genetic_model_2 = {'max_depth': 30, 'min_child_weight': 74, 'subsample': 0.1, 'colsample_bytree': 0.7000000000000001, 'lambda': 13, 'alpha': 37, 'gamma': 31, 'eta': 0.01, 'nthread':-1}
sbp_num_boost_rounds_genetic_model_2 = 348

sbp_params_genetic_model_3 = {'max_depth': 2, 'min_child_weight': 29, 'subsample': 0.9, 'colsample_bytree': 0.6, 'lambda': 31, 'alpha': 8, 'gamma': 38, 'eta': 0.02, 'nthread':-1}
sbp_num_boost_rounds_genetic_model_3 = 697

sbp_csx_model_params = {'max_depth': 2, 'min_child_weight': 63, 'subsample': 0.30000000000000004, 
                        'colsample_bytree': 1.0, 'lambda': 39, 'alpha': 10, 'gamma': 44, 'eta': 0.02}
sbp_num_boost_rounds_csx_model = 255 

sbp_local_prs_optuna_params = {'max_depth': 87, 'min_child_weight': 83, 'subsample': 0.4,
                               'colsample_bytree': 0.7000000000000001, 'lambda': 0,
                               'alpha': 45, 'gamma': 3, 'eta': 0.01, 'nthread':-1}
sbp_local_prs_num_boost_round = 947


dbp_params_genetic = {'max_depth': 3, 'min_child_weight': 1, 'subsample': 0.8,\
                          'colsample_bytree': 0.9, 'lambda': 37, 'alpha': 50,\
                          'gamma': 45, 'eta': 0.05, 'nthread': -1}
dbp_num_boost_rounds_genetic = 90

dbp_params_genetic_model_2 = {'max_depth': 30, 'min_child_weight': 42, 'subsample': 0.30000000000000004, 
                              'colsample_bytree': 0.5, 'lambda': 48, 'alpha': 43, 'gamma': 24, 'eta': 0.02, 'nthread':-1}
dbp_num_boost_rounds_genetic_model_2 = 192 

dbp_params_genetic_model_3 = {'max_depth': 3, 'min_child_weight': 16, 'subsample': 0.9, 
                              'colsample_bytree': 0.8, 'lambda': 42, 'alpha': 36, 'gamma': 29, 'eta': 0.04, 'nthread':-1}
dbp_num_boost_rounds_genetic_model_3 = 290 

dbp_csx_model_params = {'max_depth': 2, 'min_child_weight': 53, 'subsample': 0.6, 
                        'colsample_bytree': 0.5, 'lambda': 9, 'alpha': 22, 'gamma': 4, 'eta': 0.02, 'nthread':-1}
dbp_num_boost_rounds_csx_model = 112 

dbp_local_prs_optuna_params = {'max_depth': 70, 'min_child_weight': 58, 'subsample': 0.8,
                               'colsample_bytree': 0.8, 'lambda': 7,
                               'alpha': 25, 'gamma': 34, 'eta': 0.01, 'nthread':-1}
dbp_local_prs_num_boost_round = 954 

In [8]:
prs_phenotypes_dir = '/2022_BP_ensemble/TOPMed_data/Global PRS Prepared/'
baseline_phenotypes_dir = '/2022_BP_ensemble/Data/TOPMed_phenotypes/baseline data/'
baseline_adj_phenotypes_dir = '/2022_BP_ensemble/Data/TOPMed_phenotypes/baseline adjusted model/'

In [9]:
#Read in baseline data and adjusted baseline data 

baseline_phenotype_data_train_sbp = pd.read_csv(baseline_phenotypes_dir+'TOPMed_sbp_x_train_baseline_data.csv', dtype = {'sample.id':'str'}).set_index('sample.id')
baseline_phenotype_data_test_sbp = pd.read_csv(baseline_phenotypes_dir+'TOPMed_sbp_x_test_baseline_data.csv', dtype = {'sample.id':'str'}).set_index('sample.id')

baseline_y_train_sbp = pd.read_csv(baseline_phenotypes_dir+'TOPMed_sbp_y_train_baseline_data.csv', dtype = {'sample.id':'str'}).set_index('sample.id').iloc[:, 0]
baseline_y_test_sbp = pd.read_csv(baseline_phenotypes_dir+'TOPMed_sbp_y_test_baseline_data.csv', dtype = {'sample.id':'str'}).set_index('sample.id').iloc[:, 0]

baseline_y_train_sbp_adjusted = pd.read_csv(baseline_adj_phenotypes_dir+'TOPMed_training_sbp_baseline_data_and_xgb_adjustment.csv', \
                                            dtype = {'sample.id':'str'}).set_index('sample.id')['SBP_V1_residuals']
baseline_y_test_sbp_adjusted = pd.read_csv(baseline_adj_phenotypes_dir+'TOPMed_testing_sbp_baseline_data_and_xgb_adjustment.csv', \
                                           dtype = {'sample.id':'str'}).set_index('sample.id')['SBP_V1_residuals']


baseline_phenotype_data_train_dbp = pd.read_csv(baseline_phenotypes_dir+'TOPMed_dbp_x_train_baseline_data.csv', dtype = {'sample.id':'str'}).set_index('sample.id')
baseline_phenotype_data_test_dbp = pd.read_csv(baseline_phenotypes_dir+'TOPMed_dbp_x_test_baseline_data.csv', dtype = {'sample.id':'str'}).set_index('sample.id')

baseline_y_train_dbp = pd.read_csv(baseline_phenotypes_dir+'TOPMed_dbp_y_train_baseline_data.csv', dtype = {'sample.id':'str'}).set_index('sample.id').iloc[:, 0]
baseline_y_test_dbp = pd.read_csv(baseline_phenotypes_dir+'TOPMed_dbp_y_test_baseline_data.csv', dtype = {'sample.id':'str'}).set_index('sample.id').iloc[:, 0]

baseline_y_train_dbp_adjusted = pd.read_csv(baseline_adj_phenotypes_dir+'TOPMed_training_dbp_baseline_data_and_xgb_adjustment.csv', \
                                            dtype = {'sample.id':'str'}).set_index('sample.id')['DBP_V1_residuals']
baseline_y_test_dbp_adjusted = pd.read_csv(baseline_adj_phenotypes_dir+'TOPMed_testing_dbp_baseline_data_and_xgb_adjustment.csv', \
                                           dtype = {'sample.id':'str'}).set_index('sample.id')['DBP_V1_residuals']

In [10]:
#Read in PRS files

#SBP
ukbb_prs_train_sbp = pd.read_csv(prs_phenotypes_dir+'TOPMED_SBP_UKBB+ICBP_prs_train.csv', \
                                dtype = {'sample.id':'str'}).set_index('sample.id')
ukbb_prs_test_sbp = pd.read_csv(prs_phenotypes_dir+'TOPMED_SBP_UKBB+ICBP_prs_test.csv', \
                                dtype = {'sample.id':'str'}).set_index('sample.id')

bbj_prs_train_sbp = pd.read_csv(prs_phenotypes_dir+'TOPMED_SBP_BBJ_prs_train.csv', \
                                dtype = {'sample.id':'str'}).set_index('sample.id')
bbj_prs_test_sbp = pd.read_csv(prs_phenotypes_dir+'TOPMED_SBP_BBJ_prs_test.csv', \
                                dtype = {'sample.id':'str'}).set_index('sample.id')

mvp_prs_train_sbp = pd.read_csv(prs_phenotypes_dir+'TOPMED_SBP_MVP_prs_train.csv', \
                                dtype = {'sample.id':'str'}).set_index('sample.id')
mvp_prs_test_sbp = pd.read_csv(prs_phenotypes_dir+'TOPMED_SBP_MVP_prs_test.csv', \
                                dtype = {'sample.id':'str'}).set_index('sample.id')

model_csx_prs_train_sbp = pd.read_csv(prs_phenotypes_dir+'TOPMED_SBP_PRS-CsX_prs_train.csv', \
                                dtype = {'sample.id':'str'}).set_index('sample.id')['SBP_PRS-CsX_SBP_PS_std_sum_std']
model_csx_prs_test_sbp = pd.read_csv(prs_phenotypes_dir+'TOPMED_SBP_PRS-CsX_prs_test.csv', \
                                dtype = {'sample.id':'str'}).set_index('sample.id')['SBP_PRS-CsX_SBP_PS_std_sum_std']

ukbb_local_prs_train_sbp = pd.read_csv(prs_phenotypes_dir+'TOPMED_SBP_Local PRS UKBB_prs_train.csv', \
                                dtype = {'sample.id':'str'}).set_index('sample.id')
ukbb_local_prs_test_sbp = pd.read_csv(prs_phenotypes_dir+'TOPMED_SBP_Local PRS UKBB_prs_test.csv', \
                                dtype = {'sample.id':'str'}).set_index('sample.id')

#DBP
ukbb_prs_train_dbp = pd.read_csv(prs_phenotypes_dir+'TOPMED_DBP_UKBB+ICBP_prs_train.csv', \
                                dtype = {'sample.id':'str'}).set_index('sample.id')
ukbb_prs_test_dbp = pd.read_csv(prs_phenotypes_dir+'TOPMED_DBP_UKBB+ICBP_prs_test.csv', \
                                dtype = {'sample.id':'str'}).set_index('sample.id')

bbj_prs_train_dbp = pd.read_csv(prs_phenotypes_dir+'TOPMED_DBP_BBJ_prs_train.csv', \
                                dtype = {'sample.id':'str'}).set_index('sample.id')
bbj_prs_test_dbp = pd.read_csv(prs_phenotypes_dir+'TOPMED_DBP_BBJ_prs_test.csv', \
                                dtype = {'sample.id':'str'}).set_index('sample.id')

mvp_prs_train_dbp = pd.read_csv(prs_phenotypes_dir+'TOPMED_DBP_MVP_prs_train.csv', \
                                dtype = {'sample.id':'str'}).set_index('sample.id')
mvp_prs_test_dbp = pd.read_csv(prs_phenotypes_dir+'TOPMED_DBP_MVP_prs_test.csv', \
                                dtype = {'sample.id':'str'}).set_index('sample.id')

model_csx_prs_train_dbp = pd.read_csv(prs_phenotypes_dir+'TOPMED_DBP_PRS-CsX_prs_train.csv', \
                                dtype = {'sample.id':'str'}).set_index('sample.id')['DBP_PRS-CsX_SBP_PS_std_sum_std']
model_csx_prs_test_dbp = pd.read_csv(prs_phenotypes_dir+'TOPMED_DBP_PRS-CsX_prs_test.csv', \
                                dtype = {'sample.id':'str'}).set_index('sample.id')['DBP_PRS-CsX_SBP_PS_std_sum_std']

ukbb_local_prs_train_dbp = pd.read_csv(prs_phenotypes_dir+'TOPMED_DBP_Local PRS UKBB_prs_train.csv', \
                                dtype = {'sample.id':'str'}).set_index('sample.id')
ukbb_local_prs_test_dbp = pd.read_csv(prs_phenotypes_dir+'TOPMED_DBP_Local PRS UKBB_prs_test.csv', \
                                dtype = {'sample.id':'str'}).set_index('sample.id')

In [12]:
#UKBB LOCAL PRS WITH FEATURE SELECTION LASSO

#sbp
sbp_local_prs_ukbb_lasso_feature_coeffients = pd.read_csv("/2022_BP_ensemble/Local_PRS_files/feature importance/TOPMED SBP_lasso_coefficients 20230705.csv")
sbp_local_prs_features_selected = sbp_local_prs_ukbb_lasso_feature_coeffients.loc[sbp_local_prs_ukbb_lasso_feature_coeffients['lasso coefficient'] >0,'feature']
sbp_local_prs_features_selected = ('SBP_Local PRS UKBB_'+sbp_local_prs_features_selected[sbp_local_prs_features_selected.str.startswith('chr', na=False)]).tolist()
ukbb_local_prs_train_sbp_features_selected = ukbb_local_prs_train_sbp[sbp_local_prs_features_selected] 
ukbb_local_prs_test_sbp_features_selected = ukbb_local_prs_test_sbp[sbp_local_prs_features_selected]

#dbp
dbp_local_prs_ukbb_lasso_feature_coeffients = pd.read_csv("/2022_BP_ensemble/Local_PRS_files/feature importance/TOPMED DBP_lasso_coefficients 20230705.csv")
dbp_local_prs_features_selected = dbp_local_prs_ukbb_lasso_feature_coeffients.loc[dbp_local_prs_ukbb_lasso_feature_coeffients['lasso coefficient'] >0,'feature']
dbp_local_prs_features_selected = ('DBP_Local PRS UKBB_'+dbp_local_prs_features_selected[dbp_local_prs_features_selected.str.startswith('chr', na=False)]).tolist()
ukbb_local_prs_train_dbp_features_selected = ukbb_local_prs_train_dbp[dbp_local_prs_features_selected] 
ukbb_local_prs_test_dbp_features_selected = ukbb_local_prs_test_dbp[dbp_local_prs_features_selected]


In [13]:
#SBP PRS features
model_1_prs_train_sbp = ukbb_prs_train_sbp['SBP_UKBB+ICBP_Pt_0.01']
model_1_prs_test_sbp = ukbb_prs_test_sbp['SBP_UKBB+ICBP_Pt_0.01']

model_csx_prs_train_sbp = pd.read_csv(prs_phenotypes_dir+'TOPMED_SBP_PRS-CsX_prs_train.csv', \
                                dtype = {'sample.id':'str'}).set_index('sample.id')['SBP_PRS-CsX_SBP_PS_std_sum_std']
model_csx_prs_test_sbp = pd.read_csv(prs_phenotypes_dir+'TOPMED_SBP_PRS-CsX_prs_test.csv', \
                                dtype = {'sample.id':'str'}).set_index('sample.id')['SBP_PRS-CsX_SBP_PS_std_sum_std']

model_2_prs_train_sbp = ukbb_prs_train_sbp
model_2_prs_test_sbp = ukbb_prs_test_sbp

model_3_prs_train_sbp = pd.concat([ukbb_prs_train_sbp, bbj_prs_train_sbp, mvp_prs_train_sbp], axis = 1)
model_3_prs_test_sbp = pd.concat([ukbb_prs_test_sbp, bbj_prs_test_sbp, mvp_prs_test_sbp], axis = 1)

#DBP PRS features
model_1_prs_train_dbp = ukbb_prs_train_dbp['DBP_UKBB+ICBP_Pt_0.01']
model_1_prs_test_dbp = ukbb_prs_test_dbp['DBP_UKBB+ICBP_Pt_0.01']

model_csx_prs_train_dbp = pd.read_csv(prs_phenotypes_dir+'TOPMED_DBP_PRS-CsX_prs_train.csv', \
                                dtype = {'sample.id':'str'}).set_index('sample.id')['DBP_PRS-CsX_DBP_PS_std_sum_std']
model_csx_prs_test_dbp = pd.read_csv(prs_phenotypes_dir+'TOPMED_DBP_PRS-CsX_prs_test.csv', \
                                dtype = {'sample.id':'str'}).set_index('sample.id')['DBP_PRS-CsX_DBP_PS_std_sum_std']

model_2_prs_train_dbp = ukbb_prs_train_dbp
model_2_prs_test_dbp = ukbb_prs_test_dbp

model_3_prs_train_dbp = pd.concat([ukbb_prs_train_dbp, bbj_prs_train_dbp, mvp_prs_train_dbp], axis = 1)
model_3_prs_test_dbp = pd.concat([ukbb_prs_test_dbp, bbj_prs_test_dbp, mvp_prs_test_dbp], axis = 1)

In [14]:
phenotype_list = ['SBP_V1','DBP_V1'] 
var_list = ['SBP','DBP']
gwas_list = ['BBJ','MVP','UKBB+ICBP']

## SBP XGB Genetic Models

In [None]:
xgb_genetic_model_1_sbp_prediction_train, xgb_genetic_model_1_sbp_prediction_test, xgb_model_1_sbp_no_study,xgb_model_1_sbp_feat_importance = run_model(['SBP_V1'], 
                                                                                   ['SBP'], 
                                                                                   ['UKBB+ICBP'], 
                                                                                   base_model = "No", 
                                                                                   pc_include = False,
                                                                                   residuals_model = "xgboost",
                                                                                   remove_study = True,
                                                                                   remove_na = False,
                                                                                   prs_train = model_1_prs_train_sbp, 
                                                                                   prs_test = model_1_prs_test_sbp,
                                                                                   y_train_loaded = baseline_y_train_sbp,
                                                                                   y_test_loaded = baseline_y_test_sbp,
                                                                                   adj_y_train_loaded = baseline_y_train_sbp_adjusted,
                                                                                   adj_y_test_loaded = baseline_y_test_sbp_adjusted,
                                                                                   phenotype_data_train_loaded = baseline_phenotype_data_train_sbp,
                                                                                   phenotype_data_test_loaded = baseline_phenotype_data_test_sbp,
                                                                                   mgb_y = None,
                                                                                   mgb_y_adj = None,
                                                                                   mgb_test = None,
                                                                                   tune_xgb = False,
                                                                                   save_model = False,
                                                                                   model_name_saved = 'TOPMED sbp_genetic_model_1_xgb_xgb',
                                                                                   model_weights_load = True,
                                                                                   params = sbp_params_genetic,     
                                                                                   xgb_n_estimator = sbp_num_boost_rounds_genetic,
                                                                                   optuna_tuning_save = "sbp_model_1_optuna_genetic_tuning.csv"
                                                                                  )
xgb_model_1_sbp_feat_importance

In [None]:
xgb_genetic_model_csx_sbp_prediction_train, xgb_genetic_model_csx_sbp_prediction_test, xgb_model_csx_sbp_no_study,xgb_model_csx_sbp_feat_importance = run_model(['SBP_V1'], 
                                                                                   ['SBP'], 
                                                                                   ['PRS-CsX'], 
                                                                                   base_model = "No", 
                                                                                   pc_include = False,
                                                                                   residuals_model = "xgboost",
                                                                                   remove_study = True,
                                                                                   remove_na = False,
                                                                                   prs_train = model_csx_prs_train_sbp, 
                                                                                   prs_test = model_csx_prs_test_sbp,
                                                                                   y_train_loaded = baseline_y_train_sbp,
                                                                                   y_test_loaded = baseline_y_test_sbp,
                                                                                   adj_y_train_loaded = baseline_y_train_sbp_adjusted,
                                                                                   adj_y_test_loaded = baseline_y_test_sbp_adjusted,
                                                                                   phenotype_data_train_loaded = baseline_phenotype_data_train_sbp,
                                                                                   phenotype_data_test_loaded = baseline_phenotype_data_test_sbp,
                                                                                   mgb_y = None,
                                                                                   mgb_y_adj = None,
                                                                                   mgb_test = None,
                                                                                   tune_xgb = False,
                                                                                   save_model = False,
                                                                                   model_name_saved = 'TOPMED sbp_genetic_model_csx_xgb_xgb',
                                                                                   model_weights_load = True,
                                                                                   params = sbp_csx_model_params,     
                                                                                   xgb_n_estimator = sbp_num_boost_rounds_csx_model,
                                                                                   optuna_tuning_save = "sbp_model_csx_optuna_genetic_tuning.csv"
                                                                                  )
xgb_model_csx_sbp_feat_importance

In [None]:
xgb_genetic_model_2_sbp_prediction_train, xgb_genetic_model_2_sbp_prediction_test, xgb_model_2_sbp_no_study,xgb_model_2_sbp_feat_importance = run_model(['SBP_V1'], 
                                                                                   ['SBP'], 
                                                                                   ['UKBB+ICBP'], 
                                                                                   base_model = "No", 
                                                                                   pc_include = False,
                                                                                   residuals_model = "xgboost",
                                                                                   remove_study = True,
                                                                                   prs_train = model_2_prs_train_sbp, 
                                                                                   prs_test = model_2_prs_test_sbp,
                                                                                   y_train_loaded = baseline_y_train_sbp,
                                                                                   y_test_loaded = baseline_y_test_sbp,
                                                                                   adj_y_train_loaded = baseline_y_train_sbp_adjusted,
                                                                                   adj_y_test_loaded = baseline_y_test_sbp_adjusted,
                                                                                   phenotype_data_train_loaded = baseline_phenotype_data_train_sbp,
                                                                                   phenotype_data_test_loaded = baseline_phenotype_data_test_sbp,
                                                                                   mgb_y = None,
                                                                                   mgb_y_adj = None,
                                                                                   mgb_test = None,
                                                                                   save_model = False,
                                                                                   tune_xgb = False,
                                                                                   model_name_saved = 'TOPMED sbp_genetic_model_2_xgb_xgb',
                                                                                   model_weights_load = True,
                                                                                   params = sbp_params_genetic_model_2,     
                                                                                   xgb_n_estimator = sbp_num_boost_rounds_genetic_model_2,
                                                                                   optuna_tuning_save = "sbp_model_2_optuna_genetic_tuning.csv")
xgb_model_2_sbp_feat_importance

In [None]:
xgb_genetic_model_3_sbp_prediction_train, xgb_genetic_model_3_sbp_prediction_test, xgb_model_3_sbp_no_study,xgb_model_3_sbp_feat_importance = run_model(['SBP_V1'], 
                                                                                   ['SBP'], 
                                                                                   gwas_list, 
                                                                                   base_model = "No", 
                                                                                   pc_include = False,
                                                                                   residuals_model = "xgboost",
                                                                                   remove_study = True,
                                                                                   prs_train = model_3_prs_train_sbp, 
                                                                                   prs_test = model_3_prs_test_sbp,
                                                                                   y_train_loaded = baseline_y_train_sbp,
                                                                                   y_test_loaded = baseline_y_test_sbp,
                                                                                   adj_y_train_loaded = baseline_y_train_sbp_adjusted,
                                                                                   adj_y_test_loaded = baseline_y_test_sbp_adjusted,
                                                                                   phenotype_data_train_loaded = baseline_phenotype_data_train_sbp,
                                                                                   phenotype_data_test_loaded = baseline_phenotype_data_test_sbp,
                                                                                   mgb_y = None,
                                                                                   mgb_y_adj = None,
                                                                                   mgb_test = None,
                                                                                   save_model = False,
                                                                                   tune_xgb = False,
                                                                                   model_name_saved = 'TOPMED sbp_genetic_model_3_xgb_xgb',
                                                                                   model_weights_load = True,
                                                                                   params = sbp_params_genetic_model_3,     
                                                                                   xgb_n_estimator = sbp_num_boost_rounds_genetic_model_3,
                                                                                   optuna_tuning_save = "sbp_model_3_optuna_genetic_tuning.csv")
xgb_model_3_sbp_feat_importance

In [None]:
lasso_genetic_model_local_prs_ukbb_sbp_prediction_train, lasso_genetic_model_local_prs_ukbb_sbp_prediction_test, lasso_model_local_prs_ukbb_sbp_no_study,lasso_model_local_prs_ukbb_sbp_feat_importance = run_model(['SBP_V1'], 
                                                                                   ['SBP'], 
                                                                                   ['UKBB+ICBP'], 
                                                                                   base_model = "No", 
                                                                                   pc_include = False,
                                                                                   residuals_model = "xgboost",
                                                                                   remove_study = True,
                                                                                   prs_train = ukbb_local_prs_train_sbp_features_selected, 
                                                                                   prs_test = ukbb_local_prs_test_sbp_features_selected,
                                                                                   y_train_loaded = baseline_y_train_sbp,
                                                                                   y_test_loaded = baseline_y_test_sbp,
                                                                                   adj_y_train_loaded = baseline_y_train_sbp_adjusted,
                                                                                   adj_y_test_loaded = baseline_y_test_sbp_adjusted,
                                                                                   phenotype_data_train_loaded = baseline_phenotype_data_train_sbp,
                                                                                   phenotype_data_test_loaded = baseline_phenotype_data_test_sbp,
                                                                                   mgb_y = None,
                                                                                   mgb_y_adj = None,
                                                                                   mgb_test = None,
                                                                                   save_model = False,
                                                                                   tune_xgb = False,
                                                                                   model_name_saved = 'TOPMED sbp_genetic_model_local_prs_ukbb_xgb_lasso',
                                                                                   model_weights_load = True,
                                                                                   params = sbp_local_prs_optuna_params,     
                                                                                   xgb_n_estimator = sbp_local_prs_num_boost_round,
                                                                                   prediction_model_type = 'lasso_regression',
                                                                                   optuna_tuning_save = "sbp_model_local_prs_ukbb_lasso_optuna_genetic_tuning.csv")
lasso_model_local_prs_ukbb_sbp_feat_importance

In [None]:
xgb_genetic_model_local_prs_ukbb_sbp_prediction_train, xgb_genetic_model_local_prs_ukbb_sbp_prediction_test, xgb_model_local_prs_ukbb_sbp_no_study,xgb_model_local_prs_ukbb_sbp_feat_importance = run_model(['SBP_V1'], 
                                                                                   ['SBP'], 
                                                                                   ['UKBB+ICBP'], 
                                                                                   base_model = "No", 
                                                                                   pc_include = False,
                                                                                   residuals_model = "xgboost",
                                                                                   remove_study = True,
                                                                                   prs_train = ukbb_local_prs_train_sbp_features_selected, 
                                                                                   prs_test = ukbb_local_prs_test_sbp_features_selected,
                                                                                   y_train_loaded = baseline_y_train_sbp,
                                                                                   y_test_loaded = baseline_y_test_sbp,
                                                                                   adj_y_train_loaded = baseline_y_train_sbp_adjusted,
                                                                                   adj_y_test_loaded = baseline_y_test_sbp_adjusted,
                                                                                   phenotype_data_train_loaded = baseline_phenotype_data_train_sbp,
                                                                                   phenotype_data_test_loaded = baseline_phenotype_data_test_sbp,
                                                                                   mgb_y = None,
                                                                                   mgb_y_adj = None,
                                                                                   mgb_test = None,
                                                                                   save_model = False,
                                                                                   tune_xgb = False,
                                                                                   model_name_saved = 'TOPMED sbp_genetic_model_local_prs_ukbb_xgb_xgb',
                                                                                   model_weights_load = True,
                                                                                   params = sbp_local_prs_optuna_params,     
                                                                                   xgb_n_estimator = sbp_local_prs_num_boost_round,
                                                                                   optuna_tuning_save = "sbp_model_local_prs_ukbb_optuna_genetic_tuning.csv")
xgb_model_local_prs_ukbb_sbp_feat_importance

In [26]:
xgb_model_1_sbp_no_study["model"] = ["model_1"]*5
xgb_model_2_sbp_no_study["model"] = ["model_2"]*5
xgb_model_3_sbp_no_study["model"] = ["model_3"]*5
xgb_model_csx_sbp_no_study["model"] = ["model_csx"]*5

In [None]:
xgb_sbp_no_study_df = pd.concat([xgb_model_1_sbp_no_study,xgb_model_2_sbp_no_study,
                                xgb_model_3_sbp_no_study, xgb_model_csx_sbp_no_study])

#xgb_sbp_no_study_df.to_csv("/2022_BP_ensemble/Results/sbp_xgb_full_prs_results.csv")


## SBP Linear Regression Genetic Models

In [None]:
lr_genetic_model_1_sbp_prediction_train, lr_genetic_model_1_sbp_prediction_test, lr_model_1_sbp_no_study,lr_model_1_sbp_feat_importance = run_model(['SBP_V1'], 
                                                                                   ['SBP'], 
                                                                                   ['UKBB+ICBP'], 
                                                                                   base_model = "No", 
                                                                                   pc_include = False,
                                                                                   residuals_model = "xgboost",
                                                                                   remove_study = True,
                                                                                   remove_na = False,
                                                                                   prs_train = model_1_prs_train_sbp, 
                                                                                   prs_test = model_1_prs_test_sbp,
                                                                                   y_train_loaded = baseline_y_train_sbp,
                                                                                   y_test_loaded = baseline_y_test_sbp,
                                                                                   adj_y_train_loaded = baseline_y_train_sbp_adjusted,
                                                                                   adj_y_test_loaded = baseline_y_test_sbp_adjusted,
                                                                                   phenotype_data_train_loaded = baseline_phenotype_data_train_sbp,
                                                                                   phenotype_data_test_loaded = baseline_phenotype_data_test_sbp,
                                                                                   mgb_y = None,
                                                                                   mgb_y_adj = None,
                                                                                   mgb_test = None,
                                                                                   tune_xgb = False,
                                                                                   save_model = False,
                                                                                   model_name_saved = 'TOPMED sbp_genetic_model_1_xgb_lr',
                                                                                   model_weights_load = True,
                                                                                   prediction_model_type = "linear regression"
                                                                                  )

In [None]:
lr_genetic_model_csx_sbp_prediction_train, lr_genetic_model_csx_sbp_prediction_test, lr_model_csx_sbp_no_study,lr_model_csx_sbp_feat_importance = run_model(['SBP_V1'], 
                                                                                   ['SBP'], 
                                                                                   ['PRS-CsX'], 
                                                                                   base_model = "No", 
                                                                                   pc_include = False,
                                                                                   residuals_model = "xgboost",
                                                                                   remove_study = True,
                                                                                   remove_na = True,
                                                                                   prs_train = model_csx_prs_train_sbp, 
                                                                                   prs_test = model_csx_prs_test_sbp,
                                                                                   y_train_loaded = baseline_y_train_sbp,
                                                                                   y_test_loaded = baseline_y_test_sbp,
                                                                                   adj_y_train_loaded = baseline_y_train_sbp_adjusted,
                                                                                   adj_y_test_loaded = baseline_y_test_sbp_adjusted,
                                                                                   phenotype_data_train_loaded = baseline_phenotype_data_train_sbp,
                                                                                   phenotype_data_test_loaded = baseline_phenotype_data_test_sbp,
                                                                                   mgb_y = None,
                                                                                   mgb_y_adj = None,
                                                                                   mgb_test = None,
                                                                                   tune_xgb = False,
                                                                                   save_model = False,
                                                                                   model_name_saved = 'TOPMED sbp_genetic_model_csx_xgb_lr',
                                                                                   model_weights_load = True,
                                                                                   prediction_model_type = "linear regression"
                                                                                  )

In [None]:
lr_genetic_model_2_sbp_prediction_train, lr_genetic_model_2_sbp_prediction_test, lr_model_2_sbp_no_study,lr_model_2_sbp_feat_importance = run_model(['SBP_V1'], 
                                                                                   ['SBP'], 
                                                                                   ['UKBB+ICBP'], 
                                                                                   base_model = "No", 
                                                                                   pc_include = False,
                                                                                   residuals_model = "xgboost",
                                                                                   remove_study = True,
                                                                                   prs_train = model_2_prs_train_sbp, 
                                                                                   prs_test = model_2_prs_test_sbp,
                                                                                   y_train_loaded = baseline_y_train_sbp,
                                                                                   y_test_loaded = baseline_y_test_sbp,
                                                                                   adj_y_train_loaded = baseline_y_train_sbp_adjusted,
                                                                                   adj_y_test_loaded = baseline_y_test_sbp_adjusted,
                                                                                   phenotype_data_train_loaded = baseline_phenotype_data_train_sbp,
                                                                                   phenotype_data_test_loaded = baseline_phenotype_data_test_sbp,
                                                                                   mgb_y = None,
                                                                                   mgb_y_adj = None,
                                                                                   mgb_test = None,
                                                                                   save_model = False,
                                                                                   tune_xgb = False,
                                                                                   model_name_saved = 'TOPMED sbp_genetic_model_2_xgb_lr',
                                                                                   model_weights_load = True,
                                                                                   params = sbp_params_genetic_model_2,     
                                                                                   xgb_n_estimator = sbp_num_boost_rounds_genetic_model_2,
                                                                                   prediction_model_type = "linear regression")

In [None]:
lr_genetic_model_3_sbp_prediction_train, lr_genetic_model_3_sbp_prediction_test, lr_model_3_sbp_no_study,lr_model_3_sbp_feat_importance = run_model(['SBP_V1'], 
                                                                                   ['SBP'], 
                                                                                   gwas_list, 
                                                                                   base_model = "No", 
                                                                                   pc_include = False,
                                                                                   residuals_model = "xgboost",
                                                                                   remove_study = True,
                                                                                   prs_train = model_3_prs_train_sbp, 
                                                                                   prs_test = model_3_prs_test_sbp,
                                                                                   y_train_loaded = baseline_y_train_sbp,
                                                                                   y_test_loaded = baseline_y_test_sbp,
                                                                                   adj_y_train_loaded = baseline_y_train_sbp_adjusted,
                                                                                   adj_y_test_loaded = baseline_y_test_sbp_adjusted,
                                                                                   phenotype_data_train_loaded = baseline_phenotype_data_train_sbp,
                                                                                   phenotype_data_test_loaded = baseline_phenotype_data_test_sbp,
                                                                                   mgb_y = None,
                                                                                   mgb_y_adj = None,
                                                                                   mgb_test = None,
                                                                                   save_model = False,
                                                                                   tune_xgb = False,
                                                                                   model_name_saved = 'TOPMED sbp_genetic_model_3_xgb_lr',
                                                                                   model_weights_load = True,
                                                                                   prediction_model_type = "linear regression")

In [None]:
lr_genetic_model_local_prs_ukbb_sbp_prediction_train, lr_genetic_model_local_prs_ukbb_sbp_prediction_test, lr_model_local_prs_ukbb_sbp_no_study,lr_model_local_prs_ukbb_sbp_feat_importance = run_model(['SBP_V1'], 
                                                                                   ['SBP'], 
                                                                                   ['UKBB+ICBP'], 
                                                                                   base_model = "No", 
                                                                                   pc_include = False,
                                                                                   residuals_model = "xgboost",
                                                                                   remove_study = True,
                                                                                   prs_train = ukbb_local_prs_train_sbp_features_selected, 
                                                                                   prs_test = ukbb_local_prs_test_sbp_features_selected,
                                                                                   y_train_loaded = baseline_y_train_sbp,
                                                                                   y_test_loaded = baseline_y_test_sbp,
                                                                                   adj_y_train_loaded = baseline_y_train_sbp_adjusted,
                                                                                   adj_y_test_loaded = baseline_y_test_sbp_adjusted,
                                                                                   phenotype_data_train_loaded = baseline_phenotype_data_train_sbp,
                                                                                   phenotype_data_test_loaded = baseline_phenotype_data_test_sbp,
                                                                                   mgb_y = None,
                                                                                   mgb_y_adj = None,
                                                                                   mgb_test = None,
                                                                                   save_model = False,
                                                                                   tune_xgb = False,
                                                                                   model_name_saved = 'TOPMED sbp_genetic_model_local_prs_ukbb_xgb_lr',
                                                                                   model_weights_load = True,
                                                                                   params = sbp_local_prs_optuna_params,     
                                                                                   xgb_n_estimator = sbp_local_prs_num_boost_round,
                                                                                   prediction_model_type = "linear regression")

In [33]:
lr_model_1_sbp_no_study["model"] = ["model_1"]*5
lr_model_2_sbp_no_study["model"] = ["model_2"]*5
lr_model_3_sbp_no_study["model"] = ["model_3"]*5
lr_model_csx_sbp_no_study["model"] = ["model_csx"]*5
lr_model_local_prs_ukbb_sbp_no_study["model"] = ["local_prs"]*5

In [None]:
lr_sbp_no_study_df = pd.concat([lr_model_1_sbp_no_study, lr_model_2_sbp_no_study,
                                lr_model_3_sbp_no_study, lr_model_csx_sbp_no_study, lr_model_local_prs_ukbb_sbp_no_study])

# lr_sbp_no_study_df.to_csv("/2022_BP_ensemble/Results/sbp_lr_full_prs_results.csv")
lr_sbp_no_study_df

## DBP XGB Genetic Models

In [None]:
xgb_genetic_model_1_dbp_prediction_train, xgb_genetic_model_1_dbp_prediction_test, xgb_model_1_dbp_no_study,xgb_model_1_dbp_feat_importance = run_model(['DBP_V1'], 
                                                                                   ['DBP'], 
                                                                                   ['UKBB+ICBP'], 
                                                                                   base_model = "No", 
                                                                                   pc_include = False,
                                                                                   residuals_model = "xgboost",
                                                                                   remove_study = True,
                                                                                   remove_na = False,
                                                                                   prs_train = model_1_prs_train_dbp, 
                                                                                   prs_test = model_1_prs_test_dbp,
                                                                                   y_train_loaded = baseline_y_train_dbp,
                                                                                   y_test_loaded = baseline_y_test_dbp,
                                                                                   adj_y_train_loaded = baseline_y_train_dbp_adjusted,
                                                                                   adj_y_test_loaded = baseline_y_test_dbp_adjusted,
                                                                                   phenotype_data_train_loaded = baseline_phenotype_data_train_dbp,
                                                                                   phenotype_data_test_loaded = baseline_phenotype_data_test_dbp,
                                                                                   mgb_y = None,
                                                                                   mgb_y_adj = None,
                                                                                   mgb_test = None,
                                                                                   tune_xgb = False,
                                                                                   save_model = False,
                                                                                   model_name_saved = 'TOPMED dbp_genetic_model_1_xgb_xgb',
                                                                                   model_weights_load = True,
                                                                                   params = dbp_params_genetic,     
                                                                                   xgb_n_estimator = dbp_num_boost_rounds_genetic,
                                                                                   optuna_tuning_save = "dbp_model_1_optuna_genetic_tuning.csv"
                                                                                  )

xgb_model_1_dbp_feat_importance

In [None]:
xgb_genetic_model_csx_dbp_prediction_train, xgb_genetic_model_csx_dbp_prediction_test, xgb_model_csx_dbp_no_study,xgb_model_csx_dbp_feat_importance = run_model(['DBP_V1'], 
                                                                                   ['DBP'], 
                                                                                   ['PRS-CsX'], 
                                                                                   base_model = "No", 
                                                                                   pc_include = False,
                                                                                   residuals_model = "xgboost",
                                                                                   remove_study = True,
                                                                                   remove_na = False,
                                                                                   prs_train = model_csx_prs_train_dbp, 
                                                                                   prs_test = model_csx_prs_test_dbp,
                                                                                   y_train_loaded = baseline_y_train_dbp,
                                                                                   y_test_loaded = baseline_y_test_dbp,
                                                                                   adj_y_train_loaded = baseline_y_train_dbp_adjusted,
                                                                                   adj_y_test_loaded = baseline_y_test_dbp_adjusted,
                                                                                   phenotype_data_train_loaded = baseline_phenotype_data_train_dbp,
                                                                                   phenotype_data_test_loaded = baseline_phenotype_data_test_dbp,
                                                                                   mgb_y = None,
                                                                                   mgb_y_adj = None,
                                                                                   mgb_test = None,
                                                                                   tune_xgb = False,
                                                                                   save_model = False,
                                                                                   model_name_saved = 'TOPMED dbp_genetic_model_csx_xgb_xgb',
                                                                                   model_weights_load = True,
                                                                                   params = dbp_csx_model_params,     
                                                                                   xgb_n_estimator = dbp_num_boost_rounds_csx_model,
                                                                                   optuna_tuning_save = "dbp_model_csx_optuna_genetic_tuning.csv"
                                                                                  )
xgb_model_csx_dbp_feat_importance

In [None]:
xgb_genetic_model_2_dbp_prediction_train, xgb_genetic_model_2_dbp_prediction_test, xgb_model_2_dbp_no_study,xgb_model_2_dbp_feat_importance = run_model(['DBP_V1'], 
                                                                                   ['DBP'], 
                                                                                   ['UKBB+ICBP'], 
                                                                                   base_model = "No", 
                                                                                   pc_include = False,
                                                                                   residuals_model = "xgboost",
                                                                                   remove_study = True,
                                                                                   prs_train = model_2_prs_train_dbp, 
                                                                                   prs_test = model_2_prs_test_dbp,
                                                                                   y_train_loaded = baseline_y_train_dbp,
                                                                                   y_test_loaded = baseline_y_test_dbp,
                                                                                   adj_y_train_loaded = baseline_y_train_dbp_adjusted,
                                                                                   adj_y_test_loaded = baseline_y_test_dbp_adjusted,
                                                                                   phenotype_data_train_loaded = baseline_phenotype_data_train_dbp,
                                                                                   phenotype_data_test_loaded = baseline_phenotype_data_test_dbp,
                                                                                   mgb_y = None,
                                                                                   mgb_y_adj = None,
                                                                                   mgb_test = None,
                                                                                   save_model = False,
                                                                                   tune_xgb = False,
                                                                                   model_name_saved = 'TOPMED dbp_genetic_model_2_xgb_xgb',
                                                                                   model_weights_load = True,
                                                                                   params = dbp_params_genetic_model_2,     
                                                                                   xgb_n_estimator = dbp_num_boost_rounds_genetic_model_2,
                                                                                   optuna_tuning_save = "dbp_model_2_optuna_genetic_tuning.csv")
xgb_model_2_dbp_feat_importance

In [None]:
xgb_genetic_model_3_dbp_prediction_train, xgb_genetic_model_3_dbp_prediction_test, xgb_model_3_dbp_no_study,xgb_model_3_dbp_feat_importance = run_model(['DBP_V1'], 
                                                                                   ['DBP'], 
                                                                                   gwas_list, 
                                                                                   base_model = "No", 
                                                                                   pc_include = False,
                                                                                   residuals_model = "xgboost",
                                                                                   remove_study = True,
                                                                                   prs_train = model_3_prs_train_dbp, 
                                                                                   prs_test = model_3_prs_test_dbp,
                                                                                   y_train_loaded = baseline_y_train_dbp,
                                                                                   y_test_loaded = baseline_y_test_dbp,
                                                                                   adj_y_train_loaded = baseline_y_train_dbp_adjusted,
                                                                                   adj_y_test_loaded = baseline_y_test_dbp_adjusted,
                                                                                   phenotype_data_train_loaded = baseline_phenotype_data_train_dbp,
                                                                                   phenotype_data_test_loaded = baseline_phenotype_data_test_dbp,
                                                                                   mgb_y = None,
                                                                                   mgb_y_adj = None,
                                                                                   mgb_test = None,
                                                                                   save_model = False,
                                                                                   tune_xgb = False,
                                                                                   model_name_saved = 'TOPMED dbp_genetic_model_3_xgb_xgb',
                                                                                   model_weights_load = True,
                                                                                   params = dbp_params_genetic_model_3,     
                                                                                   xgb_n_estimator = dbp_num_boost_rounds_genetic_model_3,
                                                                                   optuna_tuning_save = "dbp_model_3_optuna_genetic_tuning.csv")
xgb_model_3_dbp_feat_importance

In [None]:
lasso_genetic_model_local_prs_ukbb_dbp_prediction_train, lasso_genetic_model_local_prs_ukbb_dbp_prediction_test, lasso_model_local_prs_ukbb_dbp_no_study,lasso_model_local_prs_ukbb_dbp_feat_importance = run_model(['DBP_V1'], 
                                                                                   ['DBP'], 
                                                                                   ['UKBB+ICBP'], 
                                                                                   base_model = "No", 
                                                                                   pc_include = False,
                                                                                   residuals_model = "xgboost",
                                                                                   remove_study = True,
                                                                                   prs_train = ukbb_local_prs_train_dbp_features_selected, 
                                                                                   prs_test = ukbb_local_prs_test_dbp_features_selected,
                                                                                   y_train_loaded = baseline_y_train_dbp,
                                                                                   y_test_loaded = baseline_y_test_dbp,
                                                                                   adj_y_train_loaded = baseline_y_train_dbp_adjusted,
                                                                                   adj_y_test_loaded = baseline_y_test_dbp_adjusted,
                                                                                   phenotype_data_train_loaded = baseline_phenotype_data_train_dbp,
                                                                                   phenotype_data_test_loaded = baseline_phenotype_data_test_dbp,
                                                                                   mgb_y = None,
                                                                                   mgb_y_adj = None,
                                                                                   mgb_test = None,
                                                                                   save_model = False,
                                                                                   tune_xgb = False,
                                                                                   model_name_saved = 'TOPMED dbp_genetic_model_local_prs_ukbb_xgb_lasso',
                                                                                   model_weights_load = True,
                                                                                   params = dbp_local_prs_optuna_params,     
                                                                                   xgb_n_estimator = dbp_local_prs_num_boost_round,
                                                                                   prediction_model_type = 'lasso_regression',
                                                                                   optuna_tuning_save = "dbp_model_local_prs_ukbb_lasso_optuna_genetic_tuning.csv")
lasso_model_local_prs_ukbb_dbp_feat_importance

In [None]:
xgb_genetic_model_local_prs_ukbb_dbp_prediction_train, xgb_genetic_model_local_prs_ukbb_dbp_prediction_test, xgb_model_local_prs_ukbb_dbp_no_study,xgb_model_local_prs_ukbb_dbp_feat_importance = run_model(['DBP_V1'], 
                                                                                   ['DBP'], 
                                                                                   ['UKBB+ICBP'], 
                                                                                   base_model = "No", 
                                                                                   pc_include = False,
                                                                                   residuals_model = "xgboost",
                                                                                   remove_study = True,
                                                                                   prs_train = ukbb_local_prs_train_dbp_features_selected, 
                                                                                   prs_test = ukbb_local_prs_test_dbp_features_selected,
                                                                                   y_train_loaded = baseline_y_train_dbp,
                                                                                   y_test_loaded = baseline_y_test_dbp,
                                                                                   adj_y_train_loaded = baseline_y_train_dbp_adjusted,
                                                                                   adj_y_test_loaded = baseline_y_test_dbp_adjusted,
                                                                                   phenotype_data_train_loaded = baseline_phenotype_data_train_dbp,
                                                                                   phenotype_data_test_loaded = baseline_phenotype_data_test_dbp,
                                                                                   mgb_y = None,
                                                                                   mgb_y_adj = None,
                                                                                   mgb_test = None,
                                                                                   save_model = False,
                                                                                   tune_xgb = False,
                                                                                   model_name_saved = 'TOPMED dbp_genetic_model_local_prs_ukbb_xgb_xgb',
                                                                                   model_weights_load = True,
                                                                                   params = dbp_local_prs_optuna_params,     
                                                                                   xgb_n_estimator = dbp_local_prs_num_boost_round,
                                                                                   optuna_tuning_save = "dbp_model_local_prs_ukbb_optuna_genetic_tuning.csv")
xgb_model_local_prs_ukbb_dbp_feat_importance

In [41]:
xgb_model_1_dbp_no_study["model"] = ["model_1"]*5
xgb_model_2_dbp_no_study["model"] = ["model_2"]*5
xgb_model_3_dbp_no_study["model"] = ["model_3"]*5
xgb_model_csx_dbp_no_study["model"] = ["model_csx"]*5
lasso_model_local_prs_ukbb_dbp_no_study["model"] = ["model_lasso"]*5
xgb_model_local_prs_ukbb_dbp_no_study["model"] = ["model_local_prs"]*5

In [None]:
xgb_dbp_no_study_df = pd.concat([xgb_model_1_dbp_no_study,xgb_model_2_dbp_no_study,
                                xgb_model_3_dbp_no_study, xgb_model_csx_dbp_no_study, lasso_model_local_prs_ukbb_dbp_no_study,xgb_model_local_prs_ukbb_dbp_no_study])

# xgb_dbp_no_study_df.to_csv("/2022_BP_ensemble/Results/dbp_xgb_full_prs_results.csv")
xgb_dbp_no_study_df

## DBP Linear Regression Genetic Models

In [None]:
lr_genetic_model_1_dbp_prediction_train, lr_genetic_model_1_dbp_prediction_test, lr_model_1_dbp_no_study,lr_model_1_dbp_feat_importance = run_model(['DBP_V1'], 
                                                                                   ['DBP'], 
                                                                                   ['UKBB+ICBP'], 
                                                                                   base_model = "No", 
                                                                                   pc_include = False,
                                                                                   residuals_model = "xgboost",
                                                                                   remove_study = True,
                                                                                   remove_na = False,
                                                                                   prs_train = model_1_prs_train_dbp, 
                                                                                   prs_test = model_1_prs_test_dbp,
                                                                                   y_train_loaded = baseline_y_train_dbp,
                                                                                   y_test_loaded = baseline_y_test_dbp,
                                                                                   adj_y_train_loaded = baseline_y_train_dbp_adjusted,
                                                                                   adj_y_test_loaded = baseline_y_test_dbp_adjusted,
                                                                                   phenotype_data_train_loaded = baseline_phenotype_data_train_dbp,
                                                                                   phenotype_data_test_loaded = baseline_phenotype_data_test_dbp,
                                                                                   mgb_y = None,
                                                                                   mgb_y_adj = None,
                                                                                   mgb_test = None,
                                                                                   tune_xgb = False,
                                                                                   save_model = False,
                                                                                   model_name_saved = 'TOPMED dbp_genetic_model_1_xgb_lr',
                                                                                   model_weights_load = True,
                                                                                   prediction_model_type = "linear regression"
                                                                                  )

In [None]:
lr_genetic_model_csx_dbp_prediction_train, lr_genetic_model_csx_dbp_prediction_test, lr_model_csx_dbp_no_study,lr_model_csx_dbp_feat_importance = run_model(['DBP_V1'], 
                                                                                   ['DBP'], 
                                                                                   ['PRS-CsX'], 
                                                                                   base_model = "No", 
                                                                                   pc_include = False,
                                                                                   residuals_model = "xgboost",
                                                                                   remove_study = True,
                                                                                   remove_na = True,
                                                                                   prs_train = model_csx_prs_train_dbp, 
                                                                                   prs_test = model_csx_prs_test_dbp,
                                                                                   y_train_loaded = baseline_y_train_dbp,
                                                                                   y_test_loaded = baseline_y_test_dbp,
                                                                                   adj_y_train_loaded = baseline_y_train_dbp_adjusted,
                                                                                   adj_y_test_loaded = baseline_y_test_dbp_adjusted,
                                                                                   phenotype_data_train_loaded = baseline_phenotype_data_train_dbp,
                                                                                   phenotype_data_test_loaded = baseline_phenotype_data_test_dbp,
                                                                                   mgb_y = None,
                                                                                   mgb_y_adj = None,
                                                                                   mgb_test = None,
                                                                                   tune_xgb = False,
                                                                                   save_model = False,
                                                                                   model_name_saved = 'TOPMED dbp_genetic_model_csx_xgb_lr',
                                                                                   model_weights_load = True,
                                                                                   prediction_model_type = "linear regression"
                                                                                  )

In [None]:
lr_genetic_model_2_dbp_prediction_train, lr_genetic_model_2_dbp_prediction_test, lr_model_2_dbp_no_study,lr_model_2_dbp_feat_importance = run_model(['DBP_V1'], 
                                                                                   ['DBP'], 
                                                                                   ['UKBB+ICBP'], 
                                                                                   base_model = "No", 
                                                                                   pc_include = False,
                                                                                   residuals_model = "xgboost",
                                                                                   remove_study = True,
                                                                                   prs_train = model_2_prs_train_dbp, 
                                                                                   prs_test = model_2_prs_test_dbp,
                                                                                   y_train_loaded = baseline_y_train_dbp,
                                                                                   y_test_loaded = baseline_y_test_dbp,
                                                                                   adj_y_train_loaded = baseline_y_train_dbp_adjusted,
                                                                                   adj_y_test_loaded = baseline_y_test_dbp_adjusted,
                                                                                   phenotype_data_train_loaded = baseline_phenotype_data_train_dbp,
                                                                                   phenotype_data_test_loaded = baseline_phenotype_data_test_dbp,
                                                                                   mgb_y = None,
                                                                                   mgb_y_adj = None,
                                                                                   mgb_test = None,
                                                                                   save_model = False,
                                                                                   tune_xgb = False,
                                                                                   model_name_saved = 'TOPMED dbp_genetic_model_2_xgb_lr',
                                                                                   model_weights_load = True,
                                                                                   prediction_model_type = "linear regression")

In [None]:
lr_genetic_model_3_dbp_prediction_train, lr_genetic_model_3_dbp_prediction_test, lr_model_3_dbp_no_study,lr_model_3_dbp_feat_importance = run_model(['DBP_V1'], 
                                                                                   ['DBP'], 
                                                                                   gwas_list, 
                                                                                   base_model = "No", 
                                                                                   pc_include = False,
                                                                                   residuals_model = "xgboost",
                                                                                   remove_study = True,
                                                                                   prs_train = model_3_prs_train_dbp, 
                                                                                   prs_test = model_3_prs_test_dbp,
                                                                                   y_train_loaded = baseline_y_train_dbp,
                                                                                   y_test_loaded = baseline_y_test_dbp,
                                                                                   adj_y_train_loaded = baseline_y_train_dbp_adjusted,
                                                                                   adj_y_test_loaded = baseline_y_test_dbp_adjusted,
                                                                                   phenotype_data_train_loaded = baseline_phenotype_data_train_dbp,
                                                                                   phenotype_data_test_loaded = baseline_phenotype_data_test_dbp,
                                                                                   mgb_y = None,
                                                                                   mgb_y_adj = None,
                                                                                   mgb_test = None,
                                                                                   save_model = False,
                                                                                   tune_xgb = False,
                                                                                   model_name_saved = 'TOPMED dbp_genetic_model_3_xgb_lr',
                                                                                   model_weights_load = True,
                                                                                   prediction_model_type = "linear regression")

In [None]:
lr_genetic_model_local_prs_ukbb_dbp_prediction_train, lr_genetic_model_local_prs_ukbb_dbp_prediction_test, lr_model_local_prs_ukbb_dbp_no_study,lr_model_local_prs_ukbb_dbp_feat_importance = run_model(['DBP_V1'], 
                                                                                   ['DBP'], 
                                                                                   ['UKBB+ICBP'], 
                                                                                   base_model = "No", 
                                                                                   pc_include = False,
                                                                                   residuals_model = "xgboost",
                                                                                   remove_study = True,
                                                                                   prs_train = ukbb_local_prs_train_dbp_features_selected, 
                                                                                   prs_test = ukbb_local_prs_test_dbp_features_selected,
                                                                                   y_train_loaded = baseline_y_train_dbp,
                                                                                   y_test_loaded = baseline_y_test_dbp,
                                                                                   adj_y_train_loaded = baseline_y_train_dbp_adjusted,
                                                                                   adj_y_test_loaded = baseline_y_test_dbp_adjusted,
                                                                                   phenotype_data_train_loaded = baseline_phenotype_data_train_dbp,
                                                                                   phenotype_data_test_loaded = baseline_phenotype_data_test_dbp,
                                                                                   mgb_y = None,
                                                                                   mgb_y_adj = None,
                                                                                   mgb_test = None,
                                                                                   save_model = False,
                                                                                   tune_xgb = False,
                                                                                   model_name_saved = 'TOPMED dbp_genetic_model_local_prs_ukbb_xgb_lr',
                                                                                   model_weights_load = True,
                                                                                   prediction_model_type = "linear regression")

In [48]:
lr_model_1_dbp_no_study["model"] = ["model_1"]*5
lr_model_2_dbp_no_study["model"] = ["model_2"]*5
lr_model_3_dbp_no_study["model"] = ["model_3"]*5
lr_model_csx_dbp_no_study["model"] = ["model_csx"]*5
lr_model_local_prs_ukbb_dbp_no_study['model'] = ['local_prs']*5

In [None]:
lr_dbp_no_study_df = pd.concat([lr_model_1_dbp_no_study,lr_model_2_dbp_no_study,
                                lr_model_3_dbp_no_study, lr_model_csx_dbp_no_study,lr_model_3_dbp_no_study,lr_model_local_prs_ukbb_dbp_no_study])

# lr_dbp_no_study_df.to_csv("/2022_BP_ensemble/Results/dbp_lr_xgb_full_prs_results_20231211.csv")
lr_dbp_no_study_df

## Full Dataset and Prediction Intervals

In [None]:
model_3_prs_test_sbp

In [None]:
sbp_full_test_data = pd.concat([baseline_phenotype_data_test_sbp,baseline_y_test_sbp, baseline_y_test_sbp_adjusted, 
                                pd.Series(baseline_y_test_sbp-baseline_y_test_sbp_adjusted, name = 'SBP_V1_prediction').set_axis(baseline_phenotype_data_test_sbp.index), 
                                model_3_prs_test_sbp,model_csx_prs_test_sbp, 
                                pd.Series(xgb_genetic_model_1_sbp_prediction_test, name = 'model_1_SBP_V1_xgb_predictions').set_axis(baseline_phenotype_data_test_sbp.index),
                                pd.Series(xgb_genetic_model_csx_sbp_prediction_test, name = 'model_csx_SBP_V1_xgb_predictions').set_axis(baseline_phenotype_data_test_sbp.index),
                                pd.Series(xgb_genetic_model_2_sbp_prediction_test, name = 'model_2_SBP_V1_xgb_predictions').set_axis(baseline_phenotype_data_test_sbp.index),
                                pd.Series(xgb_genetic_model_3_sbp_prediction_test, name = 'model_3_SBP_V1_xgb_predictions').set_axis(baseline_phenotype_data_test_sbp.index),
                                #lasso and local PRS
                                pd.Series(lasso_genetic_model_local_prs_ukbb_sbp_prediction_test, name = 'model_1_SBP_V1_local_lasso_predictions').set_axis(baseline_phenotype_data_test_sbp.index),
                                pd.Series(xgb_genetic_model_local_prs_ukbb_sbp_prediction_test, name = 'model_1_SBP_V1_local_xgb_predictions').set_axis(baseline_phenotype_data_test_sbp.index),
                                
                                pd.Series(lr_genetic_model_1_sbp_prediction_test, name = 'model_1_SBP_V1_lr_predictions').set_axis(baseline_phenotype_data_test_sbp.index),
                                pd.Series(lr_genetic_model_csx_sbp_prediction_test, name = 'model_csx_SBP_V1_lr_predictions').set_axis(model_csx_prs_test_sbp[~model_csx_prs_test_sbp.isnull()].index),
                                pd.Series(lr_genetic_model_2_sbp_prediction_test, name = 'model_2_SBP_V1_lr_predictions').set_axis(baseline_phenotype_data_test_sbp.index),
                                pd.Series(lr_genetic_model_3_sbp_prediction_test, name = 'model_3_SBP_V1_lr_predictions').set_axis(baseline_phenotype_data_test_sbp.index),
                                #lr local prs
                                pd.Series(lr_genetic_model_local_prs_ukbb_sbp_prediction_test, name = 'model_1_SBP_V1_local_lr_predictions').set_axis(baseline_phenotype_data_test_sbp.index)
                               ], axis = 1)

sbp_prediction_intervals = bootstrap_prediction_intervals('SBP_V1', sbp_full_test_data)
sbp_prediction_intervals.columns = ['lower_bound_2.5th_percentile','upper_bound_97.5th_percentile', 'PVE']
sbp_prediction_intervals[['lower_bound_2.5th_percentile','PVE', 'upper_bound_97.5th_percentile']].to_csv("/2022_BP_ensemble/Results/sbp_prediction_intervals.csv")
sbp_prediction_intervals[['lower_bound_2.5th_percentile','PVE', 'upper_bound_97.5th_percentile']]

In [None]:
dbp_full_test_data = pd.concat([baseline_phenotype_data_test_dbp,baseline_y_test_dbp, baseline_y_test_dbp_adjusted, 
                                pd.Series(baseline_y_test_dbp-baseline_y_test_dbp_adjusted, name = 'DBP_V1_prediction').set_axis(baseline_phenotype_data_test_dbp.index), 
                                model_3_prs_test_dbp,model_csx_prs_test_dbp, 
          pd.Series(xgb_genetic_model_1_dbp_prediction_test, name = 'model_1_DBP_V1_xgb_predictions').set_axis(baseline_phenotype_data_test_dbp.index),
          pd.Series(xgb_genetic_model_csx_dbp_prediction_test, name = 'model_csx_DBP_V1_xgb_predictions').set_axis(baseline_phenotype_data_test_dbp.index),
          pd.Series(xgb_genetic_model_2_dbp_prediction_test, name = 'model_2_DBP_V1_xgb_predictions').set_axis(baseline_phenotype_data_test_dbp.index),
          pd.Series(xgb_genetic_model_3_dbp_prediction_test, name = 'model_3_DBP_V1_xgb_predictions').set_axis(baseline_phenotype_data_test_dbp.index),
          
          #lasso and local PRS 
          pd.Series(lasso_genetic_model_local_prs_ukbb_dbp_prediction_test, name = 'model_1_DBP_V1_local_lasso_predictions').set_axis(baseline_phenotype_data_test_dbp.index),
          pd.Series(xgb_genetic_model_local_prs_ukbb_dbp_prediction_test, name = 'model_1_DBP_V1_local_xgb_predictions').set_axis(baseline_phenotype_data_test_dbp.index),
        
                                
          pd.Series(lr_genetic_model_1_dbp_prediction_test, name = 'model_1_DBP_V1_lr_predictions').set_axis(baseline_phenotype_data_test_dbp.index),
          pd.Series(lr_genetic_model_csx_dbp_prediction_test, name = 'model_csx_DBP_V1_lr_predictions').set_axis(model_csx_prs_test_dbp[~model_csx_prs_test_dbp.isnull()].index),
          pd.Series(lr_genetic_model_2_dbp_prediction_test, name = 'model_2_DBP_V1_lr_predictions').set_axis(baseline_phenotype_data_test_dbp.index),
          pd.Series(lr_genetic_model_3_dbp_prediction_test, name = 'model_3_DBP_V1_lr_predictions').set_axis(baseline_phenotype_data_test_dbp.index),
          #lr local prs
          pd.Series(lr_genetic_model_local_prs_ukbb_dbp_prediction_test, name = 'model_1_DBP_V1_local_lr_predictions').set_axis(baseline_phenotype_data_test_dbp.index)], axis = 1)

dbp_prediction_intervals = bootstrap_prediction_intervals('DBP_V1', dbp_full_test_data)
dbp_prediction_intervals.columns = ['lower_bound_2.5th_percentile','upper_bound_97.5th_percentile','PVE']
dbp_prediction_intervals[['lower_bound_2.5th_percentile','PVE','upper_bound_97.5th_percentile']].to_csv("/2022_BP_ensemble/Results/dbp_prediction_intervals.csv")
dbp_prediction_intervals[['lower_bound_2.5th_percentile','PVE','upper_bound_97.5th_percentile']]