In [3]:
###############################################################
# Purpose :  load baseline adjusted data, PRS, and run genetic models using Topmed model weights for MGBB
###############################################################

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Lasso, LassoCV, ElasticNet, ElasticNetCV, LogisticRegression, RidgeCV
from sklearn.metrics import r2_score, explained_variance_score, mean_squared_error, f1_score, accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import cross_val_score
from sklearn import preprocessing

import xgboost as xgb
from xgboost import XGBRegressor
import pickle

import optuna


from step_6b_auxiliary_functions import run_regr, run_prediction, run_model, bootstrap_prediction_intervals

In [18]:
#Read in baseline data and adjusted baseline data 
pheno_path = '/2022_BP_ensemble/MGB_phenotypes/'

baseline_phenotype_data_train_sbp = pd.read_csv(pheno_path+'SBP_MGBB_Train.csv', dtype = {'sample.id':'str'}).set_index('sample.id')

baseline_y_train_sbp = pd.read_csv(pheno_path+'SBP_MGBB_Y_Train.csv', dtype = {'sample.id':'str'}).set_index('sample.id').iloc[:, 0]

baseline_y_train_sbp_adjusted = pd.read_csv(pheno_path+'SBP_MGBB_Y_Train_Resid.csv', \
                                            dtype = {'sample.id':'str'}).set_index('sample.id')
baseline_y_train_sbp_topmed_weights_adjusted = pd.read_csv(pheno_path+'SBP_MGBB_TOPMED_WEIGHTS_Y_Train_Resid.csv', \
                                            dtype = {'sample.id':'str'}).set_index('sample.id')


baseline_phenotype_data_train_dbp = pd.read_csv(pheno_path+'DBP_MGBB_Train.csv', dtype = {'sample.id':'str'}).set_index('sample.id')

baseline_y_train_dbp = pd.read_csv(pheno_path+'DBP_MGBB_Y_Train.csv', dtype = {'sample.id':'str'}).set_index('sample.id').iloc[:, 0]

baseline_y_train_dbp_adjusted = pd.read_csv(pheno_path+'DBP_MGBB_Y_Train_Resid.csv', \
                                            dtype = {'sample.id':'str'}).set_index('sample.id')
baseline_y_train_dbp_topmed_weights_adjusted = pd.read_csv(pheno_path+'DBP_MGBB_TOPMED_WEIGHTS_Y_Train_Resid.csv', \
                                            dtype = {'sample.id':'str'}).set_index('sample.id')

In [8]:
#Read in PRS files
prs_path = '/2022_BP_ensemble/MGB_PRS_files/MGB_Biobank/'

#SBP
ukbb_prs_train_sbp = pd.read_csv(prs_path+'mgbb_sbp_ukbb_prs.csv', \
                                dtype = {'sample.id':'str'}).set_index('sample.id')

bbj_prs_train_sbp = pd.read_csv(prs_path+'mgbb_sbp_bbj_prs.csv', \
                                dtype = {'sample.id':'str'}).set_index('sample.id')

mvp_prs_train_sbp = pd.read_csv(prs_path+'mgbb_sbp_mvp_prs.csv', \
                                dtype = {'sample.id':'str'}).set_index('sample.id')

#DBP
ukbb_prs_train_dbp = pd.read_csv(prs_path+'mgbb_dbp_ukbb_prs.csv', \
                                dtype = {'sample.id':'str'}).set_index('sample.id')

bbj_prs_train_dbp = pd.read_csv(prs_path+'mgbb_dbp_bbj_prs.csv', \
                                dtype = {'sample.id':'str'}).set_index('sample.id')

mvp_prs_train_dbp = pd.read_csv(prs_path+'mgbb_dbp_mvp_prs.csv', \
                                dtype = {'sample.id':'str'}).set_index('sample.id')

In [9]:
#SBP PRS features
model_1_prs_train_sbp = ukbb_prs_train_sbp['SBP_UKBB+ICBP_Pt_0.01']
model_2_prs_train_sbp = ukbb_prs_train_sbp
model_3_prs_train_sbp = pd.concat([ukbb_prs_train_sbp, bbj_prs_train_sbp, mvp_prs_train_sbp], axis = 1)

#DBP PRS features
model_1_prs_train_dbp = ukbb_prs_train_dbp['DBP_UKBB+ICBP_Pt_0.01']
model_2_prs_train_dbp = ukbb_prs_train_dbp
model_3_prs_train_dbp = pd.concat([ukbb_prs_train_dbp, bbj_prs_train_dbp, mvp_prs_train_dbp], axis = 1)

In [10]:
#tunning parameters
sbp_params_genetic = {'max_depth': 1, 'min_child_weight': 57, 'subsample': 0.30000000000000004, 'colsample_bytree': 0.8, 'lambda': 48, 'alpha': 48, 'gamma': 17, 'eta': 0.09999999999999999, 'nthread':-1}
sbp_num_boost_rounds_genetic = 290

sbp_params_genetic_model_2 = {'max_depth': 30, 'min_child_weight': 74, 'subsample': 0.1, 'colsample_bytree': 0.7000000000000001, 'lambda': 13, 'alpha': 37, 'gamma': 31, 'eta': 0.01, 'nthread':-1}
sbp_num_boost_rounds_genetic_model_2 = 348

sbp_params_genetic_model_3 = {'max_depth': 2, 'min_child_weight': 29, 'subsample': 0.9, 'colsample_bytree': 0.6, 'lambda': 31, 'alpha': 8, 'gamma': 38, 'eta': 0.02, 'nthread':-1}
sbp_num_boost_rounds_genetic_model_3 = 697

sbp_csx_model_params = {'max_depth': 2, 'min_child_weight': 63, 'subsample': 0.30000000000000004,
                        'colsample_bytree': 1.0, 'lambda': 39, 'alpha': 10, 'gamma': 44, 'eta': 0.02}
sbp_num_boost_rounds_csx_model = 255

sbp_local_prs_optuna_params = {'max_depth': 87, 'min_child_weight': 83, 'subsample': 0.4,
                               'colsample_bytree': 0.7000000000000001, 'lambda': 0,
                               'alpha': 45, 'gamma': 3, 'eta': 0.01, 'nthread':-1}
sbp_local_prs_num_boost_round = 947


dbp_params_genetic = {'max_depth': 3, 'min_child_weight': 1, 'subsample': 0.8,\
                          'colsample_bytree': 0.9, 'lambda': 37, 'alpha': 50,\
                          'gamma': 45, 'eta': 0.05, 'nthread': -1}
dbp_num_boost_rounds_genetic = 90

dbp_params_genetic_model_2 = {'max_depth': 30, 'min_child_weight': 42, 'subsample': 0.30000000000000004,
                              'colsample_bytree': 0.5, 'lambda': 48, 'alpha': 43, 'gamma': 24, 'eta': 0.02, 'nthread':-1}
dbp_num_boost_rounds_genetic_model_2 = 192

dbp_params_genetic_model_3 = {'max_depth': 3, 'min_child_weight': 16, 'subsample': 0.9,
                              'colsample_bytree': 0.8, 'lambda': 42, 'alpha': 36, 'gamma': 29, 'eta': 0.04, 'nthread':-1}
dbp_num_boost_rounds_genetic_model_3 = 290

dbp_csx_model_params = {'max_depth': 2, 'min_child_weight': 53, 'subsample': 0.6,
                        'colsample_bytree': 0.5, 'lambda': 9, 'alpha': 22, 'gamma': 4, 'eta': 0.02, 'nthread':-1}
dbp_num_boost_rounds_csx_model = 112

dbp_local_prs_optuna_params = {'max_depth': 70, 'min_child_weight': 58, 'subsample': 0.8,
                               'colsample_bytree': 0.8, 'lambda': 7,
                               'alpha': 25, 'gamma': 34, 'eta': 0.01, 'nthread':-1}
dbp_local_prs_num_boost_round = 954

### SBP Predictions XGBoost

In [None]:
xgb_genetic_model_1_sbp_prediction_train, xgb_model_1_sbp_no_study,xgb_model_1_sbp_feat_importance = run_model(['SBP_V1'], 
                                                                                   ['SBP'], 
                                                                                   ['UKBB+ICBP'], 
                                                                                   base_model = "No", 
                                                                                   pc_include = False,
                                                                                   residuals_model = "xgboost",
                                                                                   remove_study = True,
                                                                                   remove_na = False,
                                                                                   prs_train = model_1_prs_train_sbp, 
                                                                                   y_train_loaded = baseline_y_train_sbp,
                                                                                   adj_y_train_loaded = baseline_y_train_sbp_topmed_weights_adjusted,
                                                                                   phenotype_data_train_loaded = baseline_phenotype_data_train_sbp,
                                                                                   tune_xgb = False,
                                                                                   save_model = False,
                                                                                   model_name_saved = 'TOPMED sbp_genetic_model_1_xgb_xgb',
                                                                                   model_weights_load = True,
                                                                                   params = sbp_params_genetic,     
                                                                                   xgb_n_estimator = sbp_num_boost_rounds_genetic,
                                                                                   optuna_tuning_save = "MGBB_sbp_model_1_optuna_genetic_tuning.csv"
                                                                                  )
xgb_model_1_sbp_feat_importance

In [None]:
xgb_genetic_model_2_sbp_prediction_train, xgb_model_2_sbp_no_study,xgb_model_2_sbp_feat_importance = run_model(['SBP_V1'], 
                                                                                   ['SBP'], 
                                                                                   ['UKBB+ICBP'], 
                                                                                   base_model = "No", 
                                                                                   pc_include = False,
                                                                                   residuals_model = "xgboost",
                                                                                   remove_study = True,
                                                                                   remove_na = False,
                                                                                   prs_train = model_2_prs_train_sbp, 
                                                                                   y_train_loaded = baseline_y_train_sbp,
                                                                                   adj_y_train_loaded = baseline_y_train_sbp_topmed_weights_adjusted,
                                                                                   phenotype_data_train_loaded = baseline_phenotype_data_train_sbp,
                                                                                   tune_xgb = False,
                                                                                   save_model = False,
                                                                                   model_name_saved = 'TOPMED sbp_genetic_model_2_xgb_xgb',
                                                                                   model_weights_load = True,
                                                                                   params = sbp_params_genetic,     
                                                                                   xgb_n_estimator = sbp_num_boost_rounds_genetic,
                                                                                   optuna_tuning_save = "MGBB_sbp_model_2_optuna_genetic_tuning.csv"
                                                                                  )
xgb_model_2_sbp_feat_importance

In [None]:
xgb_genetic_model_3_sbp_prediction_train, xgb_model_3_sbp_no_study,xgb_model_3_sbp_feat_importance = run_model(['SBP_V1'], 
                                                                                   ['SBP'], 
                                                                                   ['UKBB+ICBP'], 
                                                                                   base_model = "No", 
                                                                                   pc_include = False,
                                                                                   residuals_model = "xgboost",
                                                                                   remove_study = True,
                                                                                   remove_na = False,
                                                                                   prs_train = model_3_prs_train_sbp, 
                                                                                   y_train_loaded = baseline_y_train_sbp,
                                                                                   adj_y_train_loaded = baseline_y_train_sbp_topmed_weights_adjusted,
                                                                                   phenotype_data_train_loaded = baseline_phenotype_data_train_sbp,
                                                                                   tune_xgb = False,
                                                                                   save_model = False,
                                                                                   model_name_saved = 'TOPMED sbp_genetic_model_3_xgb_xgb',
                                                                                   model_weights_load = True,
                                                                                   params = sbp_params_genetic,     
                                                                                   xgb_n_estimator = sbp_num_boost_rounds_genetic,
                                                                                   optuna_tuning_save = "MGBB_sbp_model_3_optuna_genetic_tuning.csv"
                                                                                  )
xgb_model_3_sbp_feat_importance

### SBP Predictions Linear Regression

In [None]:
lr_genetic_model_1_sbp_prediction_train, lr_model_1_sbp_no_study,lr_model_1_sbp_feat_importance = run_model(['SBP_V1'], 
                                                                                   ['SBP'], 
                                                                                   ['UKBB+ICBP'], 
                                                                                   base_model = "No", 
                                                                                   pc_include = False,
                                                                                   residuals_model = "xgboost",
                                                                                   remove_study = True,
                                                                                   remove_na = False,
                                                                                   prs_train = model_1_prs_train_sbp, 
                                                                                   y_train_loaded = baseline_y_train_sbp,
                                                                                   adj_y_train_loaded = baseline_y_train_sbp_topmed_weights_adjusted,
                                                                                   phenotype_data_train_loaded = baseline_phenotype_data_train_sbp,
                                                                                   tune_xgb = False,
                                                                                   save_model = False,
                                                                                   model_name_saved = 'TOPMED sbp_genetic_model_1_xgb_lr',
                                                                                   model_weights_load = True,
                                                                                   params = sbp_params_genetic,     
                                                                                   xgb_n_estimator = sbp_num_boost_rounds_genetic,
                                                                                   optuna_tuning_save = "MGBB_sbp_model_1_optuna_genetic_tuning.csv",
                                                                                   prediction_model_type = "linear regression"
                                                                                  )
lr_model_1_sbp_feat_importance

In [None]:
lr_genetic_model_2_sbp_prediction_train, lr_model_2_sbp_no_study,lr_model_2_sbp_feat_importance = run_model(['SBP_V1'], 
                                                                                   ['SBP'], 
                                                                                   ['UKBB+ICBP'], 
                                                                                   base_model = "No", 
                                                                                   pc_include = False,
                                                                                   residuals_model = "xgboost",
                                                                                   remove_study = True,
                                                                                   remove_na = False,
                                                                                   prs_train = model_2_prs_train_sbp, 
                                                                                   y_train_loaded = baseline_y_train_sbp,
                                                                                   adj_y_train_loaded = baseline_y_train_sbp_topmed_weights_adjusted,
                                                                                   phenotype_data_train_loaded = baseline_phenotype_data_train_sbp,
                                                                                   tune_xgb = False,
                                                                                   save_model = False,
                                                                                   model_name_saved = 'TOPMED sbp_genetic_model_2_xgb_lr',
                                                                                   model_weights_load = True,
                                                                                   params = sbp_params_genetic,     
                                                                                   xgb_n_estimator = sbp_num_boost_rounds_genetic,
                                                                                   optuna_tuning_save = "MGBB_sbp_model_2_optuna_genetic_tuning.csv",
                                                                                                             prediction_model_type = "linear regression"
                                                                                  )
lr_model_2_sbp_feat_importance

In [None]:
lr_genetic_model_3_sbp_prediction_train, lr_model_3_sbp_no_study,lr_model_3_sbp_feat_importance = run_model(['SBP_V1'], 
                                                                                   ['SBP'], 
                                                                                   ['UKBB+ICBP'], 
                                                                                   base_model = "No", 
                                                                                   pc_include = False,
                                                                                   residuals_model = "xgboost",
                                                                                   remove_study = True,
                                                                                   remove_na = False,
                                                                                   prs_train = model_3_prs_train_sbp, 
                                                                                   y_train_loaded = baseline_y_train_sbp,
                                                                                   adj_y_train_loaded = baseline_y_train_sbp_topmed_weights_adjusted,
                                                                                   phenotype_data_train_loaded = baseline_phenotype_data_train_sbp,
                                                                                   tune_xgb = False,
                                                                                   save_model = False,
                                                                                   model_name_saved = 'TOPMED sbp_genetic_model_3_xgb_lr',
                                                                                   model_weights_load = True,
                                                                                   params = sbp_params_genetic,     
                                                                                   xgb_n_estimator = sbp_num_boost_rounds_genetic,
                                                                                   optuna_tuning_save = "MGBB_sbp_model_3_optuna_genetic_tuning.csv",
                                                                                                             prediction_model_type = "linear regression"
                                                                                  )
lr_model_3_sbp_feat_importance

### DBP Predictions XGBoost

In [None]:
xgb_genetic_model_1_dbp_prediction_train, xgb_model_1_dbp_no_study,xgb_model_1_dbp_feat_importance = run_model(['DBP_V1'], 
                                                                                   ['DBP'], 
                                                                                   ['UKBB+ICBP'], 
                                                                                   base_model = "No", 
                                                                                   pc_include = False,
                                                                                   residuals_model = "xgboost",
                                                                                   remove_study = True,
                                                                                   remove_na = False,
                                                                                   prs_train = model_1_prs_train_dbp, 
                                                                                   y_train_loaded = baseline_y_train_dbp,
                                                                                   adj_y_train_loaded = baseline_y_train_dbp_topmed_weights_adjusted,
                                                                                   phenotype_data_train_loaded = baseline_phenotype_data_train_dbp,
                                                                                   tune_xgb = False,
                                                                                   save_model = False,
                                                                                   model_name_saved = 'TOPMED dbp_genetic_model_1_xgb_xgb',
                                                                                   model_weights_load = True,
                                                                                   params = dbp_params_genetic,     
                                                                                   xgb_n_estimator = dbp_num_boost_rounds_genetic,
                                                                                   optuna_tuning_save = "MGBB_dbp_model_1_optuna_genetic_tuning.csv"
                                                                                  )
xgb_model_1_dbp_feat_importance

In [None]:
xgb_genetic_model_2_dbp_prediction_train, xgb_model_2_dbp_no_study,xgb_model_2_dbp_feat_importance = run_model(['DBP_V1'], 
                                                                                   ['DBP'], 
                                                                                   ['UKBB+ICBP'], 
                                                                                   base_model = "No", 
                                                                                   pc_include = False,
                                                                                   residuals_model = "xgboost",
                                                                                   remove_study = True,
                                                                                   remove_na = False,
                                                                                   prs_train = model_2_prs_train_dbp, 
                                                                                   y_train_loaded = baseline_y_train_dbp,
                                                                                   adj_y_train_loaded = baseline_y_train_dbp_topmed_weights_adjusted,
                                                                                   phenotype_data_train_loaded = baseline_phenotype_data_train_dbp,
                                                                                   tune_xgb = False,
                                                                                   save_model = False,
                                                                                   model_name_saved = 'TOPMED dbp_genetic_model_2_xgb_xgb',
                                                                                   model_weights_load = True,
                                                                                   params = dbp_params_genetic,     
                                                                                   xgb_n_estimator = dbp_num_boost_rounds_genetic,
                                                                                   optuna_tuning_save = "MGBB_dbp_model_2_optuna_genetic_tuning.csv"
                                                                                  )
xgb_model_2_sbp_feat_importance

In [None]:
xgb_genetic_model_3_dbp_prediction_train, xgb_model_3_dbp_no_study,xgb_model_3_dbp_feat_importance = run_model(['DBP_V1'], 
                                                                                   ['DBP'], 
                                                                                   ['UKBB+ICBP'], 
                                                                                   base_model = "No", 
                                                                                   pc_include = False,
                                                                                   residuals_model = "xgboost",
                                                                                   remove_study = True,
                                                                                   remove_na = False,
                                                                                   prs_train = model_3_prs_train_dbp, 
                                                                                   y_train_loaded = baseline_y_train_dbp,
                                                                                   adj_y_train_loaded = baseline_y_train_dbp_topmed_weights_adjusted,
                                                                                   phenotype_data_train_loaded = baseline_phenotype_data_train_dbp,
                                                                                   tune_xgb = False,
                                                                                   save_model = False,
                                                                                   model_name_saved = 'TOPMED dbp_genetic_model_3_xgb_xgb',
                                                                                   model_weights_load = True,
                                                                                   params = dbp_params_genetic,     
                                                                                   xgb_n_estimator = dbp_num_boost_rounds_genetic,
                                                                                   optuna_tuning_save = "MGBB_dbp_model_3_optuna_genetic_tuning.csv"
                                                                                  )
xgb_model_3_dbp_feat_importance

### DBP Predictions Linear Regression

In [None]:
lr_genetic_model_1_dbp_prediction_train, lr_model_1_dbp_no_study,lr_model_1_dbp_feat_importance = run_model(['DBP_V1'], 
                                                                                   ['DBP'], 
                                                                                   ['UKBB+ICBP'], 
                                                                                   base_model = "No", 
                                                                                   pc_include = False,
                                                                                   residuals_model = "xgboost",
                                                                                   remove_study = True,
                                                                                   remove_na = False,
                                                                                   prs_train = model_1_prs_train_dbp,
                                                                                   y_train_loaded = baseline_y_train_dbp,
                                                                                   adj_y_train_loaded = baseline_y_train_dbp_topmed_weights_adjusted,
                                                                                   phenotype_data_train_loaded = baseline_phenotype_data_train_dbp,
                                                                                   tune_xgb = False,
                                                                                   save_model = False,
                                                                                   model_name_saved = 'TOPMED dbp_genetic_model_1_xgb_lr',
                                                                                   model_weights_load = True,
                                                                                   params = dbp_params_genetic,     
                                                                                   xgb_n_estimator = dbp_num_boost_rounds_genetic,
                                                                                   optuna_tuning_save = "MGBB_dbp_model_1_optuna_genetic_tuning.csv",
                                                                                                            prediction_model_type = "linear regression"
                                                                                  )
lr_model_1_dbp_feat_importance

In [None]:
lr_genetic_model_2_dbp_prediction_train, lr_model_2_dbp_no_study,lr_model_2_dbp_feat_importance = run_model(['DBP_V1'], 
                                                                                   ['DBP'], 
                                                                                   ['UKBB+ICBP'], 
                                                                                   base_model = "No", 
                                                                                   pc_include = False,
                                                                                   residuals_model = "xgboost",
                                                                                   remove_study = True,
                                                                                   remove_na = False,
                                                                                   prs_train = model_2_prs_train_dbp, 
                                                                                   y_train_loaded = baseline_y_train_dbp,
                                                                                   adj_y_train_loaded = baseline_y_train_dbp_topmed_weights_adjusted,
                                                                                   phenotype_data_train_loaded = baseline_phenotype_data_train_dbp,
                                                                                   tune_xgb = False,
                                                                                   save_model = False,
                                                                                   model_name_saved = 'TOPMED dbp_genetic_model_2_xgb_lr',
                                                                                   model_weights_load = True,
                                                                                   params = dbp_params_genetic,     
                                                                                   xgb_n_estimator = dbp_num_boost_rounds_genetic,
                                                                                   optuna_tuning_save = "MGBB_dbp_model_2_optuna_genetic_tuning.csv",
                                                                                                            prediction_model_type = "linear regression"
                                                                                  )
lr_model_2_sbp_feat_importance

In [None]:
lr_genetic_model_3_dbp_prediction_train, lr_model_3_dbp_no_study,lr_model_3_dbp_feat_importance = run_model(['DBP_V1'], 
                                                                                   ['DBP'], 
                                                                                   ['UKBB+ICBP'], 
                                                                                   base_model = "No", 
                                                                                   pc_include = False,
                                                                                   residuals_model = "xgboost",
                                                                                   remove_study = True,
                                                                                   remove_na = False,
                                                                                   prs_train = model_3_prs_train_dbp, 
                                                                                   y_train_loaded = baseline_y_train_dbp,
                                                                                   adj_y_train_loaded = baseline_y_train_dbp_topmed_weights_adjusted,
                                                                                   phenotype_data_train_loaded = baseline_phenotype_data_train_dbp,
                                                                                   tune_xgb = False,
                                                                                   save_model = False,
                                                                                   model_name_saved = 'TOPMED dbp_genetic_model_3_xgb_lr',
                                                                                   model_weights_load = True,
                                                                                   params = dbp_params_genetic,     
                                                                                   xgb_n_estimator = dbp_num_boost_rounds_genetic,
                                                                                   optuna_tuning_save = "MGBB_dbp_model_3_optuna_genetic_tuning.csv",
                                                                                                            prediction_model_type = "linear regression"
                                                                                  )
lr_model_3_dbp_feat_importance

## Full Dataset and Prediction Intervals

In [25]:
sbp_full_train_data = pd.concat([baseline_phenotype_data_train_sbp,baseline_y_train_sbp, pd.Series(baseline_y_train_sbp_topmed_weights_adjusted.squeeze(), name = "SBP_V1_residuals"),
                                pd.Series(baseline_y_train_sbp-baseline_y_train_sbp_topmed_weights_adjusted.squeeze(), name = 'SBP_V1_prediction').set_axis(baseline_phenotype_data_train_sbp.index),
                                model_3_prs_train_sbp,
                                pd.Series(xgb_genetic_model_1_sbp_prediction_train, name = 'model_1_SBP_V1_xgb_predictions').set_axis(baseline_phenotype_data_train_sbp.index),
                                pd.Series(xgb_genetic_model_2_sbp_prediction_train, name = 'model_2_SBP_V1_xgb_predictions').set_axis(baseline_phenotype_data_train_sbp.index),
                                pd.Series(xgb_genetic_model_3_sbp_prediction_train, name = 'model_3_SBP_V1_xgb_predictions').set_axis(baseline_phenotype_data_train_sbp.index),
                                pd.Series(lr_genetic_model_1_sbp_prediction_train, name = 'model_1_SBP_V1_lr_predictions').set_axis(baseline_phenotype_data_train_sbp.index),
                                pd.Series(lr_genetic_model_2_sbp_prediction_train, name = 'model_2_SBP_V1_lr_predictions').set_axis(baseline_phenotype_data_train_sbp.index),
                                pd.Series(lr_genetic_model_3_sbp_prediction_train, name = 'model_3_SBP_V1_lr_predictions').set_axis(baseline_phenotype_data_train_sbp.index)
                               ], axis = 1)

In [None]:
sbp_prediction_intervals = bootstrap_prediction_intervals('SBP_V1', sbp_full_train_data)
sbp_prediction_intervals.columns = ['lower_bound_2.5th_percentile','upper_bound_97.5th_percentile', 'PVE']


In [27]:
sbp_prediction_intervals[['lower_bound_2.5th_percentile','PVE', 'upper_bound_97.5th_percentile']].to_csv("/2022_BP_ensemble/Results/MGBB_w_CIs/MGBB_SBP_CI_TOPMed_baseline_weights.csv")






In [None]:
sbp_prediction_intervals[['lower_bound_2.5th_percentile','PVE', 'upper_bound_97.5th_percentile']]

In [128]:
dbp_full_train_data = pd.concat([baseline_phenotype_data_train_dbp,baseline_y_train_dbp, pd.Series(baseline_y_train_dbp_topmed_weights_adjusted.squeeze(), name = "DBP_V1_residuals"),
                                pd.Series(baseline_y_train_dbp-baseline_y_train_dbp_topmed_weights_adjusted.squeeze(), name = 'DBP_V1_prediction').set_axis(baseline_phenotype_data_train_dbp.index),
                                model_3_prs_train_dbp,
                                pd.Series(xgb_genetic_model_1_dbp_prediction_train, name = 'model_1_DBP_V1_xgb_predictions').set_axis(baseline_phenotype_data_train_dbp.index),
                                pd.Series(xgb_genetic_model_2_dbp_prediction_train, name = 'model_2_DBP_V1_xgb_predictions').set_axis(baseline_phenotype_data_train_dbp.index),
                                pd.Series(xgb_genetic_model_3_dbp_prediction_train, name = 'model_3_DBP_V1_xgb_predictions').set_axis(baseline_phenotype_data_train_dbp.index),
                                pd.Series(lr_genetic_model_1_dbp_prediction_train, name = 'model_1_DBP_V1_lr_predictions').set_axis(baseline_phenotype_data_train_dbp.index),
                                pd.Series(lr_genetic_model_2_dbp_prediction_train, name = 'model_2_DBP_V1_lr_predictions').set_axis(baseline_phenotype_data_train_dbp.index),
                                pd.Series(lr_genetic_model_3_dbp_prediction_train, name = 'model_3_DBP_V1_lr_predictions').set_axis(baseline_phenotype_data_train_dbp.index)
                                ], axis = 1)


In [None]:
dbp_prediction_intervals = bootstrap_prediction_intervals('DBP_V1', dbp_full_train_data)
dbp_prediction_intervals.columns = ['lower_bound_2.5th_percentile','upper_bound_97.5th_percentile','PVE']
# dbp_prediction_intervals[['lower_bound_2.5th_percentile','PVE','upper_bound_97.5th_percentile']].to_csv("/Results/dbp_prediction_intervals.csv")

In [None]:
dbp_prediction_intervals[['lower_bound_2.5th_percentile','PVE','upper_bound_97.5th_percentile']]

In [132]:
dbp_prediction_intervals[['lower_bound_2.5th_percentile','PVE', 'upper_bound_97.5th_percentile']].to_csv("/2022_BP_ensemble/Results/MGBB_w_CIs/MGBB_DBP_CI_TOPMed_baseline_weights.csv")





