In [1]:
# import module
import os
import sys

# import dowhy
import econml
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import scipy.stats
import shap
import statsmodels.nonparametric.smoothers_lowess as sl
from econml.grf import CausalForest, CausalIVForest, RegressionForest
from econml.iv.dml import DMLIV, NonParamDMLIV, OrthoIV
from econml.iv.dr import DRIV, ForestDRIV, LinearDRIV, SparseLinearDRIV
from econml.sklearn_extensions.linear_model import WeightedLassoCV
from matplotlib import cm
from matplotlib.colors import Normalize
from scipy import special
from scipy.interpolate import interp1d, interpn
from scipy.stats import pearsonr
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import (LinearRegression, LogisticRegression,
                                  LogisticRegressionCV)
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from statsmodels.stats.multitest import multipletests

In [2]:
def warn(*args, **kwargs):
    pass
import warnings

warnings.warn = warn

In [3]:
import pickle
def save_object(obj, filename):
    with open(filename, 'wb') as outp:  # Overwrites any existing file.
        pickle.dump(obj, outp, pickle.HIGHEST_PROTOCOL)

In [4]:

# load dataset and sample
X_set1 = pd.read_csv("/mnt/md0/yujia/project/2023-07-20-individual_MR/dat/04_pheno_covar_data/traits/ukbb.covariate.traits.set1.gz", sep = "\t")
X_set2 = pd.read_csv("/mnt/md0/yujia/project/2023-07-20-individual_MR/dat/04_pheno_covar_data/traits/ukbb.covariate.traits.set2.gz", sep = "\t")
X_set3 = pd.read_csv("/mnt/md0/yujia/project/2023-07-20-individual_MR/dat/04_pheno_covar_data/IGF1/CAD/ukbb.covariate.IGF1.set3.gz", sep = "\t")

Z_set1 = pd.read_csv("/mnt/md0/yujia/project/2023-07-20-individual_MR/dat/06_PRS_calculation/IGF1/CAD/set1/5e-05/IGF1_prs.best", sep = " ") # score_constd
Z_set2 = pd.read_csv("/mnt/md0/yujia/project/2023-07-20-individual_MR/dat/06_PRS_calculation/IGF1/CAD/set2/5e-05/IGF1_prs.best", sep = " ") # score_constd
Z_set3 = pd.read_csv("/mnt/md0/yujia/project/2023-07-20-individual_MR/dat/06_PRS_calculation/IGF1/CAD/set3/5e-05/IGF1_prs.best", sep = " ") # score_constd

W = pd.read_csv("/mnt/md0/yujia/project/2023-07-20-individual_MR/dat/04_pheno_covar_data/IGF1/CAD/ukbb.phenotype.IGF1.nmolL", sep = "\t")

Y_date1 = pd.read_csv("/mnt/md0/yujia/project/2023-07-20-individual_MR/dat/03_outcome_data/CAD_outcome_include_comorbid_date1_james2022.gz", sep = "\t")
Y_date2 = pd.read_csv("/mnt/md0/yujia/project/2023-07-20-individual_MR/dat/03_outcome_data/CAD_outcome_include_comorbid_date2_james2022.gz", sep = "\t")
Y_date3 = pd.read_csv("/mnt/md0/yujia/project/2023-07-20-individual_MR/dat/03_outcome_data/CAD_outcome_include_comorbid_date3_james2022.gz", sep = "\t")

# harmonize the data
selected_id_set3 = set.intersection(set(X_set3['IID']), set(Z_set3['IID']), set(W['IID']), set(Y_date3['IID']) )
selected_id_set3 = list(selected_id_set3)
selected_id_set3_arr = np.array(selected_id_set3)
selected_id_set3_arr.sort()

selected_id_set1b = set.intersection(set(X_set1['IID']), set(Z_set1['IID']), set(W['IID']), set(Y_date3['IID']) )
selected_id_set1b = list(selected_id_set1b)
selected_id_set1b_arr = np.array(selected_id_set1b)
selected_id_set1b_arr.sort()

# model 1b
X_model1b = X_set1.loc[X_set1['IID'].isin(selected_id_set1b)].reset_index(drop = True)
Z_model1b = Z_set1.loc[Z_set1['IID'].isin(selected_id_set1b)].reset_index(drop = True)
W_model1b = W.loc[W['IID'].isin(selected_id_set1b)].reset_index(drop = True)
Y_model1b = Y_date3.loc[Y_date3['IID'].isin(selected_id_set1b)].reset_index(drop = True)

# model 3
X_model3 = X_set3.loc[X_set3['IID'].isin(selected_id_set3)].reset_index(drop = True)
Z_model3 = Z_set3.loc[Z_set3['IID'].isin(selected_id_set3)].reset_index(drop = True)
W_model3 = W.loc[W['IID'].isin(selected_id_set3)].reset_index(drop = True)
Y_model3 = Y_date3.loc[Y_date3['IID'].isin(selected_id_set3)].reset_index(drop = True)
X_model3 = pd.concat([X_model3, Y_model3.loc[:, ["htn", "t2dm", "heart_failure", "hemorrhage_stroke", "ischemic_stroke"]]], axis=1)
Y_model3 = Y_model3.loc[:, ["IID", "CAD"]]

# generate mat file for three models
# model 1b
W_model1b_mat = W_model1b.loc[:, ["30770-0.0"]]["30770-0.0"]
X_model1b_mat = X_model1b.iloc[:, 2:]
Y_model1b_mat = Y_model1b.loc[:, ["CAD"]]["CAD"]
Z_model1b_mat = Z_model1b.iloc[:, 3]

# model 3
W_model3_mat = W_model3.loc[:, ["30770-0.0"]]["30770-0.0"]
X_model3_mat = X_model3.iloc[:, 2:]
Y_model3_mat = Y_model3.loc[:, ["CAD"]]["CAD"]
Z_model3_mat = Z_model3.iloc[:, 3]

# correct the data type
X_model1b_mat = X_model1b_mat.astype({
    "22001-0.0": 'int64', "21022-0.0": 'float64', "22000-0.0": 'float64',
    "22009-0.1": 'float64', "22009-0.2": 'float64', "22009-0.3": 'float64', "22009-0.4": 'float64', "22009-0.5": 'float64',
    "22009-0.6": 'float64', "22009-0.7": 'float64', "22009-0.8": 'float64', "22009-0.9": 'float64', "22009-0.10": 'float64',
})

X_model3_mat = X_model3_mat.astype({
    "22001-0.0": 'int64', "21022-0.0": 'float64', 
    "4079-0.0": 'float64', "4080-0.0": 'float64', "189-0.0": 'float64', 
    "22009-0.1": 'float64', "22009-0.2": 'float64', "22009-0.3": 'float64', "22009-0.4": 'float64', "22009-0.5": 'float64',
    "22009-0.6": 'float64', "22009-0.7": 'float64', "22009-0.8": 'float64', "22009-0.9": 'float64', "22009-0.10": 'float64', "22000-0.0": 'float64',
    "23099-0.0": 'float64', "21001-0.0": 'float64', "21002-0.0": 'float64', # "whr": 'float64', 
    "Blood_pressure_medication": 'int64', "Cholesterol_lowering_medication": 'int64', "Insulin": 'int64', 
    "Non_alcohol_drinker": 'int64' , "Previous_alcohol_drinker": 'int64', "Current_alcohol_drinker": 'int64',
    "Non_smoker": 'int64' , "Previous_smoker": 'int64', "Current_smoker": 'int64',
    "30630-0.0": 'float64', "30760-0.0": 'float64', "30870-0.0": 'float64', # lipid-related covariates
    "30680-0.0": 'float64', "30700-0.0": 'float64', "30710-0.0": 'float64', "30720-0.0": 'float64', "30730-0.0": 'float64', 
    "30740-0.0": 'float64', "30750-0.0": 'float64', "30650-0.0": 'float64', "30660-0.0": 'float64', 
    "30670-0.0": 'float64', "30810-0.0": 'float64', "30830-0.0": 'float64', "30850-0.0": 'float64', 
    "30860-0.0": 'float64', "30880-0.0": 'float64', "30890-0.0": 'float64', "30840-0.0": 'float64',
    "t2dm": 'int64', "htn": 'int64', "heart_failure": 'int64', "hemorrhage_stroke": 'int64', "ischemic_stroke": 'int64'
})

# correct the data type
X_model1b_mat.rename({
    "22001-0.0": 'Gender', "21022-0.0": 'Age',
    "22009-0.1": 'PC1', "22009-0.2": 'PC2', "22009-0.3": 'PC3', "22009-0.4": 'PC4', "22009-0.5": 'PC5',
    "22009-0.6": 'PC6', "22009-0.7": 'PC7', "22009-0.8": 'PC8', "22009-0.9": 'PC9', "22009-0.10": 'PC10', "22000-0.0": 'Genotype batch',
}, inplace=True, axis=1)

X_model3_mat.rename({
    "22001-0.0": 'Gender', "21022-0.0": 'Age', 
    "4079-0.0": 'diastolic blood pressure', "4080-0.0": 'systolic blood pressure', "189-0.0": 'Townsend deprivation index', 
    "22009-0.1": 'PC1', "22009-0.2": 'PC2', "22009-0.3": 'PC3', "22009-0.4": 'PC4', "22009-0.5": 'PC5',
    "22009-0.6": 'PC6', "22009-0.7": 'PC7', "22009-0.8": 'PC8', "22009-0.9": 'PC9', "22009-0.10": 'PC10', "22000-0.0": 'Genotype batch',
    "23099-0.0": 'Body Fat Percentage', "21001-0.0": 'BMI', "21002-0.0": 'Weight', # "whr": 'Waist-hip-ratio', 
    "Blood_pressure_medication": 'Blood pressure medication', "Cholesterol_lowering_medication": 'Cholesterol lowering medication', "Insulin": 'Insulin', 
    "Non_alcohol_drinker": 'Non-alcohol drinker' , "Previous_alcohol_drinker": 'Previous alcohol drinker', "Current_alcohol_drinker": 'Current alcohol drinker',
    "Non_smoker": 'Non-smoker' , "Previous_smoker": 'Previous smoker', "Current_smoker": 'Current smoker',
    "30630-0.0": "Apolipoprotein A", "30640-0.0": "Apolipoprotein B", "30690-0.0": "Cholesterol", "30760-0.0": "HDL-C", "30780-0.0": "LDL-C", "30790-0.0": "Lipoprotein A", "30870-0.0": "Triglycerides", # lipid-related covariates
    "30680-0.0": 'Calcium', "30700-0.0": 'Creatinine', "30710-0.0": 'C-reactive protein', "30720-0.0": 'Cystatin C', "30730-0.0": 'Gamma glutamyltransferase', 
    "30740-0.0": 'Glucose', "30750-0.0": 'HbA1c', "30650-0.0": 'Aspartate aminotransferase', "30660-0.0": 'Direct bilirubin', 
    "30670-0.0": 'Urea', "30810-0.0": 'Phosphate', "30830-0.0": 'SHBG', "30850-0.0": 'Testosterone', 
    "30860-0.0": 'Total protein', "30880-0.0": 'Urate', "30890-0.0": 'Vitamin D', "30840-0.0": 'Total bilirubin',
    "t2dm": 'Type 2 diabetes history', "htn": 'Hypertension history', "heart_failure": 'Heart failure history', "hemorrhage_stroke": 'Hemorrhage Stroke history', "ischemic_stroke": 'Ischemic Stroke history'
}, inplace=True, axis=1)


In [5]:
Z_model3

Unnamed: 0,FID,IID,In_Regression,PRS
0,1000048,1000048,Yes,-0.000462
1,1000055,1000055,Yes,-0.010465
2,1000067,1000067,Yes,-0.008813
3,1000072,1000072,Yes,-0.006466
4,1000099,1000099,Yes,-0.001012
...,...,...,...,...
276049,6026113,6026113,Yes,-0.007059
276050,6026124,6026124,Yes,-0.006450
276051,6026136,6026136,Yes,-0.002716
276052,6026155,6026155,Yes,-0.014700


In [6]:
W_model3

Unnamed: 0,FID,IID,30770-0.0
0,1000048,1000048,23.993
1,1000055,1000055,20.509
2,1000067,1000067,22.958
3,1000072,1000072,12.205
4,1000099,1000099,22.371
...,...,...,...
276049,6026113,6026113,8.814
276050,6026124,6026124,16.788
276051,6026136,6026136,21.943
276052,6026155,6026155,9.245


In [7]:
W_model1b_mat_binary = np.where(W_model1b_mat >= 25, 1, 0)
W_model1b_mat_binary = pd.DataFrame({"30770-0.0": W_model1b_mat_binary}).iloc[:, 0]
W_model3_mat_binary = np.where(W_model3_mat >= 25, 1, 0)
W_model3_mat_binary = pd.DataFrame({"30770-0.0": W_model3_mat_binary}).iloc[:, 0]

In [8]:
# Z_model1_mat_binary = np.where(Z_model1_mat <= 0, 1, 0)
# Z_model1_mat_binary = pd.DataFrame({"PRS": Z_model1_mat_binary}).iloc[:, 0]
# Z_model1b_mat_binary = np.where(Z_model1b_mat <= 0, 1, 0)
# Z_model1b_mat_binary = pd.DataFrame({"PRS": Z_model1b_mat_binary}).iloc[:, 0]
# Z_model2_mat_binary = np.where(Z_model2_mat <= 0, 1, 0)
# Z_model2_mat_binary = pd.DataFrame({"PRS": Z_model2_mat_binary}).iloc[:, 0]
# Z_model2b_mat_binary = np.where(Z_model2b_mat <= 0, 1, 0)
# Z_model2b_mat_binary = pd.DataFrame({"PRS": Z_model2b_mat_binary}).iloc[:, 0]
# Z_model3_mat_binary = np.where(Z_model3_mat <= 0, 1, 0)
# Z_model3_mat_binary = pd.DataFrame({"PRS": Z_model3_mat_binary}).iloc[:, 0]

#### DRIV estimator

#### Model 1

In [9]:
# split the dataset into training and testing
train, test = train_test_split(X_model1b_mat, test_size=0.5, stratify=Y_model1b_mat, random_state=309)
train_set = train.index.to_list()
test_set = test.index.to_list()
train_set.sort()
test_set.sort()

# write the training set and testing set to file, which will be used in R analysis.
train_set_pd = pd.DataFrame({"Training_index": [x+1 for x in train_set], "Training_id": selected_id_set1b_arr[train_set]})
test_set_pd = pd.DataFrame({"Testing_index": [x+1 for x in test_set], "Testing_id": selected_id_set1b_arr[test_set]})

In [10]:
# Continuous W (Full Set)
est_driv_continuousW_model1b = ForestDRIV(projection=False, discrete_treatment=False, discrete_instrument=False, \
                                         n_estimators=5000, min_samples_leaf=300, max_samples=0.03, cv=5, prel_cv = 5, 
                                         random_state=309, cov_clip = 0.001, n_jobs=40) 
est_driv_continuousW_model1b.fit(Y_model1b_mat, W_model1b_mat, Z=Z_model1b_mat, X=X_model1b_mat, cache_values=True)
point_driv_continuousW_model1b = est_driv_continuousW_model1b.effect(X_model1b_mat)
print(pd.DataFrame({"dat": point_driv_continuousW_model1b}).describe())

                 dat
count  276054.000000
mean        0.002917
std         0.001286
min        -0.001951
25%         0.002021
50%         0.002900
75%         0.003801
max         0.007801


In [18]:
# Continuous W (Full Set)
est_driv_continuousW_model3 = ForestDRIV(projection=False, discrete_treatment=False, discrete_instrument=False, \
                                  n_estimators=3000, min_samples_leaf=500, max_samples=0.1, cv=5, prel_cv = 5, 
                                  random_state=309, cov_clip = 0.0015, n_jobs=40) 
est_driv_continuousW_model3.fit(Y_model3_mat[train_set], W_model3_mat[train_set], Z=Z_model3_mat[train_set], X=X_model3_mat.loc[train_set], cache_values=True)
point_driv_continuousW_model3 = est_driv_continuousW_model3.effect(X_model3_mat)
print(pd.DataFrame({"dat": point_driv_continuousW_model3}).describe())

                 dat
count  276054.000000
mean        0.003825
std         0.001075
min        -0.000146
25%         0.003068
50%         0.003813
75%         0.004564
max         0.008318


In [32]:
# Continuous W (Full Set)
est_driv_continuousW_model3 = ForestDRIV(projection=False, discrete_treatment=False, discrete_instrument=False, \
                                  n_estimators=3000, min_samples_leaf=200, max_samples=0.03, cv=5, prel_cv = 5, 
                                  random_state=309, cov_clip = 0.002, n_jobs=40) 
est_driv_continuousW_model3.fit(Y_model3_mat[train_set], W_model3_mat[train_set], Z=Z_model3_mat[train_set], X=X_model3_mat.loc[train_set], cache_values=True)
point_driv_continuousW_model3 = est_driv_continuousW_model3.effect(X_model3_mat)
print(pd.DataFrame({"dat": point_driv_continuousW_model3}).describe())

In [19]:
# model 3
point_driv_lb_continuousW_model3, point_driv_ub_continuousW_model3 = est_driv_continuousW_model3.effect_interval(X_model3_mat, alpha=0.1) # type: ignore
z_value_driv_continuousW_model3 = point_driv_continuousW_model3/((point_driv_ub_continuousW_model3-point_driv_lb_continuousW_model3)/(2*1.645))
p_value_driv_continuousW_model3 = scipy.stats.norm.sf(abs(z_value_driv_continuousW_model3)) # * 2
p_value_BH_driv_continuousW_model3 = multipletests(pvals = p_value_driv_continuousW_model3, method = "fdr_bh", alpha=0.1)

full_results_continuousW_model3 = pd.DataFrame({"IID": selected_id_set3_arr, "point": point_driv_continuousW_model3, "upper_bound": point_driv_ub_continuousW_model3, \
                                "lower_bound": point_driv_lb_continuousW_model3, "z-value": z_value_driv_continuousW_model3, \
                                "p_value": p_value_driv_continuousW_model3, "p_value_corrected": p_value_BH_driv_continuousW_model3[1]})


In [20]:
print(full_results_continuousW_model3.loc[full_results_continuousW_model3["p_value"] < 0.05, :]["IID"])

2         1000067
11        1000219
14        1000256
16        1000281
20        1000330
           ...   
276026    6025779
276031    6025837
276045    6026040
276049    6026113
276052    6026155
Name: IID, Length: 81760, dtype: int64


In [21]:
len(point_driv_lb_continuousW_model3[point_driv_lb_continuousW_model3 > 0])

81736

In [None]:
# Continuous W (Full Set)
est_driv_continuousW_model3 = ForestDRIV(projection=False, discrete_treatment=False, discrete_instrument=False, \
                                  n_estimators=3000, min_samples_leaf=500, max_samples=0.1, cv=5, prel_cv = 5, 
                                  random_state=309, cov_clip = 0.003, n_jobs=40) 
est_driv_continuousW_model3.fit(Y_model3_mat, W_model3_mat, Z=Z_model3_mat, X=X_model3_mat, cache_values=True)
point_driv_continuousW_model3 = est_driv_continuousW_model3.effect(X_model3_mat)
print(pd.DataFrame({"dat": point_driv_continuousW_model3}).describe())

In [None]:
point_driv_continuousW_model3

In [None]:
# Continuous W (Full Set)
est_driv_continuousW_model3 = ForestDRIV(projection=False, discrete_treatment=False, discrete_instrument=False, \
                                  n_estimators=3000, min_samples_leaf=500, max_samples=0.1, cv=5, prel_cv = 5, 
                                  random_state=309, cov_clip = 0.003, n_jobs=40) 
est_driv_continuousW_model3.fit(Y_model3_mat, W_model3_mat, Z=Z_model3_mat, X=X_model3_mat, cache_values=True)
point_driv_continuousW_model3 = est_driv_continuousW_model3.effect(X_model3_mat)
print(pd.DataFrame({"dat": point_driv_continuousW_model3}).describe())

In [None]:
# Continuous W (Full Set)
est_driv_continuousW_model3 = ForestDRIV(projection=False, discrete_treatment=False, discrete_instrument=False, \
                                  n_estimators=3000, min_samples_leaf=500, max_samples=0.1, cv=5, prel_cv = 5, 
                                  random_state=309, cov_clip = 0.003, n_jobs=40) 
est_driv_continuousW_model3.fit(Y_model3_mat, W_model3_mat, Z=Z_model3_mat, X=X_model3_mat, cache_values=True)
point_driv_continuousW_model3 = est_driv_continuousW_model3.effect(X_model3_mat)
print(pd.DataFrame({"dat": point_driv_continuousW_model3}).describe())

In [None]:
# Binary W (Full Set)
est_driv_binaryW_model1b = ForestDRIV(projection=False, discrete_treatment=False, discrete_instrument=False, 
                                     n_estimators=5000, min_samples_leaf=100, max_samples=0.02, 
                                     random_state=309, cov_clip = 0.1, n_jobs=15) 
est_driv_binaryW_model1b.fit(Y_model1b_mat, W_model1b_mat_binary, Z=Z_model1b_mat, X=X_model1b_mat, cache_values=True)
point_driv_binaryW_model1b = est_driv_binaryW_model1b.effect(X_model1b_mat)
print(pd.DataFrame({"dat": point_driv_binaryW_model1b}).describe())

In [None]:
# # Binary W (Full Set)
# est_driv_binaryW_model2 = ForestDRIV(projection=False, discrete_treatment=False, discrete_instrument=False,
#                                      n_estimators=5000, min_samples_leaf=100, max_samples=0.02, 
#                                      random_state=309, cov_clip = 0.1, n_jobs=15) 
# est_driv_binaryW_model2.fit(Y_model2_mat, W_model2_mat_binary, Z=Z_model2_mat, X=X_model2_mat, cache_values=True)
# point_driv_binaryW_model2 = est_driv_binaryW_model2.effect(X_model2_mat)
# print(pd.DataFrame({"dat": point_driv_binaryW_model2}).describe())

In [None]:
# Binary W (Full Set)
est_driv_binaryW_model3 = ForestDRIV(projection=False, discrete_treatment=False, discrete_instrument=False, \
                                         n_estimators=5000, min_samples_leaf=100, max_samples=0.02, 
                                         random_state=309, cov_clip = 0.1, n_jobs=15) 
est_driv_binaryW_model3.fit(Y_model3_mat, W_model3_mat_binary, Z=Z_model3_mat, X=X_model3_mat, cache_values=True)
point_driv_binaryW_model3 = est_driv_binaryW_model3.effect(X_model3_mat)
print(pd.DataFrame({"dat": point_driv_binaryW_model3}).describe())

In [None]:
# model 1b
point_driv_lb_continuousW_model1b, point_driv_ub_continuousW_model1b = est_driv_continuousW_model1b.effect_interval(X_model1b_mat, alpha=0.1) # type: ignore
z_value_driv_continuousW_model1b = point_driv_continuousW_model1b/((point_driv_ub_continuousW_model1b-point_driv_lb_continuousW_model1b)/(2*1.96))
p_value_driv_continuousW_model1b = scipy.stats.norm.sf(abs(z_value_driv_continuousW_model1b)) # * 2
p_value_BH_driv_continuousW_model1b = multipletests(pvals = p_value_driv_continuousW_model1b, method = "fdr_bh", alpha=0.1)

full_results_continuousW_model1b = pd.DataFrame({"IID": selected_id_set1b_arr, "point": point_driv_continuousW_model1b, "upper_bound": point_driv_ub_continuousW_model1b, \
                                "lower_bound": point_driv_lb_continuousW_model1b, "z-value": z_value_driv_continuousW_model1b, \
                                "p_value": p_value_driv_continuousW_model1b, "p_value_corrected": p_value_BH_driv_continuousW_model1b[1]})

# # model 2b
# point_driv_lb_continuousW_model2b, point_driv_ub_continuousW_model2b = est_driv_continuousW_model2b.effect_interval(X_model2b_mat, alpha=0.1) # type: ignore
# z_value_driv_continuousW_model2b = point_driv_continuousW_model2b/((point_driv_ub_continuousW_model2b-point_driv_lb_continuousW_model2b)/(2*1.96))
# p_value_driv_continuousW_model2b = scipy.stats.norm.sf(abs(z_value_driv_continuousW_model2b)) # * 2
# p_value_BH_driv_continuousW_model2b = multipletests(pvals = p_value_driv_continuousW_model2b, method = "fdr_bh", alpha=0.1)

# full_results_continuousW_model2b = pd.DataFrame({"IID": selected_id_set2b_arr, "point": point_driv_continuousW_model2b, "upper_bound": point_driv_ub_continuousW_model2b, \
#                                 "lower_bound": point_driv_lb_continuousW_model2b, "z-value": z_value_driv_continuousW_model2b, \
#                                 "p_value": p_value_driv_continuousW_model2b, "p_value_corrected": p_value_BH_driv_continuousW_model2b[1]})

# model 3
point_driv_lb_continuousW_model3, point_driv_ub_continuousW_model3 = est_driv_continuousW_model3.effect_interval(X_model3_mat, alpha=0.1) # type: ignore
z_value_driv_continuousW_model3 = point_driv_continuousW_model3/((point_driv_ub_continuousW_model3-point_driv_lb_continuousW_model3)/(2*1.96))
p_value_driv_continuousW_model3 = scipy.stats.norm.sf(abs(z_value_driv_continuousW_model3)) # * 2
p_value_BH_driv_continuousW_model3 = multipletests(pvals = p_value_driv_continuousW_model3, method = "fdr_bh", alpha=0.1)

full_results_continuousW_model3 = pd.DataFrame({"IID": selected_id_set3_arr, "point": point_driv_continuousW_model3, "upper_bound": point_driv_ub_continuousW_model3, \
                                "lower_bound": point_driv_lb_continuousW_model3, "z-value": z_value_driv_continuousW_model3, \
                                "p_value": p_value_driv_continuousW_model3, "p_value_corrected": p_value_BH_driv_continuousW_model3[1]})


In [None]:
full_results_continuousW_model3.describe()
full_results_continuousW_model3.loc[full_results_continuousW_model3["p_value"] < 0.05, :]["IID"]

In [None]:
full_results_continuousW_model1b.describe()
full_results_continuousW_model1b.loc[full_results_continuousW_model1b["p_value"] < 0.05, :]

In [None]:
X_model3_mat_iid = X_model3_mat.copy()

In [None]:
X_model3_mat_iid["iid"] = selected_id_set3_arr

In [None]:
X_model3_mat_iid_sig = X_model3_mat_iid.loc[X_model3_mat_iid["iid"].isin(full_results_continuousW_model3.loc[full_results_continuousW_model3["p_value"] < 0.05, :]["IID"]), :]

In [None]:
X_model3_mat_iid_sig.drop("iid", axis=1, inplace=True)

In [None]:
X_model3_mat_iid_sig

In [None]:
shap_values_driv_continuousW_model3 = est_driv_continuousW_model3.shap_values(X_model3_mat_iid_sig)

In [None]:
# model 1b
point_driv_lb_binaryW_model1b, point_driv_ub_binaryW_model1b = est_driv_binaryW_model1b.effect_interval(X_model1b_mat, alpha=0.1) # type: ignore
z_value_driv_binaryW_model1b = point_driv_binaryW_model1b/((point_driv_ub_binaryW_model1b-point_driv_lb_binaryW_model1b)/(2*1.96))
p_value_driv_binaryW_model1b = scipy.stats.norm.sf(abs(z_value_driv_binaryW_model1b)) # * 2
p_value_BH_driv_binaryW_model1b = multipletests(pvals = p_value_driv_binaryW_model1b, method = "fdr_bh", alpha=0.1)

full_results_binaryW_model1b = pd.DataFrame({"IID": selected_id_set1b_arr, "point": point_driv_binaryW_model1b, "upper_bound": point_driv_ub_binaryW_model1b, \
                                "lower_bound": point_driv_lb_binaryW_model1b, "z-value": z_value_driv_binaryW_model1b, \
                                "p_value": p_value_driv_binaryW_model1b, "p_value_corrected": p_value_BH_driv_binaryW_model1b[1]})

# # model 2
# point_driv_lb_binaryW_model2b, point_driv_ub_binaryW_model2b = est_driv_binaryW_model2b.effect_interval(X_model2b_mat, alpha=0.05) # type: ignore
# z_value_driv_binaryW_model2b = point_driv_binaryW_model2b/((point_driv_ub_binaryW_model2b-point_driv_lb_binaryW_model2b)/(2*1.96))
# p_value_driv_binaryW_model2b = scipy.stats.norm.sf(abs(z_value_driv_binaryW_model2b)) # * 2
# p_value_BH_driv_binaryW_model2b = multipletests(pvals = p_value_driv_binaryW_model2b, method = "fdr_bh", alpha=0.1)

# full_results_binaryW_model2b = pd.DataFrame({"IID": selected_id_set2b_arr, "point": point_driv_binaryW_model2b, "upper_bound": point_driv_ub_binaryW_model2b, \
#                                 "lower_bound": point_driv_lb_binaryW_model2b, "z-value": z_value_driv_binaryW_model2b, \
#                                 "p_value": p_value_driv_binaryW_model2b, "p_value_corrected": p_value_BH_driv_binaryW_model2b[1]})

# model 3
point_driv_lb_binaryW_model3, point_driv_ub_binaryW_model3 = est_driv_binaryW_model3.effect_interval(X_model3_mat, alpha=0.1) # type: ignore
z_value_driv_binaryW_model3 = point_driv_binaryW_model3/((point_driv_ub_binaryW_model3-point_driv_lb_binaryW_model3)/(2*1.96))
p_value_driv_binaryW_model3 = scipy.stats.norm.sf(abs(z_value_driv_binaryW_model3)) # * 2
p_value_BH_driv_binaryW_model3 = multipletests(pvals = p_value_driv_binaryW_model3, method = "fdr_bh", alpha=0.1)

full_results_binaryW_model3 = pd.DataFrame({"IID": selected_id_set3_arr, "point": point_driv_binaryW_model3, "upper_bound": point_driv_ub_binaryW_model3, \
                                "lower_bound": point_driv_lb_binaryW_model3, "z-value": z_value_driv_binaryW_model3, \
                                "p_value": p_value_driv_binaryW_model3, "p_value_corrected": p_value_BH_driv_binaryW_model3[1]})


In [None]:
full_results_binaryW_model1b.describe()

In [None]:
shap_values_driv_binaryW_model3 = est_driv_binaryW_model3.shap_values(X_model3_mat.iloc[1:100, :])


In [None]:
pd.DataFrame(shap_values_driv_binaryW_model3['CAD']['30780-0.0'].values)

In [None]:
save_object(shap_values_driv_binaryW_model3, "/home/yujia/Project/2023-07-20-individual_MR/res/02_ITE_analysis/01_table/LDL/CAD/03_variable_importance/driv_binaryW_shap_model3.pkl")

In [None]:
def pickle_loader(filename):
    """ Deserialize a file of pickled objects. """
    with open(filename, "rb") as f:
        while True:
            try:
                yield pickle.load(f)
            except EOFError:
                break

In [None]:
aaa = pickle_loader("/home/yujia/Project/2023-07-20-individual_MR/res/02_ITE_analysis/01_table/LDL/CAD/03_variable_importance/driv_binaryW_shap_model3.pkl")

In [None]:
with open ("/home/yujia/Project/2023-07-20-individual_MR/res/02_ITE_analysis/01_table/LDL/CAD/03_variable_importance/driv_binaryW_shap_model3.pkl", 'rb') as f: #打开文件
    t3 = pickle.load(f)

In [None]:
t3["CAD"]