In [1]:
'''
feature selection with Boruta
'''
import os, sys
from datetime import datetime
import argparse
import pandas as pd
import numpy as np
import torch
import optuna
from sklearn.model_selection import KFold
from boruta import BorutaPy
from BorutaShap import BorutaShap
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
import plotly.graph_objects as go
import json
import pickle
# 检测运行环境
def in_notebook():
    return 'IPKernelApp' in getattr(globals().get('get_ipython', lambda: None)(), 'config', {})

if in_notebook():
    from IPython.display import clear_output, display
    notebook_dir = os.getcwd()
    src_path = os.path.abspath(os.path.join(notebook_dir, '..'))
    N_TRIAL = 100 # boruta 特征选择次数
    OUTCOME_IX = 0
    IMPORTANCE_MEASURE = 'gini' # gini, shap, perm
else:
    src_path = os.path.abspath(os.path.dirname(os.path.dirname(__file__)))
    parser = argparse.ArgumentParser(description='')
    parser.add_argument('-n',metavar= 50, type=int, default=1000,help='''optuna优化尝试次数''')
    parser.add_argument('-outcome_ix',metavar=0, type=int, default=0,help='''选择预测结局, 为 `get_ite_features()`返回的预设 outcomes 列表的索引''')
    parser.add_argument('-importance',metavar='gini', type=str, default='shap',help='''特征重要性度量方式''')
    sys_args = parser.parse_args()
    N_TRIAL = sys_args.n
    OUTCOME_IX = sys_args.outcome_ix
    IMPORTANCE_MEASURE = sys_args.importance

sys.path.append(src_path) if src_path not in sys.path else None
from src.utils import *
from src.model_utils import *
from src.setup import *

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f'current device: {DEVICE}')
DATASET='EXIT_SEP'
current_time = datetime.now()
formatted_time = current_time.strftime("%Y-%m-%d_%H-%M")

current device: cpu


In [2]:
df = pd.read_csv(f'{DATA}/imputed/EXIT_SEP_clean_imputed.tsv.gz', sep='\t', index_col='ID')
cate_vars, cont_vars, outcomes = get_cleaned_vars('EXIT_SEP')

X = df[[*cate_vars, *cont_vars]].copy()
y = df[outcomes[OUTCOME_IX]].copy()

In [3]:
corr_mat = X.corr()
for col in corr_mat.columns:
    mask_high_corr = (corr_mat[col] > 0.9) & (corr_mat[col].index != col)
    high_corr = (corr_mat[col][mask_high_corr]).to_dict()
    for var, corr_coef in high_corr.items():
        print(f'{col} x {var} : {corr_coef:.2f}')

SOFA_renal x Scr : 0.90
Respiratory_Support x MV : 0.96
Respiratory_Support x MV/NIPPV : 0.96
MV x Respiratory_Support : 0.96
MV/NIPPV x Respiratory_Support : 0.96
Scr x SOFA_renal : 0.90


In [4]:
# 去除高相关性特征和综合评分
X = X.drop(columns=['RBC', 'HCT', 'ALT',
                    'Respiratory_Support', 'MV', 
                    'APACHE_II','SOFA','SOFA_renal', 'SOFA_cardio', 'SOFA_coagulation', 'SOFA_liver', 'SOFA_respiration', #'SOFA_cns',
                   ])

In [5]:
# RF for feature selection
model = RandomForestClassifier(n_jobs=-1, class_weight='balanced', max_depth=5)

# define Boruta feature selection method
feature_selector = BorutaPy(model, n_estimators='auto', verbose=2, random_state=19960816)

feature_selector.fit(X, y)

Iteration: 	1 / 100
Confirmed: 	0
Tentative: 	56
Rejected: 	0
Iteration: 	2 / 100
Confirmed: 	0
Tentative: 	56
Rejected: 	0
Iteration: 	3 / 100
Confirmed: 	0
Tentative: 	56
Rejected: 	0
Iteration: 	4 / 100
Confirmed: 	0
Tentative: 	56
Rejected: 	0
Iteration: 	5 / 100
Confirmed: 	0
Tentative: 	56
Rejected: 	0
Iteration: 	6 / 100
Confirmed: 	0
Tentative: 	56
Rejected: 	0
Iteration: 	7 / 100
Confirmed: 	0
Tentative: 	56
Rejected: 	0
Iteration: 	8 / 100
Confirmed: 	12
Tentative: 	13
Rejected: 	31
Iteration: 	9 / 100
Confirmed: 	12
Tentative: 	13
Rejected: 	31
Iteration: 	10 / 100
Confirmed: 	12
Tentative: 	13
Rejected: 	31
Iteration: 	11 / 100
Confirmed: 	12
Tentative: 	13
Rejected: 	31
Iteration: 	12 / 100
Confirmed: 	13
Tentative: 	9
Rejected: 	34
Iteration: 	13 / 100
Confirmed: 	13
Tentative: 	9
Rejected: 	34
Iteration: 	14 / 100
Confirmed: 	13
Tentative: 	9
Rejected: 	34
Iteration: 	15 / 100
Confirmed: 	13
Tentative: 	9
Rejected: 	34
Iteration: 	16 / 100
Confirmed: 	13
Tentative: 	9
Re

BorutaPy(estimator=RandomForestClassifier(class_weight='balanced', max_depth=5,
                                          n_estimators=126, n_jobs=-1,
                                          random_state=RandomState(MT19937) at 0x7F78E8580040),
         n_estimators='auto',
         random_state=RandomState(MT19937) at 0x7F78E8580040, verbose=2)

In [7]:
# check selected features - first 5 features are selected
print(f"selected: {list(X.columns[feature_selector.support_])}")
print(f"tentative : {list(X.columns[feature_selector.support_weak_ ])}")
print(f"Rejected : {list(X.columns[~((feature_selector.support_weak_) | (feature_selector.support_))])}")

selected: ['SOFA_cns', 'MV/NIPPV', 'age', 'heart_rate', 'Lac', 'LYM%', 'PLT', 'BUN', 'Scr', 'K+', 'Na+', 'Fg', 'PT', 'APTT', 'PaO2/FiO2', 'PaO2']
tentative : ['BMI', 'PCT']
Rejected : ['XBJ_intervention', 'sex', 'primary_infection_site_lung', 'primary_infection_site_abdo', 'primary_infection_site_uri', 'primary_infection_site_skin', 'primary_infection_site_brain', 'pathogen_test', 'Gram-_infect', 'Gram+_infect', 'Fungi_infect', 'Gram_neg_resist', 'Gram_pos_resist', 'Fungi_resist', 'multidrug_resist', 'DIC-score', 'septic_shock', 'NIPPV', 'CCRT', 'nutri_support', 'nutri_support_enteral', 'nutri_support_parenteral', 'temperature', 'respiratory_rate', 'SBP', 'DBP', 'MAP', 'Hb', 'WBC', 'NE%', 'AST', 'STB', 'Glu', 'D-Dimer', 'CRP', 'PH', 'HCO3-', 'PaCO2']
