In [1]:
'''
feature selection with Boruta
'''
import os, sys
from datetime import datetime
import argparse
import pandas as pd
import numpy as np
import torch
import optuna
from sklearn.model_selection import KFold
from boruta import BorutaPy
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
import plotly.graph_objects as go
import json
import pickle
# 检测运行环境
def in_notebook():
    return 'IPKernelApp' in getattr(globals().get('get_ipython', lambda: None)(), 'config', {})

if in_notebook():
    from IPython.display import clear_output, display
    notebook_dir = os.getcwd()
    src_path = os.path.abspath(os.path.join(notebook_dir, '..'))
    N_TRIAL = 100 # boruta 特征选择次数
    OUTCOME_IX = 0
    IMPORTANCE_MEASURE = 'gini' # gini, shap, perm
else:
    src_path = os.path.abspath(os.path.dirname(os.path.dirname(__file__)))
    parser = argparse.ArgumentParser(description='')
    parser.add_argument('-n',metavar= 50, type=int, default=50,help='''optuna优化尝试次数''')
    parser.add_argument('-outcome_ix',metavar=0, type=int, default=0,help='''选择预测结局, 为 `get_ite_features()`返回的预设 outcomes 列表的索引''')
    parser.add_argument('-importance',metavar='gini', type=str, default='shap',help='''特征重要性度量方式''')
    sys_args = parser.parse_args()
    N_TRIAL = sys_args.n
    OUTCOME_IX = sys_args.outcome_ix
    IMPORTANCE_MEASURE = sys_args.importance

sys.path.append(src_path) if src_path not in sys.path else None
from src.utils import *
from src.model_utils import *
from src.setup import *

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f'current device: {DEVICE}')

current device: cpu


In [2]:
df = pd.read_csv(f'{DATA}/imputed/MIMIC_IV_clean_imputed.tsv.gz', sep='\t', index_col='ID')
cate_vars, cont_vars, outcomes = get_cleaned_vars('MIMIC_IV')

X = df[[*cate_vars, *cont_vars]].copy()
y = df[outcomes[OUTCOME_IX]].copy()

# manual feature selection or integration

In [17]:
corr_mat = X.corr()
for col in corr_mat.columns:
    mask_high_corr = (corr_mat[col] > 0.8) & (corr_mat[col].index != col)
    high_corr = (corr_mat[col][mask_high_corr]).to_dict()
    for var, corr_coef in high_corr.items():
        print(f'{col}:{var}: {corr_coef:.2f}')

weight:BMI: 0.88
BMI:weight: 0.88
DBP:MAP: 0.93
MAP:DBP: 0.93
RBC:Hb: 0.90
RBC:HCT: 0.92
Hb:RBC: 0.90
Hb:HCT: 0.97
HCT:RBC: 0.92
HCT:Hb: 0.97
ALT:AST: 0.92
AST:ALT: 0.92


In [3]:
# 聚合特征
X['server_cancer_AIDS'] = X[['metastatic_cancer', 'hematologic_cancer', 'AIDS']].max(axis=1)
X['MV/NIMV'] = X[['MV', 'NIPPV']].max(axis=1) # NIPPV is actually NIMV, correct name here.

# 去除高相关性特征、综合评分、被聚合的原始特征
X = X.drop(columns=['height', 'weight', 'DBP', 'RBC', 'HCT', 'ALT',
                    'OASIS', 'Charlson', 'APS_III', 'SAPS_II', 'SIRS', # existing scores for validation
                    'SOFA', 'SOFA_renal', 'SOFA_cardio', 'SOFA_coagulation', 'SOFA_liver', 'SOFA_respiration', 'SOFA_cns',
                    'metastatic_cancer', 'hematologic_cancer', 'AIDS', 'MV', 'NIPPV',
                   ])

In [5]:
# RF for feature selection
model = RandomForestClassifier(n_jobs=-1, class_weight='balanced', max_depth=5)

# define Boruta feature selection method
feature_selector = BorutaPy(model, n_estimators='auto', verbose=2, random_state=19960816)

feature_selector.fit(X, y)

Iteration: 	1 / 100
Confirmed: 	0
Tentative: 	38
Rejected: 	0
Iteration: 	2 / 100
Confirmed: 	0
Tentative: 	38
Rejected: 	0
Iteration: 	3 / 100
Confirmed: 	0
Tentative: 	38
Rejected: 	0
Iteration: 	4 / 100
Confirmed: 	0
Tentative: 	38
Rejected: 	0
Iteration: 	5 / 100
Confirmed: 	0
Tentative: 	38
Rejected: 	0
Iteration: 	6 / 100
Confirmed: 	0
Tentative: 	38
Rejected: 	0
Iteration: 	7 / 100
Confirmed: 	0
Tentative: 	38
Rejected: 	0
Iteration: 	8 / 100
Confirmed: 	35
Tentative: 	3
Rejected: 	0
Iteration: 	9 / 100
Confirmed: 	35
Tentative: 	3
Rejected: 	0
Iteration: 	10 / 100
Confirmed: 	35
Tentative: 	2
Rejected: 	1
Iteration: 	11 / 100
Confirmed: 	35
Tentative: 	2
Rejected: 	1
Iteration: 	12 / 100
Confirmed: 	35
Tentative: 	2
Rejected: 	1
Iteration: 	13 / 100
Confirmed: 	35
Tentative: 	2
Rejected: 	1
Iteration: 	14 / 100
Confirmed: 	35
Tentative: 	2
Rejected: 	1
Iteration: 	15 / 100
Confirmed: 	35
Tentative: 	2
Rejected: 	1
Iteration: 	16 / 100
Confirmed: 	36
Tentative: 	1
Rejected: 	1
I

BorutaPy(estimator=RandomForestClassifier(class_weight='balanced', max_depth=5,
                                          n_estimators=172, n_jobs=-1,
                                          random_state=RandomState(MT19937) at 0x7FCE77514240),
         n_estimators='auto',
         random_state=RandomState(MT19937) at 0x7FCE77514240, verbose=2)

In [16]:
# check selected features - first 5 features are selected
print(f"selected: {list(X.columns[feature_selector.support_])}")
print(f"tentative : {list(X.columns[feature_selector.support_weak_ ])}")
print(f"Rejected : {list(X.columns[~((feature_selector.support_weak_) | (feature_selector.support_))])}")

selected: ['cancer', 'CCRT', 'age', 'BMI', 'temperature', 'heart_rate', 'respir_rate', 'SBP', 'MAP', 'SPO2', 'GCS', 'WBC', 'Hb', 'NE#', 'NE%', 'LYM#', 'LYM%', 'PLT', 'AST', 'STB', 'BUN', 'Scr', 'Glu', 'K+', 'Na+', 'Ca2+', 'Fg', 'PT', 'APTT', 'PH', 'PaO2/FiO2', 'HCO3-', 'PaO2', 'Lac', 'PaCO2', 'server_cancer_AIDS']
tentative : []
Rejected : ['sex', 'MV/NIMV']


In [7]:
feature_selector_path = f'{MODELS}/MIMIC_IV_boruta_risk_model_{outcomes[OUTCOME_IX]}_{IMPORTANCE_MEASURE}.pkl'
with open(feature_selector_path, 'wb') as file:
    pickle.dump(feature_selector, file)
print(f"Feature selector saved: {feature_selector_path}")

Feature selector saved: /home/xuxu.wei/sepsis-reasearch//models//MIMIC_IV_boruta_risk_model_28d_mortality_gini.pkl
