In [1]:
import sys
import os

# Use os.getcwd() to get the current working directory in a Jupyter notebook
parent_dir = os.path.dirname(os.getcwd())  # Get the parent directory of the current working directory
sys.path.append(parent_dir)

In [2]:
import pandas as pd
from skopt.space import Real, Integer
from driver import Driver

# Load Cleaned Master MIMIC-IV Dataset

In [3]:
# Cleaned Master MIMIC Data Set
df_master = pd.read_pickle('../MIMIC_IV/df_ckd_master_clean.pkl')
df_base = pd.read_pickle('../MIMIC_IV/df_ckd_base.pkl')

In [4]:
dct_set = pd.read_pickle('novel_predictors/dct_Features.pkl')
fcnn_set = pd.read_pickle('novel_predictors/fcnn_Features.pkl')
lr_set = pd.read_pickle('novel_predictors/lr_Features.pkl')
rdmf_set = pd.read_pickle('novel_predictors/rdmf_Features.pkl')
rsnt_set = pd.read_pickle('novel_predictors/rsnt_Features.pkl')
xgb_set = pd.read_pickle('novel_predictors/xgb_Features.pkl')

# Take the union of the sets
union_set = set(dct_set).union(fcnn_set, lr_set, rdmf_set, rsnt_set, xgb_set)

# Print the union of the sets
print(union_set)

{'Total_Protein,_Urine_mean', 'Atrial fibrillation', 'Hemoglobin_max_y', 'Potassium_mean', 'Basophils_first', 'Monocytes_mean', 'Old myocardial infarction', 'Other and unspecified hyperlipidemia', 'MCHC_min', 'Severe sepsis', 'Chloride_max', 'Platelet_Count_median', 'Diaphragmatic hernia without mention of obstruction or gangrene', 'Creatinine_min', 'Acute on chronic diastolic heart failure', 'Protein/Creatinine_Ratio_median', 'Pneumonia, organism unspecified', 'Pure hypercholesterolemia', 'race_WHITE', 'Creatinine,_Urine_mean', 'Acute kidney failure, unspecified', 'WBC_Count_last', 'Urea_Nitrogen_max', 'Protein,_Total_min', 'Bilirubin,_Total_mean', 'Eosinophils_mean', 'Glucose_max', 'Potassium_last', 'Neutrophils_max', 'Creatinine,_Urine_min', 'Uric_Acid_median', 'Sodium_min', 'Cholesterol_Ratio_(Total/HDL)_min', 'Hemoglobin_first_y', 'Asparate_Aminotransferase_(AST)_first', 'Protein_last', 'Creatinine_mean', 'Anemia, unspecified', 'Monocytes_min', 'Protein/Creatinine_Ratio_mean', 'Hy

In [6]:
len(union_set)

125

# Evaluate XGBoost Pipeline

In [5]:
XGboost_cfg = {
    'name': 'XGboost', # Feature selection model long name
    'tag': 'xgb', # Feature selection model identifier tag
    'response': 'stage_delta',  # Response variable for binary classification and Cox Proportional Hazards
    'duration': 'duration', # Duration in days until response or censorship
    'n_novel': 40, # Number of ranked novel features included in augmented KFRE dataset
    'test_size': 0.1, # Ratio of data reserved for testing
    'val_size': 0.2, # Ratio of data reserved for testing in validation sets
    'n_valsets': 5, # Number of training validation sets
    'n_bayesian': 10, # Number of Bayesian hyperparameter optimization iterations 
    'n_folds': 5, # Number of folds for CoxPH cross validation
    'penalizer': 0.0007, # CoxPH model penalizer
    'auc_space': [
        Integer(5, 20, name='max_depth'),
        Integer(1, 10, name='min_child_weight'),
        Real(0.5, 3.0, name='gamma'),
        Real(0.6, 1.0, name='subsample'),
        Real(0.6, 1.0, name='colsample_bytree'),
        Real(0.01, 0.6, name='colsample_bylevel'),
        Real(0.01, 0.3, name='learning_rate'),
        Integer(50, 150, name='n_estimators')
    ], # Search space for Bayesian hyperparameter optimization
    'random_state': 42 # Seed for reproducibility
}

driver = Driver(XGboost_cfg, df_master, df_base)
driver.ClassicalLearningPipe()

Testing params: {'max_depth': 17, 'min_child_weight': 3, 'gamma': 2.449227500681924, 'subsample': 0.8387400631785948, 'colsample_bytree': 0.7783331011414365, 'colsample_bylevel': 0.06898520033262172, 'learning_rate': 0.1431821786701015, 'n_estimators': 83}


KeyboardInterrupt: 