# Import Necessary Headers

In [1]:
import pandas as pd
import numpy as np
import xgboost as xgb
from itertools import product
from joblib import Parallel, delayed
from skopt import gp_minimize
from skopt.space import Real, Integer, Categorical
from skopt.utils import use_named_args
from sklearn.metrics import mean_squared_error, r2_score, roc_auc_score
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns

# Load Updated CSV into Pandas Dataframe

In [2]:
# Hong et al. Dataset
df = pd.read_csv("hong_et_al/df_updt.csv")

# Iimori et al. data set
df_iimori = pd.read_excel('iimori_et_al/ROUTE_proteinuria_dataset.xlsx')

  warn(msg)


In [3]:
# Initial view of the frame
df.head(3)

Unnamed: 0,dep_name,esi,age,gender,ethnicity,race,lang,religion,maritalstatus,employstatus,...,cc_vaginaldischarge,cc_vaginalpain,cc_weakness,cc_wheezing,cc_withdrawal-alcohol,cc_woundcheck,cc_woundinfection,cc_woundre-evaluation,cc_wristinjury,cc_wristpain
0,B,4.0,40.0,Male,Hispanic or Latino,White or Caucasian,English,,Single,Full Time,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,B,4.0,66.0,Male,Hispanic or Latino,Native Hawaiian or Other Pacific Islander,English,Pentecostal,Married,Not Employed,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,B,2.0,66.0,Male,Hispanic or Latino,Native Hawaiian or Other Pacific Islander,English,Pentecostal,Married,Not Employed,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Preprocess Data

In [4]:
# Drop CKD information columns
df = df.drop(columns=df.filter(like='chrkidneydisease').columns)

# Exclude columns describing direct parameter to CKD-EPI Creatinine Equation used to calculate eGFR
df = df.drop(columns=df.filter(like='creat').columns)

# Drop rows with NaN eGFR values
df.dropna(subset=['egfr_CKD_EPI'])

# Impute median data for any remaining NaN values in numeric columns
numeric_cols = df.select_dtypes(include=['number']).columns
df[numeric_cols] = df[numeric_cols].fillna(df[numeric_cols].median())

# Cast categorical columns as type category
categorical_cols = ['dep_name', 'gender', 'ethnicity', 'race', 'lang', 'religion',
                    'maritalstatus', 'employstatus', 'insurance_status', 'disposition', 'arrivalmode',
                    'arrivalmonth', 'arrivalday', 'arrivalhour_bin', 'previousdispo']

df[categorical_cols] = df[categorical_cols].astype('category')

# Define Train and Test Data Splits for Hong et al. Set

In [5]:
# Exclude target variable from features frame
X = df.drop(columns=df.filter(like='egfr').columns)

# Set target variable to egfr_CKD_EPI; calculated patient eGFR 
y = df['egfr_CKD_EPI']

def create_splits(X, y, test_size=0.1, val_size=0.2, n_splits=5, seed=42):
    np.random.seed(seed)  # Ensure reproducibility
    
    # Step 1: Split data into test and the remaining data
    X_traindev, X_test, y_traindev, y_test = train_test_split(X, y, test_size=test_size, random_state=seed)
    
    # Step 2: Further split the remaining data into multiple train and validation sets
    val_sets = []
    for _ in range(n_splits):
        # Randomly select validation set from the remaining data
        X_train, X_val, y_train, y_val = train_test_split(X_traindev, y_traindev, test_size=val_size, random_state=np.random.randint(10000))
        
        val_sets.append({
            'X_train': X_train, 
            'y_train': y_train, 
            'X_val': X_val, 
            'y_val': y_val
        })

    return X_traindev, y_traindev, X_test, y_test, val_sets

X_traindev, y_traindev, X_test, y_test, val_sets = create_splits(X, y)

# Define Train and Test Data Splits for Iimori et al. Set

In [6]:
# Exclude target variable from features frame
X_iimori = df_iimori.drop('CKD progression', axis=1)

# Set target variable to egfr_CKD_EPI; calculated patient eGFR 
y_iimori = df_iimori['CKD progression']

def create_splits(X, y, test_size=0.1, val_size=0.2, n_splits=5, seed=42):
    np.random.seed(seed)  # Ensure reproducibility
    
    # Step 1: Split data into test and the remaining data
    X_traindev, X_test, y_traindev, y_test = train_test_split(X, y, test_size=test_size, random_state=seed)
    
    # Step 2: Further split the remaining data into multiple train and validation sets
    val_sets = []
    for _ in range(n_splits):
        # Randomly select validation set from the remaining data
        X_train, X_val, y_train, y_val = train_test_split(X_traindev, y_traindev, test_size=val_size, random_state=np.random.randint(10000))
        
        val_sets.append({
            'X_train': X_train, 
            'y_train': y_train, 
            'X_val': X_val, 
            'y_val': y_val
        })

    return X_traindev, y_traindev, X_test, y_test, val_sets

X_traindev_2, y_traindev_2, X_test_2, y_test_2, val_sets_2 = create_splits(X_iimori, y_iimori, n_splits=4)

# Initialize XGBoost Regression and Classification Models

In [7]:
# Initialize regression model with GPU support
model = xgb.XGBRegressor(objective='reg:squarederror', verbosity=0, enable_categorical=True, tree_method='gpu_hist')

# Initialize classification model with GPU support
model_2 = xgb.XGBClassifier(objective='binary:logistic', verbosity=0, tree_method='gpu_hist')

# Implement Bayesian Hyperparameter Optimization

In [8]:
def MSE_validate(model, params, val_sets):
    mse_scores = []
    for val in val_sets:
        # Set parameters and reinitialize the model to avoid leakage from previous fits
        params['tree_method'] = 'gpu_hist'  # Use GPU for training
        model.set_params(**params)
        model.fit(val['X_train'], val['y_train'])

        try:
            # Predict on the validation set and calculate MSE
            preds = model.predict(val['X_val'])
            mse = mean_squared_error(val['y_val'], preds)
        except Exception as e:
            print(f"Error during model prediction: {str(e)}")
            mse = float('inf')  # Consider the worst case if prediction fails

        mse_scores.append(mse)
    
    # Calculate average MSE across all validation sets
    return np.mean(mse_scores)

# Function to evaluate model on validation sets
def AUC_validate(model, params, val_sets):
    auc_scores = []
    for val in val_sets:
        # Set parameters and reinitialize the model to avoid leakage from previous fits
        params['tree_method'] = 'gpu_hist'  # Use GPU for training
        model.set_params(**params)
        model.fit(val['X_train'], val['y_train'])

        try:
            # Predict probabilities on the validation set and calculate AUC
            preds_proba = model.predict_proba(val['X_val'])[:, 1]  # Probability of the positive class
            auc = roc_auc_score(val['y_val'], preds_proba)
        except Exception as e:
            print(f"Error during model prediction: {str(e)}")
            auc = 0.0  # Consider the worst case if prediction fails

        auc_scores.append(auc)
    
    # Calculate average AUC across all validation sets
    return np.mean(auc_scores)

# Tune Regression Model Hyperparameters

In [9]:
# Define the search space
reg_space = [
    Integer(5, 20, name='max_depth'),
    Integer(1, 10, name='min_child_weight'),
    Real(0.5, 5.0, name='gamma'),
    Real(0.6, 1.0, name='subsample'),
    Real(0.6, 1.0, name='colsample_bytree'),
    Real(0.01, 0.6, name='colsample_bylevel'),
    Real(0.01, 0.3, name='learning_rate'),
    Integer(50, 300, name='n_estimators')
]

# Define objective function
@use_named_args(reg_space)
def MSE_objective(**params):
    print("Testing params:", params)  # Debug: print parameters to console
    mse = MSE_validate(model, params, val_sets)
    print("MSE for params:", mse)  # Debug: print result to console
    return mse

# Perform Bayesian Optimization
result = gp_minimize(MSE_objective, reg_space, n_calls=20, random_state=42)

# Extract the best parameters and the corresponding score
best_reg_params = {dimension.name: result.x[i] for i, dimension in enumerate(reg_space)}
best_reg_score = result.fun

print("Best parameters found: ", best_reg_params)
print("Best average MSE across validation sets: ", best_reg_score)

Testing params: {'max_depth': 17, 'min_child_weight': 3, 'gamma': 4.008609501227463, 'subsample': 0.8387400631785948, 'colsample_bytree': 0.7783331011414365, 'colsample_bylevel': 0.06898520033262172, 'learning_rate': 0.1431821786701015, 'n_estimators': 133}


  or is_sparse(dtype)
  or (is_categorical_dtype(dtype) and enable_categorical)
  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  enable_categorical and any(is_categorical_dtype(dtype) for dtype in data.dtypes)
  if is_categorical_dtype(dtype):
  if is_categorical_dtype(dtype):
  if is_categorical_dtype(dtype):
  if is_categorical_dtype(dtype):
  if is_categorical_dtype(dtype):
  if is_categorical_dtype(dtype):
  if is_categorical_dtype(dtype):
  if is_categorical_dtype(dtype):
  if is_categorical_dtype(dtype):
  if is_categorical_dtype(dtype):
  if is_categorical_dtype(dtype):
  if is_categorical_dtype(dtype):
  if is_categorical_dtype(dtype):
  if is_categorical_dtype(dtype):
  if is_categorical_dtype(dtype):
  if is_categorical_dtype(dtype):
  if is_categorical_dtype(dtype):
  if is_categorical_dtype(dtype):
  if is_categorical_dtype(dtype):
  if is_categorical_dtype(dtype):
  if is_categorical_dtype(dtype):
  if is_categorical_dtype(dtype):
  if is

KeyboardInterrupt: 

# Tune Classification Model Hyperparameters

In [10]:
# Define the search space
auc_space = [
    Integer(5, 20, name='max_depth'),
    Integer(1, 10, name='min_child_weight'),
    Real(0.5, 5.0, name='gamma'),
    Real(0.6, 1.0, name='subsample'),
    Real(0.6, 1.0, name='colsample_bytree'),
    Real(0.01, 0.6, name='colsample_bylevel'),
    Real(0.01, 0.3, name='learning_rate'),
    Integer(50, 300, name='n_estimators')
]

# Define objective function
@use_named_args(auc_space)
def AUC_objective(**params):
    print("Testing params:", params)  # Debug: print parameters to console
    auc = AUC_validate(model_2, params, val_sets_2)
    print("AUC for params:", auc)  # Debug: print result to console
    return -auc # Invert to optimize for minimum AUC 

# Perform Bayesian Optimization
result = gp_minimize(AUC_objective, auc_space, n_calls=20, random_state=42)

# Extract the best parameters and the corresponding score
best_auc_params = {dimension.name: result.x[i] for i, dimension in enumerate(auc_space)}
best_auc_score = -result.fun # Convert back to positive AUC

print("Best parameters found: ", best_auc_params)
print("Best average AUC across validation sets: ", best_auc_score)

Testing params: {'max_depth': 17, 'min_child_weight': 3, 'gamma': 4.008609501227463, 'subsample': 0.8387400631785948, 'colsample_bytree': 0.7783331011414365, 'colsample_bylevel': 0.06898520033262172, 'learning_rate': 0.1431821786701015, 'n_estimators': 133}


  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_sparse(data):
  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_sparse(data):
  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_sparse(data):
  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_sparse(data):
  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:


AUC for params: 0.9992498533670335
Testing params: {'max_depth': 7, 'min_child_weight': 7, 'gamma': 0.7538521056219512, 'subsample': 0.88879950890673, 'colsample_bytree': 0.9754210836063002, 'colsample_bylevel': 0.010459471846198455, 'learning_rate': 0.29774135219445313, 'n_estimators': 204}


  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_sparse(data):
  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_sparse(data):
  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_sparse(data):
  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_sparse(data):
  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_sparse(data):
  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_c

AUC for params: 0.9962188902065509
Testing params: {'max_depth': 14, 'min_child_weight': 1, 'gamma': 0.6037809126863709, 'subsample': 0.8099098641033557, 'colsample_bytree': 0.7599443886861023, 'colsample_bylevel': 0.0375327412960331, 'learning_rate': 0.2923891004640232, 'n_estimators': 108}


  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_sparse(data):
  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_sparse(data):
  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_sparse(data):
  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_sparse(data):


AUC for params: 0.9970106798309614
Testing params: {'max_depth': 6, 'min_child_weight': 7, 'gamma': 2.2210789607022328, 'subsample': 0.9932923543227153, 'colsample_bytree': 0.786705157299192, 'colsample_bylevel': 0.5173648399744293, 'learning_rate': 0.20728918619045614, 'n_estimators': 163}


  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_sparse(data):
  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_sparse(data):
  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_sparse(data):
  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_sparse(data):


AUC for params: 1.0
Testing params: {'max_depth': 5, 'min_child_weight': 9, 'gamma': 3.034796980304927, 'subsample': 0.7541666010159664, 'colsample_bytree': 0.6063865008880857, 'colsample_bylevel': 0.14622735711706794, 'learning_rate': 0.07989738514754341, 'n_estimators': 221}


  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_sparse(data):
  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_sparse(data):
  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_sparse(data):


AUC for params: 0.9967354424567292
Testing params: {'max_depth': 14, 'min_child_weight': 8, 'gamma': 1.2801409407849746, 'subsample': 0.7564242430292963, 'colsample_bytree': 0.672894435115225, 'colsample_bylevel': 0.45566323208741505, 'learning_rate': 0.133295203602461, 'n_estimators': 102}


  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_sparse(data):
  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_sparse(data):
  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_sparse(data):
  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_sparse(data):


AUC for params: 0.9993530412388686
Testing params: {'max_depth': 14, 'min_child_weight': 1, 'gamma': 4.290281485677495, 'subsample': 0.7799016533479063, 'colsample_bytree': 0.7580600944007259, 'colsample_bylevel': 0.5567287308183387, 'learning_rate': 0.22090887879836207, 'n_estimators': 132}


  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_sparse(data):
  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_sparse(data):
  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_sparse(data):
  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_sparse(data):


AUC for params: 1.0
Testing params: {'max_depth': 14, 'min_child_weight': 6, 'gamma': 4.825274109572072, 'subsample': 0.9378135394712607, 'colsample_bytree': 0.8989280440549523, 'colsample_bylevel': 0.3284183581095571, 'learning_rate': 0.180157838042516, 'n_estimators': 291}


  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_sparse(data):
  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_sparse(data):
  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_sparse(data):
  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:


AUC for params: 1.0
Testing params: {'max_depth': 14, 'min_child_weight': 3, 'gamma': 1.833230775668371, 'subsample': 0.666106775625201, 'colsample_bytree': 0.6062545626964776, 'colsample_bylevel': 0.25980687361675814, 'learning_rate': 0.12451564027091523, 'n_estimators': 123}


  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_sparse(data):
  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_sparse(data):
  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_sparse(data):
  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_sparse(data):
  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_sparse(data):
  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_c

AUC for params: 1.0
Testing params: {'max_depth': 5, 'min_child_weight': 3, 'gamma': 3.701038787368925, 'subsample': 0.9160702162124823, 'colsample_bytree': 0.8423839899124046, 'colsample_bylevel': 0.556517518322876, 'learning_rate': 0.19881233739556392, 'n_estimators': 279}


  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_sparse(data):
  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_sparse(data):
  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:


AUC for params: 1.0
Testing params: {'max_depth': 8, 'min_child_weight': 7, 'gamma': 3.3788929214251326, 'subsample': 0.6247207892879741, 'colsample_bytree': 0.6632587690672016, 'colsample_bylevel': 0.3791650192341516, 'learning_rate': 0.19365631586130713, 'n_estimators': 75}


  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_sparse(data):
  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_sparse(data):
  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_sparse(data):
  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_sparse(data):
  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_sparse(data):


AUC for params: 0.996810924661474
Testing params: {'max_depth': 19, 'min_child_weight': 9, 'gamma': 2.6779938667126575, 'subsample': 0.8957193501254201, 'colsample_bytree': 0.9995141054249435, 'colsample_bylevel': 0.09898231809737199, 'learning_rate': 0.16706937376058442, 'n_estimators': 161}


  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_sparse(data):
  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_sparse(data):
  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_sparse(data):
  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_sparse(data):
  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_sparse(data):


AUC for params: 0.996246304438564
Testing params: {'max_depth': 10, 'min_child_weight': 1, 'gamma': 4.995424413489406, 'subsample': 0.7337056993420317, 'colsample_bytree': 0.6549738070651464, 'colsample_bylevel': 0.012488846665288118, 'learning_rate': 0.2751277221984007, 'n_estimators': 63}


  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_sparse(data):
  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_sparse(data):
  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:


AUC for params: 0.9990868783936873
Testing params: {'max_depth': 14, 'min_child_weight': 8, 'gamma': 1.6821480109691793, 'subsample': 0.8189472892641156, 'colsample_bytree': 0.8367848289141177, 'colsample_bylevel': 0.569066105939107, 'learning_rate': 0.17077612933879502, 'n_estimators': 292}


  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_sparse(data):
  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_sparse(data):
  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_sparse(data):
  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_sparse(data):
  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_sparse(data):


AUC for params: 0.9986575978098098
Testing params: {'max_depth': 7, 'min_child_weight': 7, 'gamma': 4.993120868073198, 'subsample': 0.8521215718130768, 'colsample_bytree': 0.7084876867298266, 'colsample_bylevel': 0.5379262692125157, 'learning_rate': 0.04965445578491413, 'n_estimators': 60}


  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_sparse(data):
  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_sparse(data):
  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_sparse(data):
  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:


AUC for params: 1.0
Testing params: {'max_depth': 9, 'min_child_weight': 9, 'gamma': 0.5080392178596013, 'subsample': 0.9783207883148227, 'colsample_bytree': 0.8970027329132035, 'colsample_bylevel': 0.5692612791479094, 'learning_rate': 0.17409712977284567, 'n_estimators': 97}


  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_sparse(data):
  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_sparse(data):
  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_sparse(data):
  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_sparse(data):
  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:


AUC for params: 0.9996923707957341
Testing params: {'max_depth': 11, 'min_child_weight': 6, 'gamma': 0.5047791902062033, 'subsample': 0.6085430421352586, 'colsample_bytree': 0.7320629158297, 'colsample_bylevel': 0.3250228702860144, 'learning_rate': 0.07739786251792406, 'n_estimators': 151}


  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_sparse(data):
  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_sparse(data):
  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_sparse(data):
  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_sparse(data):
  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_sparse(data):


AUC for params: 0.9986111279898687
Testing params: {'max_depth': 11, 'min_child_weight': 8, 'gamma': 4.95248064704332, 'subsample': 0.6184880291472902, 'colsample_bytree': 0.7501157015500892, 'colsample_bylevel': 0.3107961699198378, 'learning_rate': 0.1586505795910497, 'n_estimators': 156}


  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_sparse(data):
  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_sparse(data):
  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_sparse(data):
  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_sparse(data):


AUC for params: 0.9982065088883778
Testing params: {'max_depth': 16, 'min_child_weight': 1, 'gamma': 0.6841081222946469, 'subsample': 0.9444026425015422, 'colsample_bytree': 0.6947972309934985, 'colsample_bylevel': 0.5711442372906836, 'learning_rate': 0.16132619352072475, 'n_estimators': 94}


  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_sparse(data):
  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_sparse(data):
  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_sparse(data):
  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:


AUC for params: 1.0
Testing params: {'max_depth': 14, 'min_child_weight': 2, 'gamma': 4.455270571118296, 'subsample': 1.0, 'colsample_bytree': 0.6680783131184538, 'colsample_bylevel': 0.534441432134639, 'learning_rate': 0.199996314269117, 'n_estimators': 169}


  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_sparse(data):
  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_sparse(data):
  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_sparse(data):
  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_sparse(data):


AUC for params: 1.0
Best parameters found:  {'max_depth': 6, 'min_child_weight': 7, 'gamma': 2.2210789607022328, 'subsample': 0.9932923543227153, 'colsample_bytree': 0.786705157299192, 'colsample_bylevel': 0.5173648399744293, 'learning_rate': 0.20728918619045614, 'n_estimators': 163}
Best average AUC across validation sets:  1.0


  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:


# Train XG Boost Regressor on Full Training Set

In [13]:
# Train the XG Boost regressor model on the full training set with tuned hyperparameters
model.set_params(**best_reg_params)
model.fit(X_traindev, y_traindev)

  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  enable_categorical and any(is_categorical_dtype(dtype) for dtype in data.dtypes)
  if is_sparse(data):


# Evaluate Accuracy on Test Set

In [None]:
y_pred = model.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred, squared=False)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse}")
print(f"Root Mean Squared Error: {rmse}")
print(f"R² Score: {r2}")

# Evaluate Feature Importance

In [None]:
# Retrieve feature importances
importances = model.feature_importances_

# Create a frame for feature importance visualization
feature_importance = pd.DataFrame({
    'Feature': X.columns,
    'Importance': importances
}).sort_values(by='Importance', ascending=False)

# Display the feature importance frame
print(feature_importance.head(30))

# Visualize Feature Importance

In [None]:
plt.figure(figsize=(10, 6))
sns.barplot(x='Importance', y='Feature', data=feature_importance.head(30))
plt.title('Feature Importances in eGFR Prediction')
plt.xlabel('Importance')
plt.ylabel('Feature')
plt.show()

# Train XG Boost Classifier on Full Training Set

In [14]:
# Train the XG Boost regressor model on the full training set with tuned hyperparameters
model_2.set_params(**best_auc_params)
model.fit(X_traindev_2, y_traindev_2)

  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  enable_categorical and any(is_categorical_dtype(dtype) for dtype in data.dtypes)
  if is_sparse(data):


# Evaluate Accuracy on Test Set

In [15]:
y_pred_2 = model_2.predict(X_test_2)

auc = roc_auc_score(y_test_2, y_pred_2)

print(f"Area Under ROC Curve: {auc}")

Area Under ROC Curve: 1.0


  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:


# Evaluate Feature Importance

In [None]:
# Retrieve feature importances
importances = model_2.feature_importances_

# Create a frame for feature importance visualization
feature_importance = pd.DataFrame({
    'Feature': X_iimori.columns,
    'Importance': importances
}).sort_values(by='Importance', ascending=False)

# Display the feature importance frame
print(feature_importance.head(30))