In [1]:
import pandas as pd
import numpy as np
from xgboost import XGBClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import roc_auc_score
from scipy.stats import uniform, randint

In [2]:
# Datasets
heart_test = pd.read_csv('Data/heart_test.csv')
heart_train = pd.read_csv('Data/heart_train.csv')
diabetes_test = pd.read_csv('Data/diabetes_test.csv')
diabetes_train = pd.read_csv('Data/diabetes_train.csv')
cancer_test = pd.read_csv('Data/cancer_test.csv')
cancer_train = pd.read_csv('Data/cancer_train.csv')
alzheimer_test = pd.read_csv('Data/alzheimer_test.csv')
alzheimer_train = pd.read_csv('Data/alzheimer_train.csv')

datasets = {
    "heart": (heart_train, heart_test),
    "diabetes": (diabetes_train, diabetes_test),
    "cancer": (cancer_train, cancer_test),
    "alzheimer": (alzheimer_train, alzheimer_test)
}

In [3]:
from sklearn.model_selection import train_test_split

# Training sets for 25%, 50%, 75%
heart_train25, _ = train_test_split(heart_train, train_size=0.25, random_state=42, stratify=heart_train.iloc[:, -1])
heart_train50, _ = train_test_split(heart_train, train_size=0.50, random_state=42, stratify=heart_train.iloc[:, -1])
heart_train75, _ = train_test_split(heart_train, train_size=0.75, random_state=42, stratify=heart_train.iloc[:, -1])

diabetes_train25, _ = train_test_split(diabetes_train, train_size=0.25, random_state=42, stratify=diabetes_train.iloc[:, -1])
diabetes_train50, _ = train_test_split(diabetes_train, train_size=0.50, random_state=42, stratify=diabetes_train.iloc[:, -1])
diabetes_train75, _ = train_test_split(diabetes_train, train_size=0.75, random_state=42, stratify=diabetes_train.iloc[:, -1])

cancer_train25, _ = train_test_split(cancer_train, train_size=0.25, random_state=42, stratify=cancer_train.iloc[:, -1])
cancer_train50, _ = train_test_split(cancer_train, train_size=0.50, random_state=42, stratify=cancer_train.iloc[:, -1])
cancer_train75, _ = train_test_split(cancer_train, train_size=0.75, random_state=42, stratify=cancer_train.iloc[:, -1])

alzheimer_train25, _ = train_test_split(alzheimer_train, train_size=0.25, random_state=42, stratify=alzheimer_train.iloc[:, -1])
alzheimer_train50, _ = train_test_split(alzheimer_train, train_size=0.50, random_state=42, stratify=alzheimer_train.iloc[:, -1])
alzheimer_train75, _ = train_test_split(alzheimer_train, train_size=0.75, random_state=42, stratify=alzheimer_train.iloc[:, -1])


datasets25 = {
    "heart": (heart_train25, heart_test),
    "diabetes": (diabetes_train25, diabetes_test),
    "cancer": (cancer_train25, cancer_test),
    "alzheimer": (alzheimer_train25, alzheimer_test)
}

datasets50 = {
    "heart": (heart_train50, heart_test),
    "diabetes": (diabetes_train50, diabetes_test),
    "cancer": (cancer_train50, cancer_test),
    "alzheimer": (alzheimer_train50, alzheimer_test)
}

datasets75 = {
    "heart": (heart_train75, heart_test),
    "diabetes": (diabetes_train75, diabetes_test),
    "cancer": (cancer_train75, cancer_test),
    "alzheimer": (alzheimer_train75, alzheimer_test)
}


# 1. Uniform Random

In [4]:
# Grid of hyperparameters 
param_uniform = {
    'n_estimators': randint(100, 2000),                      
    'learning_rate': [2 ** x for x in np.linspace(-8, 0, 50)],  
    'subsample': np.linspace(0.5, 1, 10),               
    'max_depth': randint(3, 12),                            
    'min_child_weight': [2 ** x for x in np.linspace(0, 5, 30)], 
    'colsample_bytree': np.linspace(0.5, 1, 10),        
    'colsample_bylevel': np.linspace(0.5, 1, 10),       
    'reg_lambda': [2 ** x for x in np.linspace(-5, 5, 40)],    
    'reg_alpha': [2 ** x for x in np.linspace(-5, 5, 40)]      
}


In [5]:
all_results = []

for name, (train, test) in datasets.items():
    print(f"Training: {name}")
    
    X_train, y_train = train.iloc[:, :-1], train.iloc[:, -1]
    X_test, y_test = test.iloc[:, :-1], test.iloc[:, -1]
    
    # XGBoost model
    xgb = XGBClassifier(
        random_state=42,
        eval_metric='auc'  
    )

    
    # Random Search
    random_search = RandomizedSearchCV(
        estimator=xgb,
        param_distributions=param_uniform,
        n_iter=100,                    
        scoring='roc_auc',
        cv=5,                          
        random_state=42,
        n_jobs=-1
    )
    
    # Fit
    random_search.fit(X_train, y_train)
    
    # Result
    cv_results = pd.DataFrame(random_search.cv_results_)
    
    # Testing on test sets
    for i, params in enumerate(random_search.cv_results_['params']):
        # Training again on parameters from random_search.cv_results_ :(
        model = XGBClassifier(
            random_state=42,
            eval_metric='auc',
            **params
        )
        model.fit(X_train, y_train)
        
        y_proba = model.predict_proba(X_test)[:, 1]
        test_auc = roc_auc_score(y_test, y_proba)
        
        all_results.append({
            "dataset": name,
            "params": params,
            "cv_roc_auc": cv_results.loc[i, 'mean_test_score'],
            "test_roc_auc": test_auc
        })
    
#Results
results_df = pd.DataFrame(all_results)

Training: heart
Training: diabetes
Training: cancer
Training: alzheimer


In [6]:
#Summary
for dataset in datasets.keys():
    dataset_results = results_df[results_df['dataset'] == dataset]
    best_idx = dataset_results['test_roc_auc'].idxmax()
    best_result = dataset_results.loc[best_idx]
    
    print(f"\n{dataset.upper()}:")
    print(f"  Best test AUC: {best_result['test_roc_auc']:.4f}")
    print(f"  CV AUC: {best_result['cv_roc_auc']:.4f}")
    print(f"  Parameters: {best_result['params']}")


HEART:
  Best test AUC: 0.8020
  CV AUC: 0.7791
  Parameters: {'colsample_bylevel': 1.0, 'colsample_bytree': 0.6666666666666666, 'learning_rate': 0.08293720650170974, 'max_depth': 10, 'min_child_weight': 28.395417264496608, 'n_estimators': 994, 'reg_alpha': 26.789388470197363, 'reg_lambda': 0.03125, 'subsample': 0.6111111111111112}

DIABETES:
  Best test AUC: 0.8324
  CV AUC: 0.8121
  Parameters: {'colsample_bylevel': 0.9444444444444444, 'colsample_bytree': 1.0, 'learning_rate': 0.7121258270022673, 'max_depth': 11, 'min_child_weight': 12.300880266076668, 'n_estimators': 338, 'reg_alpha': 5.411008231029584, 'reg_lambda': 0.5368400098840593, 'subsample': 1.0}

CANCER:
  Best test AUC: 0.8675
  CV AUC: 0.8640
  Parameters: {'colsample_bylevel': 0.5555555555555556, 'colsample_bytree': 0.8888888888888888, 'learning_rate': 0.3224946067247292, 'max_depth': 3, 'min_child_weight': 3.723222215679941, 'n_estimators': 694, 'reg_alpha': 18.77536736007401, 'reg_lambda': 0.07599515647546923, 'subsam

In [7]:
#Finding the best set of hyperparameters for each dataset 
best_per_dataset = (
    results_df.sort_values(by=["dataset", "test_roc_auc"], ascending=[True, False]).groupby("dataset", as_index=False).first()
)
params_df = best_per_dataset["params"].apply(pd.Series)

# Creating new set of hyperparameters from all datasets
mean_params = params_df.mean()
mean_params_dict = mean_params.to_dict()

for param in ["max_depth", "min_child_weight", "n_estimators"]:
    mean_params_dict[param] = int(round(mean_params_dict[param]))
mean_results = []

#Training with new hyperparameters on all datasets
for name, (train, test) in datasets.items():

    X_train, y_train = train.iloc[:, :-1], train.iloc[:, -1]
    X_test, y_test = test.iloc[:, :-1], test.iloc[:, -1]

    model = XGBClassifier(
            random_state=42,
            eval_metric='auc',
            **mean_params_dict
        )
    model.fit(X_train, y_train)
    y_proba = model.predict_proba(X_test)[:, 1]
    mean_auc = roc_auc_score(y_test, y_proba)

    mean_results.append({
        "dataset": name,
        "star_test_roc_auc": mean_auc
    })


In [8]:
# Creating new dataframe with all results

# Star means that this set of hyperparameters is a mean from the best 4 sets of hyperparameters, one for each set
params_df = results_df['params'].apply(pd.Series)
results_df = pd.concat([results_df.drop('params', axis=1), params_df], axis=1)

results_col = results_df[['cv_roc_auc','test_roc_auc']]
results_df = pd.concat([results_df.drop(['cv_roc_auc','test_roc_auc'],axis=1),results_col], axis=1)

mean_df = pd.DataFrame(mean_results)
results_df = results_df.merge(mean_df, on='dataset')
results_df['diff_from_star'] = results_df['star_test_roc_auc'] - results_df['test_roc_auc']
#results_df

In [9]:
results_df.to_csv("Results/xgboost_uniform.csv", index=False)

In [10]:
# Best parameters for each dataset
best_per_dataset = (
    results_df.sort_values(by=["dataset", "test_roc_auc"], ascending=[True, False])
    .groupby("dataset", as_index=False)
    .first()
    .drop(['cv_roc_auc', 'diff_from_star'], axis=1)
    #.drop(['star_test_roc_auc', 'diff_from_star'], axis=1)
)

In [11]:
# Deafault model

default_results = []
for name, (train, test) in datasets.items():

    X_train, y_train = train.iloc[:, :-1], train.iloc[:, -1]
    X_test, y_test = test.iloc[:, :-1], test.iloc[:, -1]

    
    model = XGBClassifier(random_state=42,)
    model.fit(X_train, y_train)
    y_proba = model.predict_proba(X_test)[:, 1]
    score = roc_auc_score(y_test, y_proba)

    default_results.append({
        "dataset": name,
        "default_test_roc_auc": score
    })

default_df = pd.DataFrame(default_results)
summary_df = best_per_dataset.merge(default_df, on="dataset")

In [12]:
# STAR row
mean_row = {
    "dataset": "STAR",
    **mean_params_dict,
    # "ccp_alpha" :None, "max_depth": None, "min_samples_leaf": None, "min_samples_split": None,
    "test_roc_auc": None,
    "star_test_roc_auc": mean_df["star_test_roc_auc"].mean(),
    #"star_test_roc_auc": None,
    "default_test_roc_auc": None
}

summary_df = pd.concat([summary_df, pd.DataFrame([mean_row])], ignore_index=True)
summary_df

  summary_df = pd.concat([summary_df, pd.DataFrame([mean_row])], ignore_index=True)


Unnamed: 0,dataset,colsample_bylevel,colsample_bytree,learning_rate,max_depth,min_child_weight,n_estimators,reg_alpha,reg_lambda,subsample,test_roc_auc,star_test_roc_auc,default_test_roc_auc
0,alzheimer,0.833333,0.666667,0.012113,10.0,22.35861,500.0,22.427229,1.862752,0.722222,0.864761,0.855354,0.84814
1,cancer,0.555556,0.888889,0.322495,3.0,3.723222,694.0,18.775367,0.075995,0.666667,0.867462,0.862524,0.826089
2,diabetes,0.944444,1.0,0.712126,11.0,12.30088,338.0,5.411008,0.53684,1.0,0.832418,0.806866,0.79403
3,heart,1.0,0.666667,0.082937,10.0,28.395417,994.0,26.789388,0.03125,0.611111,0.802025,0.793336,0.76824
4,STAR,0.833333,0.805556,0.282418,8.0,17.0,632.0,18.350748,0.626709,0.75,,0.82952,


In [13]:
summary_df.to_csv("Results/xgboost_uniform_summary.csv", index=False)

# 2. Bayesian

In [14]:
from skopt import BayesSearchCV
from skopt.space import Real, Integer, Categorical
from sklearn.metrics import roc_auc_score

In [15]:
search_spaces = {
    'n_estimators': Integer(100, 2000),
    'learning_rate': Real(2**-8, 2**0, prior='log-uniform'),
    'subsample': Real(0.5, 1.0, prior='uniform'),
    'max_depth': Integer(3, 12),
    'min_child_weight': Real(2**0, 2**5, prior='log-uniform'),
    'colsample_bytree': Real(0.5, 1.0, prior='uniform'),
    'colsample_bylevel': Real(0.5, 1.0, prior='uniform'),
    'reg_lambda': Real(2**-5, 2**5, prior='log-uniform'),
    'reg_alpha': Real(2**-5, 2**5, prior='log-uniform')
}

In [16]:
all_results_2 = []

for name, (train, test) in datasets.items():
    print(f"Training: {name}")
    
    X_train, y_train = train.iloc[:, :-1], train.iloc[:, -1]
    X_test, y_test = test.iloc[:, :-1], test.iloc[:, -1]
    
    # XGBoost model
    xgb = XGBClassifier(
        random_state=42,
        eval_metric='auc'
    )
    
    # Bayesian Search
    bayes_search = BayesSearchCV(
        estimator=xgb,
        search_spaces=search_spaces,
        n_iter=100,
        scoring='roc_auc',
        cv=5,
        random_state=42,
        n_jobs=-1,
        verbose=0
    )
    
    # Fit
    bayes_search.fit(X_train, y_train)
    
    # Result
    cv_results = pd.DataFrame(bayes_search.cv_results_)
 
    # Testing on test sets
    for i, params in enumerate(bayes_search.cv_results_['params']):
        # Training again on parameters from bayes_search.cv_results_
        model = XGBClassifier(
            random_state=42,
            eval_metric='auc',
            **params
        )
        model.fit(X_train, y_train)
        
        y_proba = model.predict_proba(X_test)[:, 1]
        test_auc = roc_auc_score(y_test, y_proba) 
        
        all_results_2.append({
            "dataset": name,
            "params": params,
            "cv_roc_auc": cv_results.loc[i, 'mean_test_score'],
            "test_roc_auc": test_auc
        })

# Results
results_2_df = pd.DataFrame(all_results_2)

Training: heart
Training: diabetes
Training: cancer
Training: alzheimer


In [17]:
# Summary
for dataset in datasets.keys():
    dataset_results = results_2_df[results_2_df['dataset'] == dataset]
    best_idx = dataset_results['test_roc_auc'].idxmax()
    best_result = dataset_results.loc[best_idx]
    
    print(f"\n{dataset.upper()}:")
    print(f"  Best test AUC: {best_result['test_roc_auc']:.4f}")
    print(f"  CV AUC: {best_result['cv_roc_auc']:.4f}")
    print(f"  Parameters: {best_result['params']}")



HEART:
  Best test AUC: 0.8033
  CV AUC: 0.7855
  Parameters: OrderedDict({'colsample_bylevel': 0.5339572432377074, 'colsample_bytree': 0.5908736884356977, 'learning_rate': 0.21206670957454102, 'max_depth': 9, 'min_child_weight': 18.60651972787815, 'n_estimators': 1865, 'reg_alpha': 31.507085796060498, 'reg_lambda': 32.0, 'subsample': 0.5})

DIABETES:
  Best test AUC: 0.8302
  CV AUC: 0.8277
  Parameters: OrderedDict({'colsample_bylevel': 0.8363361837314895, 'colsample_bytree': 0.8770510820338543, 'learning_rate': 0.09620520296316923, 'max_depth': 3, 'min_child_weight': 1.0, 'n_estimators': 862, 'reg_alpha': 9.027815736193546, 'reg_lambda': 32.0, 'subsample': 0.5032370037838271})

CANCER:
  Best test AUC: 0.8667
  CV AUC: 0.8595
  Parameters: OrderedDict({'colsample_bylevel': 0.9192837880147213, 'colsample_bytree': 1.0, 'learning_rate': 1.0, 'max_depth': 4, 'min_child_weight': 32.0, 'n_estimators': 2000, 'reg_alpha': 32.0, 'reg_lambda': 8.976093015656529, 'subsample': 0.79853161919636

In [18]:
# Finding the best hyperparameters for each dataset
best_per_dataset_2 = (
    results_2_df.sort_values(by=["dataset", "test_roc_auc"], ascending=[True, False]).groupby("dataset", as_index=False).first()
)

params_2_df = best_per_dataset_2["params"].apply(pd.Series)

In [19]:
# Creating a final dataframe for all of the results

# Splitting all hyperparaemters to separate columns
params_2_df = results_2_df['params'].apply(pd.Series)
results_2_df = pd.concat([results_2_df.drop('params', axis=1), params_2_df], axis=1)

results_col_2 = results_2_df[['cv_roc_auc','test_roc_auc']]
results_2_df = pd.concat([results_2_df.drop(['cv_roc_auc','test_roc_auc'],axis=1),results_col_2], axis=1)

In [20]:
results_2_df.to_csv("Results/xgboost_bayes.csv", index=False)

In [21]:
# Best parameters for each dataset
best_per_dataset_2 = (
    results_2_df.sort_values(by=["dataset", "test_roc_auc"], ascending=[True, False])
    .groupby("dataset", as_index=False)
    .first()
    .drop(['cv_roc_auc'], axis=1)
)

summary_2_df = best_per_dataset_2.merge(default_df, on="dataset")

In [22]:
summary_2_df.to_csv("Results/xgboost_bayes_summary.csv", index=False)