## Decision Tree Classifier

In [75]:
import pandas as pd
import numpy as np

from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform, randint
from sklearn.metrics import roc_auc_score
from sklearn.tree import plot_tree

from skopt import BayesSearchCV
from skopt.space import Real, Integer

In [76]:
# Datasets
heart_test = pd.read_csv('Data/heart_test.csv')
heart_train = pd.read_csv('Data/heart_train.csv')
diabetes_test = pd.read_csv('Data/diabetes_test.csv')
diabetes_train = pd.read_csv('Data/diabetes_train.csv')
cancer_test = pd.read_csv('Data/cancer_test.csv')
cancer_train = pd.read_csv('Data/cancer_train.csv')
alzheimer_test = pd.read_csv('Data/alzheimer_test.csv')
alzheimer_train = pd.read_csv('Data/alzheimer_train.csv')

datasets = {
    "heart": (heart_train, heart_test),
    "diabetes": (diabetes_train, diabetes_test),
    "cancer": (cancer_train, cancer_test),
    "alzheimer": (alzheimer_train, alzheimer_test)
}

In [77]:
from sklearn.model_selection import train_test_split

# Training sets for 25%, 50%, 75%
heart_train25, _ = train_test_split(heart_train, train_size=0.25, random_state=42, stratify=heart_train.iloc[:, -1])
heart_train50, _ = train_test_split(heart_train, train_size=0.50, random_state=42, stratify=heart_train.iloc[:, -1])
heart_train75, _ = train_test_split(heart_train, train_size=0.75, random_state=42, stratify=heart_train.iloc[:, -1])

diabetes_train25, _ = train_test_split(diabetes_train, train_size=0.25, random_state=42, stratify=diabetes_train.iloc[:, -1])
diabetes_train50, _ = train_test_split(diabetes_train, train_size=0.50, random_state=42, stratify=diabetes_train.iloc[:, -1])
diabetes_train75, _ = train_test_split(diabetes_train, train_size=0.75, random_state=42, stratify=diabetes_train.iloc[:, -1])

cancer_train25, _ = train_test_split(cancer_train, train_size=0.25, random_state=42, stratify=cancer_train.iloc[:, -1])
cancer_train50, _ = train_test_split(cancer_train, train_size=0.50, random_state=42, stratify=cancer_train.iloc[:, -1])
cancer_train75, _ = train_test_split(cancer_train, train_size=0.75, random_state=42, stratify=cancer_train.iloc[:, -1])

alzheimer_train25, _ = train_test_split(alzheimer_train, train_size=0.25, random_state=42, stratify=alzheimer_train.iloc[:, -1])
alzheimer_train50, _ = train_test_split(alzheimer_train, train_size=0.50, random_state=42, stratify=alzheimer_train.iloc[:, -1])
alzheimer_train75, _ = train_test_split(alzheimer_train, train_size=0.75, random_state=42, stratify=alzheimer_train.iloc[:, -1])


datasets25 = {
    "heart": (heart_train25, heart_test),
    "diabetes": (diabetes_train25, diabetes_test),
    "cancer": (cancer_train25, cancer_test),
    "alzheimer": (alzheimer_train25, alzheimer_test)
}

datasets50 = {
    "heart": (heart_train50, heart_test),
    "diabetes": (diabetes_train50, diabetes_test),
    "cancer": (cancer_train50, cancer_test),
    "alzheimer": (alzheimer_train50, alzheimer_test)
}

datasets75 = {
    "heart": (heart_train75, heart_test),
    "diabetes": (diabetes_train75, diabetes_test),
    "cancer": (cancer_train75, cancer_test),
    "alzheimer": (alzheimer_train75, alzheimer_test)
}

## 1. Uniform Random Search

In [78]:
# Grid of hyperparameters
param_dist = {
    'ccp_alpha': uniform(0, 0.1),
    'max_depth': randint(1, 30),
    'min_samples_leaf': randint(1, 60),
    'min_samples_split': randint(2, 60)
}

In [79]:
all_results = []

for name, (train, test) in datasets25.items():
    print(f"Training: {name}")
    
    X_train, y_train = train.iloc[:, :-1], train.iloc[:, -1]
    X_test, y_test = test.iloc[:, :-1], test.iloc[:, -1]

    # DecisionTreeClassifier Model
    dt = DecisionTreeClassifier(random_state=42)

    # Random search
    random_search = RandomizedSearchCV(
        estimator=dt,
        param_distributions=param_dist,
        n_iter=100,
        scoring='roc_auc',
        cv=3,
        random_state=42,
        n_jobs=1
    )
    random_search.fit(X_train, y_train)

    # Adding the results to a new data frame
    cv_results = pd.DataFrame(random_search.cv_results_)

    # Training on test sets
    for i, params in enumerate(random_search.cv_results_['params']):
        model = DecisionTreeClassifier(random_state=42, **params)
        model.fit(X_train, y_train)
        y_proba = model.predict_proba(X_test)[:, 1]
        test_auc = roc_auc_score(y_test, y_proba)

        # Adding the results
        all_results.append({
            "dataset": name,
            "params": params,
            "cv_roc_auc": cv_results.loc[i, 'mean_test_score'],
            "test_roc_auc": test_auc
        })

results_df = pd.DataFrame(all_results)

Training: heart
Training: diabetes
Training: cancer
Training: alzheimer


In [80]:
#Summary
for dataset in datasets25.keys():
    dataset_results = results_df[results_df['dataset'] == dataset]
    best_idx = dataset_results['test_roc_auc'].idxmax()
    best_result = dataset_results.loc[best_idx]
    
    print(f"\n{dataset.upper()}:")
    print(f"  Best test AUC: {best_result['test_roc_auc']:.4f}")
    print(f"  CV AUC: {best_result['cv_roc_auc']:.4f}")
    print(f"  Parameters: {best_result['params']}")


HEART:
  Best test AUC: 0.7730
  CV AUC: 0.7388
  Parameters: {'ccp_alpha': 0.006505159298527952, 'max_depth': 4, 'min_samples_leaf': 25, 'min_samples_split': 15}

DIABETES:
  Best test AUC: 0.7517
  CV AUC: 0.7940
  Parameters: {'ccp_alpha': 0.006505159298527952, 'max_depth': 4, 'min_samples_leaf': 25, 'min_samples_split': 15}

CANCER:
  Best test AUC: 0.7782
  CV AUC: 0.8065
  Parameters: {'ccp_alpha': 0.0005061583846218687, 'max_depth': 14, 'min_samples_leaf': 18, 'min_samples_split': 3}

ALZHEIMER:
  Best test AUC: 0.8266
  CV AUC: 0.8255
  Parameters: {'ccp_alpha': 0.009310276780589922, 'max_depth': 12, 'min_samples_leaf': 5, 'min_samples_split': 38}


In [81]:
# Visualizing the decision tree to inspect its structure and ensure that there are no unexpected splits
# test = pd.read_csv("Data/alzheimer_test.csv")
# train = pd.read_csv("Data/alzheimer_train.csv")
# X_train, y_train = train.iloc[:, :-1], train.iloc[:, -1]
# X_test, y_test = test.iloc[:, :-1], test.iloc[:, -1]
# model = DecisionTreeClassifier(ccp_alpha=0.003919,
#     max_depth=25,
#     min_samples_leaf=21,
#     min_samples_split = 58)
# model.fit(X_train, y_train)
# y_proba = model.predict_proba(X_test)[:, 1]
# test_auc = roc_auc_score(y_test, y_proba)

# plot_tree(model)

In [82]:
# Visualizing the decision tree to inspect its structure and ensure that there are no unexpected splits
# test = pd.read_csv("Data/alzheimer_test.csv")
# train = pd.read_csv("Data/alzheimer_train.csv")
# X_train, y_train = train.iloc[:, :-1], train.iloc[:, -1]
# X_test, y_test = test.iloc[:, :-1], test.iloc[:, -1]
# model = DecisionTreeClassifier(ccp_alpha=0.006505,
#     max_depth=4,
#     min_samples_leaf=25,
#     min_samples_split = 15)
# model.fit(X_train, y_train)
# y_proba = model.predict_proba(X_test)[:, 1]
# test_auc = roc_auc_score(y_test, y_proba)

# plot_tree(model)

In [83]:
results_df

Unnamed: 0,dataset,params,cv_roc_auc,test_roc_auc
0,heart,"{'ccp_alpha': 0.03745401188473625, 'max_depth'...",0.742681,0.771595
1,heart,"{'ccp_alpha': 0.07796910002727693, 'max_depth'...",0.600931,0.725127
2,heart,"{'ccp_alpha': 0.015599452033620266, 'max_depth...",0.746107,0.771595
3,heart,"{'ccp_alpha': 0.03337086111390219, 'max_depth'...",0.742681,0.771595
4,heart,"{'ccp_alpha': 0.0020584494295802446, 'max_dept...",0.742681,0.771595
...,...,...,...,...
395,alzheimer,"{'ccp_alpha': 0.029529058841893874, 'max_depth...",0.755971,0.776618
396,alzheimer,"{'ccp_alpha': 0.0697015740995268, 'max_depth':...",0.500000,0.500000
397,alzheimer,"{'ccp_alpha': 0.05528199769079078, 'max_depth'...",0.650423,0.609982
398,alzheimer,"{'ccp_alpha': 0.08101133946791808, 'max_depth'...",0.500000,0.500000


In [84]:
# Finding the best hyperparameters for each dataset
best_per_dataset = (
    results_df.sort_values(by=["dataset", "test_roc_auc"], ascending=[True, False]).groupby("dataset", as_index=False).first()
)

params_df = best_per_dataset["params"].apply(pd.Series)

# Finding new set of hyperparameters as a mean from the best results for each dataset
mean_params = params_df.mean()
mean_params_dict = mean_params.to_dict()

# Rounding to integer values
for param in ["max_depth", "min_samples_leaf", "min_samples_split"]:
    mean_params_dict[param] = int(round(mean_params_dict[param]))
mean_results = []

# Training with new hyperparameters on all datasets and evaluating on tests
for name, (train, test) in datasets25.items():

    X_train, y_train = train.iloc[:, :-1], train.iloc[:, -1]
    X_test, y_test = test.iloc[:, :-1], test.iloc[:, -1]

    model = DecisionTreeClassifier(random_state=42, **mean_params_dict)
    model.fit(X_train, y_train)
    y_proba = model.predict_proba(X_test)[:, 1]
    mean_auc = roc_auc_score(y_test, y_proba)

    mean_results.append({
        "dataset": name,
        # Star is a mean from the best 4 sets of hyperparameters from each dataset
        "star_test_roc_auc": mean_auc
    })

In [85]:
# Creating a final dataframe for all of the results

# Splitting all hyperparaemters to separate columns
params_df = results_df['params'].apply(pd.Series)
results_df = pd.concat([results_df.drop('params', axis=1), params_df], axis=1)

results_col = results_df[['cv_roc_auc','test_roc_auc']]
results_df = pd.concat([results_df.drop(['cv_roc_auc','test_roc_auc'],axis=1),results_col], axis=1)

mean_df = pd.DataFrame(mean_results)
results_df = results_df.merge(mean_df, on="dataset")
results_df["diff_from_star"] = results_df["star_test_roc_auc"] - results_df["test_roc_auc"]

In [86]:
mean_df

Unnamed: 0,dataset,star_test_roc_auc
0,heart,0.767484
1,diabetes,0.725672
2,cancer,0.750324
3,alzheimer,0.78015


In [87]:
results_df

Unnamed: 0,dataset,ccp_alpha,max_depth,min_samples_leaf,min_samples_split,cv_roc_auc,test_roc_auc,star_test_roc_auc,diff_from_star
0,heart,0.037454,29.0,15.0,44.0,0.742681,0.771595,0.767484,-0.004112
1,heart,0.077969,21.0,39.0,59.0,0.600931,0.725127,0.767484,0.042357
2,heart,0.015599,11.0,11.0,25.0,0.746107,0.771595,0.767484,-0.004112
3,heart,0.033371,8.0,24.0,4.0,0.742681,0.771595,0.767484,-0.004112
4,heart,0.002058,2.0,24.0,45.0,0.742681,0.771595,0.767484,-0.004112
...,...,...,...,...,...,...,...,...,...
395,alzheimer,0.029529,26.0,17.0,41.0,0.755971,0.776618,0.780150,0.003531
396,alzheimer,0.069702,11.0,54.0,49.0,0.500000,0.500000,0.780150,0.280150
397,alzheimer,0.055282,10.0,55.0,27.0,0.650423,0.609982,0.780150,0.170168
398,alzheimer,0.081011,25.0,24.0,14.0,0.500000,0.500000,0.780150,0.280150


In [88]:
results_df.to_csv("Results/decisiontree_uniform_25.csv", index=False)

In [89]:
# Creating a short summary dataset
# results_df =  pd.read_csv("Results/decisiontree_uniform.csv")

# Best parameters for each dataset
best_per_dataset = (
    results_df.sort_values(by=["dataset", "test_roc_auc"], ascending=[True, False])
    .groupby("dataset", as_index=False)
    .first()
    .drop(['cv_roc_auc', 'diff_from_star'], axis=1)
)

In [90]:
# Deafault model

default_results = []
for name, (train, test) in datasets25.items():

    X_train, y_train = train.iloc[:, :-1], train.iloc[:, -1]
    X_test, y_test = test.iloc[:, :-1], test.iloc[:, -1]

    model = DecisionTreeClassifier(random_state=42, )
    model.fit(X_train, y_train)
    y_proba = model.predict_proba(X_test)[:, 1]
    score = roc_auc_score(y_test, y_proba)

    default_results.append({
        "dataset": name,
        "default_test_roc_auc": score
    })

default_df = pd.DataFrame(default_results)
summary_df = best_per_dataset.merge(default_df, on="dataset")

In [91]:
# STAR row
mean_row = {
    "dataset": "STAR",
    **mean_params_dict,
    # "ccp_alpha" :None, "max_depth": None, "min_samples_leaf": None, "min_samples_split": None,
    "test_roc_auc": None,
    "star_test_roc_auc": mean_df["star_test_roc_auc"].mean(),
    #"star_test_roc_auc": None,
    "default_test_roc_auc": None
}

summary_df = pd.concat([summary_df, pd.DataFrame([mean_row])], ignore_index=True)
summary_df

  summary_df = pd.concat([summary_df, pd.DataFrame([mean_row])], ignore_index=True)


Unnamed: 0,dataset,ccp_alpha,max_depth,min_samples_leaf,min_samples_split,test_roc_auc,star_test_roc_auc,default_test_roc_auc
0,alzheimer,0.00931,12.0,5.0,38.0,0.826618,0.78015,0.736267
1,cancer,0.000506,14.0,18.0,3.0,0.778173,0.750324,0.65916
2,diabetes,0.006505,4.0,25.0,15.0,0.751701,0.725672,0.646806
3,heart,0.006505,4.0,25.0,15.0,0.773011,0.767484,0.63667
4,STAR,0.005707,8.0,18.0,18.0,,0.755907,


In [92]:
summary_df.to_csv("Results/decisiontree_uniform_summary_25.csv", index=False)

## 2. Bayesian Optimalization

In [93]:
# Search space for bayesian optimalization
search_spaces = {
    'ccp_alpha': Real(0.0, 0.1, prior='uniform'),
    'max_depth': Integer(1, 30),
    'min_samples_leaf': Integer(1, 60),
    'min_samples_split': Integer(2, 60)
}

In [94]:
all_results_2 = []

for name, (train, test) in datasets25.items():
    print(f"Training: {name}")
    
    X_train, y_train = train.iloc[:, :-1], train.iloc[:, -1]
    X_test, y_test = test.iloc[:, :-1], test.iloc[:, -1]

    # DecisionTreeClassifier Model
    dt = DecisionTreeClassifier(random_state=42)

    # Bayesian search
    bayes_search = BayesSearchCV(
        estimator=dt,
        search_spaces=search_spaces,
        n_iter=100,
        scoring='roc_auc',
        cv=3,
        random_state=42,
        n_jobs=1
    )
    bayes_search.fit(X_train, y_train)

    # Adding the results to a new data frame
    cv_results = pd.DataFrame(bayes_search.cv_results_)

    # Training on test sets
    for i, params in enumerate(bayes_search.cv_results_['params']):
        model = DecisionTreeClassifier(random_state=42, **params)
        model.fit(X_train, y_train)
        y_proba = model.predict_proba(X_test)[:, 1]
        test_auc = roc_auc_score(y_test, y_proba)

        # Adding the results
        all_results_2.append({
            "dataset": name,
            "params": params,
            "cv_roc_auc": cv_results.loc[i, 'mean_test_score'],
            "test_roc_auc": test_auc
        })

results_2_df = pd.DataFrame(all_results_2)

Training: heart
Training: diabetes




Training: cancer
Training: alzheimer




In [95]:
# Summary
for dataset in datasets25.keys():
    dataset_results = results_2_df[results_2_df['dataset'] == dataset]
    best_idx = dataset_results['test_roc_auc'].idxmax()
    best_result = dataset_results.loc[best_idx]
    
    print(f"\n{dataset.upper()}:")
    print(f"  Best test AUC: {best_result['test_roc_auc']:.4f}")
    print(f"  CV AUC: {best_result['cv_roc_auc']:.4f}")
    print(f"  Parameters: {best_result['params']}")


HEART:
  Best test AUC: 0.7730
  CV AUC: 0.7415
  Parameters: OrderedDict([('ccp_alpha', 0.0), ('max_depth', 20), ('min_samples_leaf', 22), ('min_samples_split', 60)])

DIABETES:
  Best test AUC: 0.7527
  CV AUC: 0.7901
  Parameters: OrderedDict([('ccp_alpha', 0.0), ('max_depth', 30), ('min_samples_leaf', 27), ('min_samples_split', 2)])

CANCER:
  Best test AUC: 0.8162
  CV AUC: 0.8035
  Parameters: OrderedDict([('ccp_alpha', 0.0), ('max_depth', 25), ('min_samples_leaf', 19), ('min_samples_split', 2)])

ALZHEIMER:
  Best test AUC: 0.8218
  CV AUC: 0.8333
  Parameters: OrderedDict([('ccp_alpha', 0.009182779384957656), ('max_depth', 23), ('min_samples_leaf', 1), ('min_samples_split', 43)])


In [96]:
# Finding the best hyperparameters for each dataset
best_per_dataset_2 = (
    results_2_df.sort_values(by=["dataset", "test_roc_auc"], ascending=[True, False]).groupby("dataset", as_index=False).first()
)

params_2_df = best_per_dataset_2["params"].apply(pd.Series)

In [97]:
# Creating a final dataframe for all of the results

# Splitting all hyperparaemters to separate columns
params_2_df = results_2_df['params'].apply(pd.Series)
results_2_df = pd.concat([results_2_df.drop('params', axis=1), params_2_df], axis=1)

results_col_2 = results_2_df[['cv_roc_auc','test_roc_auc']]
results_2_df = pd.concat([results_2_df.drop(['cv_roc_auc','test_roc_auc'],axis=1),results_col_2], axis=1)

In [98]:
results_2_df.to_csv("Results/decisiontree_bayes_25.csv", index=False)

In [99]:
# Best parameters for each dataset
best_per_dataset_2 = (
    results_2_df.sort_values(by=["dataset", "test_roc_auc"], ascending=[True, False])
    .groupby("dataset", as_index=False)
    .first()
    .drop(['cv_roc_auc'], axis=1)
)

summary_2_df = best_per_dataset_2.merge(default_df, on="dataset")

In [100]:
summary_2_df.to_csv("Results/decisiontree_bayes_summary_25.csv", index=False)

In [101]:
summary_2_df

Unnamed: 0,dataset,ccp_alpha,max_depth,min_samples_leaf,min_samples_split,test_roc_auc,default_test_roc_auc
0,alzheimer,0.009183,23.0,1.0,43.0,0.821809,0.736267
1,cancer,0.0,25.0,19.0,2.0,0.816231,0.65916
2,diabetes,0.0,30.0,27.0,2.0,0.752716,0.646806
3,heart,0.0,20.0,22.0,60.0,0.773011,0.63667
