## Hyperparameter Tuning

### Imports

In [6]:
import joblib
import datetime
import pandas as pd
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest, f_classif, SequentialFeatureSelector
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import roc_auc_score
from itertools import product
import time

## Load the training data

In [7]:
# Load split datasets
X_train = joblib.load("../data/splits/X_train.pkl")
X_test = joblib.load("../data/splits/X_test.pkl")
y_train = joblib.load("../data/splits/y_train.pkl")
y_test = joblib.load("../data/splits/y_test.pkl")

print("Train/test split loaded successfully!")

Train/test split loaded successfully!


## Models

In [8]:
# ---- DEFINE BASE MODELS ----

models = {
    "RF": RandomForestClassifier(random_state=42),
    "SVM": SVC(probability=True),
    "MLP2": MLPClassifier(hidden_layer_sizes=(2,), max_iter=5000, random_state=42),
    "MLP4": MLPClassifier(hidden_layer_sizes=(4,), max_iter=5000, random_state=42),
    "MLP6": MLPClassifier(hidden_layer_sizes=(6,), max_iter=5000, random_state=42),
    "MLP8": MLPClassifier(hidden_layer_sizes=(8,), max_iter=5000, random_state=42),
    "3NN": KNeighborsClassifier(n_neighbors=3),
    "5NN": KNeighborsClassifier(n_neighbors=5),
    "10NN": KNeighborsClassifier(n_neighbors=10),
    "15NN": KNeighborsClassifier(n_neighbors=15),
}

## Search Parameter Grids

In [9]:
# ---- DEFINE HYPERPARAMETER GRIDS ----

# Feature space dimension
n_features = X_train.shape[1]

k_values = list(range(2, n_features, 2))
print (k_values)

param_grids_anova = {
    "RF": {
        "feature_selection__k": k_values,  # k
        "classifier__n_estimators": [50, 100, 200],  # Number of trees
        "classifier__max_depth": [5, 10, 20],  # Tree depth
        "classifier__min_samples_split": [2, 5, 10],  # Minimum samples per split
        "classifier__min_samples_leaf": [1, 2, 4],  # Minimum leaf nodes
        "classifier__max_features": ["sqrt", "log2"]  # Feature subset size at each split
    },
    "MLP2": {
        "feature_selection__k": k_values,  # k
        "classifier__learning_rate_init": [0.001, 0.003, 0.01, 0.03, 0.1],
        "classifier__batch_size": [16, 32, 64, 128],
        "classifier__alpha": [1e-5, 1e-4, 1e-3, 1e-2, 1e-1] # L2 penalty (regularization term) parameter
    },
    "MLP4": {
        "feature_selection__k": k_values,  # k
        "classifier__learning_rate_init": [0.001, 0.003, 0.01, 0.03, 0.1],
        "classifier__batch_size": [16, 32, 64, 128],
        "classifier__alpha": [1e-5, 1e-4, 1e-3, 1e-2, 1e-1] # L2 penalty (regularization term) parameter
    },
    "MLP6": {
        "feature_selection__k": k_values,  # k
        "classifier__learning_rate_init": [0.001, 0.003, 0.01, 0.03, 0.1],
        "classifier__batch_size": [16, 32, 64, 128],
        "classifier__alpha": [1e-5, 1e-4, 1e-3, 1e-2, 1e-1] # L2 penalty (regularization term) parameter
    },
    "MLP8": {
        "feature_selection__k": k_values,  # k
        "classifier__learning_rate_init": [0.001, 0.003, 0.01, 0.03, 0.1],
        "classifier__batch_size": [16, 32, 64, 128],
        "classifier__alpha": [1e-5, 1e-4, 1e-3, 1e-2, 1e-1] # L2 penalty (regularization term) parameter
    },
    "SVM": {
        "feature_selection__k": k_values,  # k
        "classifier__C": [0.1, 1, 10, 100],  # Regularization parameter
        "classifier__kernel": ["linear", "rbf", "poly"],  # Kernel types
        "classifier__gamma": ["scale", "auto"]  # Kernel coefficient
    },
    "3NN": {
        "feature_selection__k": k_values,  # k
        "classifier__weights": ["uniform", "distance"],  # Weight function
        "classifier__metric": ["euclidean", "manhattan", "minkowski"]  # Distance metric
    },
    "5NN": {
        "feature_selection__k": k_values,  # k
        "classifier__weights": ["uniform", "distance"],  # Weight function
        "classifier__metric": ["euclidean", "manhattan", "minkowski"]  # Distance metric
    },
    "10NN": {
        "feature_selection__k": k_values,  # k
        "classifier__weights": ["uniform", "distance"],  # Weight function
        "classifier__metric": ["euclidean", "manhattan", "minkowski"]  # Distance metric
    },
    "15NN": {
        "feature_selection__k": k_values,  # k
        "classifier__weights": ["uniform", "distance"],  # Weight function
        "classifier__metric": ["euclidean", "manhattan", "minkowski"]  # Distance metric
    },
}

param_grids_sfs = {
    "RF": {
        "feature_selection__n_features_to_select": k_values,  # n_features_to_select  
        "classifier__n_estimators": [50, 100, 200],  
        "classifier__max_depth": [5, 7, 10, 20],  
        "classifier__min_samples_split": [2, 5, 10],  
        "classifier__min_samples_leaf": [1, 2, 4],
        "classifier__max_features": ["sqrt", "log2", None]  # Feature subset size at each split
    },
    "MLP2": {
        "feature_selection__n_features_to_select": k_values,  # n_features_to_select
        "classifier__batch_size": [16, 32, 64, 128],
        "classifier__learning_rate_init": [0.001, 0.003, 0.01, 0.03, 0.1],
        "classifier__alpha": [1e-5, 1e-4, 1e-3, 1e-2, 1e-1]
    },
    "MLP4": {
        "feature_selection__n_features_to_select": k_values,  # n_features_to_select
        "classifier__batch_size": [16, 32, 64, 128],
        "classifier__learning_rate_init": [0.001, 0.003, 0.01, 0.03, 0.1],
        "classifier__alpha": [1e-5, 1e-4, 1e-3, 1e-2, 1e-1]
    },
    "MLP6": {
        "feature_selection__n_features_to_select": k_values,  # n_features_to_select
        "classifier__batch_size": [16, 32, 64, 128],
        "classifier__learning_rate_init": [0.001, 0.003, 0.01, 0.03, 0.1],
        "classifier__alpha": [1e-5, 1e-4, 1e-3, 1e-2, 1e-1]
    },
    "MLP8": {
        "feature_selection__n_features_to_select": k_values,  # n_features_to_select
        "classifier__batch_size": [16, 32, 64, 128],
        "classifier__learning_rate_init": [0.001, 0.003, 0.01, 0.03, 0.1],
        "classifier__alpha": [1e-5, 1e-4, 1e-3, 1e-2, 1e-1]
    },
    "SVM": {
        "feature_selection__n_features_to_select": k_values,  # n_features_to_select  
        "classifier__C": [0.1, 1, 10, 100],  
        "classifier__kernel": ["linear", "rbf", "poly"],  
        "classifier__gamma": ["scale", "auto"]  
    },
    "3NN": {
        "feature_selection__n_features_to_select": k_values,  # n_features_to_select  
        "classifier__weights": ["uniform", "distance"],  
        "classifier__metric": ["euclidean", "manhattan", "minkowski"]  
    },
    "5NN": {
        "feature_selection__n_features_to_select": k_values,  # n_features_to_select  
        "classifier__weights": ["uniform", "distance"],  
        "classifier__metric": ["euclidean", "manhattan", "minkowski"]  
    },
    "10NN": {
        "feature_selection__n_features_to_select": k_values,  # n_features_to_select  
        "classifier__weights": ["uniform", "distance"],  
        "classifier__metric": ["euclidean", "manhattan", "minkowski"]  
    },
    "15NN": {
        "feature_selection__n_features_to_select": k_values,  # n_features_to_select  
        "classifier__weights": ["uniform", "distance"],  
        "classifier__metric": ["euclidean", "manhattan", "minkowski"]  
    },
}

[2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24]


## Hyperparameter Tuning

In [10]:
# ---- HYPERPARAMETER TUNING ----

best_models = {}
results = []

for name, model in models.items():
    print(f"Tuning hyperparameters for {name}...")

    for fs_method in ["ANOVA", "SFS"]:
        # Define Feature Selection
        if fs_method == "ANOVA":
            selector = SelectKBest(score_func=f_classif)
            param_grid = param_grids_anova[name]
            SearchClass = GridSearchCV
        else:
            selector = SequentialFeatureSelector(model)
            param_grid = param_grids_sfs[name]
            SearchClass = RandomizedSearchCV  # We'll use RandomizedSearch for SFS

        # Create pipeline
        pipeline = Pipeline([
            ("feature_selection", selector),
            ("classifier", model)
        ])

        # Build the search
        if fs_method == "ANOVA":
            # GridSearchCV
            search = SearchClass(pipeline, param_grid, cv=10, scoring="roc_auc", n_jobs=-1)
        else:
            # RandomizedSearchCV
            total_combinations = len(list(product(*param_grid.values())))
            n_iter_adjusted = min(20, total_combinations)
            search = SearchClass(
                pipeline, param_grid, 
                n_iter=n_iter_adjusted, cv=10, scoring="roc_auc", 
                n_jobs=-1, random_state=42
            )
        
        # ---- Measure hyperparameter search time ----
        start_search = time.time()
        search.fit(X_train, y_train)
        end_search = time.time()
        search_time = end_search - start_search

        # Extract best model
        start_refit = time.time()
        best_model = search.best_estimator_
        end_refit = time.time()
        refit_time = end_refit - start_refit
        best_score = search.best_score_
        best_params = search.best_params_
        
        # For MLPs, get number of epochs to convergence
        mlp_iterations = None
        classifier_step = best_model.named_steps["classifier"]
        if isinstance(classifier_step, MLPClassifier):
            mlp_iterations = classifier_step.n_iter_
        

        # Extract selected features
        selected_features = X_train.columns[best_model.named_steps["feature_selection"].get_support()]
        selected_features_str = ", ".join(selected_features)

        # Store results
        results.append({
            "Model": name,
            "Feature Selection": fs_method,
            "ROC AUC Score": best_score,
            "Selected Features": selected_features_str,
            "Best Params": best_params,
            "Refit Time (s)": refit_time,
            "Search Time (s)": search_time,
            "Convergence": mlp_iterations
        })

        # Store best model
        best_models[f"{name} ({fs_method})"] = best_model

        print(f"Best {name} ({fs_method}) ROC_AUC: {best_score:.4f}")
        print(f"Best Params: {best_params}")
        print(f"Selected Features: {selected_features_str}")
        if mlp_iterations is not None:
            print(f"MLP converged in {mlp_iterations} iterations")
        print(f"Final Refit Time: {refit_time:.2f}s")
        print(f"Search Time: {search_time:.2f} s\n")


Tuning hyperparameters for RF...
Best RF (ANOVA) ROC_AUC: 0.9009
Best Params: {'classifier__max_depth': 20, 'classifier__max_features': 'sqrt', 'classifier__min_samples_leaf': 1, 'classifier__min_samples_split': 2, 'classifier__n_estimators': 200, 'feature_selection__k': 24}
Selected Features: HRV_MeanNN, HRV_pNN50, HRV_LF, HRV_HF, HRV_LFHF, HRV_pNN20, HRV_VLF, HRV_maxNN, HRV_minNN, HRV_prc80NN, HRV_madNN, EDA_Tonic, SCR_Amplitude, SCR_Onsets, SCR_Phasic, HRV_pNN50_norm, HRV_LF_norm, HRV_HF_norm, HRV_pNN20_norm, HRV_minNN_norm, EDA_Tonic_norm, SCR_Amplitude_norm, SCR_Onsets_norm, SCR_Phasic_norm
Final Refit Time: 0.00s
Search Time: 517.88 s

Best RF (SFS) ROC_AUC: 0.8723
Best Params: {'feature_selection__n_features_to_select': 14, 'classifier__n_estimators': 100, 'classifier__min_samples_split': 2, 'classifier__min_samples_leaf': 1, 'classifier__max_features': None, 'classifier__max_depth': 10}
Selected Features: HRV_MeanNN, HRV_pNN50, HRV_HF, HRV_LFHF, HRV_pNN20, HRV_minNN, EDA_Tonic,

## Save Data

In [11]:
# ---- SAVE BEST MODELS ----

timestamp = datetime.datetime.now().strftime("%d-%m-%Y-%H-%M")

for model_name, model in best_models.items():
    joblib.dump(model, f"../models/{model_name.replace(' ', '_').replace('(', '').replace(')', '')}_{timestamp}.pkl")

# Convert results to DataFrame and save to CSV
results_df = pd.DataFrame(results)

filename = f"../data/results/04_hyperparameter_tuning_results_{timestamp}.csv"
results_df.to_csv(filename, index=False)

print("Results and best performing models saved for future evaluation.")

Results and best performing models saved for future evaluation.
