## Hyperparameter Tuning

### Imports

In [11]:
import joblib
import datetime
import pandas as pd
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest, f_classif, SequentialFeatureSelector
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score

## Load the training data

In [12]:
# Load split datasets
X_train = joblib.load("../data/splits/X_train.pkl")
X_test = joblib.load("../data/splits/X_test.pkl")
y_train = joblib.load("../data/splits/y_train.pkl")
y_test = joblib.load("../data/splits/y_test.pkl")

print("Train/test split loaded successfully!")

Train/test split loaded successfully!


## Models and Search Parameter Grids

In [13]:
# ---- DEFINE BASE MODELS ----

models = {
    "MLP": MLPClassifier(max_iter=5000),
    #"RF": RandomForestClassifier(random_state=42),
}

# ---- DEFINE HYPERPARAMETER GRIDS ----

param_grids_anova = {
    "MLP": {
        "feature_selection__k": [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13],  # k
        "classifier__hidden_layer_sizes": [(32, 16, 8), (64, 32, 16), (128, 64, 32)],
        "classifier__learning_rate_init": [0.001, 0.003, 0.01, 0.03, 0.1],
        "classifier__alpha": [1e-5, 1e-4, 1e-3, 1e-2, 1e-1] # L2 penalty (regularization term) parameter
    }
}

param_grids_sfs = {
    "MLP": {
        "feature_selection__n_features_to_select": [2, 4, 7, 10],  # n_features_to_select
        "classifier__hidden_layer_sizes": [(32, 16, 8), (64, 32, 16), (128, 64, 32)],
        "classifier__learning_rate_init": [0.001, 0.003, 0.01, 0.03, 0.1],
        "classifier__alpha": [1e-5, 1e-4, 1e-3, 1e-2, 1e-1]
    }
}

## Hyperparameter Tuning

In [14]:
# ---- HYPERPARAMETER TUNING ----

best_models = {}
results = []

for name, model in models.items():
    print(f"Tuning hyperparameters for {name}...")

    for fs_method in ["ANOVA", "SFS"]:
       # Define Feature Selection
        if fs_method == "ANOVA":
            selector = SelectKBest(score_func=f_classif)
            param_grid = param_grids_anova[name]
        else:
            selector = SequentialFeatureSelector(model)
            param_grid = param_grids_sfs[name]

        # Create pipeline
        pipeline = Pipeline([
            ("feature_selection", selector),
            ("classifier", model)
        ])
       
        if fs_method == "ANOVA":
            search = GridSearchCV(pipeline, param_grid, cv=5, scoring="roc_auc", n_jobs=-1)
        else:
            search = RandomizedSearchCV(pipeline, param_grid, n_iter=10, cv=5, scoring="roc_auc", n_jobs=-1, random_state=42)  
           

        search.fit(X_train, y_train)
       
        # Extract best model
        best_model = search.best_estimator_
        best_score = search.best_score_
        best_params = search.best_params_

        selected_features = X_train.columns[best_model.named_steps["feature_selection"].get_support()]
        selected_features_str = ", ".join(selected_features)  # Convert to string
       
        results.append({
            "Model": name,
            "Feature Selection": fs_method,
            "ROC AUC Score": best_score,
            "Selected Features": selected_features_str,
            "Best Params": best_params
        })

        # Store best model
        best_models[f"{name} ({fs_method})"] = search.best_estimator_

        print(f"Best {name} ({fs_method}) ROC_AUC: {best_score:.4f}")
        print(f"Best Params: {best_params}")
        print(f"Selected Features: {selected_features_str}\n")



Tuning hyperparameters for MLP...
Best MLP (ANOVA) ROC_AUC: 0.8786
Best Params: {'classifier__alpha': 0.001, 'classifier__hidden_layer_sizes': (32, 16, 8), 'classifier__learning_rate_init': 0.01, 'feature_selection__k': 12}
Selected Features: HRV_MeanNN, HRV_pNN50, HRV_LFHF, HRV_pNN20, HRV_maxNN, HRV_madNN, EDA_Tonic, SCR_Onsets, HRV_pNN50_norm, HRV_pNN20_norm, HRV_minNN_norm, EDA_Tonic_norm

Best MLP (SFS) ROC_AUC: 0.8408
Best Params: {'feature_selection__n_features_to_select': 10, 'classifier__learning_rate_init': 0.001, 'classifier__hidden_layer_sizes': (64, 32, 16), 'classifier__alpha': 0.01}
Selected Features: HRV_MeanNN, HRV_pNN50, HRV_LFHF, HRV_pNN20, HRV_maxNN, HRV_madNN, EDA_Tonic, SCR_Onsets, HRV_pNN20_norm, EDA_Tonic_norm



## Save Data

In [15]:
# ---- SAVE BEST MODELS ----

for model_name, model in best_models.items():
    joblib.dump(model, f"../models/{model_name.replace(' ', '_').replace('(', '').replace(')', '')}.pkl")

# Convert results to DataFrame and save to CSV
results_df = pd.DataFrame(results)

filename = f"../data/results/hyperparameter_tuning_results_{datetime.datetime.now().strftime("%d-%m-%Y-%H-%M")}.csv"
results_df.to_csv(filename, index=False)

print("Results and best performing models saved for future evaluation.")

Results and best performing models saved for future evaluation.
