# Model Training

In [91]:
from sklearn.model_selection import cross_val_score, train_test_split
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest, f_classif, SequentialFeatureSelector
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
import joblib

In [92]:
# ---- LOAD DATASET ----

df = pd.read_csv("../data/features/filtered_labeled_feature_matrix.csv")
X = df.drop(columns=["performance_class"])
y = df["performance_class"]

# ---- SPLIT DATASET ----

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

# Save split datasets for hyperparameter tuning for consistent splits.
joblib.dump(X_train, "../data/splits/X_train.pkl")
joblib.dump(X_test, "../data/splits/X_test.pkl")
joblib.dump(y_train, "../data/splits/y_train.pkl")
joblib.dump(y_test, "../data/splits/y_test.pkl")

print("Train/test split saved successfully!")

# ---- DEFINE MODELS ----

models = {
    #"Logistic Regression": LogisticRegression(),
    #"RF": RandomForestClassifier(n_estimators=100, max_depth=5, min_samples_split=5, min_samples_leaf=4, random_state=42, max_features="sqrt"),
    "MLP": MLPClassifier(hidden_layer_sizes=(32, 16, 8), solver="adam", activation="relu", max_iter=1000, learning_rate_init=0.01),
    #"SVM": SVC(probability=True),
    #"kNN": KNeighborsClassifier(n_neighbors=5),
    #"Naive Bayes": GaussianNB()
}


Train/test split saved successfully!


In [94]:
# ---- MODEL TRAINING ----

k_default = 5
cv_results = {}
for name, model in models.items():
    for fs_method in ["ANOVA", "SFS"]:
        
        selector = None
        
        if fs_method == "ANOVA":
            selector = SelectKBest(score_func=f_classif, k=k_default)
        elif fs_method == "SFS":
            selector = SequentialFeatureSelector(model, n_features_to_select=k_default)

        pipeline = Pipeline([
            ("feature_selection", selector),
            ("classifier", model)
        ])

        # Perform cross-validation
        scores = cross_val_score(pipeline, X_train, y_train, cv=10, scoring="roc_auc", n_jobs=-1)
        
        # Fit Pipeline on Full Dataset (to check train accuracy)
        pipeline.fit(X_train, y_train)
        
        # ---- Evaluate on Test Set ----
        y_test_probs = pipeline.predict_proba(X_test)[:, 1]  # Probabilities for ROC AUC
        test_auc = roc_auc_score(y_test, y_test_probs)
        
        # Selected feature names
        feature_names = X_train.columns
        selected_feature_indices = []

        if fs_method == "ANOVA":
            selected_feature_indices = selector.get_support(indices=True)
        if fs_method == "SFS":
            selected_feature_indices = selector.get_support(indices=True)
            
        selected_features = [f"{feature_names[i]}({i})" for i in selected_feature_indices]
        
        cv_results[f"{name} ({fs_method})"] = {
            "CV_ROC_AUC": scores.mean(),
            "Test_ROC_AUC": test_auc,
            "Features": selected_features
        }

        print(f"{fs_method} Selected Features:", selected_features)
        print(f"{name} with {fs_method}: CV ROC_AUC = {scores.mean():.4f}, Test ROC_AUC = {test_auc:.4f}")

ANOVA Selected Features: ['HRV_pNN50(1)', 'HRV_pNN20(3)', 'EDA_Tonic(7)', 'HRV_pNN20_norm(10)', 'EDA_Tonic_norm(12)']
MLP with ANOVA: CV ROC_AUC = 0.8355, Test ROC_AUC = 0.8369
SFS Selected Features: ['HRV_pNN20(3)', 'HRV_minNN(5)', 'EDA_Tonic(7)', 'SCR_Onsets(8)', 'HRV_minNN_norm(11)']
MLP with SFS: CV ROC_AUC = 0.7718, Test ROC_AUC = 0.8180


In [95]:
# ---- SAVE RESULTS FOR COMPARISONS ----

cv_results_df = pd.DataFrame.from_dict(cv_results, orient="index").reset_index()
cv_results_df.columns = ["Model", "CV_ROC_AUC", "Test_ROC_AUC", "Features"]
cv_results_df.to_csv("../data/results/initial_training_results.csv", index=False)

print("Cross-Validation completed. Results saved.")

Cross-Validation completed. Results saved.
