# Model Training

In [57]:
from sklearn.model_selection import cross_val_score, train_test_split
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest, f_classif, SequentialFeatureSelector
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score

In [58]:
# ---- LOAD DATASET ----

df = pd.read_csv("../data/features/filtered_labeled_feature_matrix.csv")
X = df.drop(columns=["performance_class"])
y = df["performance_class"]

# ---- SPLIT DATASET ----
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

# ---- DEFINE MODELS ----

models = {
    #"Logistic Regression": LogisticRegression(),
    #"RF": RandomForestClassifier(n_estimators=100, max_depth=5, min_samples_split=5, min_samples_leaf=4, random_state=42, max_features="sqrt"),
    "MLP": MLPClassifier(hidden_layer_sizes=(32, 16, 8), solver="adam", activation="relu", max_iter=1000, learning_rate_init=0.01),
    #"SVM": SVC(probability=True),
    #"kNN": KNeighborsClassifier(n_neighbors=5),
    #"Naive Bayes": GaussianNB()
}


In [59]:
# ---- FEATURE SELECTION METHODS ----

k_default = 5

In [60]:
# ---- MODEL TRAINING ----

cv_results = {}
for name, model in models.items():

    for fs_method in ["ANOVA", "SFS"]:
        
        selector = None
        
        if fs_method == "ANOVA":
            selector = SelectKBest(score_func=f_classif, k=k_default)
        elif fs_method == "SFS":
            selector = SequentialFeatureSelector(model, n_features_to_select=k_default)


        pipeline = Pipeline([
            ("feature_selection", selector),
            ("classifier", model)
        ])


        # Perform cross-validation
        scores = cross_val_score(pipeline, X_train, y_train, cv=10, scoring="roc_auc", n_jobs=-1)
        cv_results[f"{name} ({fs_method})"] = scores.mean()
        
         # Fit Pipeline on Full Dataset (to check train accuracy)
        pipeline.fit(X_train, y_train)
        # ---- Evaluate on Test Set ----
        y_test_probs = pipeline.predict_proba(X_test)[:, 1]  # Probabilities for ROC AUC
        test_auc = roc_auc_score(y_test, y_test_probs)
        
        if fs_method == "ANOVA":
            print("ANOVA Selected Features:", selector.get_support(indices=True))
        if fs_method == "SFS":
            print("SFS Selected Features:", selector.get_support(indices=True))

        print(f"{name} with {fs_method}: CV ROC_AUC = {scores.mean():.4f}, Test ROC_AUC = {test_auc:.4f}")

ANOVA Selected Features: [ 1  3  7 10 12]
MLP with ANOVA: CV ROC_AUC = 0.8662, Test ROC_AUC = 0.8123
SFS Selected Features: [ 0  1  7  9 12]
MLP with SFS: CV ROC_AUC = 0.8001, Test ROC_AUC = 0.8115


In [61]:
# ---- SAVE RESULTS FOR COMPARISONS ----

cv_results_df = pd.DataFrame(cv_results.items(), columns=["Model", "ROC_AUC"])
cv_results_df.to_csv("../data/results/cv_results.csv", index=False)

print("Cross-Validation completed. Results saved.")

Cross-Validation completed. Results saved.
