# Model Training

In [89]:
from sklearn.model_selection import cross_val_score
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest, f_classif, SequentialFeatureSelector
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier

In [90]:
# ---- LOAD DATASET ----

df = pd.read_csv("../data/features/filtered_labeled_feature_matrix.csv")
X = df.drop(columns=["performance_class"])
y = df["performance_class"]

# ---- DEFINE MODELS ----

models = {
    "Logistic Regression": LogisticRegression(),
    "MLP": MLPClassifier(max_iter=1000),
    "SVM": SVC(probability=True),
    "kNN": KNeighborsClassifier(n_neighbors=5),
    "Naive Bayes": GaussianNB()
}


In [91]:
# ---- FEATURE SELECTION METHODS ----

k_default = 5
anova_selector = SelectKBest(score_func=f_classif, k=k_default)
sfs_selector = SequentialFeatureSelector(RandomForestClassifier(), n_features_to_select=k_default)

In [92]:
# ---- MODEL TRAINING ----

cv_results = {}
for name, model in models.items():

    for fs_method, selector in [("ANOVA", anova_selector), ("SFS", sfs_selector)]:
        pipeline = Pipeline([
            ("feature_selection", selector),
            ("classifier", model)
        ])


        # Perform cross-validation
        scores = cross_val_score(pipeline, X, y, cv=10, scoring="accuracy")
        cv_results[f"{name} ({fs_method})"] = scores.mean()

        print(f"{name} with {fs_method}: Accuracy = {scores.mean():.4f}")

Logistic Regression with ANOVA: Accuracy = 0.8000
Logistic Regression with SFS: Accuracy = 0.7500
MLP with ANOVA: Accuracy = 0.9333
MLP with SFS: Accuracy = 0.9667
SVM with ANOVA: Accuracy = 0.8750
SVM with SFS: Accuracy = 0.9417
kNN with ANOVA: Accuracy = 0.8750
kNN with SFS: Accuracy = 0.9417
Naive Bayes with ANOVA: Accuracy = 0.7917
Naive Bayes with SFS: Accuracy = 0.8083


In [93]:
# ---- SAVE RESULTS FOR COMPARISONS ----

cv_results_df = pd.DataFrame(cv_results.items(), columns=["Model", "Accuracy"])
cv_results_df.to_csv("../data/results/cv_results.csv", index=False)

print("Cross-Validation completed. Results saved.")

Cross-Validation completed. Results saved.
