# Model Training

In [51]:
from sklearn.model_selection import cross_val_score, train_test_split
import datetime
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest, f_classif, SequentialFeatureSelector
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score, accuracy_score
from sklearn.model_selection import StratifiedKFold
import joblib
import time


In [52]:
# ---- LOAD DATASET ----

df = pd.read_csv("../data/features/filtered_labeled_feature_matrix.csv")
X = df.drop(columns=["performance_class"])
y = df["performance_class"]

# ---- Load DATASET ----

# Load split
X_train = joblib.load("../data/splits/X_train_augmented_best.pkl")
X_test = joblib.load("../data/splits/X_test.pkl")
y_train = joblib.load("../data/splits/y_train_augmented_best.pkl")
y_test = joblib.load("../data/splits/y_test.pkl")

print("Train/test split saved successfully!")



Train/test split saved successfully!


In [53]:
# ---- DEFINE MODELS ----

models = {
    "RF": RandomForestClassifier(random_state=42),
    "SVM": SVC(probability=True),
    "MLP2": MLPClassifier(hidden_layer_sizes=(2,), max_iter=5000, random_state=42),
    "MLP4": MLPClassifier(hidden_layer_sizes=(4,), max_iter=5000, random_state=42),
    "MLP6": MLPClassifier(hidden_layer_sizes=(6,), max_iter=5000, random_state=42),
    "MLP8": MLPClassifier(hidden_layer_sizes=(8,), max_iter=5000, random_state=42),
    "3NN": KNeighborsClassifier(n_neighbors=3),
    "5NN": KNeighborsClassifier(n_neighbors=5),
    "10NN": KNeighborsClassifier(n_neighbors=10),
    "15NN": KNeighborsClassifier(n_neighbors=15),
}


# ---- MODEL TRAINING ----

# Create a StratifiedKFold object
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

cv_results = {}

for name, model in models.items():
    pipeline = Pipeline([
        ("classifier", model)
    ])

    # ---- Measure cross-validation time ----
    start_cv = time.time()
    # Pass skf as the cv parameter to cross_val_score
    scores = cross_val_score(pipeline, X_train, y_train, cv=skf, scoring="roc_auc", n_jobs=-1)
    end_cv = time.time()
    cv_time = end_cv - start_cv  # total CV time (seconds)
    
    print(scores)

    # ---- Measure final fit time on the full training set ----
    start_fit = time.time()
    pipeline.fit(X_train, y_train)
    end_fit = time.time()
    fit_time = end_fit - start_fit  # final pipeline fit time (seconds)


    y_train_probs = pipeline.predict_proba(X_train)[:, 1]
    train_auc = roc_auc_score(y_train, y_train_probs)
    
    # ---- Evaluate on test set (ROC AUC) ----
    y_test_probs = pipeline.predict_proba(X_test)[:, 1]
    test_auc = roc_auc_score(y_test, y_test_probs)
    
    y_test_pred = pipeline.predict(X_test)
    test_accuracy = accuracy_score(y_test, y_test_pred)

    # Store results
    cv_results[name] = {
        "Model": name,
        "ROC AUC (Train)": train_auc,
        "ROC AUC (CV)": scores.mean(),
        "ROC AUC (Test)": test_auc,
        "Accuracy (Test)": test_accuracy,
        "CV Time (s)": cv_time,
        "Final Fit Time (s)": fit_time,
    }
    

    # Save the trained pipeline as a .pkl file
    timestamp = datetime.datetime.now().strftime("%d-%m-%Y-%H-%M")
    model_filename = f"../models/03_{name}_{timestamp}.pkl"
    joblib.dump(pipeline, model_filename)

    print(
        f"{name}: Train ROC_AUC = {train_auc:.4f}, "
        f"CV ROC_AUC mean = {scores.mean():.4f}, "
        f"Test ROC_AUC = {test_auc:.4f}, "
        f"CV Time = {cv_time:.2f}s, Fit Time = {fit_time:.2f}s"
    )

[0.9787  0.9749  0.97385 0.9806  0.9959  0.98425 0.95905 0.97615 0.98535
 0.97765]
RF: Train ROC_AUC = 1.0000, CV ROC_AUC mean = 0.9786, Test ROC_AUC = 0.8402, CV Time = 2.25s, Fit Time = 0.60s
[0.972  0.9541 0.9696 0.9663 0.9904 0.9809 0.9607 0.9741 0.9819 0.9737]
SVM: Train ROC_AUC = 0.9892, CV ROC_AUC mean = 0.9724, Test ROC_AUC = 0.8459, CV Time = 0.36s, Fit Time = 0.28s
[0.7869  0.83285 0.76475 0.7274  0.8215  0.8544  0.81335 0.779   0.8407
 0.7719 ]
MLP2: Train ROC_AUC = 0.8242, CV ROC_AUC mean = 0.7993, Test ROC_AUC = 0.7623, CV Time = 0.69s, Fit Time = 0.36s
[0.8372  0.88705 0.85745 0.86765 0.8893  0.9022  0.8794  0.8717  0.8983
 0.8978 ]
MLP4: Train ROC_AUC = 0.9156, CV ROC_AUC mean = 0.8788, Test ROC_AUC = 0.7172, CV Time = 1.27s, Fit Time = 0.66s
[0.9049 0.924  0.9293 0.9276 0.9691 0.9539 0.9582 0.9288 0.9355 0.9328]
MLP6: Train ROC_AUC = 0.9708, CV ROC_AUC mean = 0.9364, Test ROC_AUC = 0.8131, CV Time = 1.31s, Fit Time = 0.84s
[0.9218 0.9218 0.9269 0.9506 0.969  0.9807 0.94

In [54]:
# ---- SAVE RESULTS FOR COMPARISONS ----

timestamp = datetime.datetime.now().strftime("%d-%m-%Y-%H-%M")
cv_results_df = pd.DataFrame.from_dict(cv_results, orient="index").reset_index(drop=True)
cv_results_df = cv_results_df[
    ["Model", "ROC AUC (CV)", "ROC AUC (Test)", "CV Time (s)", "Final Fit Time (s)"]
]
results_filename = f"../data/results/03_model_training_no_feature_selection_{timestamp}.csv"
cv_results_df.to_csv(results_filename, index=False)

print(f"Cross-Validation completed. Results saved to {results_filename}.")

Cross-Validation completed. Results saved to ../data/results/03_model_training_no_feature_selection_17-03-2025-22-48.csv.
