# Model Training

In [3]:
from sklearn.model_selection import cross_val_score, train_test_split
import datetime
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold
import joblib
import time

## Load the feature matrix and split it into training and test sets.
Training and test sets are saved, for consistency in hyperparameter tuning.

In [10]:
# ---- LOAD DATASET ----

df = pd.read_csv("../data/features/filtered_labeled_feature_matrix.csv")
X = df.drop(columns=["performance_class"])
y = df["performance_class"]

# ---- SPLIT DATASET ----

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

# Save split datasets for hyperparameter tuning for consistent splits.
joblib.dump(X_train, "../data/splits/X_train.pkl")
joblib.dump(X_test, "../data/splits/X_test.pkl")
joblib.dump(y_train, "../data/splits/y_train.pkl")
joblib.dump(y_test, "../data/splits/y_test.pkl")

print("Train/test split saved successfully!")



Train/test split saved successfully!


## Models

In [None]:
models = {
    "RF": RandomForestClassifier(random_state=42),
    "SVM": SVC(probability=True),
    "MLP2": MLPClassifier(hidden_layer_sizes=(2,), max_iter=5000, random_state=42),
    "MLP4": MLPClassifier(hidden_layer_sizes=(4,), max_iter=5000, random_state=42),
    "MLP6": MLPClassifier(hidden_layer_sizes=(6,), max_iter=5000, random_state=42),
    "MLP8": MLPClassifier(hidden_layer_sizes=(8,), max_iter=5000, random_state=42),
    "3NN": KNeighborsClassifier(n_neighbors=3),
    "5NN": KNeighborsClassifier(n_neighbors=5),
    "10NN": KNeighborsClassifier(n_neighbors=10),
    "15NN": KNeighborsClassifier(n_neighbors=15),
}

## Initial Model training and evaluation.
Training is done using a stratified 10-fold cross-validation.

In [4]:
# Create a StratifiedKFold object
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

cv_results = {}
for name, model in models.items():
    pipeline = Pipeline([
        ("classifier", model)
    ])

    # ---- MEASURE TRAINING TIME ----
    
    start_cv = time.time()
    scores = cross_val_score(pipeline, X_train, y_train, cv=skf, scoring="roc_auc", n_jobs=-1)
    end_cv = time.time()
    cv_time = end_cv - start_cv
    print(scores)

    start_fit = time.time()
    pipeline.fit(X_train, y_train)
    end_fit = time.time()
    fit_time = end_fit - start_fit

    y_train_probs = pipeline.predict_proba(X_train)[:, 1]
    train_auc = roc_auc_score(y_train, y_train_probs)
    
    # ---- EVALUATE MODEL ----
    
    y_test_probs = pipeline.predict_proba(X_test)[:, 1]
    test_auc = roc_auc_score(y_test, y_test_probs)

    # ---- STORE RESULTS ----
    
    cv_results[name] = {
        "Model": name,
        "ROC AUC (Train)": train_auc,
        "ROC AUC (Test)": test_auc,
        "CV Time (s)": cv_time,
        "Final Fit Time (s)": fit_time,
    }

    # ---- SAVE MODELS AS PKL FILE ----
    
    timestamp = datetime.datetime.now().strftime("%d-%m-%Y-%H-%M")
    model_filename = f"../models/03_{name}_{timestamp}.pkl"
    joblib.dump(pipeline, model_filename)

    print(
        f"{name}: Train ROC_AUC = {train_auc:.4f}, "
        f"CV ROC_AUC mean = {scores.mean():.4f}, "
        f"Test ROC_AUC = {test_auc:.4f}, "
        f"CV Time = {cv_time:.2f}s, Fit Time = {fit_time:.2f}s"
    )

NameError: name 'X_train' is not defined

## Save the results for comparisons.

In [12]:
timestamp = datetime.datetime.now().strftime("%d-%m-%Y-%H-%M")
cv_results_df = pd.DataFrame.from_dict(cv_results, orient="index").reset_index(drop=True)
cv_results_df = cv_results_df[
    ["Model", "ROC AUC (CV)", "ROC AUC (Test)", "CV Time (s)", "Final Fit Time (s)"]
]
results_filename = f"../data/results/03_model_training_no_feature_selection_{timestamp}.csv"
cv_results_df.to_csv(results_filename, index=False)

print(f"Cross-Validation completed. Results saved to {results_filename}.")

Cross-Validation completed. Results saved to ../data/results/03_model_training_no_feature_selection_11-03-2025-01-04.csv.
