In [1]:
import os
import numpy as np
import joblib
import logging
import pandas as pd

from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score,
    f1_score, confusion_matrix, classification_report
)

from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from IPython.display import display

In [2]:
%matplotlib inline

In [3]:
# ================= PATHS =================
FEATURE_PATH = "features"
MODEL_PATH = "models"
LOG_PATH = "logs"

os.makedirs(MODEL_PATH, exist_ok=True)
os.makedirs(LOG_PATH, exist_ok=True)

# ================= LOGGER =================
logging.basicConfig(
    filename=os.path.join(LOG_PATH, "train.log"),
    level=logging.INFO,
    format="%(asctime)s - %(levelname)s - %(message)s",
    filemode="w"
)
logger = logging.getLogger()
logger.info("===== TRAINING PIPELINE STARTED =====")


In [4]:
# ================= LOAD DATA =================
X = np.load(os.path.join(FEATURE_PATH, "X.npy"))
y = np.load(os.path.join(FEATURE_PATH, "y.npy"))

encoder = joblib.load(os.path.join(MODEL_PATH, "label_encoder.pkl"))
class_names = encoder.classes_

In [5]:
# ================= SPLIT =================
# 10% TEST
X_temp, X_test, y_temp, y_test = train_test_split(
    X, y, test_size=0.1, stratify=y, random_state=42
)

# 10% VAL, 80% TRAIN
X_train, X_val, y_train, y_val = train_test_split(
    X_temp, y_temp, test_size=0.1111, stratify=y_temp, random_state=42
)

logger.info(f"Train size: {X_train.shape[0]}")
logger.info(f"Val size  : {X_val.shape[0]}")
logger.info(f"Test size : {X_test.shape[0]}")


In [6]:

# ================= CV STRATEGY =================
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

In [7]:
models = {
    "KNN": (
        Pipeline([
            ("scaler", StandardScaler()),
            ("clf", KNeighborsClassifier())
        ]),
        {
            "clf__n_neighbors": [3, 5, 7],
            "clf__weights": ["uniform", "distance"]
        }
    ),

    "SVC": (
        Pipeline([
            ("scaler", StandardScaler()),
            ("clf", SVC(probability=True))
        ]),
        {
            "clf__C": [1, 10],
            "clf__gamma": ["scale", 0.01]
        }
    ),

    # "LogisticRegression": (
    #     Pipeline([
    #         ("scaler", StandardScaler()),
    #         ("clf", LogisticRegression(max_iter=1000))
    #     ]),
    #     {
    #         "clf__C": [0.1, 1, 10]
    #     }
    # ),

    "NaiveBayes": (
        GaussianNB(),
        {}
    ),

    "RandomForest": (
        RandomForestClassifier(random_state=42, n_jobs=-1),
        {
            "n_estimators": [200, 300],
            "max_depth": [20, 30, None]
        }
    )
}


In [8]:
results = []
confusion_matrices = {}

best_model = None
best_f1 = -1
best_model_name = None

for name, (model, param_grid) in models.items():

    print(f"\n================ {name} =================")

    grid = GridSearchCV(
        model,
        param_grid,
        scoring="f1_weighted",
        cv=cv,
        n_jobs=-1
    )

    grid.fit(X_train, y_train)

    val_preds = grid.best_estimator_.predict(X_val)

    acc = accuracy_score(y_val, val_preds)
    prec = precision_score(y_val, val_preds, average="weighted")
    rec = recall_score(y_val, val_preds, average="weighted")
    f1 = f1_score(y_val, val_preds, average="weighted")

    # Store results
    results.append({
        "Model": name,
        "Accuracy": acc,
        "Precision": prec,
        "Recall": rec,
        "F1 Score": f1
    })

    # Confusion Matrix
    cm = confusion_matrix(y_val, val_preds)
    cm_df = pd.DataFrame(cm, index=class_names, columns=class_names)
    confusion_matrices[name] = cm_df

    # Display in notebook
    print("Best Params:", grid.best_params_)
    print(classification_report(y_val, val_preds, target_names=class_names))
    display(cm_df)

    # Track best model
    if f1 > best_f1:
        best_f1 = f1
        best_model = grid.best_estimator_
        best_model_name = name



Best Params: {'clf__n_neighbors': 3, 'clf__weights': 'distance'}
                      precision    recall  f1-score   support

        Good Morning       0.71      1.00      0.83         5
        How are you_       0.75      0.60      0.67         5
           I am fine       0.57      0.80      0.67         5
  I am going to home       1.00      0.25      0.40         4
        I don’t know       0.67      1.00      0.80         4
      Please help me       0.80      0.80      0.80         5
  What is your name_       1.00      0.80      0.89         5
Where are you going_       1.00      0.75      0.86         4

            accuracy                           0.76        37
           macro avg       0.81      0.75      0.74        37
        weighted avg       0.81      0.76      0.74        37



Unnamed: 0,Good Morning,How are you_,I am fine,I am going to home,I don’t know,Please help me,What is your name_,Where are you going_
Good Morning,5,0,0,0,0,0,0,0
How are you_,0,3,2,0,0,0,0,0
I am fine,0,1,4,0,0,0,0,0
I am going to home,1,0,0,1,1,1,0,0
I don’t know,0,0,0,0,4,0,0,0
Please help me,0,0,1,0,0,4,0,0
What is your name_,1,0,0,0,0,0,4,0
Where are you going_,0,0,0,0,1,0,0,3



Best Params: {'clf__C': 10, 'clf__gamma': 'scale'}
                      precision    recall  f1-score   support

        Good Morning       0.83      1.00      0.91         5
        How are you_       0.71      1.00      0.83         5
           I am fine       1.00      0.80      0.89         5
  I am going to home       1.00      0.50      0.67         4
        I don’t know       0.50      0.75      0.60         4
      Please help me       1.00      0.80      0.89         5
  What is your name_       1.00      0.80      0.89         5
Where are you going_       0.75      0.75      0.75         4

            accuracy                           0.81        37
           macro avg       0.85      0.80      0.80        37
        weighted avg       0.86      0.81      0.81        37



Unnamed: 0,Good Morning,How are you_,I am fine,I am going to home,I don’t know,Please help me,What is your name_,Where are you going_
Good Morning,5,0,0,0,0,0,0,0
How are you_,0,5,0,0,0,0,0,0
I am fine,0,1,4,0,0,0,0,0
I am going to home,0,0,0,2,2,0,0,0
I don’t know,0,0,0,0,3,0,0,1
Please help me,0,1,0,0,0,4,0,0
What is your name_,1,0,0,0,0,0,4,0
Where are you going_,0,0,0,0,1,0,0,3



Best Params: {}
                      precision    recall  f1-score   support

        Good Morning       0.71      1.00      0.83         5
        How are you_       0.38      0.60      0.46         5
           I am fine       0.75      0.60      0.67         5
  I am going to home       1.00      1.00      1.00         4
        I don’t know       1.00      0.50      0.67         4
      Please help me       0.75      0.60      0.67         5
  What is your name_       1.00      0.60      0.75         5
Where are you going_       0.60      0.75      0.67         4

            accuracy                           0.70        37
           macro avg       0.77      0.71      0.71        37
        weighted avg       0.77      0.70      0.71        37



Unnamed: 0,Good Morning,How are you_,I am fine,I am going to home,I don’t know,Please help me,What is your name_,Where are you going_
Good Morning,5,0,0,0,0,0,0,0
How are you_,0,3,1,0,0,0,0,1
I am fine,0,2,3,0,0,0,0,0
I am going to home,0,0,0,4,0,0,0,0
I don’t know,1,0,0,0,2,0,0,1
Please help me,0,2,0,0,0,3,0,0
What is your name_,1,1,0,0,0,0,3,0
Where are you going_,0,0,0,0,0,1,0,3



Best Params: {'max_depth': 20, 'n_estimators': 300}
                      precision    recall  f1-score   support

        Good Morning       0.83      1.00      0.91         5
        How are you_       0.83      1.00      0.91         5
           I am fine       1.00      1.00      1.00         5
  I am going to home       1.00      0.75      0.86         4
        I don’t know       0.75      0.75      0.75         4
      Please help me       1.00      1.00      1.00         5
  What is your name_       1.00      0.80      0.89         5
Where are you going_       0.75      0.75      0.75         4

            accuracy                           0.89        37
           macro avg       0.90      0.88      0.88        37
        weighted avg       0.90      0.89      0.89        37



Unnamed: 0,Good Morning,How are you_,I am fine,I am going to home,I don’t know,Please help me,What is your name_,Where are you going_
Good Morning,5,0,0,0,0,0,0,0
How are you_,0,5,0,0,0,0,0,0
I am fine,0,0,5,0,0,0,0,0
I am going to home,0,0,0,3,1,0,0,0
I don’t know,0,0,0,0,3,0,0,1
Please help me,0,0,0,0,0,5,0,0
What is your name_,1,0,0,0,0,0,4,0
Where are you going_,0,1,0,0,0,0,0,3


In [9]:
results_df = pd.DataFrame(results).sort_values("F1 Score", ascending=False)
display(results_df)

Unnamed: 0,Model,Accuracy,Precision,Recall,F1 Score
3,RandomForest,0.891892,0.900901,0.891892,0.890917
1,SVC,0.810811,0.857786,0.810811,0.813841
0,KNN,0.756757,0.806628,0.756757,0.743415
2,NaiveBayes,0.702703,0.76612,0.702703,0.708766


In [13]:
# ================= FINAL TEST EVALUATION =================
logger.info("\n===== FINAL TEST RESULTS =====")

test_preds = best_model.predict(X_test)
test_acc = accuracy_score(y_test, test_preds)
test_prec = precision_score(y_test, test_preds, average="weighted")
test_rec = recall_score(y_test, test_preds, average="weighted")
test_f1 = f1_score(y_test, test_preds, average="weighted")

logger.info("===== BEST MODEL FINAL TEST RESULTS =====")
logger.info(f"BEST MODEL: {best_model_name}")
logger.info(f"Accuracy : {test_acc:.4f}")
logger.info(f"Precision: {test_prec:.4f}")
logger.info(f"Recall   : {test_rec:.4f}")
logger.info(f"F1 Score : {test_f1:.4f}")

logger.info(
    classification_report(
        y_test, test_preds, target_names=class_names
    )
)

cm = confusion_matrix(y_test, test_preds)
cm_df = pd.DataFrame(cm, index=class_names, columns=class_names)
logger.info(f"\nConfusion Matrix:\n{cm_df}")

In [None]:
# ================= SAVE MODEL =================
joblib.dump(best_model, os.path.join(MODEL_PATH, "best_model.pkl"))

logger.info("===== TRAINING COMPLETED =====")
logger.info(f"BEST MODEL: {best_model_name}")
logger.info(f"BEST TEST F1: {f1_score(y_test, test_preds, average='weighted'):.4f}")

print("\nBEST MODEL:", best_model_name)
print("Model saved to models/best_model.pkl")


BEST MODEL: RandomForest
Model saved to models/best_model.pkl
