# Model Training and Internal Evaluation (for-norm dataset)
This notebook focuses on **training and evaluating multiple machine learning models** using the `for-norm` dataset. The dataset is splitted into **training**, **validation**, and **testing** sets.

## What this notebook covers:
- Training 5 classic ML models:
  - Logistic Regression
  - Naive Bayes
  - Decision Tree
  - Random Forest
  - Support Vector Machine (SVM)
- Tuning hyperparameters using the **validation set**
- Evaluating model performance on the **internal test set**
- Saving all tuned models and the scaler for reuse in the external evaluation phase

In [1]:
# Importing required libraries
import joblib
import os
import pandas as pd
from sklearn.model_selection import ParameterGrid
from sklearn.base import clone
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, accuracy_score, f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

In [2]:
# Load extracted features
df_norm = pd.read_csv("Processed_Features/for_norm_features.csv")

# Split features and labels
X = df_norm.drop(columns=["filename", "split", "label", "LABEL"])
y = df_norm["LABEL"]

# Create split-based datasets
X_train = X[df_norm["split"] == "training"]
y_train = y[df_norm["split"] == "training"]
X_val = X[df_norm["split"] == "validation"]
y_val = y[df_norm["split"] == "validation"]
X_test = X[df_norm["split"] == "testing"]
y_test = y[df_norm["split"] == "testing"]

In [3]:
# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

In [4]:
# Models and grid parameters
models = {
    "LogisticRegression": {
        "model": LogisticRegression(max_iter=1000, random_state=42),
        "params": {"C": [0.1, 1, 10]}
    },
    "NaiveBayes": {
        "model": GaussianNB(),
        "params": {}
    },
    "DecisionTree": {
        "model": DecisionTreeClassifier(random_state=42),
        "params": {
            "max_depth": [10, 20, None],
            "min_samples_split": [2, 5],
            "min_samples_leaf": [1, 2]
        }
    },
    "RandomForest": {
        "model": RandomForestClassifier(random_state=42),
        "params": {
            "n_estimators": [100],
            "max_depth": [10, 20, None]
        }
    },
    "SVM": {
        "model": SVC(probability=True, random_state=42),
        "params": {"C": [0.1, 1], "kernel": ["linear", "rbf"]}
    }
}

In [5]:
# Afunction to extract metrics from the classification report
def get_metrics(y_true, y_pred, dataset_name, model_name):
    report = classification_report(y_true, y_pred, output_dict=True)
    
    return {
        "Model": model_name,
        "Dataset": dataset_name,
        "Precision_0": report['0']['precision'],
        "Recall_0": report['0']['recall'],
        "F1_0": report['0']['f1-score'],
        "Support_0": report['0']['support'],
        "Precision_1": report['1']['precision'],
        "Recall_1": report['1']['recall'],
        "F1_1": report['1']['f1-score'],
        "Support_1": report['1']['support'],
        "Accuracy": accuracy_score(y_true, y_pred),
    }

In [6]:
results = []
best_models = {}
validation_scores = []

for model_name, config in models.items():
    print(f"\n Tuning {model_name}...")
    best_f1 = -1
    best_model = None

    for params in ParameterGrid(config["params"]):
        model = clone(config["model"]).set_params(**params)
        model.fit(X_train_scaled, y_train)
        val_pred = model.predict(X_val_scaled)
        val_f1 = f1_score(y_val, val_pred, pos_label=0)

        if val_f1 > best_f1:
            best_f1 = val_f1
            best_model = model

    best_models[model_name] = best_model
    val_pred = best_model.predict(X_val_scaled)
    test_pred = best_model.predict(X_test_scaled)

    results.append(get_metrics(y_val, val_pred, "Validation", model_name))
    results.append(get_metrics(y_test, test_pred, "Internal Test", model_name))

    validation_scores.append((model_name, best_f1))

# Save and view results
results_df = pd.DataFrame(results)
results_df.to_csv("for_norm_internal_training_results.csv", index=False)
display(results_df)



 Tuning LogisticRegression...



 Tuning NaiveBayes...

 Tuning DecisionTree...

 Tuning RandomForest...

 Tuning SVM...


Unnamed: 0,Model,Dataset,Precision_0,Recall_0,F1_0,Support_0,Precision_1,Recall_1,F1_1,Support_1,Accuracy
0,LogisticRegression,Validation,0.851337,0.86143,0.856354,5398.0,0.85982,0.84963,0.854694,5400.0,0.855529
1,LogisticRegression,Internal Test,0.658418,0.572574,0.612503,2370.0,0.606296,0.689046,0.645028,2264.0,0.629478
2,NaiveBayes,Validation,0.782921,0.867914,0.82323,5398.0,0.85189,0.759444,0.803015,5400.0,0.813669
3,NaiveBayes,Internal Test,0.600682,0.966667,0.740944,2370.0,0.903659,0.327297,0.480545,2264.0,0.654294
4,DecisionTree,Validation,0.950147,0.960356,0.955224,5398.0,0.95994,0.94963,0.954757,5400.0,0.954992
5,DecisionTree,Internal Test,0.827723,0.705485,0.761731,2370.0,0.732976,0.84629,0.785568,2264.0,0.774277
6,RandomForest,Validation,0.991116,0.992034,0.991575,5398.0,0.99203,0.991111,0.99157,5400.0,0.991573
7,RandomForest,Internal Test,0.922782,0.610127,0.734569,2370.0,0.698728,0.946555,0.803977,2264.0,0.774493
8,SVM,Validation,0.996119,0.998518,0.997317,5398.0,0.998515,0.996111,0.997312,5400.0,0.997314
9,SVM,Internal Test,0.976274,0.468776,0.633409,2370.0,0.639874,0.988074,0.776736,2264.0,0.722486


In [7]:
# Create a folder to store the saved models
os.makedirs("saved_models", exist_ok=True)

# Save each model
for model_name, model in best_models.items():
    filepath = f"saved_models/{model_name}_for_norm.joblib"
    joblib.dump(model, filepath)
    print(f"Saved: {filepath}")

# Save the scaler
joblib.dump(scaler, "saved_models/for_norm_scaler.joblib")
print("scaler saved")

Saved: saved_models/LogisticRegression_for_norm.joblib
Saved: saved_models/NaiveBayes_for_norm.joblib
Saved: saved_models/DecisionTree_for_norm.joblib
Saved: saved_models/RandomForest_for_norm.joblib
Saved: saved_models/SVM_for_norm.joblib
scaler saved
