# CIC-IDS-2017: Model Training

This notebook is intended to run in Google Colab.

**What this notebook does**
1. Loads the prepared data splits
2. Trains multiple models with different hyperparameters
3. Compares validation and test results
4. Builds an ensemble from the best models
5. Saves the best models and artifacts


## 1. Install dependencies

In [None]:
!pip install -q xgboost lightgbm scikit-learn pandas numpy plotly kaleido joblib tqdm pyyaml

In [None]:
import os
import json
import time
import warnings
from pathlib import Path

import numpy as np
import pandas as pd
import joblib
from tqdm.auto import tqdm

from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    roc_auc_score, average_precision_score, confusion_matrix,
    classification_report
)
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier

import xgboost as xgb
import lightgbm as lgb

import plotly.graph_objects as go
import plotly.express as px

warnings.filterwarnings("ignore")
print("Libraries imported.")

## 2. Load data

Ensure the following files exist in your runtime (or in the specified Google Drive paths):

- `train.parquet`
- `val.parquet`
- `test.parquet`
- `feature_schema.json`
- `label_mapping.json`


In [None]:
from google.colab import drive
drive.mount("/content/drive")

In [None]:
DATA_PATH = "/content/drive/MyDrive/TraficAnalysis/data/processed/splits/"
ARTIFACTS_PATH = "/content/drive/MyDrive/TraficAnalysis/artifacts/"
OUTPUT_PATH = "/content/drive/MyDrive/TraficAnalysis/models/"

os.makedirs(OUTPUT_PATH, exist_ok=True)

In [None]:
print("Loading data...")

train_df = pd.read_parquet(f"{DATA_PATH}/train.parquet")
val_df = pd.read_parquet(f"{DATA_PATH}/val.parquet")
test_df = pd.read_parquet(f"{DATA_PATH}/test.parquet")

print(f"Train: {len(train_df):,} rows")
print(f"Val:   {len(val_df):,} rows")
print(f"Test:  {len(test_df):,} rows")

with open(f"{ARTIFACTS_PATH}/feature_schema.json", "r") as f:
    feature_schema = json.load(f)

with open(f"{ARTIFACTS_PATH}/label_mapping.json", "r") as f:
    label_mapping = json.load(f)

feature_cols = feature_schema["feature_columns"]
print(f"Features: {len(feature_cols)}")

In [None]:
X_train = train_df[feature_cols].values
y_train = train_df["label_binary"].values

X_val = val_df[feature_cols].values
y_val = val_df["label_binary"].values

X_test = test_df[feature_cols].values
y_test = test_df["label_binary"].values

print("Data shapes:")
print(f"X_train: {X_train.shape}")
print(f"X_val:   {X_val.shape}")
print(f"X_test:  {X_test.shape}")

print("Class balance (0=Benign, 1=Attack):")
print(f"Train - Benign: {(y_train==0).sum():,}, Attack: {(y_train==1).sum():,}")
print(f"Val   - Benign: {(y_val==0).sum():,}, Attack: {(y_val==1).sum():,}")
print(f"Test  - Benign: {(y_test==0).sum():,}, Attack: {(y_test==1).sum():,}")

## 3. Helper functions

In [None]:
def calculate_metrics(y_true, y_pred, y_proba=None):
    metrics = {
        "accuracy": accuracy_score(y_true, y_pred),
        "precision": precision_score(y_true, y_pred, zero_division=0),
        "recall": recall_score(y_true, y_pred, zero_division=0),
        "f1": f1_score(y_true, y_pred, zero_division=0),
    }

    if y_proba is not None:
        if len(y_proba.shape) > 1:
            y_proba = y_proba[:, 1]
        try:
            metrics["roc_auc"] = roc_auc_score(y_true, y_proba)
            metrics["pr_auc"] = average_precision_score(y_true, y_proba)
        except Exception:
            metrics["roc_auc"] = 0.0
            metrics["pr_auc"] = 0.0

    return metrics


def train_and_evaluate(model, model_name, X_train, y_train, X_val, y_val, X_test, y_test):
    print("\n" + "=" * 60)
    print(f"Training: {model_name}")
    print("=" * 60)

    start_time = time.time()
    model.fit(X_train, y_train)
    training_time = time.time() - start_time

    print(f"Training time: {training_time:.1f}s")

    results = {"name": model_name, "training_time": training_time}

    for split_name, X, y in [("train", X_train, y_train), ("val", X_val, y_val), ("test", X_test, y_test)]:
        y_pred = model.predict(X)
        y_proba = model.predict_proba(X)

        metrics = calculate_metrics(y, y_pred, y_proba)
        for metric_name, value in metrics.items():
            results[f"{split_name}_{metric_name}"] = value

        if split_name == "test":
            results["y_test_true"] = y
            results["y_test_pred"] = y_pred
            results["y_test_proba"] = y_proba

    print(f"Val F1: {results['val_f1']:.4f}, Test F1: {results['test_f1']:.4f}")
    print(f"Val ROC-AUC: {results['val_roc_auc']:.4f}, Test ROC-AUC: {results['test_roc_auc']:.4f}")

    return model, results


def print_results_table(all_results):
    df = pd.DataFrame(all_results)

    display_cols = ["name", "training_time", "val_f1", "val_roc_auc", "test_f1", "test_roc_auc"]
    display_cols = [c for c in display_cols if c in df.columns]

    df_display = df[display_cols].copy()
    df_display = df_display.sort_values("val_f1", ascending=False)

    print("\n" + "=" * 80)
    print("RESULTS SUMMARY")
    print("=" * 80)
    print(df_display.to_string(index=False))

    return df_display

## 4. Train models

### 4.1 Random Forest

In [None]:
rf_experiments = [
    {"name": "RF_baseline", "n_estimators": 100, "max_depth": None},
    {"name": "RF_deep", "n_estimators": 200, "max_depth": 20, "min_samples_split": 5},
    {"name": "RF_wide", "n_estimators": 300, "max_depth": 10, "min_samples_leaf": 2},
]

rf_results = []
rf_models = {}

for exp in tqdm(rf_experiments, desc="Random Forest experiments"):
    model = RandomForestClassifier(
        n_estimators=exp.get("n_estimators", 100),
        max_depth=exp.get("max_depth"),
        min_samples_split=exp.get("min_samples_split", 2),
        min_samples_leaf=exp.get("min_samples_leaf", 1),
        class_weight="balanced",
        n_jobs=-1,
        random_state=42,
        verbose=0,
    )

    trained_model, results = train_and_evaluate(
        model, exp["name"], X_train, y_train, X_val, y_val, X_test, y_test
    )

    rf_results.append(results)
    rf_models[exp["name"]] = trained_model

print_results_table(rf_results)

### 4.2 XGBoost

In [None]:
xgb_experiments = [
    {"name": "XGB_baseline", "n_estimators": 100, "max_depth": 6, "learning_rate": 0.1},
    {"name": "XGB_deep", "n_estimators": 200, "max_depth": 10, "learning_rate": 0.05},
    {
        "name": "XGB_regularized",
        "n_estimators": 150,
        "max_depth": 8,
        "learning_rate": 0.1,
        "reg_alpha": 0.1,
        "reg_lambda": 1.0,
    },
]

scale_pos_weight = (y_train == 0).sum() / (y_train == 1).sum()
print(f"scale_pos_weight: {scale_pos_weight:.2f}")

xgb_results = []
xgb_models = {}

for exp in tqdm(xgb_experiments, desc="XGBoost experiments"):
    model = xgb.XGBClassifier(
        n_estimators=exp.get("n_estimators", 100),
        max_depth=exp.get("max_depth", 6),
        learning_rate=exp.get("learning_rate", 0.1),
        subsample=exp.get("subsample", 0.8),
        colsample_bytree=exp.get("colsample_bytree", 0.8),
        reg_alpha=exp.get("reg_alpha", 0),
        reg_lambda=exp.get("reg_lambda", 1),
        scale_pos_weight=scale_pos_weight,
        use_label_encoder=False,
        eval_metric="logloss",
        n_jobs=-1,
        random_state=42,
        verbosity=0,
    )

    trained_model, results = train_and_evaluate(
        model, exp["name"], X_train, y_train, X_val, y_val, X_test, y_test
    )

    xgb_results.append(results)
    xgb_models[exp["name"]] = trained_model

print_results_table(xgb_results)

### 4.3 LightGBM

In [None]:
lgb_experiments = [
    {"name": "LGBM_baseline", "n_estimators": 100, "num_leaves": 31, "learning_rate": 0.1},
    {"name": "LGBM_deep", "n_estimators": 200, "num_leaves": 63, "max_depth": 10, "learning_rate": 0.05},
    {"name": "LGBM_fast", "n_estimators": 300, "num_leaves": 15, "learning_rate": 0.15},
]

lgb_results = []
lgb_models = {}

for exp in tqdm(lgb_experiments, desc="LightGBM experiments"):
    model = lgb.LGBMClassifier(
        n_estimators=exp.get("n_estimators", 100),
        num_leaves=exp.get("num_leaves", 31),
        max_depth=exp.get("max_depth", -1),
        learning_rate=exp.get("learning_rate", 0.1),
        subsample=exp.get("subsample", 0.8),
        colsample_bytree=exp.get("colsample_bytree", 0.8),
        class_weight="balanced",
        n_jobs=-1,
        random_state=42,
        verbose=-1,
    )

    trained_model, results = train_and_evaluate(
        model, exp["name"], X_train, y_train, X_val, y_val, X_test, y_test
    )

    lgb_results.append(results)
    lgb_models[exp["name"]] = trained_model

print_results_table(lgb_results)

### 4.4 Neural Network

In [None]:
nn_experiments = [
    {"name": "NN_small", "hidden_layer_sizes": (64, 32), "learning_rate_init": 0.001},
    {"name": "NN_medium", "hidden_layer_sizes": (128, 64, 32), "learning_rate_init": 0.001},
    {"name": "NN_large", "hidden_layer_sizes": (256, 128, 64), "learning_rate_init": 0.0005},
]

nn_results = []
nn_models = {}

for exp in tqdm(nn_experiments, desc="Neural Network experiments"):
    model = MLPClassifier(
        hidden_layer_sizes=exp.get("hidden_layer_sizes", (100,)),
        activation="relu",
        learning_rate_init=exp.get("learning_rate_init", 0.001),
        max_iter=100,
        early_stopping=True,
        validation_fraction=0.1,
        random_state=42,
        verbose=False,
    )

    trained_model, results = train_and_evaluate(
        model, exp["name"], X_train, y_train, X_val, y_val, X_test, y_test
    )

    nn_results.append(results)
    nn_models[exp["name"]] = trained_model

print_results_table(nn_results)

## 5. Compare all models

In [None]:
all_results = rf_results + xgb_results + lgb_results + nn_results
all_models = {**rf_models, **xgb_models, **lgb_models, **nn_models}

results_df = pd.DataFrame(all_results).sort_values("val_f1", ascending=False)

print("\n" + "=" * 80)
print("ALL MODELS COMPARISON (sorted by Val F1)")
print("=" * 80)

display_cols = [
    "name",
    "training_time",
    "val_f1",
    "val_roc_auc",
    "val_pr_auc",
    "test_f1",
    "test_roc_auc",
    "test_pr_auc",
]
print(results_df[display_cols].to_string(index=False))

In [None]:
fig = go.Figure()

metrics_to_plot = ["f1", "roc_auc", "precision", "recall"]
colors = px.colors.qualitative.Set2

for i, metric in enumerate(metrics_to_plot):
    fig.add_trace(
        go.Bar(
            name=metric.upper(),
            x=results_df["name"],
            y=results_df[f"test_{metric}"],
            marker_color=colors[i],
        )
    )

fig.update_layout(
    title="Model Comparison (Test Set)",
    xaxis_title="Model",
    yaxis_title="Score",
    barmode="group",
    height=500,
    width=1000,
)

fig.show()

In [None]:
from sklearn.metrics import roc_curve

fig = go.Figure()

for name, model in all_models.items():
    y_proba = model.predict_proba(X_test)
    if len(y_proba.shape) > 1:
        y_proba = y_proba[:, 1]

    fpr, tpr, _ = roc_curve(y_test, y_proba)
    auc = roc_auc_score(y_test, y_proba)

    fig.add_trace(
        go.Scatter(
            x=fpr,
            y=tpr,
            mode="lines",
            name=f"{name} (AUC={auc:.3f})",
        )
    )

fig.add_trace(
    go.Scatter(
        x=[0, 1],
        y=[0, 1],
        mode="lines",
        name="Random",
        line=dict(dash="dash", color="gray"),
    )
)

fig.update_layout(
    title="ROC Curves - All Models",
    xaxis_title="False Positive Rate",
    yaxis_title="True Positive Rate",
    height=600,
    width=800,
)

fig.show()

## 6. Build an ensemble

In [None]:
top_k = 5
top_models_df = results_df.head(top_k)
print(f"Top-{top_k} models for the ensemble:")
print(top_models_df[["name", "val_f1", "test_f1"]].to_string(index=False))

top_model_names = top_models_df["name"].tolist()
top_models = {name: all_models[name] for name in top_model_names}

In [None]:
class SimpleEnsemble:
    def __init__(self, models, weights=None):
        self.models = models
        if weights is None:
            self.weights = [1.0 / len(models)] * len(models)
        else:
            total = sum(weights)
            self.weights = [w / total for w in weights]

    def predict_proba(self, X):
        probas = []
        for model in self.models.values():
            probas.append(model.predict_proba(X))

        weighted_proba = np.zeros_like(probas[0])
        for proba, weight in zip(probas, self.weights):
            weighted_proba += proba * weight

        return weighted_proba

    def predict(self, X):
        proba = self.predict_proba(X)
        return np.argmax(proba, axis=1)


ensemble_equal = SimpleEnsemble(top_models)

weights = top_models_df["val_f1"].tolist()
ensemble_weighted = SimpleEnsemble(top_models, weights=weights)

In [None]:
print("\n" + "=" * 60)
print("ENSEMBLE RESULTS")
print("=" * 60)

for name, ensemble in [("Ensemble_Equal", ensemble_equal), ("Ensemble_Weighted", ensemble_weighted)]:
    y_pred = ensemble.predict(X_test)
    y_proba = ensemble.predict_proba(X_test)

    metrics = calculate_metrics(y_test, y_pred, y_proba)

    print(f"\n{name}:")
    print(f"F1:        {metrics['f1']:.4f}")
    print(f"ROC-AUC:   {metrics['roc_auc']:.4f}")
    print(f"PR-AUC:    {metrics['pr_auc']:.4f}")
    print(f"Precision: {metrics['precision']:.4f}")
    print(f"Recall:    {metrics['recall']:.4f}")

In [None]:
best_single = results_df.iloc[0]

y_pred_best = all_models[best_single["name"]].predict(X_test)
y_proba_best = all_models[best_single["name"]].predict_proba(X_test)
metrics_best = calculate_metrics(y_test, y_pred_best, y_proba_best)

y_pred_ens = ensemble_weighted.predict(X_test)
y_proba_ens = ensemble_weighted.predict_proba(X_test)
metrics_ens = calculate_metrics(y_test, y_pred_ens, y_proba_ens)

print("\n" + "=" * 60)
print("BEST SINGLE MODEL vs ENSEMBLE")
print("=" * 60)

print(f"\n{'Metric':<15} {'Best Single':<15} {'Ensemble':<15} {'Diff':<10}")
print("-" * 55)
for metric in ["f1", "roc_auc", "pr_auc", "precision", "recall"]:
    diff = metrics_ens[metric] - metrics_best[metric]
    sign = "+" if diff > 0 else ""
    print(f"{metric:<15} {metrics_best[metric]:<15.4f} {metrics_ens[metric]:<15.4f} {sign}{diff:.4f}")

## 7. Save models

In [None]:
best_model_name = results_df.iloc[0]["name"]
best_model = all_models[best_model_name]

best_model_path = f"{OUTPUT_PATH}/best_model_{best_model_name}.joblib"
joblib.dump(best_model, best_model_path)
print(f"Best model saved: {best_model_path}")

for name, model in top_models.items():
    model_path = f"{OUTPUT_PATH}/{name}.joblib"
    joblib.dump(model, model_path)
    print(f"Model saved: {model_path}")

results_df.to_csv(f"{OUTPUT_PATH}/experiment_results.csv", index=False)
print(f"Results saved: {OUTPUT_PATH}/experiment_results.csv")

ensemble_config = {
    "models": top_model_names,
    "weights": weights,
    "voting": "soft",
    "val_f1": float(results_df.iloc[0]["val_f1"]),
    "test_f1": float(metrics_ens["f1"]),
    "test_roc_auc": float(metrics_ens["roc_auc"]),
}

with open(f"{OUTPUT_PATH}/ensemble_config.json", "w") as f:
    json.dump(ensemble_config, f, indent=2)
print(f"Ensemble config saved: {OUTPUT_PATH}/ensemble_config.json")

## 8. Final report

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

cm = confusion_matrix(y_test, y_pred_ens)
cm_normalized = cm.astype("float") / cm.sum(axis=1)[:, np.newaxis]

fig, axes = plt.subplots(1, 2, figsize=(14, 5))

sns.heatmap(
    cm,
    annot=True,
    fmt="d",
    cmap="Blues",
    ax=axes[0],
    xticklabels=["Benign", "Attack"],
    yticklabels=["Benign", "Attack"],
)
axes[0].set_title("Confusion Matrix (Counts)")
axes[0].set_xlabel("Predicted")
axes[0].set_ylabel("Actual")

sns.heatmap(
    cm_normalized,
    annot=True,
    fmt=".2%",
    cmap="Blues",
    ax=axes[1],
    xticklabels=["Benign", "Attack"],
    yticklabels=["Benign", "Attack"],
)
axes[1].set_title("Confusion Matrix (Normalized)")
axes[1].set_xlabel("Predicted")
axes[1].set_ylabel("Actual")

plt.tight_layout()
plt.savefig(f"{OUTPUT_PATH}/confusion_matrix.png", dpi=150)
plt.show()

In [None]:
print("\n" + "=" * 60)
print("CLASSIFICATION REPORT (Ensemble)")
print("=" * 60)
print(classification_report(y_test, y_pred_ens, target_names=["Benign", "Attack"]))

In [None]:
if "XGB_baseline" in all_models:
    xgb_model = all_models["XGB_baseline"]
    importance = xgb_model.feature_importances_

    importance_df = pd.DataFrame({"feature": feature_cols, "importance": importance}).sort_values(
        "importance", ascending=False
    )

    top_20 = importance_df.head(20)

    fig = go.Figure(
        go.Bar(
            x=top_20["importance"][::-1],
            y=top_20["feature"][::-1],
            orientation="h",
            marker_color="steelblue",
        )
    )

    fig.update_layout(
        title="Top-20 Feature Importance (XGBoost)",
        xaxis_title="Importance",
        yaxis_title="Feature",
        height=600,
        width=800,
        margin=dict(l=200),
    )

    fig.show()

    importance_df.to_csv(f"{OUTPUT_PATH}/feature_importance.csv", index=False)

## Done

**Artifacts in `models/`**
- `best_model_*.joblib`: best single model (by validation F1)
- `*.joblib`: top-5 models used for the ensemble
- `experiment_results.csv`: results for all experiments
- `ensemble_config.json`: ensemble configuration and summary metrics
- `confusion_matrix.png`: confusion matrix (ensemble)
- `feature_importance.csv`: XGBoost feature importances (if available)
