In [None]:
# mqtt_ids_boost_train_bayes.py
# Batch + epoch training of boosted decision tree (XGBoost) for multi-class detection
# with Bayesian optimization of hyperparameters (per-epoch using validation set).

import os
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, f1_score, classification_report
import xgboost as xgb
import random

from skopt import gp_minimize
from skopt.space import Real, Integer
from skopt.utils import use_named_args

# ------------------------------
# Config
# ------------------------------
base_path = "./"
folders = {
    "packet": "packet_features",
    "uniflow": "uniflow_features",
    "biflow": "biflow_features",
}
files = {
    "normal": "normal.csv",
    "sparta": "sparta.csv",
    "scan_A": "scan_A.csv",
    "mqtt_bruteforce": "mqtt_bruteforce.csv",
    "scan_sU": "scan_sU.csv",
}


def build_filenames(prefix):
    return {k: f"{prefix}_{v}" for k, v in files.items()}


feature_files = {
    "packet": files,
    "uniflow": build_filenames("uniflow"),
    "biflow": build_filenames("biflow"),
}

# Training params
CHUNKSIZE = 200000
TRAIN_FRACTION, VAL_FRACTION, TEST_FRACTION = 0.80, 0.10, 0.10
EPOCHS = 3
ROUNDS_PER_BATCH = 1
SAMPLE_VAL_MAX, SAMPLE_TEST_MAX = 20000, 20000

RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)
random.seed(RANDOM_SEED)

# ------------------------------
# Hyperparameter search space for Bayesian optimization
# ------------------------------
space = [
    Real(0.01, 0.3, name="eta"),
    Integer(3, 10, name="max_depth"),
    Real(0.5, 1.0, name="subsample"),
    Real(0.5, 1.0, name="colsample_bytree"),
]

# Initial defaults
current_best_params = {
    "eta": 0.1,
    "max_depth": 6,
    "subsample": 1.0,
    "colsample_bytree": 1.0,
}

XGB_FIXED_PARAMS = {
    "objective": "multi:softprob",
    "eval_metric": "mlogloss",
    "verbosity": 0,
}

# ------------------------------
# Label encoding
# ------------------------------
attack_type_names = ["normal", "sparta", "scan_A", "mqtt_bruteforce", "scan_sU"]
label_encoder = LabelEncoder()
label_encoder.fit(attack_type_names)

# ------------------------------
# Stream CSVs
# ------------------------------
def stream_chunks_for_all_files(feature_files_map, base_path, chunksize=CHUNKSIZE):
    for level, file_dict in feature_files_map.items():
        folder_path = os.path.join(base_path, folders[level])
        for key, fname in file_dict.items():
            fpath = os.path.join(folder_path, fname)
            if not os.path.isfile(fpath):
                continue
            for chunk in pd.read_csv(fpath, chunksize=chunksize, low_memory=False):
                yield (level, key, fpath, chunk)


# ------------------------------
# Preprocess
# ------------------------------
def preprocess_chunk(df, file_key, expected_feature_names=None):
    df = df.copy()
    df["attack_type"] = file_key
    y_encoded = label_encoder.transform(df["attack_type"].astype(str).values)

    numeric_df = df.select_dtypes(include=[np.number]).copy()
    for col in ["label", "attack_type"]:
        if col in numeric_df.columns:
            numeric_df = numeric_df.drop(columns=[col])

    if expected_feature_names is not None:
        # Reindex to expected features, filling missing with zeros
        numeric_df = numeric_df.reindex(columns=expected_feature_names, fill_value=0.0)
        feature_names = list(expected_feature_names)
    else:
        feature_names = list(numeric_df.columns)

    numeric_df = numeric_df.fillna(0.0)
    return numeric_df.values.astype(np.float32), y_encoded.astype(np.int32), feature_names


# ------------------------------
# Bayesian objective function (per-epoch)
# ------------------------------
val_X_global, val_y_global, feature_names_master = None, None, None
booster_snapshot = None


@use_named_args(space)
def objective(**params):
    global booster_snapshot, val_X_global, val_y_global, feature_names_master
    dval = xgb.DMatrix(
        val_X_global, label=val_y_global, feature_names=feature_names_master
    )
    booster = xgb.train(
        params={**XGB_FIXED_PARAMS, **params, "num_class": len(label_encoder.classes_)},
        dtrain=dval,
        num_boost_round=3,
        xgb_model=booster_snapshot,
        verbose_eval=False,
    )
    preds = np.argmax(booster.predict(dval), axis=1)
    f1 = f1_score(val_y_global, preds, average="macro")
    return -f1  # skopt minimizes


# ------------------------------
# Main training loop
# ------------------------------
def train_boosted_tree_batchwise(feature_files_map, base_path, epochs=EPOCHS):
    global val_X_global, val_y_global, feature_names_master, booster_snapshot, current_best_params

    booster = None
    val_X, val_y, test_X, test_y = None, None, None, None
    total_batches = 0

    for epoch in range(1, epochs + 1):
        print(f"\n=== EPOCH {epoch}/{epochs} ===")
        gen = stream_chunks_for_all_files(feature_files_map, base_path)
        for level, file_key, filepath, chunk in gen:
            total_batches += 1
            if feature_names_master is None:
                Xb, yb, feature_names_master = preprocess_chunk(chunk, file_key)
            else:
                Xb, yb, _ = preprocess_chunk(
                    chunk, file_key, expected_feature_names=feature_names_master
                )
            n = Xb.shape[0]
            rnd = np.random.rand(n)
            train_mask = rnd < TRAIN_FRACTION
            val_mask = (rnd >= TRAIN_FRACTION) & (rnd < TRAIN_FRACTION + VAL_FRACTION)
            test_mask = rnd >= TRAIN_FRACTION + VAL_FRACTION

            X_train, y_train = Xb[train_mask], yb[train_mask]
            X_val, y_val = Xb[val_mask], yb[val_mask]
            X_test, y_test = Xb[test_mask], yb[test_mask]

            if X_val.shape[0] > 0:
                val_X = X_val if val_X is None else np.vstack([val_X, X_val])
                val_y = y_val if val_y is None else np.concatenate([val_y, y_val])
                if val_X.shape[0] > SAMPLE_VAL_MAX:
                    idx = np.random.choice(val_X.shape[0], SAMPLE_VAL_MAX, replace=False)
                    val_X, val_y = val_X[idx], val_y[idx]

            if X_test.shape[0] > 0:
                test_X = X_test if test_X is None else np.vstack([test_X, X_test])
                test_y = y_test if test_y is None else np.concatenate([test_y, y_test])
                if test_X.shape[0] > SAMPLE_TEST_MAX:
                    idx = np.random.choice(test_X.shape[0], SAMPLE_TEST_MAX, replace=False)
                    test_X, test_y = test_X[idx], test_y[idx]

            if X_train.shape[0] == 0:
                continue

            dtrain = xgb.DMatrix(
                X_train, label=y_train, feature_names=feature_names_master
            )
            booster = xgb.train(
                params={
                    **XGB_FIXED_PARAMS,
                    **current_best_params,
                    "num_class": len(label_encoder.classes_),
                },
                dtrain=dtrain,
                num_boost_round=ROUNDS_PER_BATCH,
                xgb_model=booster,
                verbose_eval=False,
            )
            booster_snapshot = booster

        # --- Hyperparameter tuning after epoch ---
        if val_X is not None:
            val_X_global, val_y_global = val_X, val_y
            print("[Bayesian Opt] Running hyperparameter tuning on validation set...")
            res = gp_minimize(
                objective,
                space,
                n_calls=10,
                random_state=RANDOM_SEED,
                acq_func="EI",
            )
            best_params = {dim.name: val for dim, val in zip(space, res.x)}
            current_best_params.update(best_params)
            print(f"[EPOCH {epoch}] Best hyperparams: {current_best_params}")

    if test_X is not None and booster is not None:
        dtest = xgb.DMatrix(test_X, label=test_y, feature_names=feature_names_master)
        preds = np.argmax(booster.predict(dtest), axis=1)
        acc = accuracy_score(test_y, preds)
        f1 = f1_score(test_y, preds, average="macro")
        print("\n=== FINAL TEST ===")
        print(f"Accuracy={acc:.4f} F1={f1:.4f}")
        print(classification_report(test_y, preds, target_names=label_encoder.classes_))
    return booster, feature_names_master


# ------------------------------
# Run
# ------------------------------
if __name__ == "__main__":
    train_boosted_tree_batchwise(feature_files, base_path, epochs=EPOCHS)