In [6]:
# mqtt_ids_boost_train_bayes_fixed.py
# Batch + epoch training of boosted decision tree (XGBoost) for multi-class detection
# with Bayesian optimization of hyperparameters (per-epoch using validation set).
# Fixes:
#  - global train/val/test split (no per-chunk splitting)
#  - Bayesian objective trains on training set only and evaluates on validation set
#  - optional MAX_SAMPLES cap to limit memory usage

import os
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, f1_score, classification_report
import xgboost as xgb
import random
import math
import time

from skopt import gp_minimize
from skopt.space import Real, Integer
from skopt.utils import use_named_args

# ------------------------------
# Config (edit as needed)
# ------------------------------
base_path = "./"
folders = {
    "packet": "packet_features",
    "uniflow": "uniflow_features",
    "biflow": "biflow_features",
}
files = {
    "normal": "normal.csv",
    "sparta": "sparta.csv",
    "scan_A": "scan_A.csv",
    "mqtt_bruteforce": "mqtt_bruteforce.csv",
    "scan_sU": "scan_sU.csv",
}


def build_filenames(prefix):
    return {k: f"{prefix}_{v}" for k, v in files.items()}


feature_files = {
    "packet": files,
    "uniflow": build_filenames("uniflow"),
    "biflow": build_filenames("biflow"),
}

# training params
CHUNKSIZE = 2000
TRAIN_FRACTION, VAL_FRACTION, TEST_FRACTION = 0.80, 0.10, 0.10
EPOCHS = 3
ROUNDS_PER_BATCH = 2     # how many boosting rounds per batch update
SAMPLE_VAL_MAX, SAMPLE_TEST_MAX = 20000, 20000

# Limit total samples processed (set to None for unlimited)
MAX_SAMPLES = 50000   # set to None to process entire dataset

BATCH_SIZE = 4096     # number of rows used per incremental update to XGBoost (not ML minibatch for NN)
GP_N_CALLS = 10       # gp_minimize calls per epoch
GP_INIT_POINTS = 3

RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)
random.seed(RANDOM_SEED)

# ------------------------------
# Hyperparameter search space for Bayesian optimization
# ------------------------------
space = [
    Real(0.01, 0.3, name="eta"),
    Integer(3, 10, name="max_depth"),
    Real(0.5, 1.0, name="subsample"),
    Real(0.5, 1.0, name="colsample_bytree"),
]

# initial defaults
current_best_params = {
    "eta": 0.1,
    "max_depth": 6,
    "subsample": 1.0,
    "colsample_bytree": 1.0,
}

XGB_FIXED_PARAMS = {
    "objective": "multi:softprob",
    "eval_metric": "mlogloss",
    "verbosity": 0,
}

# ------------------------------
# Label encoding
# ------------------------------
attack_type_names = ["normal", "sparta", "scan_A", "mqtt_bruteforce", "scan_sU"]
label_encoder = LabelEncoder()
label_encoder.fit(attack_type_names)

# ------------------------------
# Stream CSVs utility
# ------------------------------
def stream_chunks_for_all_files(feature_files_map, base_path, chunksize=CHUNKSIZE):
    """
    Generator yielding (level, key, filepath, chunk_df)
    """
    for level, file_dict in feature_files_map.items():
        folder_path = os.path.join(base_path, folders[level])
        for key, fname in file_dict.items():
            fpath = os.path.join(folder_path, fname)
            if not os.path.isfile(fpath):
                continue
            for chunk in pd.read_csv(fpath, chunksize=chunksize, low_memory=False):
                yield (level, key, fpath, chunk)

# ------------------------------
# Preprocess chunk
# ------------------------------
def preprocess_chunk(df, file_key, expected_feature_names=None):
    """
    Returns (X_numpy, y_numpy, feature_names_list)
    Ensures numeric-only features, consistent column order via expected_feature_names (if provided).
    """
    df = df.copy()
    df["attack_type"] = file_key
    # target (text -> encoded int)
    y_encoded = label_encoder.transform(df["attack_type"].astype(str).values)

    # numeric features only, drop potential numeric label columns that leak
    numeric_df = df.select_dtypes(include=[np.number]).copy()
    for col in ["label", "attack_type"]:
        if col in numeric_df.columns:
            numeric_df = numeric_df.drop(columns=[col])

    if expected_feature_names is not None:
        # reindex to the master feature set, filling missing with zeros
        numeric_df = numeric_df.reindex(columns=expected_feature_names, fill_value=0.0)
        feature_names = list(expected_feature_names)
    else:
        feature_names = list(numeric_df.columns)

    # fill NaN
    numeric_df = numeric_df.fillna(0.0)
    X = numeric_df.values.astype(np.float32)
    y = y_encoded.astype(np.int32)
    return X, y, feature_names

# ------------------------------
# Collect up to MAX_SAMPLES globally (streaming)
# ------------------------------
def collect_samples(feature_files_map, base_path, max_samples=None):
    """
    Stream through CSVs and collect up to max_samples rows into X_all and y_all.
    Returns X_all (numpy), y_all (numpy), feature_names_master (list).
    """
    X_list = []
    y_list = []
    feature_names_master = None
    samples_collected = 0

    for level, key, path, chunk in stream_chunks_for_all_files(feature_files_map, base_path):
        # preprocess chunk to numeric features
        if feature_names_master is None:
            Xb, yb, feature_names_master = preprocess_chunk(chunk, key, expected_feature_names=None)
        else:
            Xb, yb, _ = preprocess_chunk(chunk, key, expected_feature_names=feature_names_master)

        if Xb.shape[0] == 0:
            continue

        # figure out how many to take from this chunk
        if max_samples is not None:
            remaining = max_samples - samples_collected
            if remaining <= 0:
                break
            if Xb.shape[0] > remaining:
                idx = np.random.choice(Xb.shape[0], remaining, replace=False)
                Xb = Xb[idx]
                yb = yb[idx]

        X_list.append(Xb)
        y_list.append(yb)
        samples_collected += Xb.shape[0]

        if max_samples is not None and samples_collected >= max_samples:
            break

    if samples_collected == 0:
        raise RuntimeError("No samples found while collecting data. Check paths and CSVs.")

    X_all = np.vstack(X_list)
    y_all = np.concatenate(y_list)
    print(f"[COLLECT] Collected {X_all.shape[0]} samples with {len(feature_names_master)} features.")
    return X_all, y_all, feature_names_master

# ------------------------------
# Bayesian objective (train on train set and score on val set)
# ------------------------------
X_train_global = None
y_train_global = None
X_val_global = None
y_val_global = None
feature_names_master_global = None
num_class_global = None

@use_named_args(space)
def objective(eta, max_depth, subsample, colsample_bytree):
    """
    Objective function for gp_minimize. Trains on training set, evaluates on validation set.
    """
    global X_train_global, y_train_global, X_val_global, y_val_global, feature_names_master_global, num_class_global

    if X_train_global is None or X_val_global is None:
        return 1e6  # fail safe

    params = {
        **XGB_FIXED_PARAMS,
        "eta": float(eta),
        "max_depth": int(max_depth),
        "subsample": float(subsample),
        "colsample_bytree": float(colsample_bytree),
        "num_class": int(num_class_global)
    }

    dtrain = xgb.DMatrix(X_train_global, label=y_train_global, feature_names=feature_names_master_global)
    dval = xgb.DMatrix(X_val_global, label=y_val_global, feature_names=feature_names_master_global)

    try:
        booster = xgb.train(
            params=params,
            dtrain=dtrain,
            num_boost_round=50,
            evals=[(dval, "val")],
            early_stopping_rounds=5,
            verbose_eval=False
        )
    except Exception as e:
        print(f"[BO-OBJ] xgb.train failed: {e}")
        return 1e6

    preds_prob = booster.predict(dval)
    preds = np.argmax(preds_prob, axis=1)
    f1 = f1_score(y_val_global, preds, average="macro")
    return -float(f1)

# ------------------------------
# Main training (use global split and batch updates)
# ------------------------------
def train_boosted_tree_batchwise(feature_files_map, base_path, epochs=EPOCHS, max_samples=MAX_SAMPLES, batch_size=BATCH_SIZE):
    global X_train_global, y_train_global, X_val_global, y_val_global, feature_names_master_global, num_class_global, current_best_params

    # 1) Collect data (streaming, capped)
    X_all, y_all, feature_names_master = collect_samples(feature_files_map, base_path, max_samples=max_samples)
    feature_names_master_global = feature_names_master
    num_class_global = len(label_encoder.classes_)

    # 2) Global train/val/test split
    n = X_all.shape[0]
    idx = np.arange(n)
    rng = np.random.RandomState(RANDOM_SEED)
    rng.shuffle(idx)

    n_test = int(math.floor(TEST_FRACTION * n))
    n_val = int(math.floor(VAL_FRACTION * n))
    n_test = max(1, n_test) if n > 2 else 0
    n_val = max(1, n_val) if n > 2 else 0
    n_train = n - n_val - n_test
    if n_train <= 0:
        raise RuntimeError("Not enough samples for train after split; reduce VAL/TEST fractions or increase MAX_SAMPLES.")

    test_idx = idx[:n_test]
    val_idx = idx[n_test:n_test + n_val]
    train_idx = idx[n_test + n_val:]

    X_train = X_all[train_idx]
    y_train = y_all[train_idx]
    X_val = X_all[val_idx]
    y_val = y_all[val_idx]
    X_test = X_all[test_idx]
    y_test = y_all[test_idx]

    print(f"[SPLIT] total={n}, train={X_train.shape[0]}, val={X_val.shape[0]}, test={X_test.shape[0]}")

    # expose to BO objective
    X_train_global = X_train
    y_train_global = y_train
    X_val_global = X_val
    y_val_global = y_val

    booster = None
    n_train_rows = X_train.shape[0]
    batch_indices = [np.arange(i, min(i + batch_size, n_train_rows)) for i in range(0, n_train_rows, batch_size)]

    for epoch in range(1, epochs + 1):
        print(f"\n=== EPOCH {epoch}/{epochs} ===")
        rng.shuffle(batch_indices)
        for bi, inds in enumerate(batch_indices, start=1):
            Xb = X_train[inds]
            yb = y_train[inds]
            if Xb.shape[0] == 0:
                continue
            dtrain = xgb.DMatrix(Xb, label=yb, feature_names=feature_names_master)
            booster = xgb.train(
                params={**XGB_FIXED_PARAMS, **current_best_params, "num_class": num_class_global},
                dtrain=dtrain,
                num_boost_round=ROUNDS_PER_BATCH,
                xgb_model=booster,
                verbose_eval=False
            )
            if bi % 10 == 0:
                print(f"  [epoch {epoch}] processed {bi}/{len(batch_indices)} batches")

        # Bayesian optimization after epoch
        print("[Bayesian Opt] Running hyperparameter tuning on validation set...")
        t0 = time.time()
        res = gp_minimize(
            objective,
            space,
            n_calls=GP_N_CALLS,
            n_initial_points=GP_INIT_POINTS,
            random_state=RANDOM_SEED,
            acq_func="EI",
            verbose=False
        )
        t_elapsed = time.time() - t0
        print(f"[Bayes] Done in {t_elapsed:.1f}s. Best objective value={res.fun:.6f}")

        best_params = {dim.name: val for dim, val in zip(space, res.x)}
        current_best_params.update(best_params)
        print(f"[EPOCH {epoch}] Updated best params: {current_best_params}")

    # Final test
    if booster is not None:
        dtest = xgb.DMatrix(X_test, label=y_test, feature_names=feature_names_master)
        preds_prob = booster.predict(dtest)
        preds = np.argmax(preds_prob, axis=1)
        acc = accuracy_score(y_test, preds)
        f1 = f1_score(y_test, preds, average="macro")
        print("\n=== FINAL TEST ===")
        print(f"Accuracy={acc:.4f}  F1={f1:.4f}")
        print(classification_report(y_test, preds, labels=np.arange(num_class_global), target_names=label_encoder.classes_, zero_division=0))
    else:
        print("[FINAL] No booster was trained.")

    return booster, feature_names_master

# ------------------------------
# Run entrypoint
# ------------------------------
if __name__ == "__main__":
    booster_final, feature_names_used = train_boosted_tree_batchwise(feature_files, base_path, epochs=EPOCHS, max_samples=MAX_SAMPLES, batch_size=BATCH_SIZE)
    print("Done.")


[COLLECT] Collected 50000 samples with 27 features.
[SPLIT] total=50000, train=40000, val=5000, test=5000

=== EPOCH 1/3 ===
  [epoch 1] processed 10/10 batches
[Bayesian Opt] Running hyperparameter tuning on validation set...
[Bayes] Done in 2.7s. Best objective value=-1.000000
[EPOCH 1] Updated best params: {'eta': 0.24099746618946757, 'max_depth': np.int64(4), 'subsample': 0.8898455001363847, 'colsample_bytree': 0.7984250789732436}

=== EPOCH 2/3 ===
  [epoch 2] processed 10/10 batches
[Bayesian Opt] Running hyperparameter tuning on validation set...
[Bayes] Done in 2.6s. Best objective value=-1.000000
[EPOCH 2] Updated best params: {'eta': 0.24099746618946757, 'max_depth': np.int64(4), 'subsample': 0.8898455001363847, 'colsample_bytree': 0.7984250789732436}

=== EPOCH 3/3 ===
  [epoch 3] processed 10/10 batches
[Bayesian Opt] Running hyperparameter tuning on validation set...
[Bayes] Done in 2.8s. Best objective value=-1.000000
[EPOCH 3] Updated best params: {'eta': 0.2409974661894