In [1]:
#!/usr/bin/env python3
"""
balanced_xgboost_bayes.py

Train an XGBoost model with Bayesian optimization, ensuring class balance:
 - Streams CSVs to collect data
 - Balances dataset by downsampling each attack class to the size of the smallest class
 - Splits into train/val/test
 - Runs batchwise training with Bayesian hyperparameter optimization after each epoch
"""

import os
import numpy as np
import pandas as pd
import random
import math
import time
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, f1_score, classification_report
import xgboost as xgb

from skopt import gp_minimize
from skopt.space import Real, Integer
from skopt.utils import use_named_args

# ------------------------------
# Config
# ------------------------------
base_path = "./"
folders = {
    "packet": "packet_features",
    "uniflow": "uniflow_features",
    "biflow": "biflow_features",
}
files = {
    "normal": "normal.csv",
    "sparta": "sparta.csv",
    "scan_A": "scan_A.csv",
    "mqtt_bruteforce": "mqtt_bruteforce.csv",
    "scan_sU": "scan_sU.csv",
}

def build_filenames(prefix):
    return {k: f"{prefix}_{v}" for k, v in files.items()}

feature_files = {
    "packet": files,
    "uniflow": build_filenames("uniflow"),
    "biflow": build_filenames("biflow"),
}

# Training params
CHUNKSIZE = 2000
TRAIN_FRACTION, VAL_FRACTION, TEST_FRACTION = 0.80, 0.10, 0.10
EPOCHS = 3
ROUNDS_PER_BATCH = 2
BATCH_SIZE = 4096
GP_N_CALLS = 10
GP_INIT_POINTS = 3

RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)
random.seed(RANDOM_SEED)

# Hyperparameter search space
space = [
    Real(0.01, 0.3, name="eta"),
    Integer(3, 10, name="max_depth"),
    Real(0.5, 1.0, name="subsample"),
    Real(0.5, 1.0, name="colsample_bytree"),
]

current_best_params = {
    "eta": 0.1,
    "max_depth": 6,
    "subsample": 1.0,
    "colsample_bytree": 1.0,
}

XGB_FIXED_PARAMS = {
    "objective": "multi:softprob",
    "eval_metric": "mlogloss",
    "verbosity": 0,
}

# ------------------------------
# Label encoding
# ------------------------------
attack_type_names = ["normal", "sparta", "scan_A", "mqtt_bruteforce", "scan_sU"]
label_encoder = LabelEncoder()
label_encoder.fit(attack_type_names)

# ------------------------------
# Data loading utilities
# ------------------------------
def stream_chunks_for_all_files(feature_files_map, base_path, chunksize=CHUNKSIZE):
    """Yield (level, key, filepath, chunk_df)"""
    for level, file_dict in feature_files_map.items():
        folder_path = os.path.join(base_path, folders[level])
        for key, fname in file_dict.items():
            fpath = os.path.join(folder_path, fname)
            if not os.path.isfile(fpath):
                continue
            for chunk in pd.read_csv(fpath, chunksize=chunksize, low_memory=False):
                yield (level, key, fpath, chunk)

def preprocess_chunk(df, file_key, expected_feature_names=None):
    """
    Returns (X_numpy, y_numpy, feature_names_list)
    """
    df = df.copy()
    df["attack_type"] = file_key
    y_encoded = label_encoder.transform(df["attack_type"].astype(str).values)

    numeric_df = df.select_dtypes(include=[np.number]).copy()
    for col in ["label", "attack_type"]:
        if col in numeric_df.columns:
            numeric_df = numeric_df.drop(columns=[col])

    if expected_feature_names is not None:
        numeric_df = numeric_df.reindex(columns=expected_feature_names, fill_value=0.0)
        feature_names = list(expected_feature_names)
    else:
        feature_names = list(numeric_df.columns)

    numeric_df = numeric_df.fillna(0.0)
    X = numeric_df.values.astype(np.float32)
    y = y_encoded.astype(np.int32)
    return X, y, feature_names

def collect_all_samples(feature_files_map, base_path):
    """Collect ALL samples into memory (for balancing)."""
    X_list, y_list = [], []
    feature_names_master = None
    for level, key, path, chunk in stream_chunks_for_all_files(feature_files_map, base_path):
        if feature_names_master is None:
            Xb, yb, feature_names_master = preprocess_chunk(chunk, key, expected_feature_names=None)
        else:
            Xb, yb, _ = preprocess_chunk(chunk, key, expected_feature_names=feature_names_master)
        if Xb.shape[0] == 0:
            continue
        X_list.append(Xb)
        y_list.append(yb)

    if not X_list:
        raise RuntimeError("No data found in dataset.")

    X_all = np.vstack(X_list)
    y_all = np.concatenate(y_list)
    print(f"[COLLECT] Collected {X_all.shape[0]} samples across {len(label_encoder.classes_)} classes.")
    return X_all, y_all, feature_names_master

def balance_classes(X, y):
    """Downsample all classes to the size of the smallest class."""
    class_counts = {cls: np.sum(y == cls) for cls in np.unique(y)}
    min_count = min(class_counts.values())
    print(f"[BALANCE] Class counts before: {class_counts}, using N={min_count} per class.")

    X_balanced, y_balanced = [], []
    for cls in np.unique(y):
        idxs = np.where(y == cls)[0]
        chosen = np.random.choice(idxs, size=min_count, replace=False)
        X_balanced.append(X[chosen])
        y_balanced.append(y[chosen])

    X_balanced = np.vstack(X_balanced)
    y_balanced = np.concatenate(y_balanced)
    print(f"[BALANCE] Balanced dataset has {X_balanced.shape[0]} samples total.")
    return X_balanced, y_balanced

# ------------------------------
# Global variables for Bayesian objective
# ------------------------------
X_train_global = y_train_global = X_val_global = y_val_global = None
feature_names_master_global = None
num_class_global = None

@use_named_args(space)
def objective(eta, max_depth, subsample, colsample_bytree):
    """Bayesian optimization objective."""
    global X_train_global, y_train_global, X_val_global, y_val_global, feature_names_master_global, num_class_global
    params = {
        **XGB_FIXED_PARAMS,
        "eta": float(eta),
        "max_depth": int(max_depth),
        "subsample": float(subsample),
        "colsample_bytree": float(colsample_bytree),
        "num_class": int(num_class_global),
    }
    dtrain = xgb.DMatrix(X_train_global, label=y_train_global, feature_names=feature_names_master_global)
    dval = xgb.DMatrix(X_val_global, label=y_val_global, feature_names=feature_names_master_global)
    try:
        booster = xgb.train(
            params=params,
            dtrain=dtrain,
            num_boost_round=50,
            evals=[(dval, "val")],
            early_stopping_rounds=5,
            verbose_eval=False
        )
    except Exception as e:
        print(f"[BO-OBJ] xgb.train failed: {e}")
        return 1e6
    preds = np.argmax(booster.predict(dval), axis=1)
    f1 = f1_score(y_val_global, preds, average="macro")
    return -f1

# ------------------------------
# Training pipeline
# ------------------------------
def train_balanced_xgboost(feature_files_map, base_path, epochs=EPOCHS, batch_size=BATCH_SIZE):
    global X_train_global, y_train_global, X_val_global, y_val_global, feature_names_master_global, num_class_global, current_best_params

    # 1. Collect and balance dataset
    X_all, y_all, feature_names_master = collect_all_samples(feature_files_map, base_path)
    X_all, y_all = balance_classes(X_all, y_all)
    feature_names_master_global = feature_names_master
    num_class_global = len(label_encoder.classes_)

    # 2. Train/val/test split
    n = X_all.shape[0]
    idx = np.arange(n)
    rng = np.random.RandomState(RANDOM_SEED)
    rng.shuffle(idx)

    n_test = int(math.floor(TEST_FRACTION * n))
    n_val = int(math.floor(VAL_FRACTION * n))
    n_train = n - n_val - n_test
    test_idx = idx[:n_test]
    val_idx = idx[n_test:n_test+n_val]
    train_idx = idx[n_test+n_val:]

    X_train, y_train = X_all[train_idx], y_all[train_idx]
    X_val, y_val = X_all[val_idx], y_all[val_idx]
    X_test, y_test = X_all[test_idx], y_all[test_idx]

    print(f"[SPLIT] train={X_train.shape[0]}, val={X_val.shape[0]}, test={X_test.shape[0]}")

    # 3. Expose for BO
    X_train_global, y_train_global = X_train, y_train
    X_val_global, y_val_global = X_val, y_val

    # 4. Batch training + BO
    booster = None
    n_train_rows = X_train.shape[0]
    batch_indices = [np.arange(i, min(i+batch_size, n_train_rows)) for i in range(0, n_train_rows, batch_size)]

    for epoch in range(1, epochs+1):
        print(f"\n=== EPOCH {epoch}/{epochs} ===")
        rng.shuffle(batch_indices)
        for bi, inds in enumerate(batch_indices, start=1):
            Xb, yb = X_train[inds], y_train[inds]
            dtrain = xgb.DMatrix(Xb, label=yb, feature_names=feature_names_master)
            booster = xgb.train(
                params={**XGB_FIXED_PARAMS, **current_best_params, "num_class": num_class_global},
                dtrain=dtrain,
                num_boost_round=ROUNDS_PER_BATCH,
                xgb_model=booster,
                verbose_eval=False
            )
        print("[Bayesian Opt] Running hyperparameter tuning...")
        res = gp_minimize(
            objective, space,
            n_calls=GP_N_CALLS,
            n_initial_points=GP_INIT_POINTS,
            random_state=RANDOM_SEED,
            acq_func="EI"
        )
        best_params = {dim.name: val for dim, val in zip(space, res.x)}
        current_best_params.update(best_params)
        print(f"[EPOCH {epoch}] Updated best params: {current_best_params}")

    # 5. Final evaluation
    if booster is not None:
        dtest = xgb.DMatrix(X_test, label=y_test, feature_names=feature_names_master)
        preds = np.argmax(booster.predict(dtest), axis=1)
        acc = accuracy_score(y_test, preds)
        f1 = f1_score(y_test, preds, average="macro")
        print("\n=== FINAL TEST ===")
        print(f"Accuracy={acc:.4f}  F1={f1:.4f}")
        print(classification_report(y_test, preds, labels=np.arange(num_class_global), target_names=label_encoder.classes_, zero_division=0))
    else:
        print("[FINAL] No model trained.")
    return booster, feature_names_master

# ------------------------------
# Entrypoint
# ------------------------------
if __name__ == "__main__":
    booster_final, feature_names_used = train_balanced_xgboost(feature_files, base_path, epochs=EPOCHS, batch_size=BATCH_SIZE)
    print("Done.")

[COLLECT] Collected 32877238 samples across 5 classes.
[BALANCE] Class counts before: {np.int32(0): np.int64(10095091), np.int32(1): np.int64(1314075), np.int32(2): np.int64(188443), np.int32(3): np.int64(329764), np.int32(4): np.int64(20949865)}, using N=188443 per class.
[BALANCE] Balanced dataset has 942215 samples total.
[SPLIT] train=753773, val=94221, test=94221

=== EPOCH 1/3 ===
[Bayesian Opt] Running hyperparameter tuning...
[EPOCH 1] Updated best params: {'eta': 0.2130355435357968, 'max_depth': np.int64(8), 'subsample': 0.7240191796025336, 'colsample_bytree': 0.7775368807814533}

=== EPOCH 2/3 ===
[Bayesian Opt] Running hyperparameter tuning...
[EPOCH 2] Updated best params: {'eta': 0.2130355435357968, 'max_depth': np.int64(8), 'subsample': 0.7240191796025336, 'colsample_bytree': 0.7775368807814533}

=== EPOCH 3/3 ===
[Bayesian Opt] Running hyperparameter tuning...
[EPOCH 3] Updated best params: {'eta': 0.2130355435357968, 'max_depth': np.int64(8), 'subsample': 0.724019179602