In [4]:
# mqtt_ids_svm_bayesopt.py
# Batch + epoch training of linear soft-margin SVM (SGD hinge loss approximation)
# Hyperparameters are tuned after each epoch using Bayesian optimization (scikit-optimize).
#
# Binary classification: secure (False) vs under attack (True).
# Datasets are streamed in chunks, allowing large-scale data.

import os
import glob
import time
import tempfile
import numpy as np
import pandas as pd
from sklearn.linear_model import SGDClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, f1_score, classification_report
from skopt import gp_minimize
from skopt.space import Real
from skopt.utils import use_named_args
import random

# ------------------------------
# Config
# ------------------------------
base_path = "./"

folders = {
    "packet": "packet_features",
    "uniflow": "uniflow_features",
    "biflow": "biflow_features"
}

files = {
    "normal": "normal.csv",
    "sparta": "sparta.csv",
    "scan_A": "scan_A.csv",
    "mqtt_bruteforce": "mqtt_bruteforce.csv",
    "scan_sU": "scan_sU.csv"
}

def build_filenames(prefix):
    return {
        "normal": f"{prefix}_normal.csv",
        "sparta": f"{prefix}_sparta.csv",
        "scan_A": f"{prefix}_scan_A.csv",
        "mqtt_bruteforce": f"{prefix}_mqtt_bruteforce.csv",
        "scan_sU": f"{prefix}_scan_sU.csv"
    }

feature_files = {
    "packet": files,
    "uniflow": build_filenames("uniflow"),
    "biflow": build_filenames("biflow")
}

CHUNKSIZE = 200000
TRAIN_FRACTION = 0.80
VAL_FRACTION = 0.10
TEST_FRACTION = 0.10

EPOCHS = 3
SAMPLE_VAL_MAX = 20000
SAMPLE_TEST_MAX = 20000
MAX_TRAIN_SAMPLES = 2000  # maximum rows to use overall (balanced between classes)

RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)
random.seed(RANDOM_SEED)

# ------------------------------
# Cleanup helpers
# ------------------------------
def safe_remove(path):
    try:
        if os.path.exists(path):
            os.remove(path)
            print(f"[CLEANUP] Removed file: {path}")
    except Exception as e:
        print(f"[CLEANUP] Could not remove {path}: {e}")

# ------------------------------
# Data streaming
# ------------------------------
def stream_chunks(feature_files_map, base_path, chunksize=CHUNKSIZE):
    """Yield (level, file_key, filepath, chunk_df)."""
    for level, file_dict in feature_files_map.items():
        folder_path = os.path.join(base_path, folders[level])
        for key, fname in file_dict.items():
            fpath = os.path.join(folder_path, fname)
            if not os.path.isfile(fpath):
                print(f"[WARN] Missing: {fpath}")
                continue
            try:
                for chunk in pd.read_csv(fpath, chunksize=chunksize, low_memory=False):
                    yield (level, key, fpath, chunk)
            except Exception as e:
                print(f"[ERROR] Failed to read {fpath}: {e}")

# ------------------------------
# Preprocess chunk
# ------------------------------
def preprocess_chunk(df, file_key, expected_features=None):
    df = df.copy()
    df["attack"] = (file_key != "normal")  # True if attack

    y = df["attack"].astype(int).values
    numeric_df = df.select_dtypes(include=[np.number]).copy()
    numeric_df = numeric_df.drop(columns=[c for c in ["attack", "label"] if c in numeric_df.columns])

    if expected_features is not None:
        numeric_df = numeric_df.reindex(columns=expected_features, fill_value=0.0)
        feature_names = expected_features
    else:
        feature_names = list(numeric_df.columns)

    X = numeric_df.fillna(0.0).values.astype(np.float32)
    return X, y, feature_names

# ------------------------------
# Bayesian optimization objective
# ------------------------------
def make_objective(train_X, train_y, val_X, val_y):
    space = [
        Real(1e-6, 1e-2, prior="log-uniform", name="alpha"),
        Real(1e-4, 1.0, prior="log-uniform", name="eta0"),
        Real(0.0, 1.0, prior="uniform", name="l1_ratio"),
    ]

    @use_named_args(space)
    def objective(**params):
        clf = SGDClassifier(
            loss="hinge",
            penalty="elasticnet",
            alpha=params["alpha"],
            l1_ratio=params["l1_ratio"],
            learning_rate="constant",
            eta0=params["eta0"],
            max_iter=1000,
            tol=1e-3,
            random_state=RANDOM_SEED
        )
        clf.fit(train_X, train_y)
        preds = clf.predict(val_X)
        f1 = f1_score(val_y, preds, average="macro")
        return -f1  # minimize negative F1
    return objective, space

# ------------------------------
# Training loop
# ------------------------------
def train_svm_bayesopt(feature_files_map, base_path, epochs=EPOCHS):
    scaler = StandardScaler(with_mean=False)
    feature_names_master = None

    # -----------------
    # Collect full dataset first
    # -----------------
    collected_X, collected_y = [], []
    for level, file_key, filepath, chunk in stream_chunks(feature_files_map, base_path):
        try:
            if feature_names_master is None:
                X, y, features = preprocess_chunk(chunk, file_key)
                feature_names_master = features
            else:
                X, y, _ = preprocess_chunk(chunk, file_key, expected_features=feature_names_master)
        except Exception as e:
            print(f"[SKIP] {e}")
            continue

        if X.shape[0] == 0 or X.shape[1] == 0:
            continue

        collected_X.append(X)
        collected_y.append(y)

    if not collected_X:
        raise RuntimeError("No data collected for training.")

    X_all = np.vstack(collected_X)
    y_all = np.concatenate(collected_y)

    # -----------------
    # Balanced subsampling
    # -----------------
    if X_all.shape[0] > MAX_TRAIN_SAMPLES:
        n_per_class = MAX_TRAIN_SAMPLES // 2
        idx_class0 = np.where(y_all == 0)[0]
        idx_class1 = np.where(y_all == 1)[0]

        if len(idx_class0) < n_per_class or len(idx_class1) < n_per_class:
            raise RuntimeError(f"Not enough samples to balance classes: "
                               f"class0={len(idx_class0)}, class1={len(idx_class1)}, need {n_per_class} each")

        sampled_idx0 = np.random.choice(idx_class0, n_per_class, replace=False)
        sampled_idx1 = np.random.choice(idx_class1, n_per_class, replace=False)
        idxs = np.concatenate([sampled_idx0, sampled_idx1])
        np.random.shuffle(idxs)

        X_all, y_all = X_all[idxs], y_all[idxs]

    # Shuffle and split into train/val/test
    idxs = np.arange(X_all.shape[0])
    np.random.shuffle(idxs)
    X_all, y_all = X_all[idxs], y_all[idxs]

    n_train = int(TRAIN_FRACTION * X_all.shape[0])
    n_val = int(VAL_FRACTION * X_all.shape[0])
    X_train, y_train = X_all[:n_train], y_all[:n_train]
    X_val, y_val = X_all[n_train:n_train+n_val], y_all[n_train:n_train+n_val]
    X_test, y_test = X_all[n_train+n_val:], y_all[n_train+n_val:]

    # Fit scaler
    scaler.fit(X_train)
    X_train = scaler.transform(X_train)
    X_val = scaler.transform(X_val)
    X_test = scaler.transform(X_test)

    # Initial SGD model
    svm = SGDClassifier(
        loss="hinge",
        penalty="elasticnet",
        alpha=0.0001,
        l1_ratio=0.15,
        learning_rate="constant",
        eta0=0.01,
        max_iter=1,
        tol=None,
        warm_start=True,
        random_state=RANDOM_SEED
    )

    # -----------------
    # Training epochs
    # -----------------
    for epoch in range(1, epochs + 1):
        print(f"\n=== EPOCH {epoch}/{epochs} ===")

        # Train model
        if not hasattr(svm, "classes_"):
            svm.partial_fit(X_train, y_train, classes=np.array([0, 1]))
        else:
            svm.partial_fit(X_train, y_train)
        print(f"[EPOCH {epoch}] Trained on {X_train.shape[0]} rows.")

        # Hyperparameter tuning
        if X_val is not None and X_val.shape[0] > 0:
            objective, space = make_objective(train_X=X_train, train_y=y_train,
                                              val_X=X_val, val_y=y_val)
            print(f"[EPOCH {epoch}] Running Bayesian optimization...")
            res = gp_minimize(objective, space, n_calls=10, random_state=RANDOM_SEED)
            best_params = res.x
            print(f"[EPOCH {epoch}] Best params: alpha={best_params[0]:.6f}, eta0={best_params[1]:.4f}, l1_ratio={best_params[2]:.2f}")

            svm.alpha = best_params[0]
            svm.eta0 = best_params[1]
            svm.l1_ratio = best_params[2]

            preds = svm.predict(X_val)
            acc = accuracy_score(y_val, preds)
            f1 = f1_score(y_val, preds, average="macro")
            print(f"[EPOCH {epoch}] Val Acc={acc:.4f}  F1={f1:.4f}")

    # Final test
    if X_test is not None and X_test.shape[0] > 0:
        preds = svm.predict(X_test)
        acc = accuracy_score(y_test, preds)
        f1 = f1_score(y_test, preds, average="macro")
        print("\n=== FINAL TEST METRICS ===")
        print(f"Test Acc={acc:.4f}  F1={f1:.4f}")
        print(classification_report(y_test, preds, target_names=["Secure(False)", "Attack(True)"]))
    else:
        print("No test data available.")

    return svm, feature_names_master

# ------------------------------
# Main
# ------------------------------
if __name__ == "__main__":
    start = time.time()
    print("[START] Training linear SVM with Bayesian optimization (balanced subsampling)...")
    try:
        svm, features = train_svm_bayesopt(feature_files, base_path, epochs=EPOCHS)
    except Exception as e:
        print(f"[FATAL] Error: {e}")
    elapsed = time.time() - start
    print(f"[DONE] Total time {elapsed:.1f}s")

    print("\n[START CLEANUP]")
    tmpdir = tempfile.gettempdir()
    for pattern in ["*.tmp", "*.temp", "tmp*"]:
        for f in glob.glob(os.path.join(tmpdir, pattern)):
            safe_remove(f)
    print("[CLEANUP] Completed.")

[START] Training linear SVM with Bayesian optimization (balanced subsampling)...

=== EPOCH 1/3 ===
[EPOCH 1] Trained on 1600 rows.
[EPOCH 1] Running Bayesian optimization...
[EPOCH 1] Best params: alpha=0.000002, eta0=0.7853, l1_ratio=0.23
[EPOCH 1] Val Acc=0.8450  F1=0.8395

=== EPOCH 2/3 ===
[EPOCH 2] Trained on 1600 rows.
[EPOCH 2] Running Bayesian optimization...
[EPOCH 2] Best params: alpha=0.000002, eta0=0.7853, l1_ratio=0.23
[EPOCH 2] Val Acc=0.8450  F1=0.8395

=== EPOCH 3/3 ===
[EPOCH 3] Trained on 1600 rows.
[EPOCH 3] Running Bayesian optimization...
[EPOCH 3] Best params: alpha=0.000002, eta0=0.7853, l1_ratio=0.23
[EPOCH 3] Val Acc=0.8450  F1=0.8395

=== FINAL TEST METRICS ===
Test Acc=0.9000  F1=0.8992
               precision    recall  f1-score   support

Secure(False)       0.83      1.00      0.91        99
 Attack(True)       1.00      0.80      0.89       101

     accuracy                           0.90       200
    macro avg       0.92      0.90      0.90       200