In [None]:
# mqtt_ids_svm_train_rbf_bayes.py
# Batch training of nonlinear RBF SVM with Bayesian optimization after each epoch.
# - Reads CSV files in chunks (batches).
# - Uses 3 epochs (configurable).
# - After each epoch, runs Bayesian optimization (skopt.BayesSearchCV with GP surrogate
#   and Expected Improvement acquisition) to tune C and gamma for RBF SVM.
# - Validation/test pools are sampled & stored in-memory.
#
# NOTE: Install scikit-optimize for BayesSearchCV:
#    pip install scikit-optimize
# If scikit-optimize is not available, script falls back to GridSearchCV.

import os
import glob
import time
import tempfile
import numpy as np
import pandas as pd
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, f1_score, classification_report
from sklearn.model_selection import GridSearchCV
import random

# Try to import BayesSearchCV from scikit-optimize
use_bayes = False
try:
    from skopt import BayesSearchCV
    from skopt.space import Real, Categorical
    use_bayes = True
    print("[INFO] scikit-optimize detected: using BayesSearchCV (GP surrogate).")
except Exception:
    print("[WARN] scikit-optimize not available. Falling back to GridSearchCV (grid search).")

# ------------------------------
# Config
# ------------------------------
base_path = "./"

folders = {
    "packet": "packet_features",
    "uniflow": "uniflow_features",
    "biflow": "biflow_features"
}

files = {
    "normal": "normal.csv",
    "sparta": "sparta.csv",
    "scan_A": "scan_A.csv",
    "mqtt_bruteforce": "mqtt_bruteforce.csv",
    "scan_sU": "scan_sU.csv"
}

def build_filenames(prefix):
    return {
        "normal": f"{prefix}_normal.csv",
        "sparta": f"{prefix}_sparta.csv",
        "scan_A": f"{prefix}_scan_A.csv",
        "mqtt_bruteforce": f"{prefix}_mqtt_bruteforce.csv",
        "scan_sU": f"{prefix}_scan_sU.csv"
    }

feature_files = {
    "packet": files,
    "uniflow": build_filenames("uniflow"),
    "biflow": build_filenames("biflow")
}

CHUNKSIZE = 200000
TRAIN_FRACTION = 0.80
VAL_FRACTION = 0.10
TEST_FRACTION = 0.10

EPOCHS = 3                    # you requested 3 epochs
SAMPLE_VAL_MAX = 20000
SAMPLE_TEST_MAX = 20000

# 🔹 Max number of samples allowed in training per epoch (editable by you)
MAX_TRAIN_SAMPLES = 200000

RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)
random.seed(RANDOM_SEED)

# Search config for Bayes (per-epoch)
BAYES_N_ITER = 25   # number of Bayesian iterations (evaluations) per epoch
CV_FOLDS = 3
SCORING = "f1_macro"

# Fallback grid (used if skopt not installed)
GRID_PARAM_GRID = {
    "C": [0.1, 1, 10],
    "gamma": ["scale", 0.01, 0.001],
    "kernel": ["rbf"]
}

# ------------------------------
# Helpers
# ------------------------------
def safe_remove(path):
    try:
        if os.path.exists(path):
            os.remove(path)
            print(f"[CLEANUP] Removed file: {path}")
    except Exception as e:
        print(f"[CLEANUP] Could not remove {path}: {e}")

def stream_chunks(feature_files_map, base_path, chunksize=CHUNKSIZE):
    """Yield (level, file_key, filepath, chunk_df)."""
    for level, file_dict in feature_files_map.items():
        folder_path = os.path.join(base_path, folders[level])
        for key, fname in file_dict.items():
            fpath = os.path.join(folder_path, fname)
            if not os.path.isfile(fpath):
                print(f"[WARN] Missing: {fpath}")
                continue
            try:
                for chunk in pd.read_csv(fpath, chunksize=chunksize, low_memory=False):
                    yield (level, key, fpath, chunk)
            except Exception as e:
                print(f"[ERROR] Failed to read {fpath}: {e}")

def preprocess_chunk(df, file_key, expected_features=None):
    """
    Preprocess chunk:
      - set attack bool target column 'attack' (True if attack)
      - select numeric features, drop 'attack' or 'label' if present
      - reindex to expected_features (fill missing with 0) if provided
      - return X, y, feature_names
    """
    df = df.copy()
    df["attack"] = (file_key != "normal")

    y = df["attack"].astype(int).values
    numeric_df = df.select_dtypes(include=[np.number]).copy()
    # drop columns that would leak target info
    if "attack" in numeric_df.columns:
        numeric_df = numeric_df.drop(columns=["attack"])
    if "label" in numeric_df.columns:
        numeric_df = numeric_df.drop(columns=["label"])

    if expected_features is not None:
        # ensure consistent ordering + fill missing
        numeric_df = numeric_df.reindex(columns=expected_features, fill_value=0.0)
        feature_names = expected_features
    else:
        feature_names = list(numeric_df.columns)

    if len(feature_names) == 0:
        # no numeric features
        X = np.zeros((numeric_df.shape[0], 0), dtype=np.float32)
    else:
        X = numeric_df.fillna(0.0).values.astype(np.float32)

    return X, y, feature_names

# ------------------------------
# Training loop with epochs + per-epoch Bayesian tuning
# ------------------------------
def train_svm_rbf_bayes(feature_files_map, base_path, epochs=EPOCHS):
    scaler = StandardScaler(with_mean=False)  # sparse-friendly

    feature_names_master = None
    val_X = None; val_y = None
    test_X = None; test_y = None

    best_model = None
    best_params_overall = None

    for epoch in range(1, epochs + 1):
        print(f"\n=== EPOCH {epoch}/{epochs} ===")
        # Collect training chunks for this epoch
        train_chunks_X = []
        train_chunks_y = []

        # stream through all chunks (batches)
        for level, file_key, filepath, chunk in stream_chunks(feature_files_map, base_path):
            print(f"[EPOCH {epoch}] CHUNK {level}/{file_key} shape={chunk.shape}")
            try:
                if feature_names_master is None:
                    Xc, yc, feat_names = preprocess_chunk(chunk, file_key, expected_features=None)
                    feature_names_master = feat_names
                    print(f"[INFO] Established master feature count: {len(feature_names_master)}")
                else:
                    Xc, yc, _ = preprocess_chunk(chunk, file_key, expected_features=feature_names_master)
            except Exception as e:
                print(f"[SKIP] Preprocess error: {e}")
                continue

            if Xc.shape[1] == 0 or Xc.shape[0] == 0:
                # nothing to train on
                print(f"[SKIP] chunk has no numeric features or rows, skipping.")
                continue

            # per-row random split into train/val/test
            n = Xc.shape[0]
            rnd = np.random.rand(n)
            train_mask = rnd < TRAIN_FRACTION
            val_mask = (rnd >= TRAIN_FRACTION) & (rnd < TRAIN_FRACTION + VAL_FRACTION)
            test_mask = rnd >= TRAIN_FRACTION + VAL_FRACTION

            X_train_chunk = Xc[train_mask]; y_train_chunk = yc[train_mask]
            X_val_chunk = Xc[val_mask]; y_val_chunk = yc[val_mask]
            X_test_chunk = Xc[test_mask]; y_test_chunk = yc[test_mask]

            # collect training data (list of arrays -> vstack later)
            if X_train_chunk.shape[0] > 0:
                train_chunks_X.append(X_train_chunk)
                train_chunks_y.append(y_train_chunk)

            # maintain in-memory validation pool (sampled)
            if X_val_chunk.shape[0] > 0:
                if val_X is None:
                    take = min(X_val_chunk.shape[0], SAMPLE_VAL_MAX)
                    idxs = np.random.choice(X_val_chunk.shape[0], take, replace=False)
                    val_X = X_val_chunk[idxs]; val_y = y_val_chunk[idxs]
                else:
                    val_X = np.vstack([val_X, X_val_chunk])
                    val_y = np.concatenate([val_y, y_val_chunk])
                    if val_X.shape[0] > SAMPLE_VAL_MAX:
                        idxs = np.random.choice(val_X.shape[0], SAMPLE_VAL_MAX, replace=False)
                        val_X = val_X[idxs]; val_y = val_y[idxs]

            # maintain in-memory test pool (sampled)
            if X_test_chunk.shape[0] > 0:
                if test_X is None:
                    take = min(X_test_chunk.shape[0], SAMPLE_TEST_MAX)
                    idxs = np.random.choice(X_test_chunk.shape[0], take, replace=False)
                    test_X = X_test_chunk[idxs]; test_y = y_test_chunk[idxs]
                else:
                    test_X = np.vstack([test_X, X_test_chunk])
                    test_y = np.concatenate([test_y, y_test_chunk])
                    if test_X.shape[0] > SAMPLE_TEST_MAX:
                        idxs = np.random.choice(test_X.shape[0], SAMPLE_TEST_MAX, replace=False)
                        test_X = test_X[idxs]; test_y = test_y[idxs]

        # combine training chunks for this epoch
        if len(train_chunks_X) == 0:
            print("[WARN] No training data collected this epoch; skipping tuning/training.")
            continue

        X_train = np.vstack(train_chunks_X)
        y_train = np.concatenate(train_chunks_y)

        # 🔹 Limit max training samples
        if X_train.shape[0] > MAX_TRAIN_SAMPLES:
            idxs = np.random.choice(X_train.shape[0], MAX_TRAIN_SAMPLES, replace=False)
            X_train = X_train[idxs]
            y_train = y_train[idxs]
            print(f"[INFO] Downsampled training set to {MAX_TRAIN_SAMPLES} samples.")

        # scale
        X_train = scaler.fit_transform(X_train)
        if val_X is not None:
            val_X_scaled = scaler.transform(val_X)
        else:
            val_X_scaled = None
        if test_X is not None:
            test_X_scaled = scaler.transform(test_X)
        else:
            test_X_scaled = None

        # ------------------------------
        # Bayesian optimization (or fallback grid) to tune C and gamma
        # ------------------------------
        if use_bayes:
            # define search space (log-uniform for C and gamma)
            search_spaces = {
                "C": Real(1e-3, 1e3, prior="log-uniform"),
                "gamma": Real(1e-4, 1e1, prior="log-uniform"),
                "kernel": Categorical(["rbf"])
            }

            bayes = BayesSearchCV(
                estimator=SVC(),
                search_spaces=search_spaces,
                n_iter=BAYES_N_ITER,
                scoring=SCORING,
                cv=CV_FOLDS,
                n_jobs=-1,
                verbose=0,
                random_state=RANDOM_SEED,
                optimizer_kwargs={"acq_func": "EI"}  # Expected Improvement acquisition
            )

            print(f"[EPOCH {epoch}] Running BayesSearchCV (n_iter={BAYES_N_ITER}, cv={CV_FOLDS}) ...")
            bayes.fit(X_train, y_train)
            best_est = bayes.best_estimator_
            best_params = bayes.best_params_
            best_score = bayes.best_score_
            print(f"[EPOCH {epoch}] Bayes best params: {best_params}  best CV {SCORING}={best_score:.4f}")
            best_model = best_est
            best_params_overall = best_params
        else:
            # fallback to grid search
            print(f"[EPOCH {epoch}] scikit-optimize not installed; running GridSearchCV fallback.")
            grid = GridSearchCV(SVC(), GRID_PARAM_GRID, cv=CV_FOLDS, scoring=SCORING, n_jobs=-1, verbose=1)
            grid.fit(X_train, y_train)
            best_model = grid.best_estimator_
            best_params_overall = grid.best_params_
            print(f"[EPOCH {epoch}] Grid best params: {best_params_overall}  best CV {SCORING}={grid.best_score_:.4f}")

        # Fit best_model on the entire epoch training set
        try:
            best_model.fit(X_train, y_train)
        except Exception as e:
            # in case best_model from BayesSearchCV is already fitted, ignore
            print(f"[WARN] best_model.fit failed (maybe already fitted): {e}")

        # Evaluate on validation pool
        if val_X_scaled is not None:
            preds_val = best_model.predict(val_X_scaled)
            acc_val = accuracy_score(val_y, preds_val)
            f1_val = f1_score(val_y, preds_val, average="macro")
            print(f"[EPOCH {epoch}] Validation: Acc={acc_val:.4f}  F1_macro={f1_val:.4f}")
        else:
            print(f"[EPOCH {epoch}] No validation pool available to evaluate.")

        # Optionally evaluate on test pool (report only)
        if test_X_scaled is not None:
            preds_test = best_model.predict(test_X_scaled)
            acc_test = accuracy_score(test_y, preds_test)
            f1_test = f1_score(test_y, preds_test, average="macro")
            print(f"[EPOCH {epoch}] Test (sampled pool): Acc={acc_test:.4f}  F1_macro={f1_test:.4f}")

        # Keep the best model from this epoch
        best_model_epoch = best_model

    # After epochs, final evaluation on test pool if available
    final_model = best_model_epoch if 'best_model_epoch' in locals() else None
    if final_model is not None and test_X is not None:
        preds = final_model.predict(test_X_scaled)
        acc = accuracy_score(test_y, preds)
        f1 = f1_score(test_y, preds, average="macro")
        print("\n=== FINAL EVALUATION ON TEST POOL ===")
        print(f"Test Acc={acc:.4f}  F1_macro={f1:.4f}")
        print(classification_report(test_y, preds, target_names=["Secure(False)", "Attack(True)"]))
    else:
        print("No final model or test pool to evaluate.")

    return final_model, feature_names_master, best_params_overall

# ------------------------------
# Main
# ------------------------------
if __name__ == "__main__":
    start_time = time.time()
    print("[START] Training RBF SVM with per-epoch Bayesian optimization (epochs={})...".format(EPOCHS))
    try:
        svm_model, features, best_params = train_svm_rbf_bayes(feature_files, base_path, epochs=EPOCHS)
    except Exception as e:
        print(f"[FATAL] Training aborted: {e}")
        svm_model, features, best_params = None, None, None
    elapsed = time.time() - start_time
    print(f"[DONE] Total time: {elapsed:.1f}s")
    if best_params is not None:
        print(f"[RESULT] Best params (last epoch): {best_params}")

    # Cleanup temp files patterns in system temp dir
    print("\n[START CLEANUP]")
    tmpdir = tempfile.gettempdir()
    for pattern in ["*.tmp", "*.temp", "tmp*"]:
        for f in glob.glob(os.path.join(tmpdir, pattern)):
            safe_remove(f)
    print("[CLEANUP] Completed.")

[INFO] scikit-optimize detected: using BayesSearchCV (GP surrogate).
[START] Training RBF SVM with per-epoch Bayesian optimization (epochs=3)...

=== EPOCH 1/3 ===
[EPOCH 1] CHUNK packet/normal shape=(200000, 31)
[INFO] Established master feature count: 27
[EPOCH 1] CHUNK packet/normal shape=(200000, 31)
[EPOCH 1] CHUNK packet/normal shape=(200000, 31)
[EPOCH 1] CHUNK packet/normal shape=(200000, 31)
[EPOCH 1] CHUNK packet/normal shape=(200000, 31)
[EPOCH 1] CHUNK packet/normal shape=(56231, 31)
[EPOCH 1] CHUNK packet/sparta shape=(200000, 31)
[EPOCH 1] CHUNK packet/sparta shape=(200000, 31)
[EPOCH 1] CHUNK packet/sparta shape=(200000, 31)
[EPOCH 1] CHUNK packet/sparta shape=(200000, 31)
[EPOCH 1] CHUNK packet/sparta shape=(200000, 31)
[EPOCH 1] CHUNK packet/sparta shape=(200000, 31)
[EPOCH 1] CHUNK packet/sparta shape=(200000, 31)
[EPOCH 1] CHUNK packet/sparta shape=(200000, 31)
[EPOCH 1] CHUNK packet/sparta shape=(200000, 31)
[EPOCH 1] CHUNK packet/sparta shape=(200000, 31)
[EPOCH 1]