In [None]:
# mqtt_ids_svm_train_rbf.py
# Full batch training of nonlinear RBF SVM with GridSearchCV
# for binary classification: secure (False) vs under attack (True).
#
# Dataset files are read in chunks to support large data.
# The target column "attack" is True if under attack, False if secure.
# Validation and test pools are sampled and stored in-memory.
# Cleanup removes intermediate files created during preprocessing.

import os
import glob
import time
import tempfile
import numpy as np
import pandas as pd
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, f1_score, classification_report
from sklearn.model_selection import GridSearchCV
import random

# ------------------------------
# Config
# ------------------------------
base_path = "./"

folders = {
    "packet": "packet_features",
    "uniflow": "uniflow_features",
    "biflow": "biflow_features"
}

files = {
    "normal": "normal.csv",
    "sparta": "sparta.csv",
    "scan_A": "scan_A.csv",
    "mqtt_bruteforce": "mqtt_bruteforce.csv",
    "scan_sU": "scan_sU.csv"
}

def build_filenames(prefix):
    return {
        "normal": f"{prefix}_normal.csv",
        "sparta": f"{prefix}_sparta.csv",
        "scan_A": f"{prefix}_scan_A.csv",
        "mqtt_bruteforce": f"{prefix}_mqtt_bruteforce.csv",
        "scan_sU": f"{prefix}_scan_sU.csv"
    }

feature_files = {
    "packet": files,
    "uniflow": build_filenames("uniflow"),
    "biflow": build_filenames("biflow")
}

CHUNKSIZE = 200000
TRAIN_FRACTION = 0.80
VAL_FRACTION = 0.10
TEST_FRACTION = 0.10

SAMPLE_VAL_MAX = 20000
SAMPLE_TEST_MAX = 20000

RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)
random.seed(RANDOM_SEED)

# ------------------------------
# Cleanup helpers
# ------------------------------
def safe_remove(path):
    try:
        if os.path.exists(path):
            os.remove(path)
            print(f"[CLEANUP] Removed file: {path}")
    except Exception as e:
        print(f"[CLEANUP] Could not remove {path}: {e}")

# ------------------------------
# Data streaming
# ------------------------------
def stream_chunks(feature_files_map, base_path, chunksize=CHUNKSIZE):
    """Yield (level, file_key, filepath, chunk_df)."""
    for level, file_dict in feature_files_map.items():
        folder_path = os.path.join(base_path, folders[level])
        for key, fname in file_dict.items():
            fpath = os.path.join(folder_path, fname)
            if not os.path.isfile(fpath):
                print(f"[WARN] Missing: {fpath}")
                continue
            try:
                for chunk in pd.read_csv(fpath, chunksize=chunksize, low_memory=False):
                    yield (level, key, fpath, chunk)
            except Exception as e:
                print(f"[ERROR] Failed to read {fpath}: {e}")

# ------------------------------
# Preprocess chunk
# ------------------------------
def preprocess_chunk(df, file_key, expected_features=None):
    df = df.copy()
    # Boolean target: False = normal, True = attack
    df["attack"] = (file_key != "normal")

    y = df["attack"].astype(int).values
    numeric_df = df.select_dtypes(include=[np.number]).copy()
    numeric_df = numeric_df.drop(columns=[c for c in ["attack", "label"] if c in numeric_df.columns])

    if expected_features is not None:
        # Reindex to master feature set, fill missing with 0
        numeric_df = numeric_df.reindex(columns=expected_features, fill_value=0.0)
        feature_names = expected_features
    else:
        feature_names = list(numeric_df.columns)

    X = numeric_df.fillna(0.0).values.astype(np.float32)
    return X, y, feature_names

# ------------------------------
# Training with RBF SVM + GridSearch
# ------------------------------
def train_svm_rbf(feature_files_map, base_path):
    scaler = StandardScaler(with_mean=False)  # sparse-friendly

    feature_names_master = None
    train_X, train_y = [], []
    val_X, val_y = None, None
    test_X, test_y = None, None

    print("\n=== Collecting data ===")
    for level, file_key, filepath, chunk in stream_chunks(feature_files_map, base_path):
        print(f"[CHUNK] {level}/{file_key} shape={chunk.shape}")
        try:
            if feature_names_master is None:
                X, y, features = preprocess_chunk(chunk, file_key)
                feature_names_master = features
            else:
                X, y, _ = preprocess_chunk(chunk, file_key, expected_features=feature_names_master)
        except Exception as e:
            print(f"[SKIP] {e}")
            continue

        if X.shape[1] == 0 or X.shape[0] == 0:
            continue

        rnd = np.random.rand(X.shape[0])
        train_mask = rnd < TRAIN_FRACTION
        val_mask = (rnd >= TRAIN_FRACTION) & (rnd < TRAIN_FRACTION + VAL_FRACTION)
        test_mask = rnd >= TRAIN_FRACTION + VAL_FRACTION

        train_X.append(X[train_mask])
        train_y.append(y[train_mask])

        # Validation pool
        if np.any(val_mask):
            X_val, y_val = X[val_mask], y[val_mask]
            if val_X is None:
                take = min(X_val.shape[0], SAMPLE_VAL_MAX)
                idxs = np.random.choice(X_val.shape[0], take, replace=False)
                val_X, val_y = X_val[idxs], y_val[idxs]
            else:
                val_X = np.vstack([val_X, X_val])
                val_y = np.concatenate([val_y, y_val])
                if val_X.shape[0] > SAMPLE_VAL_MAX:
                    idxs = np.random.choice(val_X.shape[0], SAMPLE_VAL_MAX, replace=False)
                    val_X, val_y = val_X[idxs], val_y[idxs]

        # Test pool
        if np.any(test_mask):
            X_test, y_test_ = X[test_mask], y[test_mask]
            if test_X is None:
                take = min(X_test.shape[0], SAMPLE_TEST_MAX)
                idxs = np.random.choice(X_test.shape[0], take, replace=False)
                test_X, test_y = X_test[idxs], y_test_[idxs]
            else:
                test_X = np.vstack([test_X, X_test])
                test_y = np.concatenate([test_y, y_test_])
                if test_X.shape[0] > SAMPLE_TEST_MAX:
                    idxs = np.random.choice(test_X.shape[0], SAMPLE_TEST_MAX, replace=False)
                    test_X, test_y = test_X[idxs], test_y[idxs]

    # Combine all training chunks
    train_X = np.vstack(train_X)
    train_y = np.concatenate(train_y)

    # Scale
    train_X = scaler.fit_transform(train_X)
    if val_X is not None:
        val_X = scaler.transform(val_X)
    if test_X is not None:
        test_X = scaler.transform(test_X)

    # ------------------------------
    # Grid Search with RBF SVM
    # ------------------------------
    print("\n=== Training RBF SVM with GridSearchCV ===")
    param_grid = {
        "C": [0.1, 1, 10],
        "gamma": ["scale", 0.01, 0.001],
        "kernel": ["rbf"]
    }
    grid = GridSearchCV(SVC(), param_grid, cv=3, scoring="f1_macro", verbose=2, n_jobs=-1)
    grid.fit(train_X, train_y)

    print(f"\n[GRID SEARCH] Best Params: {grid.best_params_}")
    print(f"[GRID SEARCH] Best CV Score (F1 macro): {grid.best_score_:.4f}")

    best_svm = grid.best_estimator_

    # Validation metrics
    if val_X is not None:
        preds = best_svm.predict(val_X)
        acc = accuracy_score(val_y, preds)
        f1 = f1_score(val_y, preds, average="macro")
        print(f"\n=== VALIDATION METRICS ===")
        print(f"Val Acc={acc:.4f}  F1={f1:.4f}")

    # Test metrics
    if test_X is not None:
        preds = best_svm.predict(test_X)
        acc = accuracy_score(test_y, preds)
        f1 = f1_score(test_y, preds, average="macro")
        print("\n=== FINAL TEST METRICS ===")
        print(f"Test Acc={acc:.4f}  F1={f1:.4f}")
        print(classification_report(test_y, preds, target_names=["Secure(False)", "Attack(True)"]))
    else:
        print("No test data available.")

    return best_svm, feature_names_master

# ------------------------------
# Main
# ------------------------------
if __name__ == "__main__":
    start = time.time()
    print("[START] Training nonlinear RBF SVM with GridSearchCV...")
    try:
        svm, features = train_svm_rbf(feature_files, base_path)
    except Exception as e:
        print(f"[FATAL] Error: {e}")
    elapsed = time.time() - start
    print(f"[DONE] Total time {elapsed:.1f}s")

    # Cleanup temp files
    print("\n[START CLEANUP]")
    tmpdir = tempfile.gettempdir()
    for pattern in ["*.tmp", "*.temp", "tmp*"]:
        for f in glob.glob(os.path.join(tmpdir, pattern)):
            safe_remove(f)
    print("[CLEANUP] Completed.")