In [1]:
# mqtt_ids_svm_train.py
# Batch + epoch training of a nonlinear soft-margin SVM (via SGD hinge loss approximation)
# for binary classification: secure (False) vs under attack (True).
#
# Dataset files are read in chunks to support large data.
# The target column "attack" is True if under attack, False if secure.
# Validation and test pools are sampled and stored in-memory.
# Cleanup removes intermediate files created during preprocessing.

import os
import glob
import time
import tempfile
import numpy as np
import pandas as pd
from sklearn.linear_model import SGDClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, f1_score, classification_report
import random

# ------------------------------
# Config
# ------------------------------
base_path = "./"

folders = {
    "packet": "packet_features",
    "uniflow": "uniflow_features",
    "biflow": "biflow_features"
}

files = {
    "normal": "normal.csv",
    "sparta": "sparta.csv",
    "scan_A": "scan_A.csv",
    "mqtt_bruteforce": "mqtt_bruteforce.csv",
    "scan_sU": "scan_sU.csv"
}

def build_filenames(prefix):
    return {
        "normal": f"{prefix}_normal.csv",
        "sparta": f"{prefix}_sparta.csv",
        "scan_A": f"{prefix}_scan_A.csv",
        "mqtt_bruteforce": f"{prefix}_mqtt_bruteforce.csv",
        "scan_sU": f"{prefix}_scan_sU.csv"
    }

feature_files = {
    "packet": files,
    "uniflow": build_filenames("uniflow"),
    "biflow": build_filenames("biflow")
}

CHUNKSIZE = 200000
TRAIN_FRACTION = 0.80
VAL_FRACTION = 0.10
TEST_FRACTION = 0.10

EPOCHS = 3
SAMPLE_VAL_MAX = 20000
SAMPLE_TEST_MAX = 20000

RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)
random.seed(RANDOM_SEED)

# ------------------------------
# Cleanup helpers
# ------------------------------
def safe_remove(path):
    try:
        if os.path.exists(path):
            os.remove(path)
            print(f"[CLEANUP] Removed file: {path}")
    except Exception as e:
        print(f"[CLEANUP] Could not remove {path}: {e}")

# ------------------------------
# Data streaming
# ------------------------------
def stream_chunks(feature_files_map, base_path, chunksize=CHUNKSIZE):
    """Yield (level, file_key, filepath, chunk_df)."""
    for level, file_dict in feature_files_map.items():
        folder_path = os.path.join(base_path, folders[level])
        for key, fname in file_dict.items():
            fpath = os.path.join(folder_path, fname)
            if not os.path.isfile(fpath):
                print(f"[WARN] Missing: {fpath}")
                continue
            try:
                for chunk in pd.read_csv(fpath, chunksize=chunksize, low_memory=False):
                    yield (level, key, fpath, chunk)
            except Exception as e:
                print(f"[ERROR] Failed to read {fpath}: {e}")

# ------------------------------
# Preprocess chunk
# ------------------------------
def preprocess_chunk(df, file_key, expected_features=None):
    df = df.copy()
    # Boolean target: False = normal, True = attack
    df["attack"] = (file_key != "normal")

    y = df["attack"].astype(int).values
    numeric_df = df.select_dtypes(include=[np.number]).copy()
    numeric_df = numeric_df.drop(columns=[c for c in ["attack", "label"] if c in numeric_df.columns])

    if expected_features is not None:
        # Reindex to master feature set, fill missing with 0
        numeric_df = numeric_df.reindex(columns=expected_features, fill_value=0.0)
        feature_names = expected_features
    else:
        feature_names = list(numeric_df.columns)

    X = numeric_df.fillna(0.0).values.astype(np.float32)
    return X, y, feature_names

# ------------------------------
# Training loop
# ------------------------------
def train_svm_batchwise(feature_files_map, base_path, epochs=EPOCHS):
    scaler = StandardScaler(with_mean=False)  # sparse-friendly
    svm = SGDClassifier(
        loss="hinge",     # SVM hinge loss
        penalty="l2",     # soft-margin
        alpha=0.0001,     # regularization strength
        max_iter=1,
        learning_rate="optimal",
        tol=None,
        warm_start=True
    )

    feature_names_master = None
    val_X, val_y = None, None
    test_X, test_y = None, None
    total_batches = 0

    for epoch in range(1, epochs + 1):
        print(f"\n=== EPOCH {epoch}/{epochs} ===")
        for level, file_key, filepath, chunk in stream_chunks(feature_files_map, base_path):
            total_batches += 1
            tag = f"ep{epoch}_b{total_batches}"
            print(f"[BATCH {tag}] {level}/{file_key} shape={chunk.shape}")

            try:
                if feature_names_master is None:
                    X, y, features = preprocess_chunk(chunk, file_key)
                    feature_names_master = features
                else:
                    X, y, _ = preprocess_chunk(chunk, file_key, expected_features=feature_names_master)
            except Exception as e:
                print(f"[SKIP] {e}")
                continue

            if X.shape[1] == 0:
                print(f"[SKIP] {tag} has 0 features after preprocessing.")
                continue

            n = X.shape[0]
            if n == 0: 
                continue

            rnd = np.random.rand(n)
            train_mask = rnd < TRAIN_FRACTION
            val_mask = (rnd >= TRAIN_FRACTION) & (rnd < TRAIN_FRACTION + VAL_FRACTION)
            test_mask = rnd >= TRAIN_FRACTION + VAL_FRACTION

            X_train, y_train = X[train_mask], y[train_mask]
            X_val, y_val = X[val_mask], y[val_mask]
            X_test, y_test_ = X[test_mask], y[test_mask]

            # Scale + partial fit
            if X_train.shape[0] > 0:
                X_train = scaler.partial_fit(X_train).transform(X_train)
                if not hasattr(svm, "classes_"):
                    svm.partial_fit(X_train, y_train, classes=np.array([0, 1]))
                else:
                    svm.partial_fit(X_train, y_train)
                print(f"[BATCH {tag}] trained on {X_train.shape[0]} rows.")

            # Validation pool
            if X_val.shape[0] > 0:
                X_val = scaler.transform(X_val)
                if val_X is None:
                    take = min(X_val.shape[0], SAMPLE_VAL_MAX)
                    idxs = np.random.choice(X_val.shape[0], take, replace=False)
                    val_X, val_y = X_val[idxs], y_val[idxs]
                else:
                    val_X = np.vstack([val_X, X_val])
                    val_y = np.concatenate([val_y, y_val])
                    if val_X.shape[0] > SAMPLE_VAL_MAX:
                        idxs = np.random.choice(val_X.shape[0], SAMPLE_VAL_MAX, replace=False)
                        val_X, val_y = val_X[idxs], val_y[idxs]

            # Test pool
            if X_test.shape[0] > 0:
                X_test = scaler.transform(X_test)
                if test_X is None:
                    take = min(X_test.shape[0], SAMPLE_TEST_MAX)
                    idxs = np.random.choice(X_test.shape[0], take, replace=False)
                    test_X, test_y = X_test[idxs], y_test_[idxs]
                else:
                    test_X = np.vstack([test_X, X_test])
                    test_y = np.concatenate([test_y, y_test_])
                    if test_X.shape[0] > SAMPLE_TEST_MAX:
                        idxs = np.random.choice(test_X.shape[0], SAMPLE_TEST_MAX, replace=False)
                        test_X, test_y = test_X[idxs], test_y[idxs]

        # Epoch validation
        if val_X is not None and val_X.shape[0] > 0:
            preds = svm.predict(val_X)
            acc = accuracy_score(val_y, preds)
            f1 = f1_score(val_y, preds, average="macro")
            print(f"[EPOCH {epoch}] Val Acc={acc:.4f}  F1={f1:.4f}")

    # Final test
    if test_X is not None and test_X.shape[0] > 0:
        preds = svm.predict(test_X)
        acc = accuracy_score(test_y, preds)
        f1 = f1_score(test_y, preds, average="macro")
        print("\n=== FINAL TEST METRICS ===")
        print(f"Test Acc={acc:.4f}  F1={f1:.4f}")
        print(classification_report(test_y, preds, target_names=["Secure(False)", "Attack(True)"]))
    else:
        print("No test data available.")

    return svm, feature_names_master

# ------------------------------
# Main
# ------------------------------
if __name__ == "__main__":
    start = time.time()
    print("[START] Training nonlinear SVM with batch/epoch approach...")
    try:
        svm, features = train_svm_batchwise(feature_files, base_path, epochs=EPOCHS)
    except Exception as e:
        print(f"[FATAL] Error: {e}")
    elapsed = time.time() - start
    print(f"[DONE] Total time {elapsed:.1f}s")

    # Cleanup temp files
    print("\n[START CLEANUP]")
    tmpdir = tempfile.gettempdir()
    for pattern in ["*.tmp", "*.temp", "tmp*"]:
        for f in glob.glob(os.path.join(tmpdir, pattern)):
            safe_remove(f)
    print("[CLEANUP] Completed.")


[START] Training nonlinear SVM with batch/epoch approach...

=== EPOCH 1/3 ===
[BATCH ep1_b1] packet/normal shape=(200000, 31)
[BATCH ep1_b1] trained on 159902 rows.
[BATCH ep1_b2] packet/normal shape=(200000, 31)
[BATCH ep1_b2] trained on 160096 rows.
[BATCH ep1_b3] packet/normal shape=(200000, 31)
[BATCH ep1_b3] trained on 160037 rows.
[BATCH ep1_b4] packet/normal shape=(200000, 31)
[BATCH ep1_b4] trained on 160029 rows.
[BATCH ep1_b5] packet/normal shape=(200000, 31)
[BATCH ep1_b5] trained on 160198 rows.
[BATCH ep1_b6] packet/normal shape=(56231, 31)
[BATCH ep1_b6] trained on 44973 rows.
[BATCH ep1_b7] packet/sparta shape=(200000, 31)
[BATCH ep1_b7] trained on 159897 rows.
[BATCH ep1_b8] packet/sparta shape=(200000, 31)
[BATCH ep1_b8] trained on 159957 rows.
[BATCH ep1_b9] packet/sparta shape=(200000, 31)
[BATCH ep1_b9] trained on 160138 rows.
[BATCH ep1_b10] packet/sparta shape=(200000, 31)
[BATCH ep1_b10] trained on 160133 rows.
[BATCH ep1_b11] packet/sparta shape=(200000, 31)
[B

  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
