With Hyperparameter Tuning

In [2]:
import os
import glob
import time
import tempfile
import numpy as np
import pandas as pd
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, f1_score, classification_report
from sklearn.model_selection import GridSearchCV
import random

# ------------------------------
# Config
# ------------------------------
base_path = "./"

folders = {
    "packet": "packet_features",
    "uniflow": "uniflow_features",
    "biflow": "biflow_features"
}

files = {
    "normal": "normal.csv",
    "sparta": "sparta.csv",
    "scan_A": "scan_A.csv",
    "mqtt_bruteforce": "mqtt_bruteforce.csv",
    "scan_sU": "scan_sU.csv"
}

def build_filenames(prefix):
    return {
        "normal": f"{prefix}_normal.csv",
        "sparta": f"{prefix}_sparta.csv",
        "scan_A": f"{prefix}_scan_A.csv",
        "mqtt_bruteforce": f"{prefix}_mqtt_bruteforce.csv",
        "scan_sU": f"{prefix}_scan_sU.csv"
    }

feature_files = {
    "packet": files,
    "uniflow": build_filenames("uniflow"),
    "biflow": build_filenames("biflow")
}

CHUNKSIZE = 200000
TRAIN_FRACTION = 0.80
VAL_FRACTION = 0.10
TEST_FRACTION = 0.10

EPOCHS = 3
SAMPLE_VAL_MAX = 20000
SAMPLE_TEST_MAX = 20000

# 🔹 Increased training cap for better accuracy
MAX_TRAIN_SAMPLES = 5000

RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)
random.seed(RANDOM_SEED)

CV_FOLDS = 3
SCORING = "f1_macro"

# Expanded hyperparameter grid
GRID_PARAM_GRID = {
    "C": [0.1, 1, 10, 100, 1000],
    "gamma": ["scale", "auto", 1, 0.1, 0.01, 0.001],
    "kernel": ["rbf", "linear"],
    "class_weight": [None, "balanced"]
}

# ------------------------------
# Helpers
# ------------------------------
def safe_remove(path):
    try:
        if os.path.exists(path):
            os.remove(path)
            print(f"[CLEANUP] Removed file: {path}")
    except Exception as e:
        print(f"[CLEANUP] Could not remove {path}: {e}")

def stream_chunks(feature_files_map, base_path, chunksize=CHUNKSIZE):
    """Yield (level, file_key, filepath, chunk_df)."""
    for level, file_dict in feature_files_map.items():
        folder_path = os.path.join(base_path, folders[level])
        for key, fname in file_dict.items():
            fpath = os.path.join(folder_path, fname)
            if not os.path.isfile(fpath):
                print(f"[WARN] Missing: {fpath}")
                continue
            try:
                for chunk in pd.read_csv(fpath, chunksize=chunksize, low_memory=False):
                    yield (level, key, fpath, chunk)
            except Exception as e:
                print(f"[ERROR] Failed to read {fpath}: {e}")

def preprocess_chunk(df, file_key, expected_features=None):
    """
    Preprocess chunk:
      - set attack bool target column 'attack'
      - select numeric features, drop 'attack' or 'label'
      - align to expected_features if provided
    """
    df = df.copy()
    df["attack"] = (file_key != "normal")

    y = df["attack"].astype(int).values
    numeric_df = df.select_dtypes(include=[np.number]).copy()
    if "attack" in numeric_df.columns:
        numeric_df = numeric_df.drop(columns=["attack"])
    if "label" in numeric_df.columns:
        numeric_df = numeric_df.drop(columns=["label"])

    if expected_features is not None:
        numeric_df = numeric_df.reindex(columns=expected_features, fill_value=0.0)
        feature_names = expected_features
    else:
        feature_names = list(numeric_df.columns)

    if len(feature_names) == 0:
        X = np.zeros((numeric_df.shape[0], 0), dtype=np.float32)
    else:
        X = numeric_df.fillna(0.0).values.astype(np.float32)

    return X, y, feature_names

# ------------------------------
# Training with stable pools + GridSearchCV
# ------------------------------
def train_svm_rbf_grid(feature_files_map, base_path, epochs=EPOCHS):
    scaler = StandardScaler(with_mean=False)

    feature_names_master = None
    val_X = None; val_y = None
    test_X = None; test_y = None

    best_model = None
    best_params_overall = None

    # ------------------------------
    # Build stable validation/test pools ONCE
    # ------------------------------
    train_chunks_X = []
    train_chunks_y = []
    for level, file_key, filepath, chunk in stream_chunks(feature_files_map, base_path):
        try:
            if feature_names_master is None:
                Xc, yc, feat_names = preprocess_chunk(chunk, file_key, expected_features=None)
                feature_names_master = feat_names
            else:
                Xc, yc, _ = preprocess_chunk(chunk, file_key, expected_features=feature_names_master)
        except Exception as e:
            print(f"[SKIP] Preprocess error: {e}")
            continue

        if Xc.shape[1] == 0 or Xc.shape[0] == 0:
            continue

        n = Xc.shape[0]
        rnd = np.random.rand(n)
        train_mask = rnd < TRAIN_FRACTION
        val_mask = (rnd >= TRAIN_FRACTION) & (rnd < TRAIN_FRACTION + VAL_FRACTION)
        test_mask = rnd >= TRAIN_FRACTION + VAL_FRACTION

        X_train_chunk = Xc[train_mask]; y_train_chunk = yc[train_mask]
        X_val_chunk = Xc[val_mask]; y_val_chunk = yc[val_mask]
        X_test_chunk = Xc[test_mask]; y_test_chunk = yc[test_mask]

        if X_train_chunk.shape[0] > 0:
            train_chunks_X.append(X_train_chunk)
            train_chunks_y.append(y_train_chunk)

        if X_val_chunk.shape[0] > 0:
            if val_X is None:
                take = min(X_val_chunk.shape[0], SAMPLE_VAL_MAX)
                idxs = np.random.choice(X_val_chunk.shape[0], take, replace=False)
                val_X = X_val_chunk[idxs]; val_y = y_val_chunk[idxs]
            else:
                val_X = np.vstack([val_X, X_val_chunk])
                val_y = np.concatenate([val_y, y_val_chunk])
                if val_X.shape[0] > SAMPLE_VAL_MAX:
                    idxs = np.random.choice(val_X.shape[0], SAMPLE_VAL_MAX, replace=False)
                    val_X = val_X[idxs]; val_y = val_y[idxs]

        if X_test_chunk.shape[0] > 0:
            if test_X is None:
                take = min(X_test_chunk.shape[0], SAMPLE_TEST_MAX)
                idxs = np.random.choice(X_test_chunk.shape[0], take, replace=False)
                test_X = X_test_chunk[idxs]; test_y = y_test_chunk[idxs]
            else:
                test_X = np.vstack([test_X, X_test_chunk])
                test_y = np.concatenate([test_y, y_test_chunk])
                if test_X.shape[0] > SAMPLE_TEST_MAX:
                    idxs = np.random.choice(test_X.shape[0], SAMPLE_TEST_MAX, replace=False)
                    test_X = test_X[idxs]; test_y = test_y[idxs]

    # Combine all training data
    if len(train_chunks_X) == 0:
        print("[FATAL] No training data collected.")
        return None, None, None

    X_train_full = np.vstack(train_chunks_X)
    y_train_full = np.concatenate(train_chunks_y)

    # ------------------------------
    # Epoch loop (with resampling for train only)
    # ------------------------------
    for epoch in range(1, epochs + 1):
        print(f"\n=== EPOCH {epoch}/{epochs} ===")

        # Downsample train each epoch
        if X_train_full.shape[0] > MAX_TRAIN_SAMPLES:
            idxs = np.random.choice(X_train_full.shape[0], MAX_TRAIN_SAMPLES, replace=False)
            X_train = X_train_full[idxs]
            y_train = y_train_full[idxs]
            print(f"[INFO] Downsampled training set to {MAX_TRAIN_SAMPLES} samples.")
        else:
            X_train = X_train_full
            y_train = y_train_full

        # Scale consistently
        scaler.fit(X_train)
        X_train = scaler.transform(X_train)
        val_X_scaled = scaler.transform(val_X) if val_X is not None else None
        test_X_scaled = scaler.transform(test_X) if test_X is not None else None

        # ------------------------------
        # GridSearchCV tuning
        # ------------------------------
        print(f"[EPOCH {epoch}] Running GridSearchCV (cv={CV_FOLDS}) ...")
        grid = GridSearchCV(
            SVC(),
            GRID_PARAM_GRID,
            cv=CV_FOLDS,
            scoring=SCORING,
            n_jobs=-1,
            verbose=1
        )
        grid.fit(X_train, y_train)
        best_model = grid.best_estimator_
        best_params_overall = grid.best_params_
        print(f"[EPOCH {epoch}] Best params: {best_params_overall}  CV {SCORING}={grid.best_score_:.4f}")

        # Fit best model
        best_model.fit(X_train, y_train)

        # Validation
        if val_X_scaled is not None:
            preds_val = best_model.predict(val_X_scaled)
            acc_val = accuracy_score(val_y, preds_val)
            f1_val = f1_score(val_y, preds_val, average="macro")
            print(f"[EPOCH {epoch}] Validation: Acc={acc_val:.4f}  F1_macro={f1_val:.4f}")

        # Test
        if test_X_scaled is not None:
            preds_test = best_model.predict(test_X_scaled)
            acc_test = accuracy_score(test_y, preds_test)
            f1_test = f1_score(test_y, preds_test, average="macro")
            print(f"[EPOCH {epoch}] Test: Acc={acc_test:.4f}  F1_macro={f1_test:.4f}")

    # ------------------------------
    # Final evaluation
    # ------------------------------
    final_model = best_model
    if final_model is not None and test_X is not None:
        preds = final_model.predict(test_X_scaled)
        acc = accuracy_score(test_y, preds)
        f1 = f1_score(test_y, preds, average="macro")
        print("\n=== FINAL EVALUATION ON TEST POOL ===")
        print(f"Test Acc={acc:.4f}  F1_macro={f1:.4f}")
        print(classification_report(test_y, preds, target_names=["Secure(False)", "Attack(True)"]))
    else:
        print("No final model or test pool to evaluate.")

    return final_model, feature_names_master, best_params_overall

# ------------------------------
# Main
# ------------------------------
if __name__ == "__main__":
    start_time = time.time()
    print(f"[START] Training RBF/Linear SVM with GridSearchCV (epochs={EPOCHS})...")
    try:
        svm_model, features, best_params = train_svm_rbf_grid(feature_files, base_path, epochs=EPOCHS)
    except Exception as e:
        print(f"[FATAL] Training aborted: {e}")
        svm_model, features, best_params = None, None, None
    elapsed = time.time() - start_time
    print(f"[DONE] Total time: {elapsed:.1f}s")
    if best_params is not None:
        print(f"[RESULT] Best params (last epoch): {best_params}")

    print("\n[START CLEANUP]")
    tmpdir = tempfile.gettempdir()
    for pattern in ["*.tmp", "*.temp", "tmp*"]:
        for f in glob.glob(os.path.join(tmpdir, pattern)):
            safe_remove(f)
    print("[CLEANUP] Completed.")


[START] Training RBF/Linear SVM with GridSearchCV (epochs=3)...

=== EPOCH 1/3 ===
[INFO] Downsampled training set to 5000 samples.
[EPOCH 1] Running GridSearchCV (cv=3) ...
Fitting 3 folds for each of 120 candidates, totalling 360 fits
[EPOCH 1] Best params: {'C': 10, 'class_weight': None, 'gamma': 'scale', 'kernel': 'linear'}  CV f1_macro=0.8004
[EPOCH 1] Validation: Acc=0.7991  F1_macro=0.4442
[EPOCH 1] Test: Acc=0.7997  F1_macro=0.4444

=== EPOCH 2/3 ===
[INFO] Downsampled training set to 5000 samples.
[EPOCH 2] Running GridSearchCV (cv=3) ...
Fitting 3 folds for each of 120 candidates, totalling 360 fits
[EPOCH 2] Best params: {'C': 10, 'class_weight': None, 'gamma': 1, 'kernel': 'rbf'}  CV f1_macro=0.7582
[EPOCH 2] Validation: Acc=0.8015  F1_macro=0.4449
[EPOCH 2] Test: Acc=0.8012  F1_macro=0.4448

=== EPOCH 3/3 ===
[INFO] Downsampled training set to 5000 samples.
[EPOCH 3] Running GridSearchCV (cv=3) ...
Fitting 3 folds for each of 120 candidates, totalling 360 fits
[EPOCH 3] Be