In [9]:
# ==============================
# Exploratory Data Analysis for MQTT IDS Dataset (Batch Mode, 3 feature levels)
# ==============================
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import tempfile
import glob

# ------------------------------
# 1. Define file structure
# ------------------------------
base_path = "./"   # Folder where this script is stored

folders = {
    "packet": "packet_features",
    "uniflow": "uniflow_features",
    "biflow": "biflow_features"
}

files = {
    "normal": "normal.csv",
    "sparta": "sparta.csv",
    "scan_A": "scan_A.csv",
    "mqtt_bruteforce": "mqtt_bruteforce.csv",
    "scan_sU": "scan_sU.csv"
}

def build_filenames(prefix):
    return {
        "normal": f"{prefix}_normal.csv",
        "sparta": f"{prefix}_sparta.csv",
        "scan_A": f"{prefix}_scan_A.csv",
        "mqtt_bruteforce": f"{prefix}_mqtt_bruteforce.csv",
        "scan_sU": f"{prefix}_scan_sU.csv"
    }

feature_files = {
    "packet": files,
    "uniflow": build_filenames("uniflow"),
    "biflow": build_filenames("biflow")
}

# ------------------------------
# 2. Create output folder
# ------------------------------
OUTPUT_DIR = "outputs"
os.makedirs(OUTPUT_DIR, exist_ok=True)

# ------------------------------
# 3. EDA Function (save plots/statistics)
# ------------------------------
def run_batch_eda(data, level, batch_id, filename):
    """Run EDA on a single batch and save plots/stats."""
    batch_tag = f"{level}_{os.path.splitext(os.path.basename(filename))[0]}_batch{batch_id}"

    # Save stats
    stats_file = os.path.join(OUTPUT_DIR, f"{batch_tag}_stats.txt")
    with open(stats_file, "w") as f:
        f.write(f"=== Summary Statistics for {batch_tag} ===\n\n")
        f.write(str(data.describe(include="all")))
        f.write("\n\nClass distribution:\n")
        f.write(str(data["label"].value_counts()))

    # Univariate: length-like columns
    length_cols = [c for c in data.columns if "length" in c.lower()]
    for col in length_cols:
        plt.figure(figsize=(8,5))
        sns.histplot(data, x=col, hue="label", bins=50, kde=True)
        plt.title(f"{batch_tag} | {col} Distribution")
        plt.savefig(os.path.join(OUTPUT_DIR, f"{batch_tag}_{col}_hist.png"))
        plt.close()

    # Flags distribution
    if "flags" in data.columns:
        plt.figure(figsize=(8,5))
        sns.countplot(data=data, x="flags", hue="label")
        plt.title(f"{batch_tag} | Flags Distribution")
        plt.xticks(rotation=45)
        plt.savefig(os.path.join(OUTPUT_DIR, f"{batch_tag}_flags.png"))
        plt.close()

    # Correlation heatmap
    numeric_cols = data.select_dtypes(include=[np.number]).columns
    if len(numeric_cols) > 1:
        plt.figure(figsize=(12,8))
        sns.heatmap(data[numeric_cols].corr(), cmap="coolwarm", center=0)
        plt.title(f"{batch_tag} | Correlation Heatmap")
        plt.savefig(os.path.join(OUTPUT_DIR, f"{batch_tag}_corr.png"))
        plt.close()

    # Class balance
    plt.figure(figsize=(6,4))
    sns.countplot(data=data, x="label")
    plt.title(f"{batch_tag} | Class Distribution")
    plt.savefig(os.path.join(OUTPUT_DIR, f"{batch_tag}_classdist.png"))
    plt.close()

    print(f"[SAVED] EDA outputs for {batch_tag}")

# ------------------------------
# 4. Batch Loader with EDA
# ------------------------------
def load_and_process_in_batches(filepath, label, level, chunksize=200000):
    """Load large CSV in chunks, add label, run EDA per batch."""
    for i, chunk in enumerate(pd.read_csv(filepath, chunksize=chunksize, low_memory=False)):
        chunk["label"] = label
        run_batch_eda(chunk, level, i+1, filepath)

# ------------------------------
# 5. Run across all datasets
# ------------------------------
for level, file_dict in feature_files.items():
    folder_path = os.path.join(base_path, folders[level])
    for name, fname in file_dict.items():
        fpath = os.path.join(folder_path, fname)
        label = 0 if name == "normal" else 1
        print(f"\n[INFO] Processing {fpath} ...")
        load_and_process_in_batches(fpath, label, level)

# ------------------------------
# 6. Cleanup Intermediate Files
# ------------------------------
# Delete any leftover temporary files created during processing
tmp_dirs = [tempfile.gettempdir(), base_path]
for d in tmp_dirs:
    for f in glob.glob(os.path.join(d, "*.tmp")) + glob.glob(os.path.join(d, "*.temp")):
        try:
            os.remove(f)
            print(f"[CLEANUP] Removed temp file: {f}")
        except Exception as e:
            print(f"[CLEANUP] Could not remove {f}: {e}")

print("\n[SUCCESS] All batch-wise EDA completed. Outputs saved in 'outputs/' folder.")
print("[CLEANUP] Temporary files removed.")



[INFO] Processing ./packet_features\normal.csv ...
[SAVED] EDA outputs for packet_normal_batch1
[SAVED] EDA outputs for packet_normal_batch2
[SAVED] EDA outputs for packet_normal_batch3
[SAVED] EDA outputs for packet_normal_batch4
[SAVED] EDA outputs for packet_normal_batch5
[SAVED] EDA outputs for packet_normal_batch6

[INFO] Processing ./packet_features\sparta.csv ...
[SAVED] EDA outputs for packet_sparta_batch1
[SAVED] EDA outputs for packet_sparta_batch2
[SAVED] EDA outputs for packet_sparta_batch3
[SAVED] EDA outputs for packet_sparta_batch4
[SAVED] EDA outputs for packet_sparta_batch5
[SAVED] EDA outputs for packet_sparta_batch6
[SAVED] EDA outputs for packet_sparta_batch7
[SAVED] EDA outputs for packet_sparta_batch8
[SAVED] EDA outputs for packet_sparta_batch9
[SAVED] EDA outputs for packet_sparta_batch10
[SAVED] EDA outputs for packet_sparta_batch11
[SAVED] EDA outputs for packet_sparta_batch12
[SAVED] EDA outputs for packet_sparta_batch13
[SAVED] EDA outputs for packet_sparta