In [1]:
#!/usr/bin/env python3
"""
Memory-safe single scatter plot for linearly separability check (attack vs no-attack).

This script:
 - Streams CSV files in chunks (no full concat)
 - Selects two numeric columns (from the first chunk that has >=2 numeric features)
 - Randomly samples up to MAX_POINTS combined across all files
 - Produces one scatter plot (saved to OUTPUT_DIR/combined_scatter.png)
 - Cleans up temporary *.tmp / *.temp files in the system temp dir and base folder

EDIT:
 - MAX_POINTS controls how many points will be drawn in total (default 20000)
 - CHUNK_SIZE controls pandas chunk size when streaming (default 50k)
"""

import os
import glob
import tempfile
import random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# ------------------------------
# Config
# ------------------------------
base_path = "./"   # root folder where feature folders live
folders = {
    "packet": "packet_features",
    "uniflow": "uniflow_features",
    "biflow": "biflow_features"
}
files = {
    "normal": "normal.csv",
    "sparta": "sparta.csv",
    "scan_A": "scan_A.csv",
    "mqtt_bruteforce": "mqtt_bruteforce.csv",
    "scan_sU": "scan_sU.csv"
}
def build_filenames(prefix):
    return {
        "normal": f"{prefix}_normal.csv",
        "sparta": f"{prefix}_sparta.csv",
        "scan_A": f"{prefix}_scan_A.csv",
        "mqtt_bruteforce": f"{prefix}_mqtt_bruteforce.csv",
        "scan_sU": f"{prefix}_scan_sU.csv"
    }
feature_files = {
    "packet": files,
    "uniflow": build_filenames("uniflow"),
    "biflow": build_filenames("biflow")
}

OUTPUT_DIR = "outputs_scatter"
os.makedirs(OUTPUT_DIR, exist_ok=True)

CHUNK_SIZE = 50000      # pandas read_csv chunksize
MAX_POINTS = 20000      # total points to sample across all files (keep small to avoid memory)
RANDOM_SEED = 42

np.random.seed(RANDOM_SEED)
random.seed(RANDOM_SEED)

# ------------------------------
# Helper: find two numeric columns to plot
# ------------------------------
def find_two_numeric_columns(feature_files_map, base_path, chunksize=CHUNK_SIZE):
    """
    Scan files until we find a chunk with >= 2 numeric columns.
    Returns (x_col, y_col) or (None, None) if not found.
    """
    for level, file_dict in feature_files_map.items():
        folder_path = os.path.join(base_path, folders[level])
        for name, fname in file_dict.items():
            fpath = os.path.join(folder_path, fname)
            if not os.path.exists(fpath):
                continue
            try:
                for chunk in pd.read_csv(fpath, chunksize=chunksize, low_memory=False):
                    numeric_cols = chunk.select_dtypes(include=[np.number]).columns.tolist()
                    # drop 'label' if present (we'll assign label per file later)
                    numeric_cols = [c for c in numeric_cols if c.lower() != "label"]
                    if len(numeric_cols) >= 2:
                        print(f"[INFO] Chosen features: '{numeric_cols[0]}' and '{numeric_cols[1]}' (from {fpath})")
                        return numeric_cols[0], numeric_cols[1]
                    # otherwise go to next chunk/file
                    break
            except Exception as e:
                print(f"[WARN] Could not read {fpath}: {e}")
                continue
    return None, None

# ------------------------------
# Streaming sampler
# ------------------------------
def stream_and_sample(feature_files_map, base_path, x_col, y_col, max_points=MAX_POINTS, chunksize=CHUNK_SIZE):
    """
    Streams CSVs chunk-by-chunk and samples up to max_points (combined).
    Returns arrays (X_vals shape=(n,2), labels shape=(n,))
    """
    xs = []
    ys = []
    labels = []
    total_collected = 0
    stop_all = False

    for level, file_dict in feature_files_map.items():
        folder_path = os.path.join(base_path, folders[level])
        for name, fname in file_dict.items():
            if stop_all:
                break
            fpath = os.path.join(folder_path, fname)
            if not os.path.exists(fpath):
                continue
            label = 0 if name == "normal" else 1
            try:
                for chunk in pd.read_csv(fpath, chunksize=chunksize, low_memory=False):
                    # if we've collected enough, break out
                    remaining = max_points - total_collected
                    if remaining <= 0:
                        stop_all = True
                        break

                    # If the desired columns are present, use them; otherwise skip this chunk
                    if x_col not in chunk.columns or y_col not in chunk.columns:
                        # try to coerce by selecting first two numeric columns in this chunk
                        numeric_cols = chunk.select_dtypes(include=[np.number]).columns.tolist()
                        numeric_cols = [c for c in numeric_cols if c.lower() != "label"]
                        if len(numeric_cols) >= 2:
                            use_x, use_y = numeric_cols[0], numeric_cols[1]
                        else:
                            # nothing usable in this chunk
                            continue
                    else:
                        use_x, use_y = x_col, y_col

                    sub = chunk[[use_x, use_y]].dropna()
                    if sub.shape[0] == 0:
                        continue

                    # how many samples to draw from this chunk
                    # draw up to remaining, but also up to a fraction of the chunk (to get distribution across chunks)
                    # here we draw min(remaining, len(sub))
                    n_draw = min(remaining, len(sub))
                    # rand sample indices
                    if n_draw < len(sub):
                        idx = np.random.choice(sub.index.values, size=n_draw, replace=False)
                        sample = sub.loc[idx].values
                    else:
                        sample = sub.values

                    x_vals = sample[:, 0].astype(np.float64)
                    y_vals = sample[:, 1].astype(np.float64)
                    xs.extend(x_vals.tolist())
                    ys.extend(y_vals.tolist())
                    labels.extend([label] * len(x_vals))
                    total_collected += len(x_vals)

                    # quick progress note
                    if total_collected % 1000 == 0:
                        print(f"[SAMPLED] {total_collected}/{max_points} points collected...")

                    if total_collected >= max_points:
                        stop_all = True
                        break
            except pd.errors.EmptyDataError:
                print(f"[WARN] {fpath} is empty, skipping.")
            except Exception as e:
                print(f"[WARN] Error streaming {fpath}: {e}")
        if stop_all:
            break

    if total_collected == 0:
        raise RuntimeError("No samples collected from the dataset. Check that numeric features exist.")

    X = np.column_stack((np.array(xs), np.array(ys)))
    y = np.array(labels, dtype=np.int32)
    print(f"[DONE] Collected {X.shape[0]} points for plotting.")
    return X, y

# ------------------------------
# Plotting
# ------------------------------
def plot_scatter(X, y, x_label, y_label, out_path):
    plt.figure(figsize=(8, 6))
    idx_no = (y == 0)
    idx_yes = (y == 1)
    plt.scatter(X[idx_no, 0], X[idx_no, 1], c="blue", label="No Attack", alpha=0.5, s=8)
    plt.scatter(X[idx_yes, 0], X[idx_yes, 1], c="red", label="Attack", alpha=0.5, s=8)
    plt.xlabel(x_label)
    plt.ylabel(y_label)
    plt.title(f"Scatter ({x_label} vs {y_label}) — attack vs no-attack")
    plt.legend()
    plt.tight_layout()
    plt.savefig(out_path)
    plt.close()
    print(f"[SAVED] Combined scatter saved to: {out_path}")

# ------------------------------
# Cleanup temp files (.tmp/.temp)
# ------------------------------
def cleanup_temp_files(base_dirs=None):
    if base_dirs is None:
        base_dirs = [tempfile.gettempdir(), base_path]
    patterns = ["*.tmp", "*.temp"]
    removed = 0
    for d in base_dirs:
        for pat in patterns:
            for f in glob.glob(os.path.join(d, pat)):
                try:
                    os.remove(f)
                    removed += 1
                    print(f"[CLEANUP] Removed temp file: {f}")
                except Exception as e:
                    print(f"[CLEANUP] Could not remove {f}: {e}")
    if removed == 0:
        print("[CLEANUP] No temp files found to remove.")
    else:
        print(f"[CLEANUP] Removed {removed} temp files.")

# ------------------------------
# Main
# ------------------------------
if __name__ == "__main__":
    # 1) Find two numeric columns to use
    x_col, y_col = find_two_numeric_columns(feature_files, base_path, chunksize=CHUNK_SIZE)
    if x_col is None or y_col is None:
        print("[ERROR] Could not find two numeric columns to plot in the dataset.")
        raise SystemExit(1)

    # 2) Stream and sample up to MAX_POINTS
    X_sampled, y_sampled = stream_and_sample(feature_files, base_path, x_col, y_col, max_points=MAX_POINTS, chunksize=CHUNK_SIZE)

    # 3) Plot combined scatter
    out_file = os.path.join(OUTPUT_DIR, "combined_scatter.png")
    plot_scatter(X_sampled, y_sampled, x_col, y_col, out_file)

    # 4) Cleanup temp files
    cleanup_temp_files([tempfile.gettempdir(), base_path])

    print("\n[SUCCESS] Done.")

[INFO] Chosen features: 'ttl' and 'ip_len' (from ./packet_features\normal.csv)
[SAMPLED] 20000/20000 points collected...
[DONE] Collected 20000 points for plotting.
[SAVED] Combined scatter saved to: outputs_scatter\combined_scatter.png
[CLEANUP] Could not remove C:\Users\VALMIK~1\AppData\Local\Temp\0aaf6103-e421-47be-a08d-275a36fc6fe8.tmp: [WinError 32] The process cannot access the file because it is being used by another process: 'C:\\Users\\VALMIK~1\\AppData\\Local\\Temp\\0aaf6103-e421-47be-a08d-275a36fc6fe8.tmp'
[CLEANUP] Could not remove C:\Users\VALMIK~1\AppData\Local\Temp\10dc3b92-bb32-419f-8837-66bc69d2cf90.tmp: [WinError 32] The process cannot access the file because it is being used by another process: 'C:\\Users\\VALMIK~1\\AppData\\Local\\Temp\\10dc3b92-bb32-419f-8837-66bc69d2cf90.tmp'
[CLEANUP] Could not remove C:\Users\VALMIK~1\AppData\Local\Temp\1cef3d27-3501-41ad-9004-ce7b2e5e72c2.tmp: [WinError 32] The process cannot access the file because it is being used by another 