In [1]:
#!/usr/bin/env python3
"""
Histogram plot of number of data points per class (attack type + normal).

This script:
 - Streams CSV files in chunks (no full concat, memory safe)
 - Counts rows per class (normal + attack types)
 - Plots a histogram (bar chart) saved to OUTPUT_DIR/class_histogram.png
 - Cleans up temporary *.tmp / *.temp files in system temp dir and base folder
"""

import os
import glob
import tempfile
import pandas as pd
import matplotlib.pyplot as plt

# ------------------------------
# Config
# ------------------------------
base_path = "./"   # root folder where feature folders live
folders = {
    "packet": "packet_features",
    "uniflow": "uniflow_features",
    "biflow": "biflow_features"
}
files = {
    "normal": "normal.csv",
    "sparta": "sparta.csv",
    "scan_A": "scan_A.csv",
    "mqtt_bruteforce": "mqtt_bruteforce.csv",
    "scan_sU": "scan_sU.csv"
}
def build_filenames(prefix):
    return {
        "normal": f"{prefix}_normal.csv",
        "sparta": f"{prefix}_sparta.csv",
        "scan_A": f"{prefix}_scan_A.csv",
        "mqtt_bruteforce": f"{prefix}_mqtt_bruteforce.csv",
        "scan_sU": f"{prefix}_scan_sU.csv"
    }
feature_files = {
    "packet": files,
    "uniflow": build_filenames("uniflow"),
    "biflow": build_filenames("biflow")
}

OUTPUT_DIR = "outputs_histogram"
os.makedirs(OUTPUT_DIR, exist_ok=True)

CHUNK_SIZE = 50000  # pandas read_csv chunksize

# ------------------------------
# Count samples per class
# ------------------------------
def count_samples(feature_files_map, base_path, chunksize=CHUNK_SIZE):
    class_counts = {}

    for level, file_dict in feature_files_map.items():
        folder_path = os.path.join(base_path, folders[level])
        for name, fname in file_dict.items():
            fpath = os.path.join(folder_path, fname)
            if not os.path.exists(fpath):
                continue
            try:
                for chunk in pd.read_csv(fpath, chunksize=chunksize, low_memory=False):
                    n = len(chunk)
                    class_counts[name] = class_counts.get(name, 0) + n
            except pd.errors.EmptyDataError:
                print(f"[WARN] {fpath} is empty, skipping.")
            except Exception as e:
                print(f"[WARN] Could not read {fpath}: {e}")

    if not class_counts:
        raise RuntimeError("No data points found in any CSV file.")
    return class_counts

# ------------------------------
# Plot histogram
# ------------------------------
def plot_histogram(class_counts, out_path):
    classes = list(class_counts.keys())
    counts = [class_counts[c] for c in classes]

    plt.figure(figsize=(8, 6))
    plt.bar(classes, counts, color="skyblue", edgecolor="black")
    plt.xticks(rotation=45, ha="right")
    plt.ylabel("Number of Data Points")
    plt.title("Data Points per Class (Attack Types + Normal)")
    plt.tight_layout()
    plt.savefig(out_path)
    plt.close()
    print(f"[SAVED] Histogram saved to: {out_path}")

# ------------------------------
# Cleanup temp files (.tmp/.temp)
# ------------------------------
def cleanup_temp_files(base_dirs=None):
    if base_dirs is None:
        base_dirs = [tempfile.gettempdir(), base_path]
    patterns = ["*.tmp", "*.temp"]
    removed = 0
    for d in base_dirs:
        for pat in patterns:
            for f in glob.glob(os.path.join(d, pat)):
                try:
                    os.remove(f)
                    removed += 1
                    print(f"[CLEANUP] Removed temp file: {f}")
                except Exception as e:
                    print(f"[CLEANUP] Could not remove {f}: {e}")
    if removed == 0:
        print("[CLEANUP] No temp files found to remove.")
    else:
        print(f"[CLEANUP] Removed {removed} temp files.")

# ------------------------------
# Main
# ------------------------------
if __name__ == "__main__":
    # 1) Count samples
    counts = count_samples(feature_files, base_path, chunksize=CHUNK_SIZE)
    print("[INFO] Sample counts per class:", counts)

    # 2) Plot histogram
    out_file = os.path.join(OUTPUT_DIR, "class_histogram.png")
    plot_histogram(counts, out_file)

    # 3) Cleanup
    cleanup_temp_files([tempfile.gettempdir(), base_path])

    print("\n[SUCCESS] Done.")

[INFO] Sample counts per class: {'normal': 1314075, 'sparta': 20949865, 'scan_A': 188443, 'mqtt_bruteforce': 10095091, 'scan_sU': 329764}
[SAVED] Histogram saved to: outputs_histogram\class_histogram.png
[CLEANUP] Could not remove C:\Users\VALMIK~1\AppData\Local\Temp\29acb3a2-ae9f-4900-bbe8-e6f9d7d72d33.tmp: [WinError 32] The process cannot access the file because it is being used by another process: 'C:\\Users\\VALMIK~1\\AppData\\Local\\Temp\\29acb3a2-ae9f-4900-bbe8-e6f9d7d72d33.tmp'
[CLEANUP] Could not remove C:\Users\VALMIK~1\AppData\Local\Temp\3804e259-4296-49c5-85be-77042dfa429c.tmp: [WinError 32] The process cannot access the file because it is being used by another process: 'C:\\Users\\VALMIK~1\\AppData\\Local\\Temp\\3804e259-4296-49c5-85be-77042dfa429c.tmp'
[CLEANUP] Could not remove C:\Users\VALMIK~1\AppData\Local\Temp\4883b47a-5728-47b4-857f-9fba5d2ce8ad.tmp: [WinError 32] The process cannot access the file because it is being used by another process: 'C:\\Users\\VALMIK~1\\Ap