In [None]:
# mqtt_pcap_to_csv.py
# Standalone script to convert pcap files into packet, uniflow, and biflow CSV feature sets
# Produces folders: packet_features, uniflow_features, biflow_features
# Each folder contains 5 CSVs: normal, sparta, scan_A, mqtt_bruteforce, scan_sU

import os
import pandas as pd
from scapy.all import rdpcap, IP, TCP, UDP
from collections import defaultdict

# ------------------------------
# CONFIG
# ------------------------------
INPUT_PCAPS = {
    "normal": "normal.pcap",
    "sparta": "sparta.pcap",
    "scan_A": "scan_A.pcap",
    "mqtt_bruteforce": "mqtt_bruteforce.pcap",
    "scan_sU": "scan_sU.pcap",
}
OUTPUT_DIRS = {
    "packet": "packet_features",
    "uniflow": "uniflow_features",
    "biflow": "biflow_features"
}

# Ensure output dirs exist
for folder in OUTPUT_DIRS.values():
    os.makedirs(folder, exist_ok=True)


# ------------------------------
# FEATURE EXTRACTION HELPERS
# ------------------------------

def extract_packet_features(packets):
    """Extract raw packet-level features into a DataFrame"""
    rows = []
    for p in packets:
        if IP not in p:
            continue
        ip = p[IP]
        proto = ip.proto
        length = len(p)

        row = {
            "src_ip": ip.src,
            "dst_ip": ip.dst,
            "protocol": proto,
            "packet_len": length,
            "ttl": ip.ttl
        }
        if TCP in p:
            row.update({
                "sport": p[TCP].sport,
                "dport": p[TCP].dport,
                "flags": int(p[TCP].flags)
            })
        elif UDP in p:
            row.update({
                "sport": p[UDP].sport,
                "dport": p[UDP].dport
            })
        else:
            row.update({"sport": 0, "dport": 0, "flags": 0})
        rows.append(row)
    return pd.DataFrame(rows)


def extract_uniflow_features(packets):
    """Aggregate features per unidirectional flow"""
    flows = defaultdict(list)
    for p in packets:
        if IP not in p:
            continue
        ip = p[IP]
        proto = ip.proto
        if TCP in p:
            key = (ip.src, ip.dst, p[TCP].sport, p[TCP].dport, proto)
            plen = len(p)
        elif UDP in p:
            key = (ip.src, ip.dst, p[UDP].sport, p[UDP].dport, proto)
            plen = len(p)
        else:
            continue
        flows[key].append(plen)

    rows = []
    for (src, dst, sport, dport, proto), sizes in flows.items():
        row = {
            "src_ip": src,
            "dst_ip": dst,
            "sport": sport,
            "dport": dport,
            "protocol": proto,
            "pkt_count": len(sizes),
            "bytes_total": sum(sizes),
            "bytes_mean": sum(sizes) / len(sizes)
        }
        rows.append(row)
    return pd.DataFrame(rows)


def extract_biflow_features(packets):
    """Aggregate features per bidirectional flow (ignores direction)"""
    flows = defaultdict(list)
    for p in packets:
        if IP not in p:
            continue
        ip = p[IP]
        proto = ip.proto
        if TCP in p:
            key = tuple(sorted([(ip.src, p[TCP].sport), (ip.dst, p[TCP].dport)])) + (proto,)
            plen = len(p)
        elif UDP in p:
            key = tuple(sorted([(ip.src, p[UDP].sport), (ip.dst, p[UDP].dport)])) + (proto,)
            plen = len(p)
        else:
            continue
        flows[key].append(plen)

    rows = []
    for key, sizes in flows.items():
        ((a_ip, a_port), (b_ip, b_port), proto) = key
        row = {
            "endA_ip": a_ip,
            "endA_port": a_port,
            "endB_ip": b_ip,
            "endB_port": b_port,
            "protocol": proto,
            "pkt_count": len(sizes),
            "bytes_total": sum(sizes),
            "bytes_mean": sum(sizes) / len(sizes)
        }
        rows.append(row)
    return pd.DataFrame(rows)


# ------------------------------
# MAIN PIPELINE
# ------------------------------

def process_pcap(label, filepath):
    print(f"[INFO] Processing {label}: {filepath}")
    packets = rdpcap(filepath)

    # Extract features
    df_packet = extract_packet_features(packets)
    df_uniflow = extract_uniflow_features(packets)
    df_biflow = extract_biflow_features(packets)

    # Save to CSV in expected structure
    df_packet.to_csv(os.path.join(OUTPUT_DIRS["packet"], f"{label}.csv"), index=False)
    df_uniflow.to_csv(os.path.join(OUTPUT_DIRS["uniflow"], f"uniflow_{label}.csv"), index=False)
    df_biflow.to_csv(os.path.join(OUTPUT_DIRS["biflow"], f"biflow_{label}.csv"), index=False)

    print(f"[OK] Saved features for {label}")


if __name__ == "__main__":
    for label, fname in INPUT_PCAPS.items():
        if not os.path.isfile(fname):
            print(f"[WARN] File {fname} not found, skipping.")
            continue
        process_pcap(label, fname)

    print("[DONE] All pcaps processed. Features are in packet_features/, uniflow_features/, biflow_features/")