In [None]:
from __future__ import annotations

import numpy as np
import pandas as pd
from pathlib import Path
from typing import Iterable

BOTIOT_COLUMNS = [
    "pkSeqID",
    "stime",
    "flgs",
    "proto",
    "saddr",
    "sport",
    "daddr",
    "dport",
    "pkts",
    "bytes",
    "state",
    "ltime",
    "seq",
    "dur",
    "mean",
    "stddev",
    "smac",
    "dmac",
    "sum",
    "min",
    "max",
    "soui",
    "doui",
    "sco",
    "dco",
    "spkts",
    "dpkts",
    "sbytes",
    "dbytes",
    "rate",
    "srate",
    "drate",
    "attack",
    "category",
    "subcategory",
]

def _safe_numeric(series: pd.Series, default: float = 0.0) -> pd.Series:
    values = pd.to_numeric(series, errors="coerce")
    if default == 0:
        return values.fillna(0.0)
    return values.fillna(default)

def transform_toniot(source: Path, target: Path) -> None:
    df = pd.read_csv(source)

    duration = _safe_numeric(df.get("duration", 0))
    src_bytes = _safe_numeric(df.get("src_bytes", 0))
    dst_bytes = _safe_numeric(df.get("dst_bytes", 0))
    src_pkts = _safe_numeric(df.get("src_pkts", 0))
    dst_pkts = _safe_numeric(df.get("dst_pkts", 0))
    ts = _safe_numeric(df.get("ts", 0))
    missed_bytes = _safe_numeric(df.get("missed_bytes", 0))

    total_bytes = src_bytes + dst_bytes
    total_pkts = src_pkts + dst_pkts
    ltime = ts + duration

    with np.errstate(divide="ignore", invalid="ignore"):
        rate = np.where(duration > 0, total_bytes / duration, 0)
        srate = np.where(duration > 0, src_bytes / duration, 0)
        drate = np.where(duration > 0, dst_bytes / duration, 0)

    attack_label = df.get("label", "").astype(str).str.lower()
    attack_numeric = attack_label.ne("normal").astype(int)

    result = pd.DataFrame({
        "pkSeqID": np.arange(1, len(df) + 1),
        "stime": df.get("ts"),
        "flgs": df.get("conn_state", pd.Series(["-"] * len(df))),
        "proto": df.get("proto"),
        "saddr": df.get("src_ip"),
        "sport": df.get("src_port"),
        "daddr": df.get("dst_ip"),
        "dport": df.get("dst_port"),
        "pkts": total_pkts,
        "bytes": total_bytes,
        "state": df.get("conn_state"),
        "ltime": ltime,
        "seq": missed_bytes,
        "dur": duration,
        "mean": 0,
        "stddev": 0,
        "smac": "",
        "dmac": "",
        "sum": total_bytes,
        "min": 0,
        "max": 0,
        "soui": 0,
        "doui": 0,
        "sco": 0,
        "dco": 0,
        "spkts": src_pkts,
        "dpkts": dst_pkts,
        "sbytes": src_bytes,
        "dbytes": dst_bytes,
        "rate": rate,
        "srate": srate,
        "drate": drate,
        "attack": attack_numeric,
        "category": df.get("type"),
        "subcategory": df.get("label"),
    })

    result = result[BOTIOT_COLUMNS]
    result.to_csv(target, index=False)

def transform_unsw(source: Path, target: Path) -> None:
    unsw_columns = [
        "srcip",
        "sport",
        "dstip",
        "dsport",
        "proto",
        "state",
        "dur",
        "sbytes",
        "dbytes",
        "sttl",
        "dttl",
        "sloss",
        "dloss",
        "service",
        "Sload",
        "Dload",
        "Spkts",
        "Dpkts",
        "swin",
        "dwin",
        "stcpb",
        "dtcpb",
        "smeansz",
        "dmeansz",
        "trans_depth",
        "res_bdy_len",
        "Sjit",
        "Djit",
        "Stime",
        "Ltime",
        "Sintpkt",
        "Dintpkt",
        "tcprtt",
        "synack",
        "ackdat",
        "is_sm_ips_ports",
        "ct_state_ttl",
        "ct_flw_http_mthd",
        "is_ftp_login",
        "ct_ftp_cmd",
        "ct_srv_src",
        "ct_srv_dst",
        "ct_dst_ltm",
        "ct_src_ltm",
        "ct_src_dport_ltm",
        "ct_dst_sport_ltm",
        "ct_dst_src_ltm",
        "attack_cat",
        "Label",
    ]

    df = pd.read_csv(source, header=None, names=unsw_columns)

    dur = _safe_numeric(df.get("dur", 0))
    sbytes = _safe_numeric(df.get("sbytes", 0))
    dbytes = _safe_numeric(df.get("dbytes", 0))
    spkts = _safe_numeric(df.get("Spkts", 0))
    dpkts = _safe_numeric(df.get("Dpkts", 0))
    sum_bytes = sbytes + dbytes
    total_pkts = spkts + dpkts
    stime = _safe_numeric(df.get("Stime", 0))
    ltime = _safe_numeric(df.get("Ltime", 0))
    sintpkt = _safe_numeric(df.get("Sintpkt", 0))
    dintpkt = _safe_numeric(df.get("Dintpkt", 0))
    ct_state_ttl = _safe_numeric(df.get("ct_state_ttl", 0))
    ct_srv_src = _safe_numeric(df.get("ct_srv_src", 0))
    ct_srv_dst = _safe_numeric(df.get("ct_srv_dst", 0))
    ct_dst_ltm = _safe_numeric(df.get("ct_dst_ltm", 0))
    ct_src_ltm = _safe_numeric(df.get("ct_src_ltm", 0))

    smeansz = _safe_numeric(df.get("smeansz", 0))
    dmeansz = _safe_numeric(df.get("dmeansz", 0))
    mean_vals = (smeansz + dmeansz) / 2.0
    sjit = _safe_numeric(df.get("Sjit", 0))
    djit = _safe_numeric(df.get("Djit", 0))
    std_vals = (sjit + djit) / 2.0

    with np.errstate(divide="ignore", invalid="ignore"):
        rate = np.where(dur > 0, sum_bytes / dur, 0)
        srate = np.where(dur > 0, sbytes / dur, 0)
        drate = np.where(dur > 0, dbytes / dur, 0)

    result = pd.DataFrame({
        "pkSeqID": np.arange(1, len(df) + 1),
        "stime": stime,
        "flgs": df.get("service"),
        "proto": df.get("proto"),
        "saddr": df.get("srcip"),
        "sport": df.get("sport"),
        "daddr": df.get("dstip"),
        "dport": df.get("dsport"),
        "pkts": total_pkts,
        "bytes": sum_bytes,
        "state": df.get("state"),
        "ltime": ltime,
        "seq": ct_state_ttl,
        "dur": dur,
        "mean": mean_vals,
        "stddev": std_vals,
        "smac": "",
        "dmac": "",
        "sum": sum_bytes,
        "min": sintpkt,
        "max": dintpkt,
        "soui": ct_srv_src,
        "doui": ct_srv_dst,
        "sco": ct_dst_ltm,
        "dco": ct_src_ltm,
        "spkts": spkts,
        "dpkts": dpkts,
        "sbytes": sbytes,
        "dbytes": dbytes,
        "rate": rate,
        "srate": srate,
        "drate": drate,
        "attack": df.get("Label"),
        "category": df.get("attack_cat"),
        "subcategory": df.get("service"),
    })

    result = result[BOTIOT_COLUMNS]
    result.to_csv(target, index=False)

def report_missing(files: Iterable[Path], base: Path) -> None:
    for file in files:
        if not file.exists():
            print(f"Skipping {file.name}: not found in {base}")

In [None]:
from pathlib import Path
from convert_to_botiot import transform_toniot

# Update these paths and the pattern before running the cell
toniot_input_dir = Path(r"C:\Users\z-pc\Desktop\lightnet-botnet-detector\data\raw\Processed_TON_IoT_dataset\Processed_Network_dataset")
toniot_output_dir = Path(r"C:\Users\z-pc\Desktop\data pro\TONIOT")
toniot_pattern = "*.csv"  # glob pattern for TONIOT samples

toniot_output_dir.mkdir(parents=True, exist_ok=True)

toniot_files = sorted(toniot_input_dir.glob(toniot_pattern))
if not toniot_files:
    print("No TONIOT files matched the provided pattern.")
else:
    for csv_path in toniot_files:
        output_path = toniot_output_dir / f"{csv_path.stem}.csv"
        transform_toniot(csv_path, output_path)
        print(f"Wrote {output_path}")

In [None]:
from pathlib import Path
from convert_to_botiot import transform_unsw

# Update these paths and the pattern before running the cell
unsw_input_dir = Path(r"C:\Users\z-pc\Desktop\data pro\New folder")
unsw_output_dir = Path(r"C:\Users\z-pc\Desktop\data pro\NUSW")
unsw_pattern = "*.csv"  # glob pattern for UNSW samples without headers

unsw_output_dir.mkdir(parents=True, exist_ok=True)

unsw_files = sorted(unsw_input_dir.glob(unsw_pattern))
if not unsw_files:
    print("No UNSW files matched the provided pattern.")
else:
    for csv_path in unsw_files:
        output_path = unsw_output_dir / f"{csv_path.stem}.csv"
        transform_unsw(csv_path, output_path)
        print(f"Wrote {output_path}")