In [None]:
from pathlib import Path
import pandas as pd

folder_path = Path("MIX")

csv_files = sorted(folder_path.glob("*.csv"))
if not csv_files:
    print("No CSV files found.")
else:
    for csv_path in csv_files:
        df = pd.read_csv(csv_path)
        print(f"{csv_path.name}:")
        print(f"  columns: {list(df.columns)}")
        print(f"  rows: {len(df)}\n")

In [None]:
from pathlib import Path
import pandas as pd

botiot_folder = Path("BOTIOT")
output_folder = Path("OUTPUT")
output_folder.mkdir(parents=True, exist_ok=True)

label_candidates = (
    "label", "Label", "attack", "Attack", "attack_label", "attack type",
    "attack_type", "type", "Type", "category", "Category", "class", "Class"
)
max_rows = 1_000_000

def save_partitions(df, base_name, suffix):
    if df.empty:
        return 0
    parts = (len(df) + max_rows - 1) // max_rows
    for idx in range(parts):
        chunk = df.iloc[idx * max_rows : (idx + 1) * max_rows]
        filename = (
            f"{base_name}_{suffix}.csv"
            if parts == 1
            else f"{base_name}_{suffix}_{idx + 1}.csv"
        )
        chunk.to_csv(output_folder / filename, index=False)
    return parts

attack_frames = []
normal_frames = []

for csv_path in sorted(botiot_folder.glob("*.csv")):
    try:
        source_df = pd.read_csv(csv_path)
    except Exception as exc:
        print(f"Skipping {csv_path.name}: failed to read ({exc}).")
        continue

    label_col = next((col for col in label_candidates if col in source_df.columns), None)
    if label_col is None:
        print(f"Skipping {csv_path.name}: no label column found.")
        continue

    labels = source_df[label_col]
    if pd.api.types.is_numeric_dtype(labels):
        labels_int = labels.fillna(-1).astype(int)
        attack_mask = labels_int == 1
        normal_mask = labels_int == 0
    else:
        labels_lower = labels.astype(str).str.strip().str.lower()
        attack_mask = labels_lower.str.contains("attack") | labels_lower.str.contains("malic") | labels_lower.str.contains("bot")
        normal_mask = labels_lower.str.contains("normal") | labels_lower.str.contains("benign") | labels_lower.str.contains("legit")

    attack_df = source_df[attack_mask]
    normal_df = source_df[normal_mask]

    if attack_df.empty and normal_df.empty:
        print(f"Skipping {csv_path.name}: could not separate attack/normal rows.")
        continue

    if not attack_df.empty:
        attack_frames.append(attack_df)
    if not normal_df.empty:
        normal_frames.append(normal_df)

    print(f"{csv_path.name}: attack_rows={len(attack_df)}, normal_rows={len(normal_df)}")

combined_attack = pd.concat(attack_frames, ignore_index=True) if attack_frames else pd.DataFrame()
combined_normal = pd.concat(normal_frames, ignore_index=True) if normal_frames else pd.DataFrame()

attack_files = save_partitions(combined_attack, botiot_folder.name, "attack")
normal_files = save_partitions(combined_normal, botiot_folder.name, "normal")

print(f"Total attack rows={len(combined_attack)} ({attack_files} file(s)), total normal rows={len(combined_normal)} ({normal_files} file(s))")

In [None]:
datasets = [Path("NUSW"), Path("TONIOT")]
dataset_summaries = {}

for dataset_folder in datasets:
    if not dataset_folder.exists():
        print(f"{dataset_folder.name}: folder not found, skipping.")
        continue

    attack_parts = []
    normal_parts = []

    for csv_path in sorted(dataset_folder.glob("*.csv")):
        try:
            source_df = pd.read_csv(csv_path)
        except Exception as exc:
            print(f"Skipping {dataset_folder.name}/{csv_path.name}: failed to read ({exc}).")
            continue

        label_col = next((col for col in label_candidates if col in source_df.columns), None)
        if label_col is None:
            print(f"Skipping {dataset_folder.name}/{csv_path.name}: no label column found.")
            continue

        labels = source_df[label_col]
        if pd.api.types.is_numeric_dtype(labels):
            labels_int = labels.fillna(-1).astype(int)
            attack_mask = labels_int == 1
            normal_mask = labels_int == 0
        else:
            labels_lower = labels.astype(str).str.strip().str.lower()
            attack_mask = (
                labels_lower.str.contains("attack")
                | labels_lower.str.contains("malic")
                | labels_lower.str.contains("bot")
            )
            normal_mask = (
                labels_lower.str.contains("normal")
                | labels_lower.str.contains("benign")
                | labels_lower.str.contains("legit")
            )

        attack_rows = source_df[attack_mask]
        normal_rows = source_df[normal_mask]

        if attack_rows.empty and normal_rows.empty:
            print(f"Skipping {dataset_folder.name}/{csv_path.name}: could not separate attack/normal rows.")
            continue

        if not attack_rows.empty:
            attack_parts.append(attack_rows)
        if not normal_rows.empty:
            normal_parts.append(normal_rows)

        print(
            f"{dataset_folder.name}/{csv_path.name}: "
            f"attack_rows={len(attack_rows)}, normal_rows={len(normal_rows)}"
        )

    combined_attack = pd.concat(attack_parts, ignore_index=True) if attack_parts else pd.DataFrame()
    combined_normal = pd.concat(normal_parts, ignore_index=True) if normal_parts else pd.DataFrame()

    attack_files = save_partitions(combined_attack, dataset_folder.name, "attack")
    normal_files = save_partitions(combined_normal, dataset_folder.name, "normal")

    dataset_summaries[dataset_folder.name] = {
        "attack_rows": len(combined_attack),
        "attack_files": attack_files,
        "normal_rows": len(combined_normal),
        "normal_files": normal_files,
    }

    print(
        f"{dataset_folder.name}: total attack rows={len(combined_attack)} ({attack_files} file(s)), "
        f"total normal rows={len(combined_normal)} ({normal_files} file(s))"
    )

dataset_summaries

In [None]:
output_folder = Path("OUTPUT")
output_v2_folder = Path("output_v2")
output_v2_folder.mkdir(parents=True, exist_ok=True)

attack_files = sorted(output_folder.glob("*attack*.csv"))

if not attack_files:
    print("No attack files found in OUTPUT.")
else:
    for attack_file in attack_files:
        try:
            df = pd.read_csv(attack_file)
        except Exception as exc:
            print(f"Skipping {attack_file.name}: failed to read ({exc}).")
            continue

        if "category" not in df.columns:
            print(f"Skipping {attack_file.name}: 'category' column missing.")
            continue

        subgroups = df.groupby(df["category"].fillna("unknown"))
        for subcat_value, sub_df in subgroups:
            subcat_str = str(subcat_value).strip() or "unknown"
            sanitized = "".join(ch.lower() if ch.isalnum() else "_" for ch in subcat_str)
            sanitized = "_".join(filter(None, sanitized.split("_"))) or "unknown"

            output_name = f"{attack_file.stem}_{sanitized}.csv"
            sub_df.to_csv(output_v2_folder / output_name, index=False)
            print(f"Saved {output_name} ({len(sub_df)} rows)")

In [12]:
category_counts = {}
missing_present = False
missing_count = 0

for csv_path in sorted(output_v2_folder.glob("*.csv")):
    try:
        data = pd.read_csv(csv_path, usecols=["category"], dtype="string")
    except ValueError:
        continue

    value_counts = data["category"].value_counts(dropna=False)
    for value, count in value_counts.items():
        if pd.isna(value):
            missing_present = True
            missing_count += count
        else:
            category_counts[value] = category_counts.get(value, 0) + count

unique_categories = sorted(category_counts)
if missing_present:
    unique_categories.append(pd.NA)

counts_data = []
for category in unique_categories:
    if pd.isna(category):
        counts_data.append((pd.NA, missing_count))
    else:
        counts_data.append((category, category_counts[category]))

result = pd.DataFrame(counts_data, columns=["category", "count"])
result["category"] = result["category"].astype("string")
result["count"] = result["count"].astype("Int64")
result

Unnamed: 0,category,count
0,Fuzzers,5051
1,Fuzzers,38390
2,Reconnaissance,12228
3,Shellcode,1288
4,Analysis,5354
5,Backdoor,3590
6,Backdoors,1068
7,DDoS,38532480
8,DoS,33037900
9,Exploits,89050


In [None]:
output_v3_folder = Path("output_v3")
output_v3_folder.mkdir(parents=True, exist_ok=True)

def normalize_category(value):
    if pd.isna(value):
        return "unknown"
    cleaned = " ".join(str(value).strip().split())
    return cleaned.lower() or "unknown"

def sanitize_filename(text):
    safe = "".join(ch.lower() if ch.isalnum() else "_" for ch in text)
    safe = "_".join(filter(None, safe.split("_")))
    return f"{safe or 'unknown'}_combined.csv"

category_to_file = {}

for csv_path in sorted(output_v2_folder.glob("*.csv")):
    try:
        for chunk in pd.read_csv(csv_path, chunksize=200_000):
            if "category" not in chunk.columns:
                continue
            chunk = chunk.copy()
            chunk.loc[:, "category"] = chunk["category"].apply(normalize_category)
            for category_value, group in chunk.groupby("category"):
                target_name = sanitize_filename(category_value)
                target_path = output_v3_folder / target_name
                category_to_file[category_value] = target_name
                group.to_csv(
                    target_path,
                    mode="a",
                    index=False,
                    header=not target_path.exists(),
                )
        print(f"Processed {csv_path.name}")
    except Exception as exc:
        print(f"Skipping {csv_path.name}: {exc}")

pd.Series(category_to_file, name="filename").sort_index()

In [14]:
v3_counts = []

for csv_path in sorted(output_v3_folder.glob("*.csv")):
    try:
        row_total = len(pd.read_csv(csv_path))
    except Exception as exc:
        print(f"Skipping {csv_path.name}: {exc}")
        continue

    v3_counts.append({"filename": csv_path.name, "rows": row_total})

v3_counts = pd.DataFrame(v3_counts).sort_values("filename").reset_index(drop=True)
v3_counts


  row_total = len(pd.read_csv(csv_path))
  row_total = len(pd.read_csv(csv_path))
  row_total = len(pd.read_csv(csv_path))


Unnamed: 0,filename,rows
0,analysis_combined.csv,5354
1,backdoor_combined.csv,511706
2,backdoors_combined.csv,1068
3,ddos_combined.csv,44697488
4,dos_combined.csv,36413228
5,exploits_combined.csv,89050
6,fuzzers_combined.csv,43441
7,generic_combined.csv,430962
8,injection_combined.csv,452659
9,mitm_combined.csv,1052


Quick take: in IoT datasets, “botnet-related” usually means attack behaviors commonly orchestrated by a distributed set of compromised devices.

Primary botnet behaviors: DDoS, DoS, Scanning, Reconnaissance, Password attacks
Propagation/foothold aids: Worms, Backdoor
Often included in botnet campaigns: Exploits, Injection
Not typically botnet-specific: MITM, XSS, Shellcode, Analysis, Theft, Generic, Normal
Related to botnets from your list:

DDoS: hallmark botnet activity leveraging many nodes.
DoS: often botnet-driven when large-scale.
Scanning: used by botnets to discover new targets.
Reconnaissance: pre-attack info gathering by botnets.
Password (brute-force/credential stuffing): common automated botnet task.
Worms: used to self-propagate and grow the botnet.
Backdoor: used to maintain remote control; common in botnet staging.
Exploits and Injection: frequently used by botnets to gain/extend access.
Less directly botnet-specific:

MITM, XSS, Shellcode, Theft, Generic, Analysis, Normal.

In [15]:
output_v4_folder = Path("output_v4")

category_counts_v4 = {}
missing_present_v4 = False
missing_count_v4 = 0

if output_v4_folder.exists():
    for csv_path in sorted(output_v4_folder.glob("*.csv")):
        try:
            data_v4 = pd.read_csv(csv_path, usecols=["category"], dtype="string")
        except ValueError:
            continue

        value_counts_v4 = data_v4["category"].value_counts(dropna=False)
        for value, count in value_counts_v4.items():
            if pd.isna(value):
                missing_present_v4 = True
                missing_count_v4 += int(count)
            else:
                category_counts_v4[value] = category_counts_v4.get(value, 0) + int(count)

    unique_categories_v4 = sorted(category_counts_v4)
    if missing_present_v4:
        unique_categories_v4.append(pd.NA)

    counts_data_v4 = []
    for category in unique_categories_v4:
        if pd.isna(category):
            counts_data_v4.append((pd.NA, missing_count_v4))
        else:
            counts_data_v4.append((category, category_counts_v4[category]))

    result_v4 = pd.DataFrame(counts_data_v4, columns=["category", "count"])
    result_v4["category"] = result_v4["category"].astype("string")
    result_v4["count"] = result_v4["count"].astype("Int64")
    result_v4
else:
    print("output_v4 folder not found.")

In [16]:
result_v4

Unnamed: 0,category,count
0,backdoor,511706
1,backdoors,1068
2,ddos,44697488
3,dos,36413228
4,exploits,89050
5,fuzzers,43441
6,injection,452659
7,normal,1005059
8,password,1718568
9,ransomware,72805


In [17]:
from pathlib import Path
import pandas as pd
import numpy as np

# Balanced dataset: keep all minority classes, downsample majorities,
# and balance total attacks vs normal.
source_folder = Path("output_v3")  # each file is one category combined
balanced_output = Path("balanced_output")
balanced_output.mkdir(parents=True, exist_ok=True)

# Configure caps
min_per_class = 5_000     # try to take at least this many when available
max_per_class = 80_000    # cap any single attack class to this

def read_category_file(category: str) -> pd.DataFrame:
    filename = f"{category.lower()}_combined.csv"
    path = source_folder / filename
    if not path.exists():
        raise FileNotFoundError(path)
    return pd.read_csv(path)

# Build counts table (reuse v3_counts if present)
try:
    v3_counts_df = v3_counts.copy()
except NameError:
    v3_counts_df = []
    for csv_path in sorted(source_folder.glob("*.csv")):
        try:
            c = len(pd.read_csv(csv_path))
        except Exception:
            c = 0
        v3_counts_df.append({"filename": csv_path.name, "rows": c})
    v3_counts_df = pd.DataFrame(v3_counts_df)

if v3_counts_df.empty:
    print("No category files to balance.")

def fname_to_cat(name: str) -> str:
    return name.replace("_combined.csv", "")

v3_counts_df["category"] = v3_counts_df["filename"].map(fname_to_cat)
available = v3_counts_df.set_index("category")["rows"].to_dict()

# Separate normal from attack categories
categories = sorted(available.keys())
attack_cats = [c for c in categories if c != "normal"]
has_normal = "normal" in available

balanced_parts = []
summary_rows = []

# For each attack class: take all if minority, else random downsample to cap
total_attack_taken = 0
for cat in attack_cats:
    avail = available[cat]
    df_cat = read_category_file(cat)
    take = avail
    # keep all minority classes fully
    if avail <= min_per_class:
        take = avail
    else:
        # majority classes: limit by max_per_class
        take = min(avail, max_per_class)
    if take < len(df_cat):
        df_cat = df_cat.sample(n=take, random_state=42)
    if "category" not in df_cat.columns:
        df_cat["category"] = cat
    balanced_parts.append(df_cat)
    total_attack_taken += len(df_cat)
    summary_rows.append({"category": cat, "available": avail, "taken": len(df_cat)})

# Balance normal to match total_attack_taken (or take all if smaller)
if has_normal:
    normal_df = read_category_file("normal")
    normal_avail = len(normal_df)
    normal_take = min(normal_avail, total_attack_taken)
    if normal_take < normal_avail:
        normal_df = normal_df.sample(n=normal_take, random_state=42)
    if "category" not in normal_df.columns:
        normal_df["category"] = "normal"
    balanced_parts.append(normal_df)
    summary_rows.append({"category": "normal", "available": normal_avail, "taken": len(normal_df)})

balanced_df = pd.concat(balanced_parts, ignore_index=True) if balanced_parts else pd.DataFrame()
summary_df = pd.DataFrame(summary_rows).sort_values("category")

out_csv = balanced_output / "balanced_sample.csv"
balanced_df.to_csv(out_csv, index=False)

print(f"Balanced sample saved to: {out_csv}")
summary_df

  return pd.read_csv(path)
  return pd.read_csv(path)
  return pd.read_csv(path)
  return pd.read_csv(path)
  return pd.read_csv(path)
  return pd.read_csv(path)
  return pd.read_csv(path)


Balanced sample saved to: balanced_output\balanced_sample.csv


Unnamed: 0,category,available,taken
0,analysis,5354,5354
1,backdoor,511706,80000
2,backdoors,1068,1068
3,ddos,44697488,80000
4,dos,36413228,80000
5,exploits,89050,80000
6,fuzzers,43441,43441
7,generic,430962,80000
8,injection,452659,80000
9,mitm,1052,1052


In [20]:
attack_total = summary_df.loc[summary_df["category"] != "normal", "taken"].sum()
normal_total = summary_df.loc[summary_df["category"] == "normal", "taken"].sum()

pd.DataFrame(
    {"attack_total": [attack_total], "normal_total": [normal_total], "difference": [attack_total - normal_total]}
)

Unnamed: 0,attack_total,normal_total,difference
0,927389,927389,0


In [23]:
missing_summary = (
    balanced_df.isna()
    .sum()
    .rename("missing_count")
    .to_frame()
    .assign(total_rows=len(balanced_df))
)
missing_summary["missing_pct"] = (missing_summary["missing_count"] / missing_summary["total_rows"] * 100).round(2)
missing_summary.reset_index().rename(columns={"index": "column"})

Unnamed: 0,column,missing_count,total_rows,missing_pct
0,pkSeqID,0,1854778,0.0
1,stime,0,1854778,0.0
2,flgs,0,1854778,0.0
3,proto,0,1854778,0.0
4,saddr,0,1854778,0.0
5,sport,27,1854778,0.0
6,daddr,0,1854778,0.0
7,dport,27,1854778,0.0
8,pkts,0,1854778,0.0
9,bytes,0,1854778,0.0


In [28]:
missing_summary = (
    balanced_df.isna()
    .sum()
    .rename("missing_count")
    .to_frame()
    .assign(total_rows=len(balanced_df))
)
missing_summary["missing_pct"] = (missing_summary["missing_count"] / missing_summary["total_rows"] * 100).round(2)
missing_summary.reset_index().rename(columns={"index": "column"})

Unnamed: 0,column,missing_count,total_rows,missing_pct
0,pkSeqID,0,1854778,0.0
1,stime,0,1854778,0.0
2,flgs,0,1854778,0.0
3,proto,0,1854778,0.0
4,saddr,0,1854778,0.0
5,sport,27,1854778,0.0
6,daddr,0,1854778,0.0
7,dport,27,1854778,0.0
8,pkts,0,1854778,0.0
9,bytes,0,1854778,0.0


In [31]:
columns_to_drop = ["attack", "category", "subcategory","subcategory ","smac", "dmac","soui","doui","sco","dco"]

attacks_output = balanced_df.loc[balanced_df["category"] != "normal"].copy()
normals_output = balanced_df.loc[balanced_df["category"] == "normal"].copy()

attacks_output.drop(columns=columns_to_drop, inplace=True, errors="ignore")
normals_output.drop(columns=columns_to_drop, inplace=True, errors="ignore")

attacks_path = balanced_output / "attacks.csv"
normals_path = balanced_output / "normal.csv"

attacks_output.to_csv(attacks_path, index=False)
normals_output.to_csv(normals_path, index=False)

print(f"Saved {len(attacks_output)} attack rows to {attacks_path.name}")
print(f"Saved {len(normals_output)} normal rows to {normals_path.name}")

Saved 927389 attack rows to attacks.csv
Saved 927389 normal rows to normal.csv


In [32]:
def count_empty_cells(df: pd.DataFrame) -> pd.Series:
    na_counts = df.isna().sum()
    blank_counts = df.select_dtypes(include="object").eq("").sum()
    blank_counts = blank_counts.reindex(df.columns, fill_value=0)
    return na_counts.add(blank_counts, fill_value=0)

attack_empty = count_empty_cells(attacks_output)
normal_empty = count_empty_cells(normals_output)

empty_summary = pd.DataFrame(
    {"attack_empty": attack_empty, "normal_empty": normal_empty}
).astype("Int64")

empty_summary.loc["__TOTAL__"] = {
    col: int(empty_summary[col].sum()) for col in empty_summary.columns
}
empty_summary

Unnamed: 0,attack_empty,normal_empty
pkSeqID,0,0
stime,0,0
flgs,0,0
proto,0,0
saddr,0,0
sport,27,0
daddr,0,0
dport,27,0
pkts,0,0
bytes,0,0


In [39]:
chunk_size = 20_000_000

def drop_empty_rows(df: pd.DataFrame) -> pd.DataFrame:
    cleaned = df.copy()
    obj_cols = cleaned.select_dtypes(include="object").columns
    if len(obj_cols):
        cleaned[obj_cols] = cleaned[obj_cols].replace(r"^\s*$", pd.NA, regex=True)
    return cleaned.dropna()

attack_clean = drop_empty_rows(attacks_output)
normal_clean = drop_empty_rows(normals_output)

chunk_n = min(chunk_size, len(attack_clean), len(normal_clean))
if chunk_n == 0:
    raise ValueError("Cannot create a balanced chunk: one of the sources has no complete rows.")

attack_chunk = attack_clean.sample(n=chunk_n, random_state=42).copy()
attack_chunk["source"] = "attack"

normal_chunk = normal_clean.sample(n=chunk_n, random_state=42).copy()
normal_chunk["source"] = "normal"

chunk_df = (
    pd.concat([attack_chunk, normal_chunk], ignore_index=True)
    .sample(frac=1, random_state=42)
    .reset_index(drop=True)
)

if chunk_df.isna().any().any() or chunk_df.select_dtypes(include="object").eq("").any().any():
    raise ValueError("Chunk still contains empty values after cleaning.")

chunk_path = balanced_output / "attack_normal_chunk_full.csv"
chunk_df.to_csv(chunk_path, index=False)

print(
    f"Saved {len(chunk_df)} rows to {chunk_path.name} "
    f"({(chunk_df['source'] == 'attack').sum()} attack / {(chunk_df['source'] == 'normal').sum()} normal)"
)
chunk_df.head()

  cleaned[obj_cols] = cleaned[obj_cols].replace(r"^\s*$", pd.NA, regex=True)


Saved 1854724 rows to attack_normal_chunk_full.csv (927362 attack / 927362 normal)


Unnamed: 0,pkSeqID,stime,flgs,proto,saddr,sport,daddr,dport,pkts,bytes,...,min,max,spkts,dpkts,sbytes,dbytes,rate,srate,drate,source
0,66424,1556087000.0,S0,udp,192.168.1.31,57056,192.168.1.194,10081,2,132.0,...,0.0,0.0,2,0,132.0,0,26.588754,26.588754,0.0,attack
1,747890,1556143000.0,S0,udp,192.168.1.195,57839,239.255.255.250,1900,24,3016.0,...,0.0,0.0,24,0,3016.0,0,910.589803,910.589803,0.0,normal
2,139150,1554314000.0,OTH,tcp,127.0.0.1,42100,127.0.0.1,7878,0,0.0,...,0.0,0.0,0,0,0.0,0,0.0,0.0,0.0,normal
3,137880,1556085000.0,SF,tcp,192.168.1.195,61775,192.168.1.1,41952,16,4204.0,...,0.0,0.0,7,9,226.0,3978,330529.129649,17768.692507,312760.437141,normal
4,81498,1556085000.0,S0,tcp,192.168.1.32,19824,192.168.1.194,6517,1,0.0,...,0.0,0.0,1,0,0.0,0,0.0,0.0,0.0,attack


In [40]:
from IPython.display import display
from ipaddress import ip_address
from numbers import Integral

if "chunk_df" in globals():
    chunk_source = chunk_df.copy()
else:
    chunk_path = balanced_output / "attack_normal_chunk2.csv"
    chunk_source = pd.read_csv(chunk_path)
    chunk_df = chunk_source.copy()

chunk_source = chunk_source.replace(r"^\s*$", pd.NA, regex=True)
non_empty_columns = chunk_source.columns[chunk_source.notna().any()]
chunk_source = chunk_source.dropna(axis=0, how="any", subset=non_empty_columns)
chunk_df = chunk_source.copy()

chunk_numeric = chunk_source.copy()
numeric_casts = {}
label_encoding_map = {}

INT64_MAX = 2**63 - 1

def ip_to_int(value):
    if pd.isna(value):
        return pd.NA
    text = str(value).strip()
    if not text:
        return pd.NA
    if ":" not in text:
        cleaned = text.replace(".", "")
        if cleaned.isdigit():
            return int(cleaned)
    try:
        return int(ip_address(text))
    except ValueError as exc:
        cleaned_hex = text.replace(".", "").replace(":", "")
        if cleaned_hex and all(ch in "0123456789abcdefABCDEF" for ch in cleaned_hex):
            return int(cleaned_hex, 16)
        raise ValueError(f"Unrecognized IP address format: {text}") from exc

address_cols = [col for col in ("saddr", "daddr") if col in chunk_numeric.columns]
for col in address_cols:
    converted_values = []
    bad_values = []
    for value in chunk_numeric[col]:
        try:
            converted_values.append(ip_to_int(value))
        except ValueError:
            bad_values.append(value)
            converted_values.append(pd.NA)
    if bad_values:
        bad_preview = pd.unique(bad_values)[:5]
        raise ValueError(f"Failed to convert {col} due to non-numeric values: {bad_preview}")
    converted_series = pd.Series(converted_values, index=chunk_numeric.index, dtype="object")
    non_na = converted_series.dropna()
    if not non_na.empty and all(isinstance(val, Integral) and abs(val) <= INT64_MAX for val in non_na):
        chunk_numeric[col] = converted_series.astype("Int64")
    else:
        chunk_numeric[col] = converted_series.astype("object")
    numeric_casts[col] = "ip_to_int"
    chunk_numeric[col] = converted.astype("Int64")
    numeric_casts[col] = "strip_dots_to_int"

skip_cols = set(address_cols) | {"sport", "dport"}
object_cols = [
    col for col, dtype in original_dtypes.items()
    if dtype == "object" and col not in skip_cols
]

for col in object_cols:
    coerced = pd.to_numeric(chunk_numeric[col], errors="coerce")
    if not coerced.isna().any():
        chunk_numeric[col] = coerced.astype("Int64")
        numeric_casts[col] = "pd.to_numeric"
    else:
        codes, uniques = pd.factorize(chunk_numeric[col], sort=True)
        chunk_numeric[col] = pd.Series(codes, index=chunk_numeric.index, dtype="int64")
        label_encoding_map[col] = {str(val): int(idx) for idx, val in enumerate(uniques)}

conversion_records = []
for col in chunk_numeric.columns:
    if col in numeric_casts:
        method = "numeric_cast"
    elif col in label_encoding_map:
        method = "label_encoding"
    else:
        method = "already_numeric"
    conversion_records.append(
        {
            "column": col,
            "original_dtype": original_dtypes[col],
            "new_dtype": str(chunk_numeric[col].dtype),
            "method": method,
            "distinct_values": chunk_source[col].nunique(dropna=False),
        }
    )

conversion_summary = (
    pd.DataFrame(conversion_records)
    .sort_values(["method", "column"])
    .reset_index(drop=True)
)

conversion_map_preview = {
    col: dict(list(mapping.items())[:5])
    for col, mapping in label_encoding_map.items()
}

chunk_int_path = balanced_output / "chunk_int_data.csv"
chunk_numeric.to_csv(chunk_int_path, index=False)

chunk_conversion_map = {
    "numeric_casts": numeric_casts,
    "label_encodings": label_encoding_map,
}

print(f"Saved numeric chunk data to {chunk_int_path}")
print("Full mapping stored in chunk_conversion_map.")
display(conversion_summary)
conversion_map_preview

Saved numeric chunk data to balanced_output\chunk_int_data.csv
Full mapping stored in chunk_conversion_map.


Unnamed: 0,column,original_dtype,new_dtype,method,distinct_values
0,bytes,float64,float64,already_numeric,18869
1,dbytes,int64,int64,already_numeric,11326
2,dpkts,int64,int64,already_numeric,1293
3,dport,object,object,already_numeric,56952
4,drate,float64,float64,already_numeric,416890
5,dur,float64,float64,already_numeric,496678
6,ltime,float64,float64,already_numeric,1098962
7,max,float64,float64,already_numeric,176068
8,mean,float64,float64,already_numeric,135976
9,min,float64,float64,already_numeric,116005


{'flgs': {'-': 0, 'OTH': 1, 'REJ': 2, 'RSTO': 3, 'RSTOS0': 4},
 'proto': {'3pc': 0, 'a/n': 1, 'aes-sp3-d': 2, 'any': 3, 'argus': 4},
 'state': {'ACC': 0, 'CON': 1, 'ECO': 2, 'FIN': 3, 'INT': 4},
 'source': {'attack': 0, 'normal': 1}}

In [41]:
if "chunk_df" not in globals():
    if "balanced_output" not in globals():
        balanced_output = Path("balanced_output")
    chunk_df = pd.read_csv(balanced_output / "attack_normal_chunk_full.csv")

class_counts = chunk_df["source"].value_counts()
if class_counts.size < 2:
    raise ValueError("chunk_df must contain both attack and normal samples.")

per_class_limit = min(500, class_counts.min())
if per_class_limit == 0:
    raise ValueError("One of the classes has no available rows for sampling.")

samples = []
selected_idx = []
for label in class_counts.index:
    sample_n = min(per_class_limit, class_counts[label])
    sampled = chunk_df[chunk_df["source"] == label].sample(n=sample_n, random_state=42)
    samples.append(sampled)
    selected_idx.extend(sampled.index.tolist())

test_df = (
    pd.concat(samples, ignore_index=False)
    .sample(frac=1, random_state=42)
    .reset_index(drop=True)
)

remaining_chunk = chunk_df.drop(index=selected_idx).reset_index(drop=True)

if "balanced_output" not in globals():
    balanced_output = Path("balanced_output")
test_path = balanced_output / "test_data.csv"

if "chunk_path" not in globals():
    chunk_path = balanced_output / "attack_normal_chunk_full.csv"

test_df.to_csv(test_path, index=False)
remaining_chunk.to_csv(chunk_path, index=False)

chunk_df = remaining_chunk

test_counts = test_df["source"].value_counts().to_dict()
print(
    f"Saved {len(test_df)} balanced test rows ({test_counts}) to {test_path.name} "
    f"and updated {chunk_path.name} with {len(chunk_df)} rows."
)

Saved 1000 balanced test rows ({'normal': 500, 'attack': 500}) to test_data.csv and updated attack_normal_chunk_full.csv with 1853724 rows.
