# Step 1: Split Classic Low‑Rate DDoS (Slowloris) vs. Benign Traffic

This notebook:
1. Loads the full CICIoT2023 dataset.
2. Cleans the data by dropping missing values and duplicates.
3. Filters to only Slowloris (classic low‑rate DDoS) and Benign labels.
4. Splits the filtered data by label and saves each subset to CSV.

In [1]:
# Import required libraries
import os
import glob
import pandas as pd

In [2]:
# Load and concatenate all CSV parts
dataset_dir = "./CICIoT2023"
file_pattern = os.path.join(dataset_dir, "part*.csv")
csv_files = glob.glob(file_pattern)
print("Number of CSV files found:", len(csv_files))

data_list = []
for file in csv_files:
    print("Reading file:", file)
    df = pd.read_csv(file)
    data_list.append(df)

data = pd.concat(data_list, ignore_index=True)
print("Combined raw dataset shape:", data.shape)

Number of CSV files found: 169
Reading file: ./CICIoT2023/part-00090-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Reading file: ./CICIoT2023/part-00162-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Reading file: ./CICIoT2023/part-00040-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Reading file: ./CICIoT2023/part-00011-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Reading file: ./CICIoT2023/part-00133-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Reading file: ./CICIoT2023/part-00141-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Reading file: ./CICIoT2023/part-00063-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Reading file: ./CICIoT2023/part-00032-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Reading file: ./CICIoT2023/part-00110-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Reading file: ./CICIoT2023/part-00087-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Reading file: ./CICIoT2023/part-00029-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Reading file: ./CICIoT2023/part-00057-363d1ba

In [3]:
# Data Cleaning: drop missing values and duplicates
missing_counts = data.isnull().sum()
print("Missing values per column:\n", missing_counts)

data_cleaned = data.dropna().reset_index(drop=True)
print("Shape after dropping missing values:", data_cleaned.shape)

data_cleaned = data_cleaned.drop_duplicates().reset_index(drop=True)
print("Shape after dropping duplicates:", data_cleaned.shape)

Missing values per column:
 flow_duration      0
Header_Length      0
Protocol Type      0
Duration           0
Rate               0
Srate              0
Drate              0
fin_flag_number    0
syn_flag_number    0
rst_flag_number    0
psh_flag_number    0
ack_flag_number    0
ece_flag_number    0
cwr_flag_number    0
ack_count          0
syn_count          0
fin_count          0
urg_count          0
rst_count          0
HTTP               0
HTTPS              0
DNS                0
Telnet             0
SMTP               0
SSH                0
IRC                0
TCP                0
UDP                0
DHCP               0
ARP                0
ICMP               0
IPv                0
LLC                0
Tot sum            0
Min                0
Max                0
AVG                0
Std                0
Tot size           0
IAT                0
Number             0
Magnitue           0
Radius             0
Covariance         0
Variance           0
Weight             0
label 

In [4]:
# Preview cleaned data
print("First 5 rows of cleaned dataset:")
print(data_cleaned.head())

First 5 rows of cleaned dataset:
   flow_duration  Header_Length  Protocol Type  Duration          Rate  \
0       0.037456       15099.00           17.0      64.0  10001.102371   
1       0.000000          54.00            6.0      64.0      0.000000   
2       0.010346        9662.50           17.0      64.0  21380.056228   
3       0.000000          54.00            6.0      64.0    241.333973   
4       0.195109          95.58            6.0      64.0      6.762174   

          Srate  Drate  fin_flag_number  syn_flag_number  rst_flag_number  \
0  10001.102371    0.0              0.0              0.0              0.0   
1      0.000000    0.0              0.0              0.0              0.0   
2  21380.056228    0.0              0.0              0.0              0.0   
3    241.333973    0.0              0.0              0.0              0.0   
4      6.762174    0.0              0.0              1.0              0.0   

   ...  Std  Tot size           IAT  Number   Magnitue  Rad

In [5]:
# save the preprocessed dataset for future use
preprocessed_file = "preprocessed_CICIoT2023.csv"
data_cleaned.to_csv(preprocessed_file, index=False)
print(f"Saved cleaned data to {preprocessed_file}")

Saved cleaned data to preprocessed_CICIoT2023.csv


In [6]:
# Filter to only Slowloris vs. Benign
labels_of_interest = ["DDoS-SlowLoris", "BenignTraffic"]
data_filtered = data_cleaned[data_cleaned["label"].isin(labels_of_interest)].reset_index(drop=True)
print(f"Filtered dataset shape (Slowloris vs. Benign): {data_filtered.shape}")
print("Labels after filtering:", data_filtered["label"].unique())

Filtered dataset shape (Slowloris vs. Benign): (1121621, 47)
Labels after filtering: ['BenignTraffic' 'DDoS-SlowLoris']


In [7]:
# Split Data by Type based on the filtered labels
unique_labels = data_filtered["label"].unique()
print("Unique labels in the filtered dataset:", unique_labels)

data_by_label = {
    label: data_filtered[data_filtered["label"] == label]
    for label in unique_labels
}

for label, df in data_by_label.items():
    print(f"Label: {label}, Number of samples: {len(df)}")

Unique labels in the filtered dataset: ['BenignTraffic' 'DDoS-SlowLoris']
Label: BenignTraffic, Number of samples: 1098195
Label: DDoS-SlowLoris, Number of samples: 23426


In [8]:
# Save each subset to CSV
output_dir = "./split_by_label"
os.makedirs(output_dir, exist_ok=True)

for label, df in data_by_label.items():
    filename = os.path.join(output_dir, f"{label}.csv")
    df.to_csv(filename, index=False)
    print(f"Saved {label} data to {filename}")

Saved BenignTraffic data to ./split_by_label/BenignTraffic.csv
Saved DDoS-SlowLoris data to ./split_by_label/DDoS-SlowLoris.csv
