In [1]:
import pandas as pd
data=pd.read_csv('ACI-IoT-2023.csv')

In [2]:
columns_to_drop = [
    "Flow ID",
    "Src IP",
    "Dst IP",
    "Timestamp",
    "Fwd Header Length",
    "Bwd Header Length",
    "Fwd Packets/s",
    "Bwd Packets/s",
    "Subflow Fwd Packets",
    "Subflow Fwd Bytes",
    "Subflow Bwd Packets",
    "Subflow Bwd Bytes",
    "Fwd PSH Flags",
    "Bwd PSH Flags",
    "Fwd URG Flags",
    "Bwd URG Flags",
]

# Drop the columns
data = data.drop(columns=columns_to_drop)

In [3]:
for col in data.columns:
    print(f"{col}: {data[col].dtype}")

Src Port: int64
Dst Port: int64
Protocol: int64
Flow Duration: int64
Total Fwd Packet: int64
Total Bwd packets: int64
Total Length of Fwd Packet: float64
Total Length of Bwd Packet: float64
Fwd Packet Length Max: float64
Fwd Packet Length Min: float64
Fwd Packet Length Mean: float64
Fwd Packet Length Std: float64
Bwd Packet Length Max: float64
Bwd Packet Length Min: float64
Bwd Packet Length Mean: float64
Bwd Packet Length Std: float64
Flow Bytes/s: float64
Flow Packets/s: float64
Flow IAT Mean: float64
Flow IAT Std: float64
Flow IAT Max: float64
Flow IAT Min: float64
Fwd IAT Total: float64
Fwd IAT Mean: float64
Fwd IAT Std: float64
Fwd IAT Max: float64
Fwd IAT Min: float64
Bwd IAT Total: float64
Bwd IAT Mean: float64
Bwd IAT Std: float64
Bwd IAT Max: float64
Bwd IAT Min: float64
Packet Length Min: float64
Packet Length Max: float64
Packet Length Mean: float64
Packet Length Std: float64
Packet Length Variance: float64
FIN Flag Count: int64
SYN Flag Count: int64
RST Flag Count: int64
PS

In [4]:
data["Connection Type"].value_counts()

Connection Type
wireless    742758
wired       488653
Name: count, dtype: int64

In [5]:
data["Connection Type"] = data["Connection Type"].replace({"wireless": 0, "wired": 1})
data["Connection Type"].value_counts()

Connection Type
0    742758
1    488653
Name: count, dtype: int64

In [6]:
import pandas as pd
import numpy as np

correlation_matrix = data.drop(columns=["Label"], errors="ignore").corr()

correlation_threshold = 0.9

upper_triangle = correlation_matrix.where(
    np.triu(np.ones(correlation_matrix.shape), k=1).astype(bool)
)

columns_to_drop = [
    column
    for column in upper_triangle.columns
    if any(upper_triangle[column] > correlation_threshold)
]

data2 = data.drop(columns=columns_to_drop)

print(f"Columns dropped: {columns_to_drop}")
print("Updated dataset shape:", data2.shape)

Columns dropped: ['Total Bwd packets', 'Total Length of Bwd Packet', 'Fwd Packet Length Mean', 'Fwd Packet Length Std', 'Bwd Packet Length Mean', 'Bwd Packet Length Std', 'Flow IAT Min', 'Fwd IAT Total', 'Fwd IAT Mean', 'Fwd IAT Max', 'Fwd IAT Min', 'Bwd IAT Max', 'Bwd IAT Min', 'Packet Length Min', 'Packet Length Std', 'ACK Flag Count', 'ECE Flag Count', 'Average Packet Size', 'Fwd Segment Size Avg', 'Bwd Segment Size Avg', 'Bwd Packet/Bulk Avg', 'Active Max', 'Active Min', 'Idle Min']
Updated dataset shape: (1231411, 45)


In [7]:
data2.isna().sum()

Src Port                         0
Dst Port                         0
Protocol                         0
Flow Duration                    0
Total Fwd Packet                 0
Total Length of Fwd Packet       0
Fwd Packet Length Max            0
Fwd Packet Length Min            0
Bwd Packet Length Max            0
Bwd Packet Length Min            0
Flow Bytes/s                  1009
Flow Packets/s                   0
Flow IAT Mean                    0
Flow IAT Std                     0
Flow IAT Max                     0
Fwd IAT Std                      0
Bwd IAT Total                    0
Bwd IAT Mean                     0
Bwd IAT Std                      0
Packet Length Max                0
Packet Length Mean               0
Packet Length Variance           0
FIN Flag Count                   0
SYN Flag Count                   0
RST Flag Count                   0
PSH Flag Count                   0
URG Flag Count                   0
CWR Flag Count                   0
Down/Up Ratio       

In [8]:
data3 = data2.drop(columns=["Label"], errors="ignore")


In [9]:
data3["Flow Bytes/s"].value_counts()

Flow Bytes/s
0.000000e+00    788809
6.666667e+05      3527
1.200000e+07      1766
1.000000e+06      1611
2.222222e+05      1412
                 ...  
1.571307e+05         1
3.665566e+00         1
5.300571e-01         1
7.407526e+03         1
2.182612e+03         1
Name: count, Length: 262187, dtype: int64

In [10]:
correlation_matrix = data3.corr()
correlation_matrix["Flow Bytes/s"]

Src Port                     -0.004029
Dst Port                      0.005398
Protocol                      0.042153
Flow Duration                -0.017839
Total Fwd Packet              0.004665
Total Length of Fwd Packet    0.000623
Fwd Packet Length Max         0.007330
Fwd Packet Length Min         0.139708
Bwd Packet Length Max        -0.000317
Bwd Packet Length Min        -0.002433
Flow Bytes/s                  1.000000
Flow Packets/s                0.300436
Flow IAT Mean                -0.017678
Flow IAT Std                 -0.009426
Flow IAT Max                 -0.018670
Fwd IAT Std                  -0.006497
Bwd IAT Total                -0.009473
Bwd IAT Mean                 -0.007750
Bwd IAT Std                  -0.005141
Packet Length Max             0.005541
Packet Length Mean            0.041478
Packet Length Variance        0.001811
FIN Flag Count                0.048443
SYN Flag Count               -0.034978
RST Flag Count               -0.030854
PSH Flag Count           

In [11]:
data3 = data3.dropna(subset=["Flow Bytes/s"])

In [12]:
data3["Fwd Bytes/Bulk Avg"].value_counts()

Fwd Bytes/Bulk Avg
0    1230402
Name: count, dtype: int64

In [13]:
columns_to_remove = [
    "Fwd Bytes/Bulk Avg",
    "Fwd Packet/Bulk Avg",
    "Fwd Bulk Rate Avg",
    "Src Port",
    "Dst Port",
]

# Remove the columns from the DataFrame
data3 = data3.drop(columns=columns_to_remove, errors="ignore")

In [14]:
data3

Unnamed: 0,Protocol,Flow Duration,Total Fwd Packet,Total Length of Fwd Packet,Fwd Packet Length Max,Fwd Packet Length Min,Bwd Packet Length Max,Bwd Packet Length Min,Flow Bytes/s,Flow Packets/s,...,FWD Init Win Bytes,Bwd Init Win Bytes,Fwd Act Data Pkts,Fwd Seg Size Min,Active Mean,Active Std,Idle Mean,Idle Std,Idle Max,Connection Type
0,6,379933,11,720.0,517.0,0.0,2736.0,0.0,1.813214e+04,5.790495e+01,...,29200,131,4,32,0.000000e+00,0.000000e+00,1.698846e+15,0.000000e+00,1.698846e+15,1
1,6,205637,3,87.0,87.0,0.0,189.0,0.0,1.342171e+03,2.917763e+01,...,29200,508,1,32,0.000000e+00,0.000000e+00,1.698846e+15,0.000000e+00,1.698846e+15,1
3,6,5030379,1,6.0,6.0,6.0,0.0,0.0,1.192753e+00,5.963765e-01,...,2048,123,0,20,0.000000e+00,0.000000e+00,8.494231e+14,1.201266e+15,1.698846e+15,1
4,6,72278,3,87.0,87.0,0.0,148.0,0.0,3.251335e+03,9.684828e+01,...,29200,508,1,32,0.000000e+00,0.000000e+00,1.698846e+15,0.000000e+00,1.698846e+15,1
5,6,2457,2,0.0,0.0,0.0,0.0,0.0,0.000000e+00,8.140008e+02,...,237,0,0,32,0.000000e+00,0.000000e+00,1.698846e+15,0.000000e+00,1.698846e+15,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1231406,6,1,2,12.0,6.0,6.0,0.0,0.0,1.200000e+07,2.000000e+06,...,510,0,1,20,0.000000e+00,0.000000e+00,1.698946e+15,0.000000e+00,1.698946e+15,1
1231407,6,50077229,4,140.0,70.0,0.0,195.0,195.0,1.058365e+01,1.198149e-01,...,1232,274,1,32,7.708200e+04,0.000000e+00,8.494730e+14,1.201336e+15,1.698946e+15,1
1231408,0,46365348,15,0.0,0.0,0.0,0.0,0.0,0.000000e+00,3.235175e-01,...,0,0,0,0,3.713166e+06,3.789721e+06,4.247365e+14,8.494730e+14,1.698946e+15,1
1231409,6,146262,15,20340.0,9327.0,0.0,5949.0,0.0,2.364387e+05,3.555264e+02,...,64240,16385,14,20,0.000000e+00,0.000000e+00,1.698946e+15,0.000000e+00,1.698946e+15,1


In [15]:
data3["Label"] = data["Label"]

In [16]:
data3.head()

Unnamed: 0,Protocol,Flow Duration,Total Fwd Packet,Total Length of Fwd Packet,Fwd Packet Length Max,Fwd Packet Length Min,Bwd Packet Length Max,Bwd Packet Length Min,Flow Bytes/s,Flow Packets/s,...,Bwd Init Win Bytes,Fwd Act Data Pkts,Fwd Seg Size Min,Active Mean,Active Std,Idle Mean,Idle Std,Idle Max,Connection Type,Label
0,6,379933,11,720.0,517.0,0.0,2736.0,0.0,18132.144352,57.904946,...,131,4,32,0.0,0.0,1698846000000000.0,0.0,1698846000000000.0,1,Benign
1,6,205637,3,87.0,87.0,0.0,189.0,0.0,1342.170913,29.177629,...,508,1,32,0.0,0.0,1698846000000000.0,0.0,1698846000000000.0,1,Benign
3,6,5030379,1,6.0,6.0,6.0,0.0,0.0,1.192753,0.596377,...,123,0,20,0.0,0.0,849423100000000.0,1201266000000000.0,1698846000000000.0,1,Benign
4,6,72278,3,87.0,87.0,0.0,148.0,0.0,3251.335123,96.84828,...,508,1,32,0.0,0.0,1698846000000000.0,0.0,1698846000000000.0,1,Benign
5,6,2457,2,0.0,0.0,0.0,0.0,0.0,0.0,814.000814,...,0,0,32,0.0,0.0,1698846000000000.0,0.0,1698846000000000.0,1,Benign


In [17]:
data3["Label"].value_counts()

Label
Port Scan             441271
Benign                328300
ICMP Flood            225234
Ping Sweep             71928
DNS Flood              46935
Vulnerability Scan     39534
OS Scan                37524
Slowloris              18643
SYN Flood              13857
Dictionary Attack       6380
UDP Flood                791
ARP Spoofing               5
Name: count, dtype: int64

In [18]:
count = 1
for col in data3.columns:
    print(f"{count}- {col}: {data3[col].dtype}")
    count += 1

1- Protocol: int64
2- Flow Duration: int64
3- Total Fwd Packet: int64
4- Total Length of Fwd Packet: float64
5- Fwd Packet Length Max: float64
6- Fwd Packet Length Min: float64
7- Bwd Packet Length Max: float64
8- Bwd Packet Length Min: float64
9- Flow Bytes/s: float64
10- Flow Packets/s: float64
11- Flow IAT Mean: float64
12- Flow IAT Std: float64
13- Flow IAT Max: float64
14- Fwd IAT Std: float64
15- Bwd IAT Total: float64
16- Bwd IAT Mean: float64
17- Bwd IAT Std: float64
18- Packet Length Max: float64
19- Packet Length Mean: float64
20- Packet Length Variance: float64
21- FIN Flag Count: int64
22- SYN Flag Count: int64
23- RST Flag Count: int64
24- PSH Flag Count: int64
25- URG Flag Count: int64
26- CWR Flag Count: int64
27- Down/Up Ratio: float64
28- Bwd Bytes/Bulk Avg: int64
29- Bwd Bulk Rate Avg: int64
30- FWD Init Win Bytes: int64
31- Bwd Init Win Bytes: int64
32- Fwd Act Data Pkts: int64
33- Fwd Seg Size Min: int64
34- Active Mean: float64
35- Active Std: float64
36- Idle Mean

In [19]:
data3.to_csv("cleaned_data_iotid23_1.csv", index=False)

In [20]:
data3=pd.read_csv('cleaned_data_iotid23_1.csv')

In [21]:
cols_to_remove = [
    "URG Flag Count",
    "CWR Flag Count",
    "Bwd IAT Total",
    "Packet Length Variance",
    "Bwd Bytes/Bulk Avg",
]
data3 = data3.drop(columns=cols_to_remove)

In [22]:
data3

Unnamed: 0,Protocol,Flow Duration,Total Fwd Packet,Total Length of Fwd Packet,Fwd Packet Length Max,Fwd Packet Length Min,Bwd Packet Length Max,Bwd Packet Length Min,Flow Bytes/s,Flow Packets/s,...,Bwd Init Win Bytes,Fwd Act Data Pkts,Fwd Seg Size Min,Active Mean,Active Std,Idle Mean,Idle Std,Idle Max,Connection Type,Label
0,6,379933,11,720.0,517.0,0.0,2736.0,0.0,1.813214e+04,5.790495e+01,...,131,4,32,0.000000e+00,0.000000e+00,1.698846e+15,0.000000e+00,1.698846e+15,1,Benign
1,6,205637,3,87.0,87.0,0.0,189.0,0.0,1.342171e+03,2.917763e+01,...,508,1,32,0.000000e+00,0.000000e+00,1.698846e+15,0.000000e+00,1.698846e+15,1,Benign
2,6,5030379,1,6.0,6.0,6.0,0.0,0.0,1.192753e+00,5.963765e-01,...,123,0,20,0.000000e+00,0.000000e+00,8.494231e+14,1.201266e+15,1.698846e+15,1,Benign
3,6,72278,3,87.0,87.0,0.0,148.0,0.0,3.251335e+03,9.684828e+01,...,508,1,32,0.000000e+00,0.000000e+00,1.698846e+15,0.000000e+00,1.698846e+15,1,Benign
4,6,2457,2,0.0,0.0,0.0,0.0,0.0,0.000000e+00,8.140008e+02,...,0,0,32,0.000000e+00,0.000000e+00,1.698846e+15,0.000000e+00,1.698846e+15,1,Benign
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1230397,6,1,2,12.0,6.0,6.0,0.0,0.0,1.200000e+07,2.000000e+06,...,0,1,20,0.000000e+00,0.000000e+00,1.698946e+15,0.000000e+00,1.698946e+15,1,Benign
1230398,6,50077229,4,140.0,70.0,0.0,195.0,195.0,1.058365e+01,1.198149e-01,...,274,1,32,7.708200e+04,0.000000e+00,8.494730e+14,1.201336e+15,1.698946e+15,1,Benign
1230399,0,46365348,15,0.0,0.0,0.0,0.0,0.0,0.000000e+00,3.235175e-01,...,0,0,0,3.713166e+06,3.789721e+06,4.247365e+14,8.494730e+14,1.698946e+15,1,Benign
1230400,6,146262,15,20340.0,9327.0,0.0,5949.0,0.0,2.364387e+05,3.555264e+02,...,16385,14,20,0.000000e+00,0.000000e+00,1.698946e+15,0.000000e+00,1.698946e+15,1,Benign


In [23]:
for column in data3.columns:
    print(f"Value counts for column: {column}")
    print(data3[column].value_counts())
    print("\n" + "=" * 50 + "\n")  # Separator for better readability

Value counts for column: Protocol
Protocol
6     815865
0     236808
17    177729
Name: count, dtype: int64


Value counts for column: Flow Duration
Flow Duration
2           8234
3           7602
1           6407
4           2212
9           1689
            ... 
18583617       1
10510736       1
21998999       1
9112427        1
146262         1
Name: count, Length: 447786, dtype: int64


Value counts for column: Total Fwd Packet
Total Fwd Packet
1       722482
2       318451
3        52570
4        42838
5         9989
         ...  
1799         1
3661         1
3626         1
3604         1
508          1
Name: count, Length: 1143, dtype: int64


Value counts for column: Total Length of Fwd Packet
Total Length of Fwd Packet
0.0        797832
2.0         69810
28.0        42304
6.0         24678
24.0        20122
            ...  
14944.0         1
75307.0         1
19257.0         1
40474.0         1
20340.0         1
Name: count, Length: 10480, dtype: int64


Value counts for col

In [24]:
data3.to_csv("cleaned_data2_iotid23.csv", index=False)