# Step 1: Load Data and Drop Initial Unwanted Columns

In [1]:
import pandas as pd

# Load the original dataset
data = pd.read_csv("/kaggle/input/iot-network-intrusion-dataset-2023/ACI-IoT-2023.csv")

# Define and drop columns that are not needed
initial_columns_to_drop = [
    "Flow ID", "Src IP", "Dst IP", "Timestamp", 
    "Fwd Header Length", "Bwd Header Length", "Fwd Packets/s", "Bwd Packets/s",
    "Subflow Fwd Packets", "Subflow Fwd Bytes", "Subflow Bwd Packets", "Subflow Bwd Bytes",
    "Fwd PSH Flags", "Bwd PSH Flags", "Fwd URG Flags", "Bwd URG Flags"
]
data = data.drop(columns=initial_columns_to_drop, errors="ignore")

# Optionally print column data types for verification
print("Column data types after initial drop:")
for idx, col in enumerate(data.columns, start=1):
    print(f"{idx}- {col}: {data[col].dtype}")

Column data types after initial drop:
1- Src Port: int64
2- Dst Port: int64
3- Protocol: int64
4- Flow Duration: int64
5- Total Fwd Packet: int64
6- Total Bwd packets: int64
7- Total Length of Fwd Packet: float64
8- Total Length of Bwd Packet: float64
9- Fwd Packet Length Max: float64
10- Fwd Packet Length Min: float64
11- Fwd Packet Length Mean: float64
12- Fwd Packet Length Std: float64
13- Bwd Packet Length Max: float64
14- Bwd Packet Length Min: float64
15- Bwd Packet Length Mean: float64
16- Bwd Packet Length Std: float64
17- Flow Bytes/s: float64
18- Flow Packets/s: float64
19- Flow IAT Mean: float64
20- Flow IAT Std: float64
21- Flow IAT Max: float64
22- Flow IAT Min: float64
23- Fwd IAT Total: float64
24- Fwd IAT Mean: float64
25- Fwd IAT Std: float64
26- Fwd IAT Max: float64
27- Fwd IAT Min: float64
28- Bwd IAT Total: float64
29- Bwd IAT Mean: float64
30- Bwd IAT Std: float64
31- Bwd IAT Max: float64
32- Bwd IAT Min: float64
33- Packet Length Min: float64
34- Packet Length Max

# Step 2: Process the 'Connection Type' Column

In [2]:
# Print original 'Connection Type' distribution
print("'Connection Type' value counts before replacement:")
print(data["Connection Type"].value_counts())

# Replace 'wireless' with 0 and 'wired' with 1
data["Connection Type"] = data["Connection Type"].replace({"wireless": 0, "wired": 1})
print()
# Verify replacement
print("'Connection Type' value counts after replacement:")
print(data["Connection Type"].value_counts())

'Connection Type' value counts before replacement:
Connection Type
wireless    742758
wired       488653
Name: count, dtype: int64

'Connection Type' value counts after replacement:
Connection Type
0    742758
1    488653
Name: count, dtype: int64


  data["Connection Type"] = data["Connection Type"].replace({"wireless": 0, "wired": 1})


# Step 3: Remove Highly Correlated Features

In [3]:
import numpy as np

# Compute correlation matrix (ignoring the Label column if present)
correlation_matrix = data.drop(columns=["Label"], errors="ignore").corr()

# Define a correlation threshold and extract the upper triangle of the correlation matrix
correlation_threshold = 0.9
upper_triangle = correlation_matrix.where(np.triu(np.ones(correlation_matrix.shape), k=1).astype(bool))

# Identify columns with any correlation greater than the threshold
columns_to_drop_corr = [col for col in upper_triangle.columns if any(upper_triangle[col] > correlation_threshold)]

# Drop the highly correlated columns
data = data.drop(columns=columns_to_drop_corr, errors="ignore")

print(f"Columns dropped due to high correlation: {columns_to_drop_corr}")
print("Updated dataset shape:", data.shape)



Columns dropped due to high correlation: ['Total Bwd packets', 'Total Length of Bwd Packet', 'Fwd Packet Length Mean', 'Fwd Packet Length Std', 'Bwd Packet Length Mean', 'Bwd Packet Length Std', 'Flow IAT Min', 'Fwd IAT Total', 'Fwd IAT Mean', 'Fwd IAT Max', 'Fwd IAT Min', 'Bwd IAT Max', 'Bwd IAT Min', 'Packet Length Min', 'Packet Length Std', 'ACK Flag Count', 'ECE Flag Count', 'Average Packet Size', 'Fwd Segment Size Avg', 'Bwd Segment Size Avg', 'Bwd Packet/Bulk Avg', 'Active Max', 'Active Min', 'Idle Min']
Updated dataset shape: (1231411, 45)


  return op(a, b)


In [4]:
print("Missing value counts:")
print(data.isna().sum())

Missing value counts:
Src Port                         0
Dst Port                         0
Protocol                         0
Flow Duration                    0
Total Fwd Packet                 0
Total Length of Fwd Packet       0
Fwd Packet Length Max            0
Fwd Packet Length Min            0
Bwd Packet Length Max            0
Bwd Packet Length Min            0
Flow Bytes/s                  1009
Flow Packets/s                   0
Flow IAT Mean                    0
Flow IAT Std                     0
Flow IAT Max                     0
Fwd IAT Std                      0
Bwd IAT Total                    0
Bwd IAT Mean                     0
Bwd IAT Std                      0
Packet Length Max                0
Packet Length Mean               0
Packet Length Variance           0
FIN Flag Count                   0
SYN Flag Count                   0
RST Flag Count                   0
PSH Flag Count                   0
URG Flag Count                   0
CWR Flag Count                   

# Step 4: Process and Remove Additional Columns

In [5]:
# Temporarily drop 'Label' column to work on numeric columns
data_numeric = data.drop(columns=["Label"], errors="ignore")

# Drop rows with missing values for 'Flow Bytes/s'
data_numeric = data_numeric.dropna(subset=["Flow Bytes/s"])

# Print value counts for 'Fwd Bytes/Bulk Avg'
print("'Fwd Bytes/Bulk Avg' value counts:")
print(data_numeric["Fwd Bytes/Bulk Avg"].value_counts())
print()
print(data_numeric["Fwd Packet/Bulk Avg"].value_counts())
print()
print(data_numeric["Fwd Bulk Rate Avg"].value_counts())


'Fwd Bytes/Bulk Avg' value counts:
Fwd Bytes/Bulk Avg
0    1230402
Name: count, dtype: int64

Fwd Packet/Bulk Avg
0    1230402
Name: count, dtype: int64

Fwd Bulk Rate Avg
0    1230402
Name: count, dtype: int64


In [6]:
# Remove additional unnecessary columns
columns_to_remove = [
    "Fwd Bytes/Bulk Avg",
    "Fwd Packet/Bulk Avg",
    "Fwd Bulk Rate Avg",
    "Src Port",
    "Dst Port"
]
data_numeric = data_numeric.drop(columns=columns_to_remove, errors="ignore")

# Reintegrate the 'Label' column back into the data
data_numeric["Label"] = data["Label"]

print("'Label' value counts:")
print(data_numeric["Label"].value_counts())

# Replace the main data variable with the cleaned numeric version for further processing
data = data_numeric.copy()

'Label' value counts:
Label
Port Scan             441271
Benign                328300
ICMP Flood            225234
Ping Sweep             71928
DNS Flood              46935
Vulnerability Scan     39534
OS Scan                37524
Slowloris              18643
SYN Flood              13857
Dictionary Attack       6380
UDP Flood                791
ARP Spoofing               5
Name: count, dtype: int64


# Step 5: Further Clean by Dropping Extra Columns

In [7]:
# Drop further unnecessary columns directly
additional_cols_to_drop = [
    'URG Flag Count',
    'CWR Flag Count',
    'Bwd IAT Total',
    'Packet Length Variance',
    'Bwd Bytes/Bulk Avg'
]
data = data.drop(columns=additional_cols_to_drop, errors="ignore")


# Step 6: Prepare for SMOTE Oversampling

In [8]:
from imblearn.over_sampling import SMOTE
import numpy as np

# For oversampling, first replace infinities and drop NaN rows
data.replace([np.inf, -np.inf], np.nan, inplace=True)
data.dropna(inplace=True)

# Check the current class distribution
print("Class distribution before SMOTE:")
print(data["Label"].value_counts())

# Drop rows where Label is 'ARP Spoofing'
data = data[data["Label"] != "ARP Spoofing"]

# Separate features (X) and the target variable (y)
X = data.drop(columns=["Label"])
y = data["Label"]


Class distribution before SMOTE:
Label
Port Scan             441260
Benign                327505
ICMP Flood            225234
Ping Sweep             71928
DNS Flood              46934
Vulnerability Scan     39533
OS Scan                37524
Slowloris              18537
SYN Flood              13857
Dictionary Attack       6379
UDP Flood                791
ARP Spoofing               5
Name: count, dtype: int64


# Step 7: Apply SMOTE Oversampling

In [9]:
# Oversample 'Dictionary Attack' and 'UDP Flood' to 10,000 samples each using SMOTE
smote = SMOTE(sampling_strategy={"Dictionary Attack": 10000, "UDP Flood": 10000}, random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

# Convert the resampled data back to a DataFrame and add the target variable
df_resampled = pd.DataFrame(X_resampled, columns=X.columns)
df_resampled["Label"] = y_resampled

# Verify the new class distribution after oversampling
print("Class distribution after oversampling:")
print(df_resampled["Label"].value_counts())


Class distribution after oversampling:
Label
Port Scan             441260
Benign                327505
ICMP Flood            225234
Ping Sweep             71928
DNS Flood              46934
Vulnerability Scan     39533
OS Scan                37524
Slowloris              18537
SYN Flood              13857
UDP Flood              10000
Dictionary Attack      10000
Name: count, dtype: int64


# Step 8: Save the Final Oversampled Dataset

In [10]:
# Save the final oversampled dataset
df_resampled.to_csv("IotFinalDataset.csv", index=False)