#### Data Preprocessing Pipeline

In [1]:
import os
import pandas as pd
import numpy as np


from sklearn.preprocessing import LabelEncoder
from sklearn.feature_selection import SelectKBest, mutual_info_classif

In [2]:
DOCS_DIR = './docs'  
sample_file = f'{DOCS_DIR}/sampled_data.csv'

In [3]:
# Check if the file exists
if os.path.exists(sample_file):
    df = pd.read_csv(sample_file)
    print("✅ Sample data loaded successfully.")
else:
    print("⚠️ Sample data not found. Please perform an overview to generate the sample dataset for preprocessing.")

✅ Sample data loaded successfully.


In [4]:
# Display basic info
print("\n" + "="*50)
print("DATASET OVERVIEW")
print("="*50)
print(df.info())


DATASET OVERVIEW
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 623869 entries, 0 to 623868
Data columns (total 85 columns):
 #   Column                        Non-Null Count   Dtype  
---  ------                        --------------   -----  
 0   Flow ID                       566047 non-null  object 
 1    Source IP                    566047 non-null  object 
 2    Source Port                  566047 non-null  float64
 3    Destination IP               566047 non-null  object 
 4    Destination Port             566047 non-null  float64
 5    Protocol                     566047 non-null  float64
 6    Timestamp                    566047 non-null  object 
 7    Flow Duration                566047 non-null  float64
 8    Total Fwd Packets            566047 non-null  float64
 9    Total Backward Packets       566047 non-null  float64
 10  Total Length of Fwd Packets   566047 non-null  float64
 11   Total Length of Bwd Packets  566047 non-null  float64
 12   Fwd Packet Length Max    

In [5]:
# List all column names
all_columns = df.columns.tolist()
print("📃 All Columns:")
for i, col in enumerate(all_columns, start=1):
    print(f"{i}. {col}")

# Print total number of columns
print(f"🧮 Total Columns: {len(all_columns)}")


📃 All Columns:
1. Flow ID
2.  Source IP
3.  Source Port
4.  Destination IP
5.  Destination Port
6.  Protocol
7.  Timestamp
8.  Flow Duration
9.  Total Fwd Packets
10.  Total Backward Packets
11. Total Length of Fwd Packets
12.  Total Length of Bwd Packets
13.  Fwd Packet Length Max
14.  Fwd Packet Length Min
15.  Fwd Packet Length Mean
16.  Fwd Packet Length Std
17. Bwd Packet Length Max
18.  Bwd Packet Length Min
19.  Bwd Packet Length Mean
20.  Bwd Packet Length Std
21. Flow Bytes/s
22.  Flow Packets/s
23.  Flow IAT Mean
24.  Flow IAT Std
25.  Flow IAT Max
26.  Flow IAT Min
27. Fwd IAT Total
28.  Fwd IAT Mean
29.  Fwd IAT Std
30.  Fwd IAT Max
31.  Fwd IAT Min
32. Bwd IAT Total
33.  Bwd IAT Mean
34.  Bwd IAT Std
35.  Bwd IAT Max
36.  Bwd IAT Min
37. Fwd PSH Flags
38.  Bwd PSH Flags
39.  Fwd URG Flags
40.  Bwd URG Flags
41.  Fwd Header Length
42.  Bwd Header Length
43. Fwd Packets/s
44.  Bwd Packets/s
45.  Min Packet Length
46.  Max Packet Length
47.  Packet Length Mean
48.  Packet Len

In [6]:
# Cleaning data set by dropping null, duplicates  

df = df.drop_duplicates()

# 2. Replace inf/-inf with NaN
df = df.replace([np.inf, -np.inf], np.nan)

# 3. Drop rows with NaN values
df = df.dropna()

# 4. Drop columns with all NaNs (if any)
df = df.dropna(axis=1, how='all')

In [7]:
# Column reshaping

In [8]:
# Clean and normalize column names (strip spaces, lowercase, replace spaces with underscores, remove brackets)
df.columns = (
    df.columns
    .str.strip()
    .str.lower()
    .str.replace(' ', '_')
    .str.replace('(', '')
    .str.replace(')', '')
)

df.head()


Unnamed: 0,flow_id,source_ip,source_port,destination_ip,destination_port,protocol,timestamp,flow_duration,total_fwd_packets,total_backward_packets,...,min_seg_size_forward,active_mean,active_std,active_max,active_min,idle_mean,idle_std,idle_max,idle_min,label
0,192.168.10.17-72.167.18.239-55220-80-6,192.168.10.17,55220.0,72.167.18.239,80.0,6.0,4/7/2017 11:04,155.0,2.0,0.0,...,32.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,BENIGN
1,192.168.10.3-192.168.10.14-53-64644-17,192.168.10.14,64644.0,192.168.10.3,53.0,17.0,4/7/2017 2:39,47536.0,2.0,2.0,...,20.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,BENIGN
2,192.168.10.5-213.180.204.92-49163-443-6,192.168.10.5,49163.0,213.180.204.92,443.0,6.0,03/07/2017 02:44:01,4755.0,2.0,0.0,...,20.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,BENIGN
3,192.168.10.3-192.168.10.14-53-60505-17,192.168.10.14,60505.0,192.168.10.3,53.0,17.0,4/7/2017 9:48,193.0,2.0,2.0,...,20.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,BENIGN
4,192.168.10.1-192.168.10.3-53-61857-17,192.168.10.3,61857.0,192.168.10.1,53.0,17.0,6/7/2017 1:26,162586.0,1.0,1.0,...,20.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,BENIGN


In [9]:
# Selecting columns from 7 till end
df = df.iloc[:,7:]

In [10]:
df.columns

Index(['flow_duration', 'total_fwd_packets', 'total_backward_packets',
       'total_length_of_fwd_packets', 'total_length_of_bwd_packets',
       'fwd_packet_length_max', 'fwd_packet_length_min',
       'fwd_packet_length_mean', 'fwd_packet_length_std',
       'bwd_packet_length_max', 'bwd_packet_length_min',
       'bwd_packet_length_mean', 'bwd_packet_length_std', 'flow_bytes/s',
       'flow_packets/s', 'flow_iat_mean', 'flow_iat_std', 'flow_iat_max',
       'flow_iat_min', 'fwd_iat_total', 'fwd_iat_mean', 'fwd_iat_std',
       'fwd_iat_max', 'fwd_iat_min', 'bwd_iat_total', 'bwd_iat_mean',
       'bwd_iat_std', 'bwd_iat_max', 'bwd_iat_min', 'fwd_psh_flags',
       'bwd_psh_flags', 'fwd_urg_flags', 'bwd_urg_flags', 'fwd_header_length',
       'bwd_header_length', 'fwd_packets/s', 'bwd_packets/s',
       'min_packet_length', 'max_packet_length', 'packet_length_mean',
       'packet_length_std', 'packet_length_variance', 'fin_flag_count',
       'syn_flag_count', 'rst_flag_count', 'ps

In [11]:
# Strip whitespace and fix encoding artifacts
df['label'] = df['label'].str.strip().str.replace('\x96', '-', regex=False)

df['label'].unique()

# Map similar attack types to broader categories
# label_map = {
#     'DoS Hulk': 'Dos/Ddos',
#     'DoS GoldenEye': 'Dos/Ddos',
#     'DoS Slowhttptest': 'Dos/Ddos',
#     'DoS slowloris': 'Dos/Ddos',
#     'Heartbleed': 'Dos/Ddos',
#     'DDoS': 'Dos/Ddos',
    
#     'FTP-Patator': 'Brute Force',
#     'SSH-Patator': 'Brute Force',
#     'Web Attack - Brute Force': 'Brute Force',
#     'Web Attack - XSS': 'Brute Force',
#     'Web Attack - Sql Injection': 'Brute Force',
    
#     # Other labels stay the same
#     'BENIGN': 'BENIGN',
#     'PortScan': 'PortScan',
#     'Bot': 'Bot',
#     'Infiltration': 'Infiltration',
# }

# df['label'] = df['label'].map(label_map)

array(['BENIGN', 'Web Attack - Brute Force', 'DoS Hulk', 'DDoS',
       'Web Attack - XSS', 'PortScan', 'DoS slowloris', 'SSH-Patator',
       'FTP-Patator', 'Bot', 'DoS Slowhttptest', 'DoS GoldenEye',
       'Infiltration', 'Web Attack - Sql Injection', 'Heartbleed'],
      dtype=object)

In [12]:
# Label encoding
le = LabelEncoder()
df['label'] = le.fit_transform(df['label'])

# Print label classes
print("✅ Encoded labels:", list(le.classes_))

✅ Encoded labels: ['BENIGN', 'Bot', 'DDoS', 'DoS GoldenEye', 'DoS Hulk', 'DoS Slowhttptest', 'DoS slowloris', 'FTP-Patator', 'Heartbleed', 'Infiltration', 'PortScan', 'SSH-Patator', 'Web Attack - Brute Force', 'Web Attack - Sql Injection', 'Web Attack - XSS']


In [13]:
# Print label → encoded number mapping
print("\n🔢 Encoded Label Mapping:")
for index, label in enumerate(le.classes_):
    print(f"{index} → {label}")


🔢 Encoded Label Mapping:
0 → BENIGN
1 → Bot
2 → DDoS
3 → DoS GoldenEye
4 → DoS Hulk
5 → DoS Slowhttptest
6 → DoS slowloris
7 → FTP-Patator
8 → Heartbleed
9 → Infiltration
10 → PortScan
11 → SSH-Patator
12 → Web Attack - Brute Force
13 → Web Attack - Sql Injection
14 → Web Attack - XSS


In [14]:
df.head()
len(df.columns)

78

In [15]:
len(df.iloc[:,:-1].columns)
#  selecting all except label

X = df.iloc[:,:-1] 

X.head()

Unnamed: 0,flow_duration,total_fwd_packets,total_backward_packets,total_length_of_fwd_packets,total_length_of_bwd_packets,fwd_packet_length_max,fwd_packet_length_min,fwd_packet_length_mean,fwd_packet_length_std,bwd_packet_length_max,...,act_data_pkt_fwd,min_seg_size_forward,active_mean,active_std,active_max,active_min,idle_mean,idle_std,idle_max,idle_min
0,155.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,32.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,47536.0,2.0,2.0,62.0,208.0,31.0,31.0,31.0,0.0,104.0,...,1.0,20.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,4755.0,2.0,0.0,12.0,0.0,6.0,6.0,6.0,0.0,0.0,...,1.0,20.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,193.0,2.0,2.0,80.0,112.0,40.0,40.0,40.0,0.0,56.0,...,1.0,20.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,162586.0,1.0,1.0,57.0,173.0,57.0,57.0,57.0,0.0,173.0,...,0.0,20.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [16]:
#  selecting labels only
Y = df.iloc[:,-1]

Y.head()

0    0
1    0
2    0
3    0
4    0
Name: label, dtype: int64

In [17]:

# 1. Select top 15 features using Mutual Information
k_best = 15
selector_mi = SelectKBest(score_func=mutual_info_classif, k=k_best)
X_selected = selector_mi.fit_transform(X, Y)

# 2. Get the names of selected features
selected_feature_names = X.columns[selector_mi.get_support()]

# 3. Display selected features
print(f"✅ Selected Top {k_best} Features:")
for i, feature in enumerate(selected_feature_names, 1):
    print(f"{i}. {feature}")

# 4. Create a DataFrame with only selected features
X_final = pd.DataFrame(X_selected, columns=selected_feature_names)

# (Optional) Keep y aligned
X_final['label'] = Y.reset_index(drop=True)  


✅ Selected Top 15 Features:
1. total_length_of_fwd_packets
2. total_length_of_bwd_packets
3. fwd_packet_length_max
4. bwd_packet_length_max
5. bwd_packet_length_mean
6. max_packet_length
7. packet_length_mean
8. packet_length_std
9. packet_length_variance
10. average_packet_size
11. avg_bwd_segment_size
12. subflow_fwd_bytes
13. subflow_bwd_bytes
14. init_win_bytes_forward
15. init_win_bytes_backward


In [18]:
X_final

Unnamed: 0,total_length_of_fwd_packets,total_length_of_bwd_packets,fwd_packet_length_max,bwd_packet_length_max,bwd_packet_length_mean,max_packet_length,packet_length_mean,packet_length_std,packet_length_variance,average_packet_size,avg_bwd_segment_size,subflow_fwd_bytes,subflow_bwd_bytes,init_win_bytes_forward,init_win_bytes_backward,label
0,0.0,0.0,0.0,0.0,0.0000,0.0,0.000000,0.000000,0.000000,0.000000,0.0000,0.0,0.0,264.0,-1.0,0
1,62.0,208.0,31.0,104.0,104.0000,104.0,60.200000,39.983747,1598.700000,75.250000,104.0000,62.0,208.0,-1.0,-1.0,0
2,12.0,0.0,6.0,0.0,0.0000,6.0,6.000000,0.000000,0.000000,9.000000,0.0000,12.0,0.0,258.0,-1.0,0
3,80.0,112.0,40.0,56.0,56.0000,56.0,46.400000,8.763561,76.800000,58.000000,56.0000,80.0,112.0,-1.0,-1.0,0
4,57.0,173.0,57.0,173.0,173.0000,173.0,95.666667,66.972631,4485.333333,143.500000,173.0000,57.0,173.0,-1.0,-1.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
565418,0.0,0.0,0.0,0.0,0.0000,0.0,0.000000,0.000000,0.000000,0.000000,0.0000,0.0,0.0,243.0,290.0,0
565419,43.0,171.0,43.0,171.0,171.0000,171.0,85.666667,73.900834,5461.333333,128.500000,171.0000,43.0,171.0,-1.0,-1.0,0
565420,1176.0,5339.0,693.0,1448.0,667.3750,1448.0,296.136364,506.385257,256426.028139,310.238095,667.3750,1176.0,5339.0,65535.0,246.0,0
565421,5823.0,7281.0,2912.0,1448.0,455.0625,2912.0,468.000000,859.325101,738439.629600,485.333333,455.0625,5823.0,7281.0,29200.0,75.0,0


In [19]:

# saving preprocessed data for modeling
processed_data = f'{DOCS_DIR}/preprocessed_data.csv'

# Save only if it doesn't already exist
if os.path.exists(processed_data):
    print("⚠️ Preprocessed data already exists. Skipping save.")
else:
    X_final.to_csv(processed_data, index=False)
    print("✅ Preprocessed data saved successfully.")

# Display info about the final dataset
print("\n🧾 Dataset Overview:")
X_final.info()

print("\n🚀 Preprocessing complete — proceed to the modeling step.")


✅ Preprocessed data saved successfully.

🧾 Dataset Overview:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 565423 entries, 0 to 565422
Data columns (total 16 columns):
 #   Column                       Non-Null Count   Dtype  
---  ------                       --------------   -----  
 0   total_length_of_fwd_packets  565423 non-null  float64
 1   total_length_of_bwd_packets  565423 non-null  float64
 2   fwd_packet_length_max        565423 non-null  float64
 3   bwd_packet_length_max        565423 non-null  float64
 4   bwd_packet_length_mean       565423 non-null  float64
 5   max_packet_length            565423 non-null  float64
 6   packet_length_mean           565423 non-null  float64
 7   packet_length_std            565423 non-null  float64
 8   packet_length_variance       565423 non-null  float64
 9   average_packet_size          565423 non-null  float64
 10  avg_bwd_segment_size         565423 non-null  float64
 11  subflow_fwd_bytes            565423 non-null  float64
 1

#### Exporting test json file for testing the modal

In [20]:
features = X_final.iloc[:,:-1]
labels = X_final.iloc[:,-1]

In [21]:
X_final.drop_duplicates(subset='label', keep='first').sort_values(by='label')


Unnamed: 0,total_length_of_fwd_packets,total_length_of_bwd_packets,fwd_packet_length_max,bwd_packet_length_max,bwd_packet_length_mean,max_packet_length,packet_length_mean,packet_length_std,packet_length_variance,average_packet_size,avg_bwd_segment_size,subflow_fwd_bytes,subflow_bwd_bytes,init_win_bytes_forward,init_win_bytes_backward,label
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,264.0,-1.0,0
359,6.0,6.0,6.0,6.0,6.0,6.0,6.0,0.0,0.0,9.0,6.0,6.0,6.0,237.0,256.0,1
15,24.0,0.0,6.0,0.0,0.0,6.0,6.0,0.0,0.0,7.5,0.0,24.0,0.0,256.0,-1.0,2
1093,1884.0,11632.0,471.0,7240.0,1938.666667,7240.0,795.058824,1959.457657,3839474.0,844.75,1938.666667,1884.0,11632.0,29200.0,235.0,3
9,369.0,11595.0,369.0,8688.0,2898.75,8688.0,854.571429,2383.3406,5680312.0,920.307692,2898.75,369.0,11595.0,251.0,235.0,4
779,531.0,0.0,520.0,0.0,0.0,520.0,59.0,172.913273,29899.0,66.375,0.0,531.0,0.0,29200.0,235.0,5
107,2541.0,6.0,231.0,6.0,2.0,231.0,134.052632,116.803479,13643.05,141.5,2.0,2541.0,6.0,29200.0,0.0,6
140,106.0,188.0,23.0,34.0,12.533333,34.0,11.76,12.617052,159.19,12.25,12.533333,106.0,188.0,29200.0,227.0,7
144901,13712.0,7878135.0,5792.0,14480.0,3733.71327,14480.0,1610.776871,2414.090913,5827835.0,1611.105467,3733.71327,13712.0,7878135.0,235.0,235.0,8
4657,3734.0,120.0,705.0,6.0,6.0,705.0,94.878049,180.882447,32718.46,97.25,6.0,3734.0,120.0,255.0,1452.0,9


In [22]:
label_mapping = {
    0: "BENIGN",
    1: "Bot",
    2: "DDoS",
    3: "DoS GoldenEye",
    4: "DoS Hulk",
    5: "DoS Slowhttptest",
    6: "DoS slowloris",
    7: "FTP-Patator",
    8: "Heartbleed",
    9: "Infiltration",
    10: "PortScan",
    11: "SSH-Patator",
    12: "Web Attack - Brute Force",
    13: "Web Attack - Sql Injection",
    14: "Web Attack - XSS"
}

filtered_df = (
    X_final.drop_duplicates(subset='label', keep='first')
           .sort_values(by='label')
           .reset_index(drop=True)
)

# Rename the labels
filtered_df['label'] = filtered_df['label'].map(label_mapping)


In [23]:
filtered_df

Unnamed: 0,total_length_of_fwd_packets,total_length_of_bwd_packets,fwd_packet_length_max,bwd_packet_length_max,bwd_packet_length_mean,max_packet_length,packet_length_mean,packet_length_std,packet_length_variance,average_packet_size,avg_bwd_segment_size,subflow_fwd_bytes,subflow_bwd_bytes,init_win_bytes_forward,init_win_bytes_backward,label
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,264.0,-1.0,BENIGN
1,6.0,6.0,6.0,6.0,6.0,6.0,6.0,0.0,0.0,9.0,6.0,6.0,6.0,237.0,256.0,Bot
2,24.0,0.0,6.0,0.0,0.0,6.0,6.0,0.0,0.0,7.5,0.0,24.0,0.0,256.0,-1.0,DDoS
3,1884.0,11632.0,471.0,7240.0,1938.666667,7240.0,795.058824,1959.457657,3839474.0,844.75,1938.666667,1884.0,11632.0,29200.0,235.0,DoS GoldenEye
4,369.0,11595.0,369.0,8688.0,2898.75,8688.0,854.571429,2383.3406,5680312.0,920.307692,2898.75,369.0,11595.0,251.0,235.0,DoS Hulk
5,531.0,0.0,520.0,0.0,0.0,520.0,59.0,172.913273,29899.0,66.375,0.0,531.0,0.0,29200.0,235.0,DoS Slowhttptest
6,2541.0,6.0,231.0,6.0,2.0,231.0,134.052632,116.803479,13643.05,141.5,2.0,2541.0,6.0,29200.0,0.0,DoS slowloris
7,106.0,188.0,23.0,34.0,12.533333,34.0,11.76,12.617052,159.19,12.25,12.533333,106.0,188.0,29200.0,227.0,FTP-Patator
8,13712.0,7878135.0,5792.0,14480.0,3733.71327,14480.0,1610.776871,2414.090913,5827835.0,1611.105467,3733.71327,13712.0,7878135.0,235.0,235.0,Heartbleed
9,3734.0,120.0,705.0,6.0,6.0,705.0,94.878049,180.882447,32718.46,97.25,6.0,3734.0,120.0,255.0,1452.0,Infiltration


In [24]:
# Build the desired nested dictionary
label_feature_dict = {
    row['label']: row.drop('label').to_dict()
    for _, row in filtered_df.iterrows()
}

# Export to JSON
import json
with open(f'{DOCS_DIR}/test.json', 'w') as f:
    json.dump(label_feature_dict, f, indent=4)
