In [1]:
import os
import pandas as pd
import numpy as np
import pickle


from sklearn.preprocessing import LabelEncoder
from sklearn.feature_selection import SelectKBest, mutual_info_classif

In [2]:
DOCS_DIR = './docs' 
MODEL_DIR = './models' 
sample_file = f'{DOCS_DIR}/sampled_data.csv'

In [3]:
# Check if the file exists
if os.path.exists(sample_file):
    df = pd.read_csv(sample_file)
    print("✅ Sample data loaded successfully.")
else:
    print("⚠️ Sample data not found. Please perform an overview to generate the sample dataset for preprocessing.")

✅ Sample data loaded successfully.


In [4]:
# Display basic info
print("\n" + "="*50)
print("DATASET OVERVIEW")
print("="*50)
print(df.info())


DATASET OVERVIEW
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3014826 entries, 0 to 3014825
Data columns (total 33 columns):
 #   Column                      Dtype  
---  ------                      -----  
 0   FLOW_ID                     int64  
 1   PROTOCOL_MAP                object 
 2   L4_SRC_PORT                 int64  
 3   IPV4_SRC_ADDR               object 
 4   L4_DST_PORT                 int64  
 5   IPV4_DST_ADDR               object 
 6   FIRST_SWITCHED              int64  
 7   FLOW_DURATION_MILLISECONDS  int64  
 8   LAST_SWITCHED               int64  
 9   PROTOCOL                    int64  
 10  TCP_FLAGS                   int64  
 11  TCP_WIN_MAX_IN              int64  
 12  TCP_WIN_MAX_OUT             int64  
 13  TCP_WIN_MIN_IN              int64  
 14  TCP_WIN_MIN_OUT             int64  
 15  TCP_WIN_MSS_IN              int64  
 16  TCP_WIN_SCALE_IN            int64  
 17  TCP_WIN_SCALE_OUT           int64  
 18  SRC_TOS                     int64  
 19  DST

In [5]:
# List all column names
all_columns = df.columns.tolist()
print("📃 All Columns:")
for i, col in enumerate(all_columns, start=1):
    print(f"{i}. {col}")

# Print total number of columns
print(f"🧮 Total Columns: {len(all_columns)}")


📃 All Columns:
1. FLOW_ID
2. PROTOCOL_MAP
3. L4_SRC_PORT
4. IPV4_SRC_ADDR
5. L4_DST_PORT
6. IPV4_DST_ADDR
7. FIRST_SWITCHED
8. FLOW_DURATION_MILLISECONDS
9. LAST_SWITCHED
10. PROTOCOL
11. TCP_FLAGS
12. TCP_WIN_MAX_IN
13. TCP_WIN_MAX_OUT
14. TCP_WIN_MIN_IN
15. TCP_WIN_MIN_OUT
16. TCP_WIN_MSS_IN
17. TCP_WIN_SCALE_IN
18. TCP_WIN_SCALE_OUT
19. SRC_TOS
20. DST_TOS
21. TOTAL_FLOWS_EXP
22. MIN_IP_PKT_LEN
23. MAX_IP_PKT_LEN
24. TOTAL_PKTS_EXP
25. TOTAL_BYTES_EXP
26. IN_BYTES
27. IN_PKTS
28. OUT_BYTES
29. OUT_PKTS
30. ALERT
31. ANALYSIS_TIMESTAMP
32. label
33. ANOMALY
🧮 Total Columns: 33


In [6]:
df.describe(include='all').T

Unnamed: 0,count,unique,top,freq,mean,std,min,25%,50%,75%,max
FLOW_ID,3014826.0,,,,355019690.1962,13578122.846709,20.0,351431611.5,357083444.0,363886806.75,374376306.0
PROTOCOL_MAP,3014826.0,6.0,tcp,1907752.0,,,,,,,
L4_SRC_PORT,3014826.0,,,,44575.619359,16987.793545,0.0,37944.0,49324.0,55884.0,65535.0
IPV4_SRC_ADDR,3014826.0,42479.0,10.114.241.191,641831.0,,,,,,,
L4_DST_PORT,3014826.0,,,,8027.832303,16778.428287,0.0,53.0,443.0,2375.0,65535.0
IPV4_DST_ADDR,3014826.0,58186.0,10.114.224.73,660224.0,,,,,,,
FIRST_SWITCHED,3014826.0,,,,1647536522.28435,120462.104799,1647329738.0,1647440362.0,1647517488.0,1647604656.0,1648225514.0
FLOW_DURATION_MILLISECONDS,3014826.0,,,,5135.963725,18974.526567,0.0,0.0,32.0,3017.0,119999.0
LAST_SWITCHED,3014826.0,,,,1647536527.418399,120462.384589,1647329817.0,1647440372.0,1647517494.0,1647604660.0,1648225517.0
PROTOCOL,3014826.0,,,,9.266743,5.423318,1.0,6.0,6.0,17.0,58.0


In [7]:
df['label'].unique()

array(['dos', 'portscanning', 'normal', 'malware'], dtype=object)

In [8]:
label_counts = df['label'].value_counts()
print(label_counts)


label
normal          1955477
dos              641831
portscanning     417455
malware              63
Name: count, dtype: int64


In [9]:
# Clean and normalize column names (strip spaces, lowercase, replace spaces with underscores, remove brackets)
df.columns = (
    df.columns
    .str.strip()
    .str.lower()
    .str.replace(' ', '_')
    .str.replace('(', '')
    .str.replace(')', '')
)

df.head(20)


Unnamed: 0,flow_id,protocol_map,l4_src_port,ipv4_src_addr,l4_dst_port,ipv4_dst_addr,first_switched,flow_duration_milliseconds,last_switched,protocol,...,total_pkts_exp,total_bytes_exp,in_bytes,in_pkts,out_bytes,out_pkts,alert,analysis_timestamp,label,anomaly
0,356579549,tcp,56962,10.114.241.191,443,10.114.224.73,1647515668,7130,1647515675,6,...,0,0,240,4,0,0,Denial of Service,1647515720,dos,
1,367378020,tcp,33216,10.114.241.166,58939,10.114.224.151,1647679889,0,1647679889,6,...,0,0,44,1,40,1,Port Scanning,1647679992,portscanning,1.0
2,352013022,udp,42836,10.114.225.205,53,10.114.241.113,1647446333,1,1647446333,17,...,0,0,95,1,180,1,,1647446403,normal,
3,350798142,udp,48481,10.114.226.5,53,204.74.108.252,1647434871,33,1647434871,17,...,0,0,84,1,208,1,,1647434909,normal,
4,364558533,icmp,0,35.161.143.192,0,10.114.226.5,1647611427,0,1647611427,1,...,0,0,36,1,0,0,,1647611468,normal,
5,362564981,tcp,41849,170.106.176.49,10023,10.114.224.117,1647598052,0,1647598052,6,...,0,0,52,1,40,1,,1647598108,normal,
6,363582060,tcp,49368,10.114.241.191,443,10.114.224.73,1647603533,3026,1647603536,6,...,0,0,180,3,0,0,Denial of Service,1647603608,dos,
7,334742616,tcp,42814,10.114.241.191,443,10.114.224.73,1647345268,111014,1647345379,6,...,0,0,1473,14,676,13,Denial of Service,1647345417,dos,
8,358540814,tcp,34628,10.114.225.204,443,52.215.192.132,1647530897,1748,1647530898,6,...,0,0,327,4,209,2,,1647530954,normal,
9,351606259,udp,52607,10.114.225.204,53,10.114.226.5,1647441963,15,1647441963,17,...,0,0,190,2,382,2,,1647442048,normal,


In [10]:
drop_cols = [
    'flow_id', 'protocol_map', 'l4_src_port', 'ipv4_src_addr',
       'l4_dst_port', 'ipv4_dst_addr', 'first_switched','last_switched','alert',
       'analysis_timestamp','anomaly'
]

df = df.drop(columns=drop_cols, errors='ignore')


In [11]:
df.head()

Unnamed: 0,flow_duration_milliseconds,protocol,tcp_flags,tcp_win_max_in,tcp_win_max_out,tcp_win_min_in,tcp_win_min_out,tcp_win_mss_in,tcp_win_scale_in,tcp_win_scale_out,...,total_flows_exp,min_ip_pkt_len,max_ip_pkt_len,total_pkts_exp,total_bytes_exp,in_bytes,in_pkts,out_bytes,out_pkts,label
0,7130,6,2,64240,0,64240,0,1460,7,0,...,356579549,0,0,0,0,240,4,0,0,dos
1,0,6,22,1024,0,1024,0,1460,0,0,...,367378020,0,0,0,0,44,1,40,1,portscanning
2,1,17,0,0,0,0,0,0,0,0,...,352013022,0,0,0,0,95,1,180,1,normal
3,33,17,0,0,0,0,0,0,0,0,...,350798142,0,0,0,0,84,1,208,1,normal
4,0,1,0,0,0,0,0,0,0,0,...,364558533,0,0,0,0,36,1,0,0,normal


In [12]:
# Label encoding
le = LabelEncoder()
df['label'] = le.fit_transform(df['label'])

os.makedirs(MODEL_DIR, exist_ok=True)

# Save the LabelEncoder
with open(f"{MODEL_DIR}/label_encoder.pkl", "wb") as f:
    pickle.dump(le, f)

# Print label classes
print("✅ Encoded labels:", list(le.classes_))

✅ Encoded labels: ['dos', 'malware', 'normal', 'portscanning']


In [13]:
# Print label → encoded number mapping
print("\n🔢 Encoded Label Mapping:")
for index, label in enumerate(le.classes_):
    print(f"{index} → {label}")


🔢 Encoded Label Mapping:
0 → dos
1 → malware
2 → normal
3 → portscanning


In [14]:
df.head()
len(df.columns)

22

In [15]:
x = df.iloc[:,:-1]
x

Unnamed: 0,flow_duration_milliseconds,protocol,tcp_flags,tcp_win_max_in,tcp_win_max_out,tcp_win_min_in,tcp_win_min_out,tcp_win_mss_in,tcp_win_scale_in,tcp_win_scale_out,...,dst_tos,total_flows_exp,min_ip_pkt_len,max_ip_pkt_len,total_pkts_exp,total_bytes_exp,in_bytes,in_pkts,out_bytes,out_pkts
0,7130,6,2,64240,0,64240,0,1460,7,0,...,0,356579549,0,0,0,0,240,4,0,0
1,0,6,22,1024,0,1024,0,1460,0,0,...,0,367378020,0,0,0,0,44,1,40,1
2,1,17,0,0,0,0,0,0,0,0,...,0,352013022,0,0,0,0,95,1,180,1
3,33,17,0,0,0,0,0,0,0,0,...,0,350798142,0,0,0,0,84,1,208,1
4,0,1,0,0,0,0,0,0,0,0,...,0,364558533,0,0,0,0,36,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3014821,1012,6,2,64240,0,64240,0,1460,7,0,...,0,369503013,0,0,0,0,120,2,0,0
3014822,0,6,2,65535,0,65535,0,0,0,0,...,0,364786722,0,0,0,0,40,1,0,0
3014823,0,6,22,1024,0,1024,0,1460,0,0,...,0,319367151,0,0,0,0,44,1,40,1
3014824,7199,6,2,64240,0,64240,0,1460,7,0,...,0,373989535,0,0,0,0,240,4,0,0


In [16]:
y = df.iloc[:,-1]
y

0          0
1          3
2          2
3          2
4          2
          ..
3014821    0
3014822    2
3014823    3
3014824    0
3014825    2
Name: label, Length: 3014826, dtype: int64

In [17]:

# 1. Select top 15 features using Mutual Information
k_best = 15
selector_mi = SelectKBest(score_func=mutual_info_classif, k=k_best)
X_selected = selector_mi.fit_transform(x, y)

# 2. Get the names of selected features
selected_feature_names = x.columns[selector_mi.get_support()]

# 3. Display selected features
print(f"✅ Selected Top {k_best} Features:")
for i, feature in enumerate(selected_feature_names, 1):
    print(f"{i}. {feature}")

# 4. Create a DataFrame with only selected features
X_final = pd.DataFrame(X_selected, columns=selected_feature_names)


✅ Selected Top 15 Features:
1. flow_duration_milliseconds
2. protocol
3. tcp_flags
4. tcp_win_max_in
5. tcp_win_max_out
6. tcp_win_min_in
7. tcp_win_min_out
8. tcp_win_mss_in
9. tcp_win_scale_in
10. tcp_win_scale_out
11. total_flows_exp
12. in_bytes
13. in_pkts
14. out_bytes
15. out_pkts


In [18]:
X_final['label'] = y.reset_index(drop=True)  


In [19]:
X_final

Unnamed: 0,flow_duration_milliseconds,protocol,tcp_flags,tcp_win_max_in,tcp_win_max_out,tcp_win_min_in,tcp_win_min_out,tcp_win_mss_in,tcp_win_scale_in,tcp_win_scale_out,total_flows_exp,in_bytes,in_pkts,out_bytes,out_pkts,label
0,7130,6,2,64240,0,64240,0,1460,7,0,356579549,240,4,0,0,0
1,0,6,22,1024,0,1024,0,1460,0,0,367378020,44,1,40,1,3
2,1,17,0,0,0,0,0,0,0,0,352013022,95,1,180,1,2
3,33,17,0,0,0,0,0,0,0,0,350798142,84,1,208,1,2
4,0,1,0,0,0,0,0,0,0,0,364558533,36,1,0,0,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3014821,1012,6,2,64240,0,64240,0,1460,7,0,369503013,120,2,0,0,0
3014822,0,6,2,65535,0,65535,0,0,0,0,364786722,40,1,0,0,2
3014823,0,6,22,1024,0,1024,0,1460,0,0,319367151,44,1,40,1,3
3014824,7199,6,2,64240,0,64240,0,1460,7,0,373989535,240,4,0,0,0


In [20]:

# saving preprocessed data for modeling
processed_data = f'{DOCS_DIR}/preprocessed_data.csv'

# Save only if it doesn't already exist
if os.path.exists(processed_data):
    print("⚠️ Preprocessed data already exists. Skipping save.")
else:
    X_final.to_csv(processed_data, index=False)
    print("✅ Preprocessed data saved successfully.")

# Display info about the final dataset
print("\n🧾 Dataset Overview:")
X_final.info()

print("\n🚀 Preprocessing complete — proceed to the modeling step.")


✅ Preprocessed data saved successfully.

🧾 Dataset Overview:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3014826 entries, 0 to 3014825
Data columns (total 16 columns):
 #   Column                      Dtype
---  ------                      -----
 0   flow_duration_milliseconds  int64
 1   protocol                    int64
 2   tcp_flags                   int64
 3   tcp_win_max_in              int64
 4   tcp_win_max_out             int64
 5   tcp_win_min_in              int64
 6   tcp_win_min_out             int64
 7   tcp_win_mss_in              int64
 8   tcp_win_scale_in            int64
 9   tcp_win_scale_out           int64
 10  total_flows_exp             int64
 11  in_bytes                    int64
 12  in_pkts                     int64
 13  out_bytes                   int64
 14  out_pkts                    int64
 15  label                       int64
dtypes: int64(16)
memory usage: 368.0 MB

🚀 Preprocessing complete — proceed to the modeling step.


In [21]:
X_final['label']

0          0
1          3
2          2
3          2
4          2
          ..
3014821    0
3014822    2
3014823    3
3014824    0
3014825    2
Name: label, Length: 3014826, dtype: int64

In [22]:
features = X_final.iloc[:,:-1]
labels = X_final.iloc[:,-1]

In [23]:
X_final.drop_duplicates(subset='label',keep="first").sort_values(by='label')


Unnamed: 0,flow_duration_milliseconds,protocol,tcp_flags,tcp_win_max_in,tcp_win_max_out,tcp_win_min_in,tcp_win_min_out,tcp_win_mss_in,tcp_win_scale_in,tcp_win_scale_out,total_flows_exp,in_bytes,in_pkts,out_bytes,out_pkts,label
0,7130,6,2,64240,0,64240,0,1460,7,0,356579549,240,4,0,0,0
209802,41,17,0,0,0,0,0,0,0,0,6126,76,1,76,1,1
2,1,17,0,0,0,0,0,0,0,0,352013022,95,1,180,1,2
1,0,6,22,1024,0,1024,0,1460,0,0,367378020,44,1,40,1,3


In [24]:
# Step 1: Load saved LabelEncoder
with open(os.path.join(MODEL_DIR, "label_encoder.pkl"), "rb") as f:
    le = pickle.load(f)

# Step 2: Drop duplicates, sort, and reset index
filtered_df = (
    X_final.drop_duplicates(subset='label', keep='first')
           .sort_values(by='label')
           .reset_index(drop=True)
)

# Step 3: Decode numeric labels back to string labels
filtered_df['label'] = le.inverse_transform(filtered_df['label'])

In [25]:
filtered_df

Unnamed: 0,flow_duration_milliseconds,protocol,tcp_flags,tcp_win_max_in,tcp_win_max_out,tcp_win_min_in,tcp_win_min_out,tcp_win_mss_in,tcp_win_scale_in,tcp_win_scale_out,total_flows_exp,in_bytes,in_pkts,out_bytes,out_pkts,label
0,7130,6,2,64240,0,64240,0,1460,7,0,356579549,240,4,0,0,dos
1,41,17,0,0,0,0,0,0,0,0,6126,76,1,76,1,malware
2,1,17,0,0,0,0,0,0,0,0,352013022,95,1,180,1,normal
3,0,6,22,1024,0,1024,0,1460,0,0,367378020,44,1,40,1,portscanning


In [26]:
# Build the desired nested dictionary
label_feature_dict = {
    row['label']: row.drop('label').to_dict()
    for _, row in filtered_df.iterrows()
}

# Export to JSON
import json
with open('./test.json', 'w') as f:
    json.dump(label_feature_dict, f, indent=4)