In [1]:
import pandas as pd
import numpy as np
import time
import matplotlib.pyplot as plt
import pickle
import os
import random
from genetic_selection import GeneticSelectionCV
from sklearn.ensemble import RandomForestClassifier

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

In [3]:
random.seed(2025)
np.random.seed(2025)

In [4]:
df = pd.read_parquet('CiC_IoT23_clean.parquet')

In [5]:
# Creating a dictionary of attack types for 33 attack classes + 1 for benign traffic
attacks = {'BenignTraffic': 0 ,
                    'DDoS-RSTFINFlood' :1, 'DDoS-PSHACK_Flood':2,  'DDoS-SYN_Flood':3, 'DDoS-UDP_Flood':4, 'DDoS-TCP_Flood':5, 
                    'DDoS-ICMP_Flood':6, 'DDoS-SynonymousIP_Flood':7, 'DDoS-ACK_Fragmentation':8, 'DDoS-UDP_Fragmentation':9, 'DDoS-ICMP_Fragmentation':10, 
                    'DDoS-SlowLoris':11, 'DDoS-HTTP_Flood':12, 'DoS-UDP_Flood':13, 'DoS-SYN_Flood':14, 'DoS-TCP_Flood':15, 'DoS-HTTP_Flood':16,                 # DDoS and DoS
                    'Mirai-greeth_flood': 17, 'Mirai-greip_flood': 18, 'Mirai-udpplain': 19,                                                                    # Mirai 
                    'Recon-PingSweep': 20, 'Recon-OSScan': 21, 'Recon-PortScan': 22, 'VulnerabilityScan': 23, 'Recon-HostDiscovery': 24,                        # Reconnaissance
                    'DNS_Spoofing': 25, 'MITM-ArpSpoofing': 26,                                                                                                 # Spoofing
                    'BrowserHijacking': 27, 'Backdoor_Malware': 28, 'XSS': 29, 'Uploading_Attack': 30, 'SqlInjection': 31, 'CommandInjection': 32,              # Web
                    'DictionaryBruteForce': 33}
df['label'] = df['label'].map(attacks)

In [6]:
X = df.drop(columns=['label'])
y = df['label']

In [7]:
# Split the data into train (80%) and test (20%) sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [8]:
# Undersampling majority classes
df_train = X_train.copy()
df_train['label'] = y_train

# Contador por clase
label_counts = df_train['label'].value_counts()

# Lista para almacenar los subsets
sampled_dfs = []

for label, count in label_counts.items():
    class_subset = df_train[df_train['label'] == label]
    
    if count < 100_000:
        sampled = class_subset  # conservar todo
    elif count < 500_000:
        sampled = class_subset.sample(frac=0.6, random_state=42)
    elif count < 5_000_000:
        sampled = class_subset.sample(frac=0.4, random_state=42)        
    else:
        sampled = class_subset.sample(frac=0.2, random_state=42)
    
    sampled_dfs.append(sampled)

# Unir todos los subconjuntos
df_train_sampled = pd.concat(sampled_dfs, ignore_index=True)

# Separar nuevamente X_train e y_train
X_train = df_train_sampled.drop(columns='label')
y_train = df_train_sampled['label']

# Mostrar la nueva distribución
print("Final distribution on training set:\n")
print(y_train.value_counts())

Final distribution on training set:

label
4     1731932
7      921991
13     904657
15     514912
14     472653
3      434843
2      367298
6      361768
0      351416
5      336001
19     284984
17     260483
1      226200
18     222442
10     204337
26     147643
9      137724
8      125031
25      85847
21      78410
22      65610
24      64450
16      57314
23      29906
12      22969
11      18741
33      10451
27       4687
32       4327
31       4196
29       3077
28       2574
20       1805
30       1002
Name: count, dtype: int64


In [9]:
# Train a base RF model as reference
rf = RandomForestClassifier(n_estimators=20, random_state=42, n_jobs=10)
rf.fit(X_train, y_train)
accuracy_base = rf.score(X_test, y_test)
print(accuracy_base)

0.9905935074546834


In [10]:
# Configure GeneticSelectionCV with custom scoring function
n_generations = 5 
ga_selector = GeneticSelectionCV(
    estimator=rf,                
    cv=3,
    verbose=1,
    scoring='accuracy',      
    max_features=X_train.shape[1],
    n_population=60,
    crossover_proba=0.5,
    mutation_proba=0.2,
    n_generations=n_generations,
    n_jobs=1 
)

In [11]:
# Genetic Algorithm run
start_time = time.time()
ga_selector.fit(X_train, y_train)
ga_time = time.time() - start_time

Selecting features with genetic algorithm.
gen	nevals	avg                               	std                               	min                            	max                               
0  	60    	[  0.814275  22.466667   0.000112]	[  0.226794  13.835301   0.000063]	[ 0.245338  1.        0.000013]	[  0.984796  44.         0.000311]
1  	35    	[  0.965946  29.95       0.000102]	[ 0.056272  8.978539  0.000045]   	[ 0.681676  8.        0.000029]	[  0.98501   44.         0.000197]
2  	36    	[  0.982069  29.65       0.000097]	[ 0.017828  6.898853  0.000061]   	[  0.845168  17.         0.000009]	[  0.985022  43.         0.000276]
3  	34    	[  0.98244   25.6        0.000073]	[ 0.017872  5.577335  0.00005 ]   	[  0.845181  17.         0.000009]	[  0.985149  41.         0.00031 ]
4  	38    	[  0.984903  23.85       0.00007 ]	[ 0.000244  3.529991  0.00004 ]   	[  0.984322  17.         0.000002]	[  0.985342  30.         0.000197]
5  	29    	[  0.985038  22.75       0.000071]	[ 0.000298  3.

In [12]:
# List of selected features
selected_features = X_train.columns[ga_selector.support_]
print("Features selected by Genetic Algorithm:", list(selected_features))

Features selected by Genetic Algorithm: ['flow_duration', 'Header_Length', 'Protocol Type', 'Rate', 'syn_flag_number', 'psh_flag_number', 'syn_count', 'fin_count', 'urg_count', 'rst_count', 'HTTP', 'DNS', 'Telnet', 'SSH', 'ARP', 'IPv', 'Max', 'IAT', 'Number', 'Weight']


In [13]:
# Assess the model with selected features
X_train_selected = X_train[selected_features]
X_test_selected = X_test[selected_features]

start_train = time.time()
rf.fit(X_train_selected, y_train)
train_time = time.time() - start_train  

num_repetitions = 10    
inference_times = []
for _ in range(num_repetitions):
    start_pred = time.time()
    y_pred = rf.predict(X_test_selected)
    inference_times.append(time.time() - start_pred)

inference_time_median = np.median(inference_times)

y_pred = rf.predict(X_test_selected)
accuracy_ga = rf.score(X_test_selected, y_test)

print(f" Number of selected features: {len(selected_features)}\n"),
print(f" Accuracy with selected features: {accuracy_ga:.4f}\n")

# Save model
model_filename = os.path.join("RaspberryPi", "GA_model.pkl")
with open(model_filename, "wb") as f:
    pickle.dump(rf, f)    
    
model_size = os.path.getsize(model_filename) / (1024 * 1024)  # Convertir a MB    

# Save feature names
features_filename = os.path.join("RaspberryPi", "GA_features.npy")
np.save(features_filename, np.array(selected_features))

# Print results
results_ga = pd.DataFrame({
    "Num Features": [len(selected_features)],
    "Selected Features": [list(selected_features)],    
    "Accuracy": [accuracy_ga],
    "Train time": [train_time],   
    "Inference time": [inference_time_median],
    "Model size": [train_time]       
})

# Print results
print(f" Test accuracy: {accuracy_ga:.4f}\n")
print(f" Training time: {train_time:.4f}\n")
print(f" Inference_time: {inference_time_median:.4f}\n")
print(f" Model Size: {model_size:.4f}\n")  
print(f" Selected Features: {list(selected_features)}\n")

print(classification_report(y_test, y_pred, digits=4))

 Number of selected features: 20

 Accuracy with selected features: 0.9913

 Test accuracy: 0.9913

 Training time: 46.7678

 Inference_time: 9.6374

 Model Size: 1576.3602

 Selected Features: ['flow_duration', 'Header_Length', 'Protocol Type', 'Rate', 'syn_flag_number', 'psh_flag_number', 'syn_count', 'fin_count', 'urg_count', 'rst_count', 'HTTP', 'DNS', 'Telnet', 'SSH', 'ARP', 'IPv', 'Max', 'IAT', 'Number', 'Weight']

              precision    recall  f1-score   support

           0     0.9317    0.9686    0.9498    219636
           1     0.9993    0.9988    0.9990    141375
           2     0.9998    0.9994    0.9996    229562
           3     0.9997    0.9995    0.9996    271777
           4     0.9998    0.9999    0.9998   1082458
           5     0.9996    0.9996    0.9996    210001
           6     0.9998    0.9997    0.9997    226105
           7     0.9998    0.9996    0.9997    576244
           8     0.9969    0.9990    0.9980     52096
           9     0.9980    0.9988 