In [1]:
import pandas as pd
import numpy as np
import time
import matplotlib.pyplot as plt
import pickle
import os
import random
from genetic_selection import GeneticSelectionCV
from sklearn.ensemble import RandomForestClassifier

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

In [3]:
random.seed(2025)
np.random.seed(2025)

In [4]:
df = pd.read_csv("Edge-IIoTset_clean.csv", low_memory=False)

In [5]:
# Creating a dictionary of Types
attacks = {'Normal': 0,'MITM': 1, 'Uploading': 2, 'Ransomware': 3, 'SQL_injection': 4,
       'DDoS_HTTP': 5, 'DDoS_TCP': 6, 'Password': 7, 'Port_Scanning': 8,
       'Vulnerability_scanner': 9, 'Backdoor': 10, 'XSS': 11, 'Fingerprinting': 12,
       'DDoS_UDP': 13, 'DDoS_ICMP': 14}
df['Attack_type'] = df['Attack_type'].map(attacks)

In [6]:
X = df.drop(columns=['Attack_label', 'Attack_type'])
y = df['Attack_type']

In [7]:
# Split the data into train (80%) and test (20%) sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [8]:
# Train a base RF model as reference
rf = RandomForestClassifier(n_estimators=20, random_state=42, n_jobs=10)
rf.fit(X_train, y_train)
accuracy_base = rf.score(X_test, y_test)
print(accuracy_base)

0.9886684314682979


In [9]:
# Inference time of reference model
num_repetitions = 10
inference_times_base = []
for _ in range(num_repetitions):
    start_pred = time.time()
    _ = rf.predict(X_test)
    inference_times_base.append(time.time() - start_pred)
inference_time_base = np.median(inference_times_base)

# Reference model size
model_filename = "rf_base.pkl"
with open(model_filename, "wb") as f:
    pickle.dump(rf, f)
size_base = os.path.getsize(model_filename) / (1024 * 1024)  # Convertir a MB


In [10]:
# Configure GeneticSelectionCV with custom scoring function
n_generations = 5 
ga_selector = GeneticSelectionCV(
    estimator=rf,                
    cv=3,
    verbose=1,
    scoring='accuracy',      
    max_features=X_train.shape[1],
    n_population=60,
    crossover_proba=0.5,
    mutation_proba=0.2,
    n_generations=n_generations,
    n_jobs=1 
)

In [11]:
# Genetic Algorithm run
start_time = time.time()
ga_selector.fit(X_train, y_train)
ga_time = time.time() - start_time

Selecting features with genetic algorithm.
gen	nevals	avg                               	std                               	min                            	max                               
0  	60    	[  0.920776  22.466667   0.000135]	[  0.081585  13.835301   0.000082]	[ 0.719435  1.        0.000001]	[  0.989209  44.         0.000366]
1  	35    	[  0.978678  32.383333   0.000104]	[ 0.024954  8.600565  0.00007 ]   	[ 0.835114  6.        0.000019]	[  0.98965   44.         0.000326]
2  	36    	[  0.988616  36.65       0.000072]	[ 0.001324  4.381115  0.000037]   	[  0.981888  27.         0.000008]	[  0.989653  44.         0.000191]
3  	34    	[  0.988674  36.116667   0.000051]	[ 0.001706  4.1956    0.000035]   	[  0.980858  26.         0.      ]	[  0.989659  43.         0.000185]
4  	38    	[  0.988895  37.333333   0.000038]	[ 0.001852  3.248931  0.00003 ]   	[  0.980636  31.         0.000008]	[  0.989684  43.         0.000163]
5  	29    	[  0.98956   37.983333   0.000023]	[ 0.000178  2.

In [12]:
# List of selected features
selected_features = X_train.columns[ga_selector.support_]
print("Features seleccionadas por el Algoritmo Genético:", list(selected_features))

Features seleccionadas por el Algoritmo Genético: ['arp.opcode', 'icmp.checksum', 'icmp.seq_le', 'icmp.transmit_timestamp', 'http.content_length', 'http.response', 'tcp.ack', 'tcp.ack_raw', 'tcp.connection.fin', 'tcp.connection.rst', 'tcp.connection.syn', 'tcp.connection.synack', 'tcp.dstport', 'tcp.flags.ack', 'tcp.len', 'tcp.seq', 'udp.port', 'udp.stream', 'udp.time_delta', 'dns.qry.qu', 'dns.retransmission', 'dns.retransmit_request', 'dns.retransmit_request_in', 'mqtt.conflag.cleansess', 'mqtt.conflags', 'mqtt.len', 'mqtt.msgtype', 'mqtt.proto_len', 'mqtt.topic_len', 'mbtcp.len', 'mbtcp.trans_id', 'mbtcp.unit_id', 'http1_encoded', 'http2_encoded', 'http3_encoded', 'dns_encoded', 'mqtt1_encoded', 'mqtt2_encoded', 'mqtt3_encoded']


In [13]:
# Assess the model with selected features
X_train_selected = X_train[selected_features]
X_test_selected = X_test[selected_features]

start_train = time.time()
rf.fit(X_train_selected, y_train)
train_time = time.time() - start_train  

num_repetitions = 10    
inference_times = []
for _ in range(num_repetitions):
    start_pred = time.time()
    y_pred = rf.predict(X_test_selected)
    inference_times.append(time.time() - start_pred)

inference_time_median = np.median(inference_times)

y_pred = rf.predict(X_test_selected)
accuracy_ga = rf.score(X_test_selected, y_test)

print(f" Number of selected features: {len(selected_features)}\n"),
print(f" Accuracy with selected features: {accuracy_ga:.4f}\n")

# Save model
model_filename = os.path.join("RaspberryPi", "GA_model.pkl")
with open(model_filename, "wb") as f:
    pickle.dump(rf, f)    
    
model_size = os.path.getsize(model_filename) / (1024 * 1024)  # Convertir a MB    

# Save feature names
features_filename = os.path.join("RaspberryPi", "GA_features.npy")
np.save(features_filename, np.array(selected_features))

# Print results
results_ga = pd.DataFrame({
    "Num Features": [len(selected_features)],
    "Selected Features": [list(selected_features)],    
    "Accuracy": [accuracy_ga],
    "Train time": [train_time],   
    "Inference time": [inference_time_median],
    "Model size": [train_time]       
})

# Print results
print(f" Test accuracy: {accuracy_ga:.4f}\n")
print(f" Training time: {train_time:.4f}\n")
print(f" Inference_time: {inference_time_median:.4f}\n")
print(f" Model Size: {model_size:.4f}\n")  
print(f" Selected Features: {list(selected_features)}\n")

print(classification_report(y_test, y_pred, digits=4))

 Number of selected features: 39

 Accuracy with selected features: 0.9896

 Test accuracy: 0.9896

 Training time: 4.3975

 Inference_time: 0.4197

 Model Size: 48.2006

 Selected Features: ['arp.opcode', 'icmp.checksum', 'icmp.seq_le', 'icmp.transmit_timestamp', 'http.content_length', 'http.response', 'tcp.ack', 'tcp.ack_raw', 'tcp.connection.fin', 'tcp.connection.rst', 'tcp.connection.syn', 'tcp.connection.synack', 'tcp.dstport', 'tcp.flags.ack', 'tcp.len', 'tcp.seq', 'udp.port', 'udp.stream', 'udp.time_delta', 'dns.qry.qu', 'dns.retransmission', 'dns.retransmit_request', 'dns.retransmit_request_in', 'mqtt.conflag.cleansess', 'mqtt.conflags', 'mqtt.len', 'mqtt.msgtype', 'mqtt.proto_len', 'mqtt.topic_len', 'mbtcp.len', 'mbtcp.trans_id', 'mbtcp.unit_id', 'http1_encoded', 'http2_encoded', 'http3_encoded', 'dns_encoded', 'mqtt1_encoded', 'mqtt2_encoded', 'mqtt3_encoded']

              precision    recall  f1-score   support

           0     1.0000    1.0000    1.0000    279925
       