In [2]:
# Import das bibliotecas

import pandas as pd
import numpy as np
from sklearn.cluster import MiniBatchKMeans
from imblearn.over_sampling import SMOTE, RandomOverSampler, ADASYN, BorderlineSMOTE, KMeansSMOTE, SVMSMOTE
from imblearn.under_sampling import ClusterCentroids, CondensedNearestNeighbour, EditedNearestNeighbours
from imblearn.under_sampling import RepeatedEditedNearestNeighbours, TomekLinks, AllKNN, InstanceHardnessThreshold
from imblearn.under_sampling import NearMiss, NeighbourhoodCleaningRule, OneSidedSelection, RandomUnderSampler
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

import matplotlib.pyplot as plt

from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score, roc_curve


In [6]:
#Carregamento do dataset
df_iot = pd.read_csv('RT_IOT2022_20240409191653', index_col =0)
df_iot.head()

Unnamed: 0,id.orig_p,id.resp_p,proto,service,flow_duration,fwd_pkts_tot,bwd_pkts_tot,fwd_data_pkts_tot,bwd_data_pkts_tot,fwd_pkts_per_sec,...,active.std,idle.min,idle.max,idle.tot,idle.avg,idle.std,fwd_init_window_size,bwd_init_window_size,fwd_last_window_size,Attack_type
0,38667,1883,tcp,mqtt,32.011598,9,5,3,3,0.281148,...,0.0,29729180.0,29729180.0,29729180.0,29729180.0,0.0,64240,26847,502,MQTT_Publish
1,51143,1883,tcp,mqtt,31.883584,9,5,3,3,0.282277,...,0.0,29855280.0,29855280.0,29855280.0,29855280.0,0.0,64240,26847,502,MQTT_Publish
2,44761,1883,tcp,mqtt,32.124053,9,5,3,3,0.280164,...,0.0,29842150.0,29842150.0,29842150.0,29842150.0,0.0,64240,26847,502,MQTT_Publish
3,60893,1883,tcp,mqtt,31.961063,9,5,3,3,0.281593,...,0.0,29913770.0,29913770.0,29913770.0,29913770.0,0.0,64240,26847,502,MQTT_Publish
4,51087,1883,tcp,mqtt,31.902362,9,5,3,3,0.282111,...,0.0,29814700.0,29814700.0,29814700.0,29814700.0,0.0,64240,26847,502,MQTT_Publish


In [7]:
#Dimensões do dataset
df_iot.shape

(123117, 84)

In [8]:
colunas_str = df_iot.select_dtypes(include=['object']).columns
colunas_str

Index(['proto', 'service', 'Attack_type'], dtype='object')

In [9]:
#Remoção das colunas categóricas
df_iot.drop(columns=['proto', 'service'], inplace=True)
df_iot.head()

Unnamed: 0,id.orig_p,id.resp_p,flow_duration,fwd_pkts_tot,bwd_pkts_tot,fwd_data_pkts_tot,bwd_data_pkts_tot,fwd_pkts_per_sec,bwd_pkts_per_sec,flow_pkts_per_sec,...,active.std,idle.min,idle.max,idle.tot,idle.avg,idle.std,fwd_init_window_size,bwd_init_window_size,fwd_last_window_size,Attack_type
0,38667,1883,32.011598,9,5,3,3,0.281148,0.156193,0.437341,...,0.0,29729180.0,29729180.0,29729180.0,29729180.0,0.0,64240,26847,502,MQTT_Publish
1,51143,1883,31.883584,9,5,3,3,0.282277,0.156821,0.439097,...,0.0,29855280.0,29855280.0,29855280.0,29855280.0,0.0,64240,26847,502,MQTT_Publish
2,44761,1883,32.124053,9,5,3,3,0.280164,0.155647,0.435811,...,0.0,29842150.0,29842150.0,29842150.0,29842150.0,0.0,64240,26847,502,MQTT_Publish
3,60893,1883,31.961063,9,5,3,3,0.281593,0.15644,0.438033,...,0.0,29913770.0,29913770.0,29913770.0,29913770.0,0.0,64240,26847,502,MQTT_Publish
4,51087,1883,31.902362,9,5,3,3,0.282111,0.156728,0.438839,...,0.0,29814700.0,29814700.0,29814700.0,29814700.0,0.0,64240,26847,502,MQTT_Publish


In [10]:
# Quebra do dataset original pegando uma amostragem maior ou igual a 50%
df_amostra = df_iot.sample(frac=0.5, random_state=42)
df_amostra.shape

(61558, 82)

In [11]:
df_amostra.head(15)

Unnamed: 0,id.orig_p,id.resp_p,flow_duration,fwd_pkts_tot,bwd_pkts_tot,fwd_data_pkts_tot,bwd_data_pkts_tot,fwd_pkts_per_sec,bwd_pkts_per_sec,flow_pkts_per_sec,...,active.std,idle.min,idle.max,idle.tot,idle.avg,idle.std,fwd_init_window_size,bwd_init_window_size,fwd_last_window_size,Attack_type
55444,7315,21,1e-06,1,1,1,0,838860.8,838860.8,1677722.0,...,0.0,0.0,0.0,0.0,0.0,0.0,64,0,64,DOS_SYN_Hping
15766,18609,21,4e-06,1,1,1,0,262144.0,262144.0,524288.0,...,0.0,0.0,0.0,0.0,0.0,0.0,64,0,64,DOS_SYN_Hping
63311,14663,21,2e-06,1,1,1,0,466033.8,466033.8,932067.6,...,0.0,0.0,0.0,0.0,0.0,0.0,64,0,64,DOS_SYN_Hping
3245,37827,1883,62.052435,9,5,3,3,0.145039,0.080577,0.225616,...,0.0,59819090.0,59819090.0,59819090.0,59819090.0,0.0,64240,26847,502,MQTT_Publish
94007,22348,21,0.0,1,0,1,0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,64,0,64,DOS_SYN_Hping
78072,37150,21,5e-06,1,1,1,0,199728.8,199728.8,399457.5,...,0.0,0.0,0.0,0.0,0.0,0.0,64,0,64,DOS_SYN_Hping
78191,36115,21,5e-06,1,1,1,0,199728.8,199728.8,399457.5,...,0.0,0.0,0.0,0.0,0.0,0.0,64,0,64,DOS_SYN_Hping
39338,42253,21,4e-06,1,1,1,0,246723.8,246723.8,493447.5,...,0.0,0.0,0.0,0.0,0.0,0.0,64,0,64,DOS_SYN_Hping
721,36242,1334,9e-06,1,1,0,0,110376.4,110376.4,220752.8,...,0.0,0.0,0.0,0.0,0.0,0.0,1024,0,1024,NMAP_XMAS_TREE_SCAN
2554,49275,53,0.021367,2,2,2,2,93.60196,93.60196,187.2039,...,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,Thing_Speak


In [12]:
# Quebra do dataset original pegando uma amostragem diferente do que foi usado no df_amostra
indices = ~df_iot.index.isin(df_amostra.index)
df_teste = df_iot[indices]
df_teste.head(15)

Unnamed: 0,id.orig_p,id.resp_p,flow_duration,fwd_pkts_tot,bwd_pkts_tot,fwd_data_pkts_tot,bwd_data_pkts_tot,fwd_pkts_per_sec,bwd_pkts_per_sec,flow_pkts_per_sec,...,active.std,idle.min,idle.max,idle.tot,idle.avg,idle.std,fwd_init_window_size,bwd_init_window_size,fwd_last_window_size,Attack_type
206,43599,1883,31.933666,9,5,3,3,0.281834,0.156575,0.438409,...,0.0,29882260.0,29882260.0,29882260.0,29882260.0,0.0,64240,26847,502,MQTT_Publish
714,48237,1883,21.897215,9,5,3,3,0.411011,0.22834,0.639351,...,0.0,19874780.0,19874780.0,19874780.0,19874780.0,0.0,64240,26847,502,MQTT_Publish
1229,35577,1883,62.049462,10,6,3,4,0.161162,0.096697,0.257859,...,0.0,59815020.0,59815020.0,59815020.0,59815020.0,0.0,64240,26847,502,MQTT_Publish
1353,58135,1883,2.896397,10,6,3,4,3.452566,2.071539,5.524105,...,0.0,0.0,0.0,0.0,0.0,0.0,64240,26847,502,MQTT_Publish
1528,44819,1883,37.092233,10,6,3,4,0.269598,0.161759,0.431357,...,0.0,34764420.0,34764420.0,34764420.0,34764420.0,0.0,64240,26847,502,MQTT_Publish
1552,55493,1883,12.024126,10,6,3,4,0.831661,0.498997,1.330658,...,0.0,9737113.0,9737113.0,9737113.0,9737113.0,0.0,64240,26847,502,MQTT_Publish
1907,48665,1883,14.044804,9,5,3,3,0.640806,0.356004,0.99681,...,0.0,11741940.0,11741940.0,11741940.0,11741940.0,0.0,64240,26847,502,MQTT_Publish
1957,55791,1883,37.915679,10,8,3,4,0.263743,0.210995,0.474738,...,0.0,35884630.0,35884630.0,35884630.0,35884630.0,0.0,64240,26847,502,MQTT_Publish
2000,60069,1883,7.108746,10,6,3,4,1.406718,0.844031,2.250749,...,0.0,0.0,0.0,0.0,0.0,0.0,64240,26847,502,MQTT_Publish
2039,45833,1883,36.093616,10,6,3,4,0.277057,0.166234,0.443292,...,0.0,33770550.0,33770550.0,33770550.0,33770550.0,0.0,64240,26847,502,MQTT_Publish


In [13]:
df_teste.shape

(45334, 82)

In [14]:
#Separação das colunas preditoras (X) da coluna dependente (y) do df_amostra
X = df_amostra.drop(columns=['Attack_type'])
y = df_amostra['Attack_type']

In [15]:
#Separação das colunas preditoras (X) da coluna dependente (y) do df_teste
X_df_teste = df_teste.drop(columns=['Attack_type'])
y_df_teste = df_teste['Attack_type']

In [16]:
#Contagem de ocorrência de cada classe do df_amostra
y.value_counts()

Attack_type
DOS_SYN_Hping                 47425
Thing_Speak                    3982
ARP_poisioning                 3807
MQTT_Publish                   2073
NMAP_UDP_SCAN                  1287
NMAP_OS_DETECTION              1010
NMAP_XMAS_TREE_SCAN             995
NMAP_TCP_scan                   542
DDOS_Slowloris                  268
Wipro_bulb                      140
Metasploit_Brute_Force_SSH       15
NMAP_FIN_SCAN                    14
Name: count, dtype: int64

In [17]:
#Contagem de ocorrência de cada classe do df_teste
y_df_teste.value_counts()

Attack_type
DOS_SYN_Hping          43815
Thing_Speak              717
ARP_poisioning           628
MQTT_Publish             130
NMAP_UDP_SCAN             23
NMAP_XMAS_TREE_SCAN        9
NMAP_OS_DETECTION          8
NMAP_TCP_scan              2
Wipro_bulb                 1
DDOS_Slowloris             1
Name: count, dtype: int64

In [18]:
# Quantidade e percentuais de ocorrência das classes do df_amostra
pd.DataFrame({"Nome": y.value_counts().index,
              "Qte." : y.value_counts().values,
              "% Total": round(y.value_counts(normalize=True) * 100, 2).values})

Unnamed: 0,Nome,Qte.,% Total
0,DOS_SYN_Hping,47425,77.04
1,Thing_Speak,3982,6.47
2,ARP_poisioning,3807,6.18
3,MQTT_Publish,2073,3.37
4,NMAP_UDP_SCAN,1287,2.09
5,NMAP_OS_DETECTION,1010,1.64
6,NMAP_XMAS_TREE_SCAN,995,1.62
7,NMAP_TCP_scan,542,0.88
8,DDOS_Slowloris,268,0.44
9,Wipro_bulb,140,0.23


In [19]:
# Separação dos dados de treino (80%) e teste (20%)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [20]:
X_train.head()

Unnamed: 0,id.orig_p,id.resp_p,flow_duration,fwd_pkts_tot,bwd_pkts_tot,fwd_data_pkts_tot,bwd_data_pkts_tot,fwd_pkts_per_sec,bwd_pkts_per_sec,flow_pkts_per_sec,...,active.avg,active.std,idle.min,idle.max,idle.tot,idle.avg,idle.std,fwd_init_window_size,bwd_init_window_size,fwd_last_window_size
1621,58195,1883,4.033791,10,6,3,4,2.479058,1.487435,3.966492,...,4033791.0,0.0,0.0,0.0,0.0,0.0,0.0,64240,26847,502
9428,12270,21,1e-06,1,1,1,0,838860.8,838860.8,1677722.0,...,1.192093,0.0,0.0,0.0,0.0,0.0,0.0,64,0,64
26209,29120,21,5e-06,1,1,1,0,199728.761905,199728.761905,399457.5,...,5.00679,0.0,0.0,0.0,0.0,0.0,0.0,64,0,64
92966,14147,21,0.0,1,0,1,0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,64,0,64
49073,51990,21,4e-06,1,1,1,0,246723.764706,246723.764706,493447.5,...,4.053116,0.0,0.0,0.0,0.0,0.0,0.0,64,0,64


In [21]:
#Normalização dos dados de treino e teste
min_max = MinMaxScaler()
min_max.fit(X_train)
X_train_norm = min_max.transform(X_train)
X_train_norm

array([[8.87998779e-01, 2.87968924e-02, 4.27585262e-04, ...,
        9.80239567e-01, 4.09658961e-01, 7.66002899e-03],
       [1.87228199e-01, 3.21154934e-04, 1.06000847e-10, ...,
        9.76577401e-04, 0.00000000e+00, 9.76577401e-04],
       [4.44342718e-01, 3.21154934e-04, 5.30004235e-10, ...,
        9.76577401e-04, 0.00000000e+00, 9.76577401e-04],
       ...,
       [4.15732052e-01, 3.21154934e-04, 4.24003388e-10, ...,
        9.76577401e-04, 0.00000000e+00, 9.76577401e-04],
       [9.71221485e-01, 3.21154934e-04, 5.30004235e-10, ...,
        9.76577401e-04, 0.00000000e+00, 9.76577401e-04],
       [8.24948501e-01, 2.87968924e-02, 3.40206655e-03, ...,
        9.80239567e-01, 4.09658961e-01, 7.66002899e-03]])

In [22]:
X_test_norm = min_max.transform(X_test)

In [23]:
#Normalização das colunas preditoras do df_teste
X_real = min_max.transform(X_df_teste)

#### Over Sampling

##### SMOTE

In [24]:
# SMOTE
smote = SMOTE(random_state=42)
X_smote, y_smote = smote.fit_resample(X_train_norm, y_train)
X_smote.shape

(453708, 81)

In [25]:
y_smote.value_counts()


Attack_type
MQTT_Publish                  37809
DOS_SYN_Hping                 37809
ARP_poisioning                37809
Thing_Speak                   37809
NMAP_OS_DETECTION             37809
NMAP_TCP_scan                 37809
NMAP_UDP_SCAN                 37809
NMAP_XMAS_TREE_SCAN           37809
Metasploit_Brute_Force_SSH    37809
DDOS_Slowloris                37809
Wipro_bulb                    37809
NMAP_FIN_SCAN                 37809
Name: count, dtype: int64

##### SVM

In [26]:
# modelo_svm = SVC(kernel='rbf', random_state=42)
# modelo_svm.fit(X_smote, y_smote)
# y_pred = modelo_svm.predict(X_test_norm)

In [27]:
# print('Acurácia:', accuracy_score(y_test, y_pred))
# print('Revocação:', recall_score(y_test, y_pred, average='weighted'))
# print('Precisão:', precision_score(y_test, y_pred,average='weighted'))

##### KNN

##### Aplicação do KNN nos dados do df_amostra com sobreamostragem

In [28]:
modelo_knn = KNeighborsClassifier(n_neighbors=3)
modelo_knn.fit(X_smote, y_smote)
y_pred = modelo_knn.predict(X_test_norm)

In [32]:
print('Acurácia:', accuracy_score(y_test, y_pred))
print('Revocação:', recall_score(y_test, y_pred, average='weighted'))
print('Precisão:', precision_score(y_test, y_pred,average='weighted'))

Acurácia: 0.9956952566601689
Revocação: 0.9956952566601689
Precisão: 0.9961469524258828


##### Aplicação do KNN nos dados do df_teste

In [33]:
y_pred2 = modelo_knn.predict(X_real)

In [42]:
print('Acurácia:', accuracy_score(y_df_teste, y_pred2))
#print('Revocação:', recall_score(y_df_teste, y_pred2, average='weighted'))
#print('Precisão:', precision_score(y_df_teste, y_pred2,average='weighted'))

Acurácia: 0.7684298760312348


#### Under sampling

In [35]:
#Random
random_under = RandomUnderSampler(sampling_strategy='majority', random_state=42)
X_under, y_under = random_under.fit_resample(X_train_norm, y_train)
X_under.shape

(11448, 81)

In [36]:
y_under.value_counts()


Attack_type
Thing_Speak                   3227
ARP_poisioning                3105
MQTT_Publish                  1650
NMAP_UDP_SCAN                 1058
NMAP_OS_DETECTION              810
NMAP_XMAS_TREE_SCAN            796
NMAP_TCP_scan                  445
DDOS_Slowloris                 216
Wipro_bulb                     106
Metasploit_Brute_Force_SSH      13
DOS_SYN_Hping                   11
NMAP_FIN_SCAN                   11
Name: count, dtype: int64

#### KNN

##### Aplicação do KNN nos dados do df_amostra com subamostragem

In [37]:
modelo_knn2 = KNeighborsClassifier(n_neighbors=3)
modelo_knn2.fit(X_under, y_under)
y_pred = modelo_knn2.predict(X_test_norm)

In [38]:
print('Acurácia:', accuracy_score(y_test, y_pred))
print('Revocação:', recall_score(y_test, y_pred, average='weighted'))
print('Precisão:', precision_score(y_test, y_pred,average='weighted'))

Acurácia: 0.8169265756985056
Revocação: 0.8169265756985056
Precisão: 0.9615980549981731


##### Aplicação do KNN nos dados do df_teste

In [39]:
y_pred2 = modelo_knn2.predict(X_real)

In [43]:
print('Acurácia:', accuracy_score(y_df_teste, y_pred2))
#print('Revocação:', recall_score(y_df_teste, y_pred2, average='weighted'))
#print('Precisão:', precision_score(y_df_teste, y_pred2,average='weighted'))

Acurácia: 0.7684298760312348


In [41]:
logistica = LogisticRegression(max_iter=1000, random_state=42)
logistica.fit(X_under, y_under)
y_pred = logistica.predict(X_test_norm)


In [44]:
print('Acurácia:', accuracy_score(y_test, y_pred))

Acurácia: 0.40505198180636776


# Exercício

- Fazer uma análise nesses dados com o uso de Over sampling com Regressão Logísitca: 
SMOTE, RandomOverSampler, ADASYN

- Fazer uma análise nesses dados com o uso de under sampling com Regressão Logísitca:
OneSidedSelection, RandomUnderSampler

In [41]:
randOn = RandomOverSampler(random_state=42, )