# DBScan - Detecção de Anomalias

Vamos realizar a detecção de anomalias utilizando o algoritmo DBScan. O DBScan é um algoritmo de clustering que é capaz de detectar outliers e ruídos em um conjunto de dados. O objetivo é identificar outliers como potenciais invasões e com isso treinar um modelo de classificação para identificar esses outliers.

In [1]:
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.cluster import DBSCAN
from sklearn.decomposition import PCA
import polars as pl

## Tratamento dos Dados

In [2]:
df_raw = pl.read_csv('../data/data.csv')
df_raw.sample(5)

duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,num_failed_logins,logged_in,num_compromised,root_shell,su_attempted,num_root,num_file_creations,num_shells,num_access_files,num_outbound_cmds,is_host_login,is_guest_login,count,srv_count,serror_rate,srv_serror_rate,rerror_rate,srv_rerror_rate,same_srv_rate,diff_srv_rate,srv_diff_host_rate,dst_host_count,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,class
i64,str,str,str,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,f64,f64,f64,f64,f64,f64,f64,i64,i64,f64,f64,f64,f64,f64,f64,f64,f64,str
0,"""tcp""","""pop_3""","""SF""",38,130,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,3,1,0.0,0.0,0.0,0.0,0.33,0.67,0.0,143,7,0.04,0.05,0.01,0.29,0.01,0.0,0.01,0.0,"""normal"""
0,"""tcp""","""http""","""S1""",163,14600,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,1,1.0,1.0,0.0,0.0,1.0,0.0,0.0,255,255,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,"""normal"""
0,"""tcp""","""smtp""","""SF""",2162,334,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,1,0.0,0.0,0.0,0.0,1.0,0.0,0.0,70,166,0.64,0.07,0.01,0.02,0.0,0.0,0.0,0.0,"""normal"""
0,"""tcp""","""http""","""SF""",294,16407,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,3,32,0.0,0.0,0.0,0.0,1.0,0.0,0.06,21,253,1.0,0.0,0.05,0.05,0.0,0.0,0.0,0.0,"""normal"""
0,"""tcp""","""discard""","""S0""",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,99,2,1.0,1.0,0.0,0.0,0.02,0.08,0.0,255,2,0.01,0.07,0.0,0.0,1.0,1.0,0.0,0.0,"""anomaly"""


Transformação dos dados categóricos em numéricos e normalizando os dados usando o MinMaxScaler.

In [8]:
scaler = MinMaxScaler().set_output(transform="polars")

df_scaled = scaler.fit_transform(df_raw.drop(["protocol_type", "service", "flag", "class"]))
df_dummies = df_raw.select(["protocol_type", "service"]).to_dummies()
df_target = df_raw.select(
    pl.when(df_raw["class"] == "anomaly").then(1).otherwise(0).alias("target")
)

df_processed = pl.concat([df_scaled, df_dummies, df_target], how="horizontal")
df_processed.sample(5)


duration,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,num_failed_logins,logged_in,num_compromised,root_shell,su_attempted,num_root,num_file_creations,num_shells,num_access_files,num_outbound_cmds,is_host_login,is_guest_login,count,srv_count,serror_rate,srv_serror_rate,rerror_rate,srv_rerror_rate,same_srv_rate,diff_srv_rate,srv_diff_host_rate,dst_host_count,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,…,service_link,service_login,service_mtp,service_name,service_netbios_dgm,service_netbios_ns,service_netbios_ssn,service_netstat,service_nnsp,service_nntp,service_ntp_u,service_other,service_pm_dump,service_pop_2,service_pop_3,service_printer,service_private,service_red_i,service_remote_job,service_rje,service_shell,service_smtp,service_sql_net,service_ssh,service_sunrpc,service_supdup,service_systat,service_telnet,service_tim_i,service_time,service_urh_i,service_urp_i,service_uucp,service_uucp_path,service_vmnet,service_whois,target
f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,…,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,i32
0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.001961,0.0,0.0,1.0,1.0,1.0,0.0,1.0,0.082353,1.0,1.0,0.0,0.05,0.17,0.0,0.0,1.0,…,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
0.0,1.1789e-07,2.6e-05,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.211765,0.331373,0.0,0.0,0.0,0.0,1.0,0.0,0.02,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,…,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
0.0,3e-06,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.87451,0.305882,0.35,0.01,0.35,0.0,0.0,0.0,0.0,…,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
0.0,6.7853e-07,0.000299,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.001961,0.001961,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.811765,1.0,1.0,0.0,0.0,0.01,0.0,0.0,0.0,…,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.196078,0.027451,1.0,1.0,0.0,0.0,0.15,0.06,0.0,1.0,0.058824,0.06,0.05,0.0,0.0,1.0,1.0,0.0,…,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1


## Usando o DBScan

In [9]:
X = df_processed.drop("target").to_numpy()
y = df_processed["target"].to_numpy()

In [10]:
from sklearn.metrics import precision_score, recall_score

def grid_search_dbscan(X, y, min_samples_range, eps_range):
    results = []
    count = 1
    total = len(min_samples_range) * len(eps_range)
    print(f"Starting Grid Search - {total} combinations")
    for min_samples in min_samples_range:
        for eps in eps_range:

            print(f"Combination {count}/{total}: min_samples={min_samples}, eps={eps}")

            result = {
                "min_samples": min_samples,
                "eps": eps
            }

            dbscan = DBSCAN(eps=eps, min_samples=min_samples, n_jobs=-1)
            labels = dbscan.fit_predict(X)
            labels_anomaly = np.where(labels == -1, 1, 0)  # -1 is anomaly, 0 is normal

            result["precision"] = precision_score(y, labels_anomaly)
            result["recall"] = recall_score(y, labels_anomaly)
            result["f1"] = 2 * (result["precision"] * result["recall"]) / (result["precision"] + result["recall"])
            result["accuracy"] = (labels_anomaly == y).mean()
            result["False Positive"] = np.sum((labels_anomaly == 1) & (y == 0))
            result["False Negative"] = np.sum((labels_anomaly == 0) & (y == 1))
            result["True Positive"] = np.sum((labels_anomaly == 1) & (y == 1))
            result["True Negative"] = np.sum((labels_anomaly == 0) & (y == 0))

            results.append(result)
            count += 1

    return results

In [11]:
grid_search = grid_search_dbscan(
    X, y, min_samples_range=[5, 100, 500, 1000, 2000, 5000], eps_range=[0.1, 0.3, 0.5, 0.7, 1.0, 1.5]
)

Starting Grid Search - 36 combinations
Combination 1/36: min_samples=5, eps=0.1
Combination 2/36: min_samples=5, eps=0.3
Combination 3/36: min_samples=5, eps=0.5
Combination 4/36: min_samples=5, eps=0.7
Combination 5/36: min_samples=5, eps=1.0
Combination 6/36: min_samples=5, eps=1.5
Combination 7/36: min_samples=100, eps=0.1
Combination 8/36: min_samples=100, eps=0.3
Combination 9/36: min_samples=100, eps=0.5
Combination 10/36: min_samples=100, eps=0.7
Combination 11/36: min_samples=100, eps=1.0
Combination 12/36: min_samples=100, eps=1.5
Combination 13/36: min_samples=500, eps=0.1
Combination 14/36: min_samples=500, eps=0.3
Combination 15/36: min_samples=500, eps=0.5
Combination 16/36: min_samples=500, eps=0.7
Combination 17/36: min_samples=500, eps=1.0
Combination 18/36: min_samples=500, eps=1.5
Combination 19/36: min_samples=1000, eps=0.1
Combination 20/36: min_samples=1000, eps=0.3
Combination 21/36: min_samples=1000, eps=0.5
Combination 22/36: min_samples=1000, eps=0.7
Combinatio

Queremos otimizar Recall mesmo que isso signifique diminuir a precisão, ou seja ter um aumento em falsos positivos.

In [12]:
pl.Config.set_tbl_rows(20)
pl.DataFrame(grid_search).sort("f1", descending=True)

min_samples,eps,precision,recall,f1,accuracy,False Positive,False Negative,True Positive,True Negative
i64,f64,f64,f64,f64,f64,i64,i64,i64,i64
5000,1.0,0.64037,0.98365,0.775729,0.734876,6487,192,11551,6962
5000,0.7,0.632499,0.985779,0.770577,0.726381,6726,167,11576,6723
5000,0.5,0.593737,0.996253,0.744046,0.680494,8005,44,11699,5444
1000,1.0,0.71862,0.764455,0.740829,0.750675,3515,2766,8977,9934
1000,0.7,0.645947,0.765477,0.700651,0.695102,4927,2754,8989,8522
500,0.7,0.687428,0.706804,0.696981,0.71352,3774,3443,8300,9675
1000,0.5,0.632937,0.770757,0.695081,0.684781,5249,2692,9051,8200
1000,0.3,0.607592,0.786511,0.68557,0.663703,5965,2507,9236,7484
500,1.0,0.729706,0.633824,0.678394,0.719871,2757,4300,7443,10692
1000,0.1,0.507051,0.998212,0.672499,0.546801,11396,21,11722,2053


Vamos seguir com min_sample = 5000 e eps = 0.5

In [13]:
dbscan = DBSCAN(eps=0.5, min_samples=5000, n_jobs=-1)
labels = dbscan.fit_predict(X)
labels_anomaly = np.where(labels == -1, 1, 0)

# Treinando uma Random Forest

Vamos treinar uma Random Forest para classificar os outliers detectados pelo DBScan.

In [14]:
scaler = MinMaxScaler().set_output(transform="polars")

df_scaled = scaler.fit_transform(
    df_raw.drop(["protocol_type", "service", "flag", "class"])
)
df_dummies = df_raw.select(["protocol_type", "service"]).to_dummies()
df_processed = pl.concat([df_scaled, df_dummies], how="horizontal")

X = df_processed.to_numpy()

X_train, X_test, y_train, y_test = train_test_split(X, labels_anomaly, test_size=0.2, random_state=13)

In [15]:
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV

# Definindo o modelo
xgb = XGBClassifier(eval_metric='logloss')

# Definindo os parâmetros para a busca em grade
param_grid = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 5, 7],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0]
}

# Configurando o GridSearchCV
grid_search = GridSearchCV(estimator=xgb, param_grid=param_grid, scoring='accuracy', cv=3, n_jobs=-1, verbose=2)

# Executando o Grid Search
grid_search.fit(X_train, y_train)

# Exibindo os melhores parâmetros
print("Melhores parâmetros encontrados: ", grid_search.best_params_)
print("Melhor pontuação: ", grid_search.best_score_)

Fitting 3 folds for each of 108 candidates, totalling 324 fits


In addition, using fork() with Python in general is a recipe for mysterious
deadlocks and crashes.

The most likely reason you are seeing this error is because you are using the
multiprocessing module on Linux, which uses fork() by default. This will be
fixed in Python 3.14. Until then, you want to use the "spawn" context instead.

See https://docs.pola.rs/user-guide/misc/multiprocessing/ for details.

  pid = os.fork()
In addition, using fork() with Python in general is a recipe for mysterious
deadlocks and crashes.

The most likely reason you are seeing this error is because you are using the
multiprocessing module on Linux, which uses fork() by default. This will be
fixed in Python 3.14. Until then, you want to use the "spawn" context instead.

See https://docs.pola.rs/user-guide/misc/multiprocessing/ for details.

  pid = os.fork()


[CV] END colsample_bytree=0.8, learning_rate=0.01, max_depth=3, n_estimators=100, subsample=1.0; total time=   0.2s
[CV] END colsample_bytree=0.8, learning_rate=0.01, max_depth=3, n_estimators=100, subsample=0.8; total time=   0.2s
[CV] END colsample_bytree=0.8, learning_rate=0.01, max_depth=3, n_estimators=100, subsample=1.0; total time=   0.3s
[CV] END colsample_bytree=0.8, learning_rate=0.01, max_depth=3, n_estimators=100, subsample=1.0; total time=   0.3s
[CV] END colsample_bytree=0.8, learning_rate=0.01, max_depth=3, n_estimators=100, subsample=0.8; total time=   0.3s
[CV] END colsample_bytree=0.8, learning_rate=0.01, max_depth=3, n_estimators=100, subsample=0.8; total time=   0.3s
[CV] END colsample_bytree=0.8, learning_rate=0.01, max_depth=3, n_estimators=200, subsample=0.8; total time=   0.5s
[CV] END colsample_bytree=0.8, learning_rate=0.01, max_depth=3, n_estimators=200, subsample=0.8; total time=   0.5s
[CV] END colsample_bytree=0.8, learning_rate=0.01, max_depth=3, n_estima

In [16]:
grid_search.best_params_

{'colsample_bytree': 1.0,
 'learning_rate': 0.1,
 'max_depth': 7,
 'n_estimators': 300,
 'subsample': 1.0}

Melhores parâmetros encontrados:  {'colsample_bytree': 1.0,
 'learning_rate': 0.1,
 'max_depth': 7,
 'n_estimators': 300,
 'subsample': 1.0}

In [17]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Usando os melhores parâmetros encontrados para treinar o modelo
best_params = grid_search.best_params_
xgb_best = XGBClassifier(**best_params, eval_metric='logloss')

# Treinando o modelo com os melhores parâmetros
xgb_best.fit(X_train, y_train)

# Fazendo previsões no conjunto de teste
y_pred = xgb_best.predict(X_test)

# Calculando métricas de desempenho

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")

Accuracy: 0.9950386981543957
Precision: 0.9967213114754099
Recall: 0.9969727547931383
F1 Score: 0.9968470172783453
