In [4]:
import pandas as pd
import numpy as np
import torch
from sklearn import preprocessing
from dgl.data import DGLDataset
import dgl
import time
import networkx as nx
import category_encoders as ce
import torch.nn as nn
import torch.nn.functional as F
import dgl.function as fn
import torch
import tqdm
import math

from typing import *
from sklearn.preprocessing import StandardScaler, Normalizer
import socket
import struct
import random
from sklearn.model_selection import train_test_split

In [5]:
device = torch.device("cuda:3" if torch.cuda.is_available() else "cpu")

In [6]:
file_name = "/media/ssd/test/standardized-datasets/combined/combined_netflow_reduced.csv"
data = pd.read_csv(file_name)

In [7]:
data.Label.value_counts()

Label
0    3797826
1    1018415
Name: count, dtype: int64

In [8]:
data.rename(columns=lambda x: x.strip(), inplace=True)
data['IPV4_SRC_ADDR'] = data["IPV4_SRC_ADDR"].apply(str)
data['L4_SRC_PORT'] = data["L4_SRC_PORT"].apply(str)
data['IPV4_DST_ADDR'] = data["IPV4_DST_ADDR"].apply(str)
data['L4_DST_PORT'] = data["L4_DST_PORT"].apply(str)

In [9]:
data.drop(columns=["L4_SRC_PORT", "L4_DST_PORT"], inplace=True)

In [10]:
data.Attack.unique()

array(['DDoS', 'DoS', 'Reconnaissance', 'Benign', 'Theft',
       'DDOS attack-HOIC', 'DoS attacks-Hulk', 'SSH-Bruteforce',
       'Infilteration', 'DDoS attacks-LOIC-HTTP',
       'DoS attacks-SlowHTTPTest', 'Bot', 'FTP-BruteForce',
       'DoS attacks-GoldenEye', 'Brute Force -XSS',
       'DDOS attack-LOIC-UDP', 'SQL Injection', 'DoS attacks-Slowloris',
       'Brute Force -Web', 'Exploits', 'Generic', 'Fuzzers', 'Backdoor',
       'Shellcode', 'Worms', 'Analysis'], dtype=object)

In [11]:
data = data.groupby(by='Attack').sample(frac=0.1, random_state=13)

In [12]:
data.groupby(by="Attack").count()

Unnamed: 0_level_0,IPV4_SRC_ADDR,IPV4_DST_ADDR,PROTOCOL,L7_PROTO,IN_BYTES,IN_PKTS,OUT_BYTES,OUT_PKTS,TCP_FLAGS,CLIENT_TCP_FLAGS,...,TCP_WIN_MAX_OUT,ICMP_TYPE,ICMP_IPV4_TYPE,DNS_QUERY_ID,DNS_QUERY_TYPE,DNS_TTL_ANSWER,FTP_COMMAND_RET_CODE,Label,flow_id,dataset_source
Attack,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Analysis,230,230,230,230,230,230,230,230,230,230,...,230,230,230,230,230,230,230,230,230,230
Backdoor,217,217,217,217,217,217,217,217,217,217,...,217,217,217,217,217,217,217,217,217,217
Benign,379783,379783,379783,379783,379783,379783,379783,379783,379783,379783,...,379783,379783,379783,379783,379783,379783,379783,379783,379783,379783
Bot,7155,7155,7155,7155,7155,7155,7155,7155,7155,7155,...,7155,7155,7155,7155,7155,7155,7155,7155,7155,7155
Brute Force -Web,107,107,107,107,107,107,107,107,107,107,...,107,107,107,107,107,107,107,107,107,107
Brute Force -XSS,93,93,93,93,93,93,93,93,93,93,...,93,93,93,93,93,93,93,93,93,93
DDOS attack-HOIC,21617,21617,21617,21617,21617,21617,21617,21617,21617,21617,...,21617,21617,21617,21617,21617,21617,21617,21617,21617,21617
DDOS attack-LOIC-UDP,106,106,106,106,106,106,106,106,106,106,...,106,106,106,106,106,106,106,106,106,106
DDoS,16499,16499,16499,16499,16499,16499,16499,16499,16499,16499,...,16499,16499,16499,16499,16499,16499,16499,16499,16499,16499
DDoS attacks-LOIC-HTTP,6146,6146,6146,6146,6146,6146,6146,6146,6146,6146,...,6146,6146,6146,6146,6146,6146,6146,6146,6146,6146


In [13]:
X = data.drop(columns=["Attack", "Label"])
y = data[["Attack", "Label"]]

X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.3, random_state=13, stratify=y)

In [14]:
encoder = ce.TargetEncoder(cols=['TCP_FLAGS','L7_PROTO','PROTOCOL',
                                  'CLIENT_TCP_FLAGS','SERVER_TCP_FLAGS','ICMP_TYPE',
                                  'ICMP_IPV4_TYPE','DNS_QUERY_ID','DNS_QUERY_TYPE',
                                  'FTP_COMMAND_RET_CODE'])
encoder.fit(X_train, y_train.Label)

# Transform on training set
X_train = encoder.transform(X_train)

# Transform on testing set
X_test = encoder.transform(X_test)

In [15]:
X_train.replace([np.inf, -np.inf], np.nan, inplace=True)
X_test.replace([np.inf, -np.inf], np.nan, inplace=True)
X_train.fillna(0, inplace=True)
X_test.fillna(0, inplace=True)

In [16]:
if 'dataset_source' in X_train.columns:
    X_train = X_train.drop(columns=['dataset_source'])
if 'dataset_source' in X_test.columns:
    X_test = X_test.drop(columns=['dataset_source'])

In [17]:
scaler = Normalizer()
cols_to_norm = list(set(list(X_train.iloc[:, 2:].columns))) # Ignore first two as the represents IP addresses
scaler.fit(X_train[cols_to_norm])

# Transform on training set
X_train[cols_to_norm] = scaler.transform(X_train[cols_to_norm])
X_train['h'] = X_train.iloc[:, 2:].values.tolist()

# Transform on testing set
X_test[cols_to_norm] = scaler.transform(X_test[cols_to_norm])
X_test['h'] = X_test.iloc[:, 2:].values.tolist()

train = pd.concat([X_train, y_train], axis=1)
test = pd.concat([X_test, y_test], axis=1)

In [18]:
X_train.head()

Unnamed: 0,IPV4_SRC_ADDR,IPV4_DST_ADDR,PROTOCOL,L7_PROTO,IN_BYTES,IN_PKTS,OUT_BYTES,OUT_PKTS,TCP_FLAGS,CLIENT_TCP_FLAGS,...,TCP_WIN_MAX_IN,TCP_WIN_MAX_OUT,ICMP_TYPE,ICMP_IPV4_TYPE,DNS_QUERY_ID,DNS_QUERY_TYPE,DNS_TTL_ANSWER,FTP_COMMAND_RET_CODE,flow_id,h
1678536,18.218.11.51,172.31.69.28,4.716756e-08,1.352251e-07,0.000106,1.018746e-06,0.000234,1.018746e-06,1.51185e-07,1.23652e-07,...,0.013353,0.005477,6.053687e-08,6.053757e-08,5.319932e-08,5.320146e-08,0.0,4.695634e-08,0.342,"[4.7167564923371754e-08, 1.3522507470442058e-0..."
3802830,59.166.0.5,149.171.126.4,1.554257e-08,3.024266e-09,0.0001,1.812756e-06,0.000138,1.947034e-06,8.278529e-10,8.277976e-10,...,0.001167,0.001069,1.331946e-09,1.354066e-09,1.753015e-08,1.753085e-08,0.0,3.073079e-11,0.255319,"[1.5542574692509295e-08, 3.0242658543497837e-0..."
2285590,172.31.64.69,172.31.0.2,4.642899e-08,1.353043e-08,3.8e-05,5.887329e-07,8.4e-05,5.887329e-07,4.870199e-08,4.869631e-08,...,0.0,0.0,8.746057e-08,8.746159e-08,5.414639e-08,1.347604e-08,1.8e-05,6.784011e-08,0.672801,"[4.642898567879007e-08, 1.3530425343873105e-08..."
4792203,59.166.0.5,149.171.126.8,3.231844e-09,6.416912e-10,0.000125,1.954479e-07,4e-06,8.376338e-08,1.392591e-09,1.46815e-09,...,8.1e-05,0.000142,8.263405e-11,8.71675e-11,3.64513e-09,3.645277e-09,0.0,3.217371e-09,0.066902,"[3.2318441136909897e-09, 6.416911557150659e-10..."
518567,13.58.98.64,172.31.69.25,5.84051e-09,1.210137e-08,8.1e-05,6.307297e-07,9.5e-05,5.29813e-07,2.516657e-09,2.653204e-09,...,0.000678,0.000677,7.49596e-09,7.496047e-09,6.58739e-09,6.587655e-09,0.0,5.814355e-09,0.013083,"[5.840509756442225e-09, 1.2101366649131552e-08..."


In [19]:
lab_enc = preprocessing.LabelEncoder()
lab_enc.fit(data["Attack"])

# Transform on training set
train["Attack"] = lab_enc.transform(train["Attack"])

# Transform on testing set
test["Attack"] = lab_enc.transform(test["Attack"])

In [20]:
# Training graph

train_g = nx.from_pandas_edgelist(train, "IPV4_SRC_ADDR", "IPV4_DST_ADDR",
            ["h", "Label", "Attack"], create_using=nx.MultiGraph())

train_g = train_g.to_directed()
train_g = dgl.from_networkx(train_g, edge_attrs=['h', 'Attack', 'Label'])
nfeat_weight = torch.ones([train_g.number_of_nodes(),
train_g.edata['h'].shape[1]])
train_g.ndata['h'] = nfeat_weight

# Testing graph
test_g = nx.from_pandas_edgelist(test, "IPV4_SRC_ADDR", "IPV4_DST_ADDR",
            ["h", "Label", "Attack"], create_using=nx.MultiGraph())

test_g = test_g.to_directed()
test_g = dgl.from_networkx(test_g, edge_attrs=['h', 'Attack', 'Label'])
nfeat_weight = torch.ones([test_g.number_of_nodes(),
test_g.edata['h'].shape[1]])
test_g.ndata['h'] = nfeat_weight

In [21]:
import torch.nn as nn
import torch.nn.functional as F
import dgl.function as fn
import tqdm
import gc

class SAGELayer(nn.Module):
    def __init__(self, ndim_in, edims, ndim_out, activation):
      super(SAGELayer, self).__init__()
      self.W_apply = nn.Linear(ndim_in + edims , ndim_out)
      self.activation = F.relu
      self.W_edge = nn.Linear(128 * 2, 256)
      self.reset_parameters()

    def reset_parameters(self):
      gain = nn.init.calculate_gain('relu')
      nn.init.xavier_uniform_(self.W_apply.weight, gain=gain)

    def message_func(self, edges):
      return {'m':  edges.data['h']}

    def forward(self, g_dgl, nfeats, efeats):
      with g_dgl.local_scope():
        g = g_dgl
        g.ndata['h'] = nfeats
        g.edata['h'] = efeats
        g.update_all(self.message_func, fn.mean('m', 'h_neigh'))
        g.ndata['h'] = F.relu(self.W_apply(torch.cat([g.ndata['h'], g.ndata['h_neigh']], 2)))

        # Compute edge embeddings
        u, v = g.edges()
        edge = self.W_edge(torch.cat((g.srcdata['h'][u], g.dstdata['h'][v]), 2))
        return g.ndata['h'], edge

In [22]:
class SAGE(nn.Module):
    def __init__(self, ndim_in, ndim_out, edim,  activation):
      super(SAGE, self).__init__()
      self.layers = nn.ModuleList()
      self.layers.append(SAGELayer(ndim_in, edim, 128, F.relu))

    def forward(self, g, nfeats, efeats, corrupt=False):
      if corrupt:
        e_perm = torch.randperm(g.number_of_edges())
        #n_perm = torch.randperm(g.number_of_nodes())
        efeats = efeats[e_perm]
        #nfeats = nfeats[n_perm]
      for i, layer in enumerate(self.layers):
        #nfeats = layer(g, nfeats, efeats)
        nfeats, e_feats = layer(g, nfeats, efeats)
      #return nfeats.sum(1)
      return nfeats.sum(1), e_feats.sum(1)

In [23]:
class Discriminator(nn.Module):
    def __init__(self, n_hidden):
      super(Discriminator, self).__init__()
      self.weight = nn.Parameter(torch.Tensor(n_hidden, n_hidden))
      self.reset_parameters()

    def uniform(self, size, tensor):
      bound = 1.0 / math.sqrt(size)
      if tensor is not None:
        tensor.data.uniform_(-bound, bound)

    def reset_parameters(self):
      size = self.weight.size(0)
      self.uniform(size, self.weight)

    def forward(self, features, summary):
      features = torch.matmul(features, torch.matmul(self.weight, summary))
      return features

In [24]:
class DGI(nn.Module):
    def __init__(self, ndim_in, ndim_out, edim, activation):
      super(DGI, self).__init__()
      self.encoder = SAGE(ndim_in, ndim_out, edim,  F.relu)
      #self.discriminator = Discriminator(128)
      self.discriminator = Discriminator(256)
      self.loss = nn.BCEWithLogitsLoss()

    def forward(self, g, n_features, e_features):
      positive = self.encoder(g, n_features, e_features, corrupt=False)
      negative = self.encoder(g, n_features, e_features, corrupt=True)
      self.loss = nn.BCEWithLogitsLoss()

    def forward(self, g, n_features, e_features):
      positive = self.encoder(g, n_features, e_features, corrupt=False)
      negative = self.encoder(g, n_features, e_features, corrupt=True)

      positive = positive[1]
      negative = negative[1]

      summary = torch.sigmoid(positive.mean(dim=0))

      positive = self.discriminator(positive, summary)
      negative = self.discriminator(negative, summary)

      l1 = self.loss(positive, torch.ones_like(positive))
      l2 = self.loss(negative, torch.zeros_like(negative))

      return l1 + l2

In [25]:
ndim_in = train_g.ndata['h'].shape[1]
hidden_features = 128
ndim_out = 128
num_layers = 1
edim = train_g.edata['h'].shape[1]
learning_rate = 1e-3
epochs = 4000

In [26]:
dgi = DGI(ndim_in,
    ndim_out,
    edim,
    F.relu)

dgi = dgi.to(device)

dgi_optimizer = torch.optim.Adam(dgi.parameters(),
                lr=1e-3,
                weight_decay=0.)

In [27]:
# Format node and edge features for E-GraphSAGE
train_g.ndata['h'] = torch.reshape(train_g.ndata['h'],
                                   (train_g.ndata['h'].shape[0], 1,
                                    train_g.ndata['h'].shape[1]))

train_g.edata['h'] = torch.reshape(train_g.edata['h'],
                                   (train_g.edata['h'].shape[0], 1,
                                    train_g.edata['h'].shape[1]))

In [28]:
# Convert to GPU
train_g = train_g.to(device)

In [29]:
cnt_wait = 0
best = 1e9
best_t = 0
dur = []
node_features = train_g.ndata['h']
edge_features = train_g.edata['h']

for epoch in range(epochs):
    dgi.train()
    if epoch >= 3:
        t0 = time.time()

    dgi_optimizer.zero_grad()
    loss = dgi(train_g, node_features, edge_features)
    loss.backward()
    dgi_optimizer.step()

    if loss < best:
        best = loss
        best_t = epoch
        cnt_wait = 0
        torch.save(dgi.state_dict(), 'best_dgi.pkl')
    else:
        cnt_wait += 1

  # if cnt_wait == patience:
  #     print('Early stopping!')
  #     break

    if epoch >= 3:
        dur.append(time.time() - t0)

    if epoch % 50 == 0:

        print("Epoch {:05d} | Time(s) {:.4f} | Loss {:.4f} | "
            "ETputs(KTEPS) {:.2f}".format(epoch, np.mean(dur),
              loss.item(),
              train_g.num_edges() / np.mean(dur) / 1000))
# ... existing code ...

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


Epoch 00000 | Time(s) nan | Loss 1.4016 | ETputs(KTEPS) nan
Epoch 00050 | Time(s) 0.2011 | Loss 1.3670 | ETputs(KTEPS) 3350.63
Epoch 00100 | Time(s) 0.2022 | Loss 1.1483 | ETputs(KTEPS) 3333.44
Epoch 00150 | Time(s) 0.2026 | Loss 0.2514 | ETputs(KTEPS) 3326.89
Epoch 00200 | Time(s) 0.2024 | Loss 0.0968 | ETputs(KTEPS) 3329.12
Epoch 00250 | Time(s) 0.2023 | Loss 0.0770 | ETputs(KTEPS) 3332.10
Epoch 00300 | Time(s) 0.2021 | Loss 0.0652 | ETputs(KTEPS) 3335.03
Epoch 00350 | Time(s) 0.2019 | Loss 0.0584 | ETputs(KTEPS) 3337.47
Epoch 00400 | Time(s) 0.2019 | Loss 0.0538 | ETputs(KTEPS) 3338.83
Epoch 00450 | Time(s) 0.2017 | Loss 0.0485 | ETputs(KTEPS) 3340.62
Epoch 00500 | Time(s) 0.2016 | Loss 0.0437 | ETputs(KTEPS) 3342.89
Epoch 00550 | Time(s) 0.2015 | Loss 0.0401 | ETputs(KTEPS) 3344.86
Epoch 00600 | Time(s) 0.2014 | Loss 0.0362 | ETputs(KTEPS) 3346.54
Epoch 00650 | Time(s) 0.2013 | Loss 0.0323 | ETputs(KTEPS) 3347.44
Epoch 00700 | Time(s) 0.2012 | Loss 0.0286 | ETputs(KTEPS) 3348.86
Ep

In [30]:
dgi.load_state_dict(torch.load('best_dgi.pkl'))

  dgi.load_state_dict(torch.load('best_dgi.pkl'))


<All keys matched successfully>

In [31]:
training_emb = dgi.encoder(train_g, train_g.ndata['h'], train_g.edata['h'])[1]
training_emb = training_emb.detach().cpu().numpy()

In [32]:
test_g.ndata['h'] = torch.reshape(test_g.ndata['h'],
                                   (test_g.ndata['h'].shape[0], 1,
                                    test_g.ndata['h'].shape[1]))



test_g.edata['h'] = torch.reshape(test_g.edata['h'],
                                   (test_g.edata['h'].shape[0], 1,
                                    test_g.edata['h'].shape[1]))

In [33]:
# Convert to GPU
test_g = test_g.to(device)

In [34]:
testing_emb = dgi.encoder(test_g, test_g.ndata['h'], test_g.edata['h'])[1]
testing_emb = testing_emb.detach().cpu().numpy()

In [35]:
df_train = pd.DataFrame(training_emb, )
df_train["Attack"] = lab_enc.inverse_transform(
        train_g.edata['Attack'].detach().cpu().numpy())
df_train["Label"] = train_g.edata['Label'].detach().cpu().numpy()

df_test = pd.DataFrame(testing_emb, )
df_test["Attack"] = lab_enc.inverse_transform(
        test_g.edata['Attack'].detach().cpu().numpy())
df_test["Label"] = test_g.edata['Label'].detach().cpu().numpy()

In [36]:
df_train

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,248,249,250,251,252,253,254,255,Attack,Label
0,0.003993,0.029602,-0.073997,-0.035304,-0.057061,-0.031729,-0.075933,0.048595,0.051252,-0.003702,...,-0.003208,0.003734,-0.027365,-0.000895,0.046468,0.000642,0.013361,0.010224,DDOS attack-HOIC,1
1,0.003993,0.029602,-0.073997,-0.035304,-0.057061,-0.031729,-0.075933,0.048595,0.051252,-0.003702,...,-0.003208,0.003734,-0.027365,-0.000895,0.046468,0.000642,0.013361,0.010224,DDOS attack-HOIC,1
2,0.003993,0.029602,-0.073997,-0.035304,-0.057061,-0.031729,-0.075933,0.048595,0.051252,-0.003702,...,-0.003208,0.003734,-0.027365,-0.000895,0.046468,0.000642,0.013361,0.010224,DDOS attack-HOIC,1
3,0.003993,0.029602,-0.073997,-0.035304,-0.057061,-0.031729,-0.075933,0.048595,0.051252,-0.003702,...,-0.003208,0.003734,-0.027365,-0.000895,0.046468,0.000642,0.013361,0.010224,DDOS attack-HOIC,1
4,0.003993,0.029602,-0.073997,-0.035304,-0.057061,-0.031729,-0.075933,0.048595,0.051252,-0.003702,...,-0.003208,0.003734,-0.027365,-0.000895,0.046468,0.000642,0.013361,0.010224,DDOS attack-HOIC,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
673945,0.025786,0.059814,-0.022930,0.037751,-0.032899,0.004213,-0.069948,-0.010040,0.006247,-0.019991,...,0.003293,0.040431,0.012497,0.029120,0.026577,-0.009339,0.033992,-0.011904,Benign,0
673946,0.012710,0.050273,-0.021694,0.040512,-0.040932,-0.005700,-0.066764,-0.005599,0.012955,-0.011116,...,-0.002057,0.045031,0.002866,0.021179,0.023044,-0.005885,0.039423,-0.016999,Benign,0
673947,0.021983,0.056845,-0.023819,0.037691,-0.037772,0.001478,-0.070110,-0.010086,0.008569,-0.017335,...,0.000178,0.041813,0.010638,0.026475,0.025749,-0.009338,0.037241,-0.014834,Benign,0
673948,0.009540,0.052301,-0.016863,0.038848,-0.042323,-0.003629,-0.067921,-0.011576,0.013820,-0.012596,...,0.002484,0.043894,0.002460,0.020876,0.022586,-0.006100,0.045977,-0.014441,Benign,0


# Embeddings CBLOF  Embeddings

In [37]:
import torch
import dgl
import numpy as np
import pandas as pd
import torch.optim as optim
import time
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, f1_score
from sklearn.ensemble import IsolationForest
import gc

from tqdm import tqdm
import itertools

In [38]:
benign_train_samples = df_train[df_train.Label == 0].drop(columns=["Label", "Attack"])
normal_train_samples = df_train.drop(columns=["Label", "Attack"])

train_labels = df_train["Label"]
test_labels = df_test["Label"]

test_samples = df_test.drop(columns=["Label", "Attack"])

In [39]:
contamination = [0.001, 0.01, 0.04, 0.05, 0.1, 0.2]

In [40]:
%pip install pyod

Note: you may need to restart the kernel to use updated packages.


In [41]:

from pyod.models.cblof import CBLOF
n_est = [8, 12, 15, 20, 25, 30]  # Try larger cluster counts
params = list(itertools.product(n_est, contamination))
score = -1
bs = None
for n_est, con in tqdm(params):
    
    # Add alpha and beta parameters with more relaxed values
    clf_if = CBLOF(n_clusters=n_est, contamination=con, alpha=0.7, beta=3)
    try:
        clf_if.fit(benign_train_samples)
        y_pred = clf_if.predict(test_samples)
        test_pred = y_pred

        f1 = f1_score(test_labels, test_pred, average='macro')

        if f1 > score:
            score = f1
            best_params = {'n_estimators': n_est,
                          "con": con
                }
            bs = test_pred
    except Exception as e:
        print(f"Error with n_clusters={n_est}, contamination={con}: {str(e)}")
        continue
    
    del clf_if
    gc.collect()


print(best_params)
print(score)
print(classification_report(test_labels, bs, digits=4))


  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super().

{'n_estimators': 8, 'con': 0.01}
0.9220114987248272
              precision    recall  f1-score   support

           0     0.9541    0.9842    0.9689    227729
           1     0.9334    0.8236    0.8751     61106

    accuracy                         0.9503    288835
   macro avg     0.9438    0.9039    0.9220    288835
weighted avg     0.9497    0.9503    0.9491    288835



In [42]:
n_est = [2,3,5,7,9,10]
contamination = [0.001, 0.01, 0.04, 0.05, 0.1, 0.2]
params = list(itertools.product(n_est, contamination))
score = -1
bs = None
for n_est, con in tqdm(params):
    
    clf_if = CBLOF(n_clusters=n_est, contamination=con)
    clf_if.fit(normal_train_samples)
    y_pred = clf_if.predict(test_samples)
    test_pred = y_pred

    f1 = f1_score(test_labels, test_pred, average='macro')

    if f1 > score:
        score = f1
        best_params = {'n_estimators': n_est,
                       "con": con
                }
        bs = test_pred
    del clf_if
    gc.collect()


print(best_params)
print(score)
print(classification_report(test_labels, bs, digits=4))

  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  3%|▎         | 1/36 [00:30<17:33, 30.11s/it]


KeyboardInterrupt: 

In [None]:
###  CBLOF RAW

In [56]:
df_raw_train = pd.concat([X_train.drop(columns=["IPV4_SRC_ADDR","IPV4_DST_ADDR", "h"]), y_train], axis=1)
df_raw_test = pd.concat([X_test.drop(columns=["IPV4_SRC_ADDR","IPV4_DST_ADDR", "h"]), y_test], axis=1)

In [57]:
raw_benign_train_samples = df_raw_train[df_raw_train.Label == 0].drop(columns=["Label", "Attack"])
raw_normal_train_samples = df_raw_train.drop(columns=["Label", "Attack"])

raw_train_labels = df_raw_train["Label"]
raw_test_labels = df_raw_test["Label"]

raw_test_samples = df_raw_test.drop(columns=["Label", "Attack"])

In [58]:
from pyod.models.cblof import CBLOF

n_est = [2,3,5,7,9,10]
contamination = [0.001, 0.01, 0.04, 0.05, 0.1, 0.2]
params = list(itertools.product(n_est, contamination))
score = -1
bs = None
for n_est, con in tqdm(params):
    
    try:
        clf_b = CBLOF(n_clusters=n_est, contamination=con)
        clf_b.fit(raw_benign_train_samples)
    except ValueError as e:
        print(n_est)
        continue  
   
    y_pred = clf_b.predict(raw_test_samples)
    test_pred = y_pred

    f1 = f1_score(raw_test_labels, test_pred, average='macro')

    if f1 > score:
        score = f1
        best_params = {'n_estimators': n_est,
                        "con": con
                }
        bs = test_pred
    del clf_b
    gc.collect()

  

print(best_params)
print(score)
print(classification_report(raw_test_labels, bs, digits=4))

  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)


2
2
2
2
2
2


  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super().

{'n_estimators': 10, 'con': 0.2}
0.3971858513564739
              precision    recall  f1-score   support

           0     0.0294    0.7949    0.0566        39
           1     0.9945    0.5864    0.7378      2478

    accuracy                         0.5896      2517
   macro avg     0.5119    0.6906    0.3972      2517
weighted avg     0.9796    0.5896    0.7272      2517






In [None]:
n_est = [2,3,5,7,9,10]
contamination = [0.001, 0.01, 0.04, 0.05, 0.1, 0.2]
params = list(itertools.product(n_est, contamination))
score = -1
bs = None
for n_est, con in tqdm(params):
    
    try:
        clf_if = CBLOF(n_clusters=n_est, contamination=con)
        clf_if.fit(raw_normal_train_samples)
    except ValueError as e:
        print(n_est)
        continue  
    
    y_pred = clf_if.predict(raw_test_samples)
    test_pred = y_pred

    f1 = f1_score(raw_test_labels, test_pred, average='macro')

    if f1 > score:
        score = f1
        best_params = {'n_estimators': n_est
                }
        bs = test_pred
    del clf_if
    gc.collect()

        

print("benign only")
print(best_params)
print(score)
print(classification_report(raw_test_labels, bs, digits=4))

100%|██████████| 36/36 [20:42<00:00, 34.51s/it]


benign only
{'n_estimators': 2}
0.8618432811826696
              precision    recall  f1-score   support

           0     0.9591    0.9806    0.9697    499068
           1     0.8287    0.6916    0.7540     67744

    accuracy                         0.9461    566812
   macro avg     0.8939    0.8361    0.8618    566812
weighted avg     0.9435    0.9461    0.9439    566812



In [None]:
# HBOS  Embeddings

In [None]:
benign_train_samples = df_train[df_train.Label == 0].drop(columns=["Label", "Attack"])
normal_train_samples = df_train.drop(columns=["Label", "Attack"])

train_labels = df_train["Label"]
test_labels = df_test["Label"]

test_samples = df_test.drop(columns=["Label", "Attack"])

In [None]:
contamination = [0.001, 0.01, 0.04, 0.05, 0.1, 0.2]

In [None]:
from pyod.models.hbos import HBOS

n_est = [5,10,15,20,25,30]
params = list(itertools.product(n_est, contamination))
score = -1
bs = None
for n_est, con in tqdm(params):
    
    clf_if = HBOS(n_bins=n_est, contamination=con)
    clf_if.fit(benign_train_samples)
    y_pred = clf_if.predict(test_samples)
    test_pred = y_pred

    f1 = f1_score(test_labels, test_pred, average='macro')

    if f1 > score:
        score = f1
        best_params = {'n_estimators': n_est,
                       "con": con
                }
        bs = test_pred
    del clf_if
    gc.collect()


print(best_params)
print(score)
print(classification_report(test_labels, bs, digits=4))

100%|██████████| 36/36 [19:29<00:00, 32.49s/it]


{'n_estimators': 5, 'con': 0.01}
0.945069359337394
              precision    recall  f1-score   support

           0     0.9845    0.9897    0.9871    996643
           1     0.9213    0.8855    0.9030    135488

    accuracy                         0.9772   1132131
   macro avg     0.9529    0.9376    0.9451   1132131
weighted avg     0.9769    0.9772    0.9770   1132131



In [None]:
n_est = [5,10,15,20,25,30]
contamination = [0.001, 0.01, 0.04, 0.05, 0.1, 0.2]
params = list(itertools.product(n_est, contamination))
score = -1
bs = None
for n_est, con in tqdm(params):
    
    clf_if = HBOS(n_bins=n_est, contamination=con)
    clf_if.fit(normal_train_samples)
    y_pred = clf_if.predict(test_samples)
    test_pred = y_pred

    f1 = f1_score(test_labels, test_pred, average='macro')

    if f1 > score:
        score = f1
        best_params = {'n_estimators': n_est,
                       "con": con
                }
        bs = test_pred
    del clf_if
    gc.collect()


print(best_params)
print(score)
print(classification_report(test_labels, bs, digits=4))

100%|██████████| 36/36 [21:10<00:00, 35.28s/it]


{'n_estimators': 5, 'con': 0.1}
0.9189314026948445
              precision    recall  f1-score   support

           0     0.9705    0.9945    0.9824    996643
           1     0.9503    0.7779    0.8555    135488

    accuracy                         0.9686   1132131
   macro avg     0.9604    0.8862    0.9189   1132131
weighted avg     0.9681    0.9686    0.9672   1132131



In [None]:
##  HBOS  RAw

In [None]:
from pyod.models.cblof import CBLOF

n_est = [5,10,15,20,25,30]
contamination = [0.001, 0.01, 0.04, 0.05, 0.1, 0.2]
params = list(itertools.product(n_est, contamination))
score = -1
bs = None
for n_est, con in tqdm(params):
    
    try:
        clf_b = HBOS(n_bins=n_est, contamination=con)
        clf_b.fit(raw_benign_train_samples)
    except ValueError as e:
        print(n_est)
        continue  
   
    y_pred = clf_b.predict(raw_test_samples)
    test_pred = y_pred

    f1 = f1_score(raw_test_labels, test_pred, average='macro')

    if f1 > score:
        score = f1
        best_params = {'n_estimators': n_est,
                        "con": con
                }
        bs = test_pred
    del clf_b
    gc.collect()

  

print(best_params)
print(score)
print(classification_report(raw_test_labels, bs, digits=4))

100%|██████████| 36/36 [02:09<00:00,  3.59s/it]


{'n_estimators': 30, 'con': 0.04}
0.8627757960209628
              precision    recall  f1-score   support

           0     0.9715    0.9601    0.9658    499068
           1     0.7294    0.7928    0.7598     67744

    accuracy                         0.9401    566812
   macro avg     0.8505    0.8765    0.8628    566812
weighted avg     0.9426    0.9401    0.9412    566812



In [None]:
n_est = [5,10,15,20,25,30]
contamination = [0.001, 0.01, 0.04, 0.05, 0.1, 0.2]
params = list(itertools.product(n_est, contamination))
score = -1
bs = None
for n_est, con in tqdm(params):
    
    try:
        clf_if = HBOS(n_bins=n_est, contamination=con)
        clf_if.fit(raw_normal_train_samples)
    except ValueError as e:
        print(n_est)
        continue  
    
    y_pred = clf_if.predict(raw_test_samples)
    test_pred = y_pred

    f1 = f1_score(raw_test_labels, test_pred, average='macro')

    if f1 > score:
        score = f1
        best_params = {'n_estimators': n_est
                }
        bs = test_pred
    del clf_if
    gc.collect()

        

print("benign only")
print(best_params)
print(score)
print(classification_report(raw_test_labels, bs, digits=4))

100%|██████████| 36/36 [02:14<00:00,  3.74s/it]


benign only
{'n_estimators': 5}
0.7882018992795334
              precision    recall  f1-score   support

           0     0.9766    0.8943    0.9337    499068
           1     0.5197    0.8422    0.6427     67744

    accuracy                         0.8881    566812
   macro avg     0.7481    0.8683    0.7882    566812
weighted avg     0.9220    0.8881    0.8989    566812



In [None]:
##  PCA  Emb

In [None]:
from pyod.models.pca import PCA
n_est = [5,10,15,20,25,30]
cont = [0.001, 0.01, 0.04, 0.05, 0.1, 0.2]
params = list(itertools.product(n_est, cont))
score = -1
bs = None

for n_est, con in tqdm(params):
    clf_if = PCA(n_components=n_est, contamination=con)
    clf_if.fit(benign_train_samples)
    y_pred = clf_if.predict(test_samples)
    test_pred = y_pred

    f1 = f1_score(test_labels, test_pred, average='macro')

    if f1 > score:
        score = f1
        best_params = {'n_estimators': n_est,
                       "con": con
                }
        bs = test_pred
    del clf_if
    gc.collect()


print(best_params)
print(score)
print(classification_report(test_labels, bs, digits=4))

100%|██████████| 36/36 [29:47<00:00, 49.66s/it]


{'n_estimators': 10, 'con': 0.001}
0.9442956641768174
              precision    recall  f1-score   support

           0     0.9770    0.9988    0.9878    996643
           1     0.9896    0.8267    0.9008    135488

    accuracy                         0.9782   1132131
   macro avg     0.9833    0.9127    0.9443   1132131
weighted avg     0.9785    0.9782    0.9774   1132131



In [None]:
n_est = [5,10,15,20,25,30]
cont = [0.001, 0.01, 0.04, 0.05, 0.1, 0.2]
params = list(itertools.product(n_est, cont))
score = -1
bs = None

for n_est, con in tqdm(params):
    clf_if = PCA(n_components=n_est, contamination=con)
    clf_if.fit(normal_train_samples)
    y_pred = clf_if.predict(test_samples)
    test_pred = y_pred

    f1 = f1_score(test_labels, test_pred, average='macro')

    if f1 > score:
        score = f1
        best_params = {'n_estimators': n_est,
                       "con": con
                }
        bs = test_pred
    del clf_if
    gc.collect()


print(best_params)
print(score)
print(classification_report(test_labels, bs, digits=4))

100%|██████████| 36/36 [32:26<00:00, 54.07s/it]


{'n_estimators': 10, 'con': 0.1}
0.9256946497974641
              precision    recall  f1-score   support

           0     0.9723    0.9955    0.9838    996643
           1     0.9598    0.7916    0.8676    135488

    accuracy                         0.9711   1132131
   macro avg     0.9661    0.8935    0.9257   1132131
weighted avg     0.9708    0.9711    0.9699   1132131



In [None]:
##  PCA  RAw

In [None]:
n_est = [5,10,15,20,25,30]
cont = [0.001, 0.01, 0.04, 0.05, 0.1, 0.2]
params = list(itertools.product(n_est, cont))
score = -1
bs = None

for n_est, con in tqdm(params):
    clf_if = PCA(n_components=n_est, contamination=con)
    clf_if.fit(raw_benign_train_samples)
   
    y_pred = clf_if.predict(raw_test_samples)
    test_pred = y_pred

    f1 = f1_score(raw_test_labels, test_pred, average='macro')

    if f1 > score:
        score = f1
        best_params = {'n_estimators': n_est,
                        "con": con
                }
        bs = test_pred
    del clf_if
    gc.collect()

  

print(best_params)
print(score)
print(classification_report(raw_test_labels, bs, digits=4))

100%|██████████| 36/36 [07:28<00:00, 12.47s/it]


{'n_estimators': 20, 'con': 0.1}
0.7684270275042566
              precision    recall  f1-score   support

           0     0.9639    0.9009    0.9313    499068
           1     0.5071    0.7513    0.6055     67744

    accuracy                         0.8830    566812
   macro avg     0.7355    0.8261    0.7684    566812
weighted avg     0.9093    0.8830    0.8924    566812



In [None]:
n_est = [5,10,15,20,25,30]
cont = [0.001, 0.01, 0.04, 0.05, 0.1, 0.2]
params = list(itertools.product(n_est, cont))
score = -1
bs = None

for n_est, con in tqdm(params):
    clf_if = PCA(n_components=n_est, contamination=con)
    clf_if.fit(raw_normal_train_samples)

    y_pred = clf_if.predict(raw_test_samples)
    test_pred = y_pred

    f1 = f1_score(raw_test_labels, test_pred, average='macro')

    if f1 > score:
        score = f1
        best_params = {'n_estimators': n_est
                }
        bs = test_pred
    del clf_if
    gc.collect()

        

print("benign only")
print(best_params)
print(score)
print(classification_report(raw_test_labels, bs, digits=4))

100%|██████████| 36/36 [08:04<00:00, 13.46s/it]


benign only
{'n_estimators': 10}
0.7376147683157477
              precision    recall  f1-score   support

           0     0.9622    0.8744    0.9162    499068
           1     0.4466    0.7471    0.5590     67744

    accuracy                         0.8591    566812
   macro avg     0.7044    0.8107    0.7376    566812
weighted avg     0.9006    0.8591    0.8735    566812



In [None]:
##  IF  Emb

In [None]:
from sklearn.ensemble import IsolationForest
n_est = [20, 50, 100, 150]
cont = [0.001, 0.01, 0.04, 0.05, 0.1, 0.2]
params = list(itertools.product(n_est, cont))
score = -1
bs = None

for n_est, con in tqdm(params):
    clf_if = IsolationForest(n_estimators=n_est, contamination=con)
    clf_if.fit(benign_train_samples)
    y_pred = clf_if.predict(test_samples)
    test_pred = list(map(lambda x : 0 if x == 1 else 1, y_pred))

    f1 = f1_score(test_labels, test_pred, average='macro')

    if f1 > score:
        score = f1
        best_params = {'n_estimators': n_est,
                       "con": con
                }
        bs = test_pred
    del clf_if
    gc.collect()


print(best_params)
print(score)
print(classification_report(test_labels, bs, digits=4))

100%|██████████| 24/24 [3:09:47<00:00, 474.46s/it]  


{'n_estimators': 20, 'con': 0.001}
0.9538863735449246
              precision    recall  f1-score   support

           0     0.9805    0.9992    0.9897    996643
           1     0.9928    0.8538    0.9180    135488

    accuracy                         0.9818   1132131
   macro avg     0.9866    0.9265    0.9539   1132131
weighted avg     0.9820    0.9818    0.9812   1132131



In [None]:
from sklearn.ensemble import IsolationForest

n_est = [20, 50, 100, 150]
cont = [0.001, 0.01, 0.04, 0.05, 0.1, 0.2]
params = list(itertools.product(n_est, cont))
score = -1
bs = None

for n_est, con in tqdm(params):
    clf_if = IsolationForest(n_estimators=n_est, contamination=con)
    clf_if.fit(normal_train_samples)
    y_pred = clf_if.predict(test_samples)
    test_pred = list(map(lambda x : 0 if x == 1 else 1, y_pred))

    f1 = f1_score(test_labels, test_pred, average='macro')

    if f1 > score:
        score = f1
        best_params = {'n_estimators': n_est,
                       "con": con
                }
        bs = test_pred
    del clf_if
    gc.collect()


print(best_params)
print(score)
print(classification_report(test_labels, bs, digits=4))

100%|██████████| 24/24 [3:31:56<00:00, 529.86s/it]  


{'n_estimators': 50, 'con': 0.2}
0.8110535459623227
              precision    recall  f1-score   support

           0     0.9878    0.8952    0.9392    996643
           1     0.5436    0.9184    0.6829    135488

    accuracy                         0.8979   1132131
   macro avg     0.7657    0.9068    0.8111   1132131
weighted avg     0.9346    0.8979    0.9085   1132131



In [None]:
##  IF  Raw

In [None]:
from sklearn.ensemble import IsolationForest

n_est = [20, 50, 100, 150]
cont = [0.001, 0.01, 0.04, 0.05, 0.1, 0.2]
params = list(itertools.product(n_est, cont))
score = -1
bs = None

for n_est, con in tqdm(params):
    clf_if = IsolationForest(n_estimators=n_est, contamination=con)
    clf_if.fit(raw_benign_train_samples.to_numpy())
   
    y_pred = clf_if.predict(raw_test_samples.to_numpy())
    test_pred = list(map(lambda x : 0 if x == 1 else 1, y_pred))

    f1 = f1_score(raw_test_labels, test_pred, average='macro')

    if f1 > score:
        score = f1
        best_params = {'n_estimators': n_est,
                        "con": con
                }
        bs = test_pred
    del clf_if
    gc.collect()

  

print(best_params)
print(score)
print(classification_report(raw_test_labels, bs, digits=4))

100%|██████████| 24/24 [19:12<00:00, 48.03s/it]


{'n_estimators': 20, 'con': 0.05}
0.8176793439704795
              precision    recall  f1-score   support

           0     0.9599    0.9494    0.9547    499068
           1     0.6553    0.7082    0.6807     67744

    accuracy                         0.9206    566812
   macro avg     0.8076    0.8288    0.8177    566812
weighted avg     0.9235    0.9206    0.9219    566812



In [None]:
n_est = [20, 50, 100, 150]
cont = [0.001, 0.01, 0.04, 0.05, 0.1, 0.2]
params = list(itertools.product(n_est, cont))
score = -1
bs = None

for n_est, con in tqdm(params):
    clf_if = IsolationForest(n_estimators=n_est, contamination=con)
    clf_if.fit(raw_normal_train_samples.to_numpy())

    y_pred = clf_if.predict(raw_test_samples.to_numpy())
    test_pred = list(map(lambda x : 0 if x == 1 else 1, y_pred))

    f1 = f1_score(raw_test_labels, test_pred, average='macro')

    if f1 > score:
        score = f1
        best_params = {'n_estimators': n_est
                }
        bs = test_pred
    del clf_if
    gc.collect()

        

print("benign only")
print(best_params)
print(score)
print(classification_report(raw_test_labels, bs, digits=4))

100%|██████████| 24/24 [20:38<00:00, 51.60s/it]


benign only
{'n_estimators': 100}
0.7409370706714709
              precision    recall  f1-score   support

           0     0.9633    0.8755    0.9173    499068
           1     0.4512    0.7539    0.5646     67744

    accuracy                         0.8610    566812
   macro avg     0.7072    0.8147    0.7409    566812
weighted avg     0.9021    0.8610    0.8751    566812

