In [3]:
!pip install  dgl -f https://data.dgl.ai/wheels/cu121/repo.html
!pip install  dglgo -f https://data.dgl.ai/wheels-test/repo.html

!pip install category_encoders

Looking in links: https://data.dgl.ai/wheels/cu121/repo.html
Collecting dgl
  Downloading https://data.dgl.ai/wheels/cu121/dgl-2.0.0%2Bcu121-cp310-cp310-manylinux1_x86_64.whl (926.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m926.0/926.0 MB[0m [31m925.4 kB/s[0m eta [36m0:00:00[0m
Installing collected packages: dgl
Successfully installed dgl-2.0.0+cu121
Looking in links: https://data.dgl.ai/wheels-test/repo.html
Collecting dglgo
  Downloading dglgo-0.0.2-py3-none-any.whl (63 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m63.5/63.5 kB[0m [31m496.0 kB/s[0m eta [36m0:00:00[0m
Collecting isort>=5.10.1 (from dglgo)
  Downloading isort-5.13.2-py3-none-any.whl (92 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m92.3/92.3 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting autopep8>=1.6.0 (from dglgo)
  Downloading autopep8-2.0.4-py2.py3-none-any.whl (45 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [25]:
import socket
import struct
import random
import pathlib
import joblib

import pandas as pd
import dask.dataframe as dd
import numpy as np

import dgl.function as fn
import dgl.nn as dglnn
from dgl import from_networkx
from dgl.data.utils import save_graphs
import torch.nn as nn
import torch as th
import torch.nn.functional as F
import networkx as nx
import category_encoders as ce
#import matplotlib.pyplot as plt
#import seaborn as sns

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.utils import class_weight

In [26]:
class SAGELayer(nn.Module):
    def __init__(self, ndim_in, edims, ndim_out, activation):
        super(SAGELayer, self).__init__()
        ### force to outut fix dimensions
        self.W_msg = nn.Linear(ndim_in + edims, ndim_out)
        ### apply weight
        self.W_apply = nn.Linear(ndim_in + ndim_out, ndim_out)
        self.activation = activation

    def message_func(self, edges):
        return {'m': self.W_msg(th.cat([edges.src['h'], edges.data['h']], 2))}

    def forward(self, g_dgl, nfeats, efeats):
        with g_dgl.local_scope():
            g = g_dgl
            g.ndata['h'] = nfeats
            g.edata['h'] = efeats
            # Eq4
            g.update_all(self.message_func, fn.mean('m', 'h_neigh'))
            # Eq5
            g.ndata['h'] = F.relu(self.W_apply(th.cat([g.ndata['h'], g.ndata['h_neigh']], 2)))
            return g.ndata['h']


class SAGE(nn.Module):
    def __init__(self, ndim_in, ndim_out, edim, activation, dropout):
        super(SAGE, self).__init__()
        self.layers = nn.ModuleList()
        self.layers.append(SAGELayer(ndim_in, edim, 128, activation))
        self.layers.append(SAGELayer(128, edim, ndim_out, activation))
        self.dropout = nn.Dropout(p=dropout)

    def forward(self, g, nfeats, efeats):
        for i, layer in enumerate(self.layers):
            if i != 0:
                nfeats = self.dropout(nfeats)
            nfeats = layer(g, nfeats, efeats)
        return nfeats.sum(1)

class MLPPredictor(nn.Module):
    def __init__(self, in_features, out_classes):
        super().__init__()
        self.W = nn.Linear(in_features * 2, out_classes)

    def apply_edges(self, edges):
        h_u = edges.src['h']
        h_v = edges.dst['h']
        score = self.W(th.cat([h_u, h_v], 1))
        return {'score': score}

    def forward(self, graph, h):
        with graph.local_scope():
            graph.ndata['h'] = h
            graph.apply_edges(self.apply_edges)
            return graph.edata['score']

class Model(nn.Module):
    def __init__(self, ndim_in, ndim_out, edim, activation, dropout):
        super().__init__()
        self.gnn = SAGE(ndim_in, ndim_out, edim, activation, dropout)
        self.pred = MLPPredictor(ndim_out, 2)
    def forward(self, g, nfeats, efeats):
        h = self.gnn(g, nfeats, efeats)
        return self.pred(g, h)

In [27]:
def log_progress(sequence, every=None, size=None, name='Epochs'):
    from ipywidgets import IntProgress, HTML, VBox
    from IPython.display import display

    is_iterator = False
    if size is None:
        try:
            size = len(sequence)
        except TypeError:
            is_iterator = True
    if size is not None:
        if every is None:
            if size <= 200:
                every = 1
            else:
                every = int(size / 200)     # every 0.5%
    else:
        assert every is not None, 'sequence is iterator, set every'

    if is_iterator:
        progress = IntProgress(min=0, max=1, value=1)
        progress.bar_style = 'info'
    else:
        progress = IntProgress(min=0, max=size, value=0)
    label = HTML()
    box = VBox(children=[label, progress])
    display(box)

    index = 0
    try:
        for index, record in enumerate(sequence, 1):
            if index == 1 or index % every == 0:
                if is_iterator:
                    label.value = '{name}: {index} / ?'.format(
                        name=name,
                        index=index
                    )
                else:
                    progress.value = index
                    label.value = u'{name}: {index} / {size}'.format(
                        name=name,
                        index=index,
                        size=size
                    )
            yield record
    except:
        progress.bar_style = 'danger'
        raise
    else:
        progress.bar_style = 'success'
        progress.value = index
        label.value = "{name}: {index}".format(
            name=name,
            index=str(index or '?')
        )
        
        
def compute_accuracy(pred, labels):
    return (pred.argmax(1) == labels).float().mean().item()

def correct_df(df, cols):

    df = df[cols]
    df['IPV4_SRC_ADDR'] = df.IPV4_SRC_ADDR.apply(lambda x: socket.inet_ntoa(struct.pack('>I', random.randint(0xac100001, 0xac1f0001))))
    df['IPV4_SRC_ADDR'] = df.IPV4_SRC_ADDR.apply(str)
    df['L4_SRC_PORT'] = df.L4_SRC_PORT.apply(str)
    df['IPV4_DST_ADDR'] = df.IPV4_DST_ADDR.apply(str)
    df['L4_DST_PORT'] = df.L4_DST_PORT.apply(str)
    df['IPV4_SRC_ADDR'] = df['IPV4_SRC_ADDR'] + ':' + df['L4_SRC_PORT']
    df['IPV4_DST_ADDR'] = df['IPV4_DST_ADDR'] + ':' + df['L4_DST_PORT']
    df.drop(columns=['L4_SRC_PORT','L4_DST_PORT'], inplace=True)
    df.rename(columns={"Label": "label"},inplace = True)
    df.drop_duplicates(inplace = True)
    label = df.label
    df.drop(columns=['label'],inplace = True)
    df = pd.concat([df, label], axis=1)

    return df

def train_egraphsage(df_train, model, epochs, cuda, report_epochs):

    print("Building a graph...")
    G = nx.from_pandas_edgelist(df_train, "IPV4_SRC_ADDR", "IPV4_DST_ADDR", ['h','label'],create_using=nx.MultiGraph())
    G = G.to_directed()
    G = from_networkx(G,edge_attrs=['h','label'] )

    G.ndata['h'] = th.ones(G.num_nodes(), G.edata['h'].shape[1])
    G.edata['train_mask'] = th.ones(len(G.edata['h']), dtype=th.bool)
    G.ndata['h'] = th.reshape(G.ndata['h'], (G.ndata['h'].shape[0], 1,G.ndata['h'].shape[1]))
    G.edata['h'] = th.reshape(G.edata['h'], (G.edata['h'].shape[0], 1,G.edata['h'].shape[1]))

    class_weights = class_weight.compute_class_weight(class_weight = "balanced",
                                                      classes = np.unique(G.edata['label'].cpu().numpy()),
                                                      y = G.edata['label'].cpu().numpy())

    class_weights = th.FloatTensor(class_weights).cuda() if cuda else th.FloatTensor(class_weights)
    criterion = nn.CrossEntropyLoss(weight=class_weights)

    if cuda:
        G = G.to('cuda:0')
        model = model.cuda()

    node_features = G.ndata['h']
    edge_features = G.edata['h']

    edge_label = G.edata['label']
    train_mask = G.edata['train_mask']

    opt = th.optim.Adam(model.parameters())

    print("Training started...")
    print("----------------------")
    #for epoch in range(1,epochs+1):
    for epoch in log_progress(range(1,epochs+1), every=1):
        pred = model(G, node_features,edge_features).cuda() if cuda else model(G, node_features,edge_features)
        loss = criterion(pred[train_mask], edge_label[train_mask])
        opt.zero_grad()
        loss.backward()
        opt.step()
        if epoch % report_epochs == 0:
            print('Training acc',str(epoch),':', compute_accuracy(pred[train_mask], edge_label[train_mask]))
        
    return model, G, node_features, edge_features

def train_in_chunks(model, path, chunk_size, cols, epochs, cuda=False, report_epochs=100, limit=None):

    count = 0
    
    def extract_number(path):
        return int(path.stem.split(".")[1])

    files = sorted(pathlib.Path(path).glob("part.*.parquet"), key=extract_number)

    encoder = None
    scaler = None
    cols_to_norm = None

    df_train = pd.DataFrame()
    for index, path in enumerate(files):
        
        count += df_train.shape[0]
        
        #print(path)
        df_chunk = correct_df(pd.read_parquet(path), cols)
        
        if index == len(files) - 1 or df_train.shape[0] >= chunk_size:
            if encoder == None:
                encoder = ce.TargetEncoder(cols=['TCP_FLAGS','PROTOCOL'])
                encoder.fit(df_train, df_train.label)
            df_train = encoder.transform(df_train)

            if scaler == None:
                scaler = StandardScaler()
                cols_to_norm = list(set(list(df_train.iloc[:, 2:].columns ))  - set(list(['label'])) )
                df_train[cols_to_norm] = scaler.fit_transform(df_train[cols_to_norm])
            df_train[cols_to_norm] = scaler.transform(df_train[cols_to_norm])

            df_train['h'] = df_train[cols_to_norm].values.tolist()

            model, G, node_features, edge_features = train_egraphsage(df_train, model, epochs, cuda, report_epochs)

            df_train = pd.DataFrame()
    
        else:
            df_train = pd.concat([df_train, df_chunk])
            
        if limit != None and limit <= count:
            break

    return model, encoder, scaler, cols_to_norm


def create_graph(df_test, encoder, scaler, cols_to_norm, n_features):
    df_test = encoder.transform(df_test)
    df_test[cols_to_norm] = scaler.transform(df_test[cols_to_norm])
    df_test['h'] = df_test[cols_to_norm].values.tolist()

    G_test = nx.from_pandas_edgelist(df_test, "IPV4_SRC_ADDR", "IPV4_DST_ADDR", ['h','label'],create_using=nx.MultiGraph())
    G_test = G_test.to_directed()
    G_test = from_networkx(G_test,edge_attrs=['h','label'] )
    actual = G_test.edata.pop('label')
    G_test.ndata['feature'] = th.ones(G_test.num_nodes(), n_features)

    G_test.ndata['feature'] = th.reshape(G_test.ndata['feature'], (G_test.ndata['feature'].shape[0], 1, G_test.ndata['feature'].shape[1]))

    G_test.edata['h'] = th.reshape(G_test.edata['h'], (G_test.edata['h'].shape[0], 1, G_test.edata['h'].shape[1]))

    return G_test, actual
    

def predict_egraphsage(df_test, model, encoder, scaler, cols_to_norm, n_features, G_test=None, actual=None):

    if G_test == None:
        G_test, actual = create_graph(df_test, encoder, scaler, cols_to_norm, n_features)
    
    node_features_test = G_test.ndata['feature']
    edge_features_test = G_test.edata['h']

    test_pred = model(G_test, node_features_test, edge_features_test)#.cuda()
    test_pred = test_pred.argmax(1)
    test_pred = th.Tensor.cpu(test_pred).detach().numpy()

    return confusion_matrix(actual, test_pred), accuracy_score(actual, test_pred), G_test, actual

def save_model(model, MODEL_PATH, scaler, SCALER_PATH, encoder, ENCODER_PATH):
    th.save(model, MODEL_PATH)
    if scaler != None:
        joblib.dump(scaler, SCALER_PATH)
    if encoder != None:
        joblib.dump(encoder, ENCODER_PATH)

def save_graph(G, GRAPH_PATH):
    save_graphs(GRAPH_PATH, [G], None)

def load_model(MODEL_PATH, SCALER_PATH, ENCODER_PATH):
    model = th.load(MODEL_PATH)
    model.eval()
    scaler = joblib.load(SCALER_PATH)
    encoder = joblib.load(ENCODER_PATH)

    return model, scaler, encoder

def load_graph(G, GRAPH_PATH):
    return

In [28]:
pd.options.mode.copy_on_write = True

**COLAB directories**

In [None]:
dir_data = '/content/drive/MyDrive/csci_e-599a/data/'
dir_model = '/content/drive/MyDrive/csci_e-599a/model/'

**Local directories**

In [18]:
dir_data = '../data/netflow/parquet/original/'
dir_model = 'model/'

In [8]:
netflows = ['NF-BoT-IoT_chunks',
           'NF-ToN-IoT_chunks',
           'NF-UNSW-NB15_chunks',
           'NF-UQ-NIDS_chunks',
           'NF-CSE-CIC-IDS2018_chunks',
           'NF-BoT-IoT-v2_chunks',
           'NF-ToN-IoT-v2_chunks',
           'NF-UNSW-NB15-v2_chunks',
           'NF-UQ-NIDS-v2_chunks',
           'NF-CSE-CIC-IDS2018-v2_chunks']

cols = ['IPV4_SRC_ADDR', 'IPV4_DST_ADDR', 'L4_SRC_PORT','L4_DST_PORT', 'PROTOCOL', 'IN_BYTES', 'OUT_BYTES',
       'IN_PKTS', 'OUT_PKTS', 'TCP_FLAGS', 'FLOW_DURATION_MILLISECONDS','Label']

n_features = len(cols) - 4 - 1 #'IPV4_SRC_ADDR', 'IPV4_DST_ADDR', 'L4_SRC_PORT','L4_DST_PORT', 'Label'
encoder = None
scaler = None
cols_to_norm = None


chunk_size = 1000000
epochs = 1000
cuda = False
report_epochs = epochs

df_test = correct_df(pd.read_parquet(dir_data + 'Attack-2_chunks'), cols)
G_test = None
actual = None

dropouts = 0.2
model_ = Model(n_features, 128, n_features, F.relu, dropouts)


for i, nf in enumerate(netflows):
    print ("\n=====================================================")
    print ("\n==== " + nf + " ====")
    
    MODEL_PATH = dir_model + nf.lower() + '.pt'
    SCALER_PATH = dir_model + '_scaler_' + nf.lower() + '.pkl'
    ENCODER_PATH = dir_model + '_encoder_' + nf.lower() + '.pkl'
    PATH = dir_data + nf
    
    MODEL_PATH_ = dir_model + str(i)+"_"+nf.lower() + '.pt'
    
    df_train = pd.read_parquet(PATH)
    sample_size = min(chunk_size, df_train.shape[0])
    df_train = df_train.sample(n=sample_size, random_state=1)
    df_train = correct_df(df_train, cols)
    
    if encoder == None:
        encoder = ce.TargetEncoder(cols=['TCP_FLAGS','PROTOCOL'])
        encoder.fit(df_train, df_train.label)
    df_train = encoder.transform(df_train)

    if scaler == None:
        scaler = StandardScaler()
        cols_to_norm = list(set(list(df_train.iloc[:, 2:].columns ))  - set(list(['label'])) )
        df_train[cols_to_norm] = scaler.fit_transform(df_train[cols_to_norm])
    df_train[cols_to_norm] = scaler.transform(df_train[cols_to_norm])

    df_train['h'] = df_train[cols_to_norm].values.tolist()
    
    model = None
    model = Model(n_features, 128, n_features, F.relu, dropouts)
    model, G, node_features, edge_features = train_egraphsage(df_train, model, epochs, cuda, report_epochs)
    print("Saving model: " + MODEL_PATH)
    save_model(model, MODEL_PATH, scaler, SCALER_PATH, encoder, ENCODER_PATH)
    
    cm, acc, G_test, actual = predict_egraphsage(df_test, model, encoder, scaler, cols_to_norm, n_features, G_test, actual)

    print("\nTest results:")
    print(cm)
    print(acc)
    
    print("--------------------------------------------------------------")
    print("Incremental data training...")
    model_, G_, node_features_, edge_features_ = train_egraphsage(df_train, model_, epochs, cuda, report_epochs)
    save_model(model_, MODEL_PATH_, None, None, None, None)
    print("Saving model: " + MODEL_PATH_)
    
    cm, acc, G_test, actual = predict_egraphsage(df_test, model_, encoder, scaler, cols_to_norm, n_features, G_test, actual)

    print("\nTest results:")
    print(cm)
    print(acc)


==== NF-BoT-IoT_chunks ====
Building a graph...
Training started...


VBox(children=(HTML(value=''), IntProgress(value=0, max=1000)))

Training acc 1000 : 0.9248425364494324
Saving model: model/nf-bot-iot_chunks.pt

Test results:
[[  8123    981]
 [   313 580805]]
0.9978076045962366

Incremental data training...
Building a graph...
Training started...


VBox(children=(HTML(value=''), IntProgress(value=0, max=1000)))

Training acc 1000 : 0.9312680959701538
Saving model: model/0_nf-bot-iot_chunks.pt

Test results:
[[  8125    979]
 [   204 580914]]
0.9979956694260803

==== NF-ToN-IoT_chunks ====
Building a graph...
Training started...


VBox(children=(HTML(value=''), IntProgress(value=0, max=1000)))

Training acc 1000 : 0.9956914782524109
Saving model: model/nf-ton-iot_chunks.pt

Test results:
[[  8915    189]
 [411384 169734]]
0.30268102510580763

Incremental data training...
Building a graph...
Training started...


VBox(children=(HTML(value=''), IntProgress(value=0, max=1000)))

Training acc 1000 : 0.9805534482002258
Saving model: model/1_nf-ton-iot_chunks.pt

Test results:
[[  8919    185]
 [482012  99106]]
0.18302435354832589

==== NF-UNSW-NB15_chunks ====
Building a graph...
Training started...


VBox(children=(HTML(value=''), IntProgress(value=0, max=1000)))

Training acc 1000 : 0.9777935147285461
Saving model: model/nf-unsw-nb15_chunks.pt

Test results:
[[  8912    192]
 [402033 179085]]
0.3185191334785894

Incremental data training...
Building a graph...
Training started...


VBox(children=(HTML(value=''), IntProgress(value=0, max=1000)))

Training acc 1000 : 0.9762794971466064
Saving model: model/2_nf-unsw-nb15_chunks.pt

Test results:
[[  8989    115]
 [416169 164949]]
0.2946992826428022

==== NF-UQ-NIDS_chunks ====
Building a graph...
Training started...


VBox(children=(HTML(value=''), IntProgress(value=0, max=1000)))

Training acc 1000 : 0.9429799914360046
Saving model: model/nf-uq-nids_chunks.pt

Test results:
[[  6356   2748]
 [353736 227382]]
0.39601709187390505

Incremental data training...
Building a graph...
Training started...


VBox(children=(HTML(value=''), IntProgress(value=0, max=1000)))

Training acc 1000 : 0.9648990035057068
Saving model: model/3_nf-uq-nids_chunks.pt

Test results:
[[  4848   4256]
 [342937 238181]]
0.4117586264151455

==== NF-CSE-CIC-IDS2018_chunks ====
Building a graph...
Training started...


VBox(children=(HTML(value=''), IntProgress(value=0, max=1000)))

KeyboardInterrupt: 

## BoT Graph from 100,000 edges, only 200 epochs

In [None]:
netflows = ['NF-BoT-IoT_chunks']

cols = ['IPV4_SRC_ADDR', 'IPV4_DST_ADDR', 'L4_SRC_PORT','L4_DST_PORT', 'PROTOCOL', 'IN_BYTES', 'OUT_BYTES',
       'IN_PKTS', 'OUT_PKTS', 'TCP_FLAGS', 'FLOW_DURATION_MILLISECONDS','Label']

n_features = len(cols) - 4 - 1 #'IPV4_SRC_ADDR', 'IPV4_DST_ADDR', 'L4_SRC_PORT','L4_DST_PORT', 'Label'

chunk_size = 100000
epochs = 200
cuda = False
report_epochs = epochs

df_test = correct_df(pd.read_parquet(dir_data + 'Attack-2_chunks'), cols)

dropouts = 0.2

for i, nf in enumerate(netflows):
    print ("\n==============================")
    print ("==== " + nf + " ====")
    
    MODEL_PATH = dir_model + nf.lower() + '.pt'
    SCALER_PATH = dir_model + '_scaler_' + nf.lower() + '.pkl'
    ENCODER_PATH = dir_model + '_encoder_' + nf.lower() + '.pkl'
    PATH = dir_data + nf
    
    MODEL_PATH_ = dir_model + str(i)+"_"+nf.lower() + '.pt'
    
    df_train = pd.read_parquet(PATH)
    sample_size = min(chunk_size, df_train.shape[0])
    df_train = df_train.sample(n=sample_size, random_state=1)
    df_train = correct_df(df_train, cols)
    
    encoder = ce.TargetEncoder(cols=['TCP_FLAGS','PROTOCOL'])
    encoder.fit(df_train, df_train.label)
    df_train = encoder.transform(df_train)

    scaler = StandardScaler()
    cols_to_norm = list(set(list(df_train.iloc[:, 2:].columns ))  - set(list(['label'])) )
    df_train[cols_to_norm] = scaler.fit_transform(df_train[cols_to_norm])
    df_train[cols_to_norm] = scaler.transform(df_train[cols_to_norm])

    df_train['h'] = df_train[cols_to_norm].values.tolist()
    
    model = None
    model = Model(n_features, 128, n_features, F.relu, dropouts)
    model, G, node_features, edge_features = train_egraphsage(df_train, model, epochs, cuda, report_epochs)
    print("Saving model: " + MODEL_PATH)
    save_model(model, MODEL_PATH, scaler, SCALER_PATH, encoder, ENCODER_PATH)
    
    print("Predicting on 'Attack-2'")
    cm, acc, G_test, actual = predict_egraphsage(df_test, model, encoder, scaler, cols_to_norm, n_features, None, None)

    print("\nTest results:")
    print(cm)
    print(acc)


==== NF-BoT-IoT_chunks ====
Building a graph...


## BoT Graph from 1,000,000 edges, 1000 epochs

In [10]:
netflows = ['NF-BoT-IoT_chunks']

cols = ['IPV4_SRC_ADDR', 'IPV4_DST_ADDR', 'L4_SRC_PORT','L4_DST_PORT', 'PROTOCOL', 'IN_BYTES', 'OUT_BYTES',
       'IN_PKTS', 'OUT_PKTS', 'TCP_FLAGS', 'FLOW_DURATION_MILLISECONDS','Label']

n_features = len(cols) - 4 - 1 #'IPV4_SRC_ADDR', 'IPV4_DST_ADDR', 'L4_SRC_PORT','L4_DST_PORT', 'Label'

chunk_size = 1000000
epochs = 1000
cuda = False
report_epochs = epochs

df_test = correct_df(pd.read_parquet(dir_data + 'Attack-2_chunks'), cols)

dropouts = 0.2

for i, nf in enumerate(netflows):
    print ("\n==============================")
    print ("==== " + nf + " ====")
    
    MODEL_PATH = dir_model + nf.lower() + '.pt'
    SCALER_PATH = dir_model + '_scaler_' + nf.lower() + '.pkl'
    ENCODER_PATH = dir_model + '_encoder_' + nf.lower() + '.pkl'
    PATH = dir_data + nf
    
    MODEL_PATH_ = dir_model + str(i)+"_"+nf.lower() + '.pt'
    
    df_train = pd.read_parquet(PATH)
    sample_size = min(chunk_size, df_train.shape[0])
    df_train = df_train.sample(n=sample_size, random_state=1)
    df_train = correct_df(df_train, cols)
    
    encoder = ce.TargetEncoder(cols=['TCP_FLAGS','PROTOCOL'])
    encoder.fit(df_train, df_train.label)
    df_train = encoder.transform(df_train)

    scaler = StandardScaler()
    cols_to_norm = list(set(list(df_train.iloc[:, 2:].columns ))  - set(list(['label'])) )
    df_train[cols_to_norm] = scaler.fit_transform(df_train[cols_to_norm])
    df_train[cols_to_norm] = scaler.transform(df_train[cols_to_norm])

    df_train['h'] = df_train[cols_to_norm].values.tolist()
    
    model = None
    model = Model(n_features, 128, n_features, F.relu, dropouts)
    model, G, node_features, edge_features = train_egraphsage(df_train, model, epochs, cuda, report_epochs)
    print("Saving model: " + MODEL_PATH)
    save_model(model, MODEL_PATH, scaler, SCALER_PATH, encoder, ENCODER_PATH)
    
    print("Predicting on 'Attack-2'")
    cm, acc, G_test, actual = predict_egraphsage(df_test, model, encoder, scaler, cols_to_norm, n_features, None, None)

    print("\nTest results:")
    print(cm)
    print(acc)


==== NF-BoT-IoT_chunks ====
Building a graph...
Training started...
----------------------


VBox(children=(HTML(value=''), IntProgress(value=0, max=1000)))

Training acc 1000 : 0.9297717213630676
Saving model: model/nf-bot-iot_chunks.pt
Predicting on 'Attack-2'

Test results:
[[  8123    981]
 [   178 580940]]
0.9980363320919925


## BoT Graph from All edges, 5000 epochs

In [13]:
netflows = ['NF-BoT-IoT_chunks']

cols = ['IPV4_SRC_ADDR', 'IPV4_DST_ADDR', 'L4_SRC_PORT','L4_DST_PORT', 'PROTOCOL', 'IN_BYTES', 'OUT_BYTES',
       'IN_PKTS', 'OUT_PKTS', 'TCP_FLAGS', 'FLOW_DURATION_MILLISECONDS','Label']

n_features = len(cols) - 4 - 1 #'IPV4_SRC_ADDR', 'IPV4_DST_ADDR', 'L4_SRC_PORT','L4_DST_PORT', 'Label'

chunk_size = 100000000
epochs = 5000
cuda = False
report_epochs = epochs

df_test = correct_df(pd.read_parquet(dir_data + 'Attack-2_chunks'), cols)

dropouts = 0.2

for i, nf in enumerate(netflows):
    print ("\n==============================")
    print ("==== " + nf + " ====")
    
    MODEL_PATH = dir_model + nf.lower() + '.pt'
    SCALER_PATH = dir_model + '_scaler_' + nf.lower() + '.pkl'
    ENCODER_PATH = dir_model + '_encoder_' + nf.lower() + '.pkl'
    PATH = dir_data + nf
    
    MODEL_PATH_ = dir_model + str(i)+"_"+nf.lower() + '.pt'
    
    df_train = pd.read_parquet(PATH)
    sample_size = min(chunk_size, df_train.shape[0])
    df_train = df_train.sample(n=sample_size, random_state=1)
    df_train = correct_df(df_train, cols)
    
    encoder = ce.TargetEncoder(cols=['TCP_FLAGS','PROTOCOL'])
    encoder.fit(df_train, df_train.label)
    df_train = encoder.transform(df_train)

    scaler = StandardScaler()
    cols_to_norm = list(set(list(df_train.iloc[:, 2:].columns ))  - set(list(['label'])) )
    df_train[cols_to_norm] = scaler.fit_transform(df_train[cols_to_norm])
    df_train[cols_to_norm] = scaler.transform(df_train[cols_to_norm])

    df_train['h'] = df_train[cols_to_norm].values.tolist()
    
    model = None
    model = Model(n_features, 128, n_features, F.relu, dropouts)
    model, G, node_features, edge_features = train_egraphsage(df_train, model, epochs, cuda, report_epochs)
    print("Saving model: " + MODEL_PATH)
    save_model(model, MODEL_PATH, scaler, SCALER_PATH, encoder, ENCODER_PATH)
    
    print("Predicting on 'Attack-2'")
    cm, acc, G_test, actual = predict_egraphsage(df_test, model, encoder, scaler, cols_to_norm, n_features, None, None)

    print("\nTest results:")
    print(cm)
    print(acc)


==== NF-BoT-IoT_chunks ====
Building a graph...
Training started...
----------------------


VBox(children=(HTML(value=''), IntProgress(value=0, max=5000)))

Training acc 5000 : 0.9371230006217957
Saving model: model/nf-bot-iot_chunks.pt
Predicting on 'Attack-2'

Test results:
[[  8126    978]
 [   332 580786]]
0.9977804961522952


## Graphs from 100,000 edges all datasets, only 200 epochs

In [11]:
netflows = ['NF-BoT-IoT_chunks',
           'NF-ToN-IoT_chunks',
           'NF-UNSW-NB15_chunks',
           'NF-UQ-NIDS_chunks',
           'NF-CSE-CIC-IDS2018_chunks',
           'NF-BoT-IoT-v2_chunks',
           'NF-ToN-IoT-v2_chunks',
           'NF-UNSW-NB15-v2_chunks',
           'NF-UQ-NIDS-v2_chunks',
           'NF-CSE-CIC-IDS2018-v2_chunks']

cols = ['IPV4_SRC_ADDR', 'IPV4_DST_ADDR', 'L4_SRC_PORT','L4_DST_PORT', 'PROTOCOL', 'IN_BYTES', 'OUT_BYTES',
       'IN_PKTS', 'OUT_PKTS', 'TCP_FLAGS', 'FLOW_DURATION_MILLISECONDS','Label']

n_features = len(cols) - 4 - 1 #'IPV4_SRC_ADDR', 'IPV4_DST_ADDR', 'L4_SRC_PORT','L4_DST_PORT', 'Label'

chunk_size = 100000
epochs = 200
cuda = False
report_epochs = epochs

df_test = correct_df(pd.read_parquet(dir_data + 'Attack-2_chunks'), cols)

dropouts = 0.2

for i, nf in enumerate(netflows):
    print ("\n==============================")
    print ("==== " + nf + " ====")
    
    MODEL_PATH = dir_model + nf.lower() + '.pt'
    SCALER_PATH = dir_model + '_scaler_' + nf.lower() + '.pkl'
    ENCODER_PATH = dir_model + '_encoder_' + nf.lower() + '.pkl'
    PATH = dir_data + nf
    
    MODEL_PATH_ = dir_model + str(i)+"_"+nf.lower() + '.pt'
    
    df_train = pd.read_parquet(PATH)
    sample_size = min(chunk_size, df_train.shape[0])
    df_train = df_train.sample(n=sample_size, random_state=1)
    df_train = correct_df(df_train, cols)
    
    encoder = ce.TargetEncoder(cols=['TCP_FLAGS','PROTOCOL'])
    encoder.fit(df_train, df_train.label)
    df_train = encoder.transform(df_train)

    scaler = StandardScaler()
    cols_to_norm = list(set(list(df_train.iloc[:, 2:].columns ))  - set(list(['label'])) )
    df_train[cols_to_norm] = scaler.fit_transform(df_train[cols_to_norm])
    df_train[cols_to_norm] = scaler.transform(df_train[cols_to_norm])

    df_train['h'] = df_train[cols_to_norm].values.tolist()
    
    model = None
    model = Model(n_features, 128, n_features, F.relu, dropouts)
    model, G, node_features, edge_features = train_egraphsage(df_train, model, epochs, cuda, report_epochs)
    print("Saving model: " + MODEL_PATH)
    save_model(model, MODEL_PATH, scaler, SCALER_PATH, encoder, ENCODER_PATH)
    
    print("Predicting on 'Attack-2'")
    cm, acc, G_test, actual = predict_egraphsage(df_test, model, encoder, scaler, cols_to_norm, n_features, None, None)

    print("\nTest results:")
    print(cm)
    print(acc)


==== NF-BoT-IoT_chunks ====
Building a graph...
Training started...
----------------------


VBox(children=(HTML(value=''), IntProgress(value=0, max=200)))

Training acc 200 : 0.8572149872779846
Saving model: model/nf-bot-iot_chunks.pt
Predicting on 'Attack-2'

Test results:
[[  5494   3610]
 [   155 580963]]
0.9936210442850317

==== NF-ToN-IoT_chunks ====
Building a graph...
Training started...
----------------------


VBox(children=(HTML(value=''), IntProgress(value=0, max=200)))

Training acc 200 : 0.9157350063323975
Saving model: model/nf-ton-iot_chunks.pt
Predicting on 'Attack-2'

Test results:
[[  8811    293]
 [ 99805 481313]]
0.8304061861469074

==== NF-UNSW-NB15_chunks ====
Building a graph...
Training started...
----------------------


VBox(children=(HTML(value=''), IntProgress(value=0, max=200)))

Training acc 200 : 0.9785199761390686
Saving model: model/nf-unsw-nb15_chunks.pt
Predicting on 'Attack-2'

Test results:
[[     0   9104]
 [    46 581072]]
0.9844973586209934

==== NF-UQ-NIDS_chunks ====
Building a graph...
Training started...
----------------------


VBox(children=(HTML(value=''), IntProgress(value=0, max=200)))

Training acc 200 : 0.9257599711418152
Saving model: model/nf-uq-nids_chunks.pt
Predicting on 'Attack-2'

Test results:
[[  8649    455]
 [500743  80375]]
0.1508313820901288

==== NF-CSE-CIC-IDS2018_chunks ====
Building a graph...
Training started...
----------------------


VBox(children=(HTML(value=''), IntProgress(value=0, max=200)))

Training acc 200 : 0.8702700138092041
Saving model: model/nf-cse-cic-ids2018_chunks.pt
Predicting on 'Attack-2'

Test results:
[[  9104      0]
 [528313  52805]]
0.1048910409981329

==== NF-BoT-IoT-v2_chunks ====
Building a graph...
Training started...
----------------------


VBox(children=(HTML(value=''), IntProgress(value=0, max=200)))

Training acc 200 : 0.9592049717903137
Saving model: model/nf-bot-iot-v2_chunks.pt
Predicting on 'Attack-2'

Test results:
[[  9019     85]
 [318137 262981]]
0.46084354700434754

==== NF-ToN-IoT-v2_chunks ====
Building a graph...
Training started...
----------------------


VBox(children=(HTML(value=''), IntProgress(value=0, max=200)))

Training acc 200 : 0.7136449813842773
Saving model: model/nf-ton-iot-v2_chunks.pt
Predicting on 'Attack-2'

Test results:
[[  4065   5039]
 [   409 580709]]
0.9907695748379424

==== NF-UNSW-NB15-v2_chunks ====
Building a graph...
Training started...
----------------------


VBox(children=(HTML(value=''), IntProgress(value=0, max=200)))

Training acc 200 : 0.9874250292778015
Saving model: model/nf-unsw-nb15-v2_chunks.pt
Predicting on 'Attack-2'

Test results:
[[   569   8535]
 [577601   3517]]
0.00692281887154325

==== NF-UQ-NIDS-v2_chunks ====
Building a graph...
Training started...
----------------------


VBox(children=(HTML(value=''), IntProgress(value=0, max=200)))

Training acc 200 : 0.8045750260353088
Saving model: model/nf-uq-nids-v2_chunks.pt
Predicting on 'Attack-2'

Test results:
[[    13   9091]
 [    16 581102]]
0.9845702125640861

==== NF-CSE-CIC-IDS2018-v2_chunks ====
Building a graph...
Training started...
----------------------


VBox(children=(HTML(value=''), IntProgress(value=0, max=200)))

Training acc 200 : 0.9544699788093567
Saving model: model/nf-cse-cic-ids2018-v2_chunks.pt
Predicting on 'Attack-2'

Test results:
[[  8952    152]
 [567954  13164]]
0.03747064663804467


In [21]:
from datetime import datetime, timedelta
df_attack = pd.read_parquet(dir_data + 'attack2/attack')
df_bgrd = pd.read_parquet(dir_data + 'attack2/background')

df_attack.label = 1
df_bgrd.label = 0

#Imbalanced
df_imb = pd.concat([df_attack, df_bgrd]).reset_index(drop=True)
df_imb = df_imb.sample(frac=1).reset_index(drop=True)

df_imb['FLOW_DURATION_MILLISECONDS'] = ((pd.to_datetime(df_imb['t_last']) - df_imb['t_first'] + timedelta(hours=4, minutes=0)).dt.total_seconds() * 10**3).astype(int)

def flag_to_numeric(flag):
    return int(''.join(['0' if s == '.' else '1' for s in flag]), 2)

df_imb['TCP_FLAGS'] = df_imb.tcp_flags.apply(flag_to_numeric)

df_imb.drop(columns=['t_first','t_last','src6_addr','src_tos','dst6_addr','icmp_code',
                   'icmp_status','sampled','export_sysid','fwd_status','app_latency',
                   'cli_latency','srv_latency', 'tcp_flags'], inplace = True)
df_imb.columns = ['IPV4_SRC_ADDR', 'L4_SRC_PORT', 'IPV4_DST_ADDR', 'L4_DST_PORT',
       'IN_BYTES', 'IN_PKTS', 'Label', 'PROTOCOL', 'OUT_BYTES', 'OUT_PKTS',
       'FLOW_DURATION_MILLISECONDS', 'TCP_FLAGS']
df_imb = df_imb.drop_duplicates()


df_b_b = df_imb[df_imb['Label']==0]
df_b_a = df_imb[df_imb['Label']==1]

#balanced
df_b = pd.concat([df_b_a.sample(n=df_b_b.shape[0], random_state=1), df_b_b]).sample(frac=1).reset_index(drop=True)

In [22]:
df_imb = correct_df(df_imb, cols)
df_b = correct_df(df_b, cols)

In [30]:
cm, acc, G_test, actual = predict_egraphsage(df_imb, model, encoder, scaler, cols_to_norm, n_features, None, None)

print("\nTest results (Imbalanced):")
print(cm)
print(acc)


Test results (Imbalanced):
[[  8120    984]
 [   176 580942]]
0.9980346378142462


In [31]:
cm, acc, G_test, actual = predict_egraphsage(df_b, model, encoder, scaler, cols_to_norm, n_features, None, None)

print("\nTest results (Balanced):")
print(cm)
print(acc)


Test results (Balanced):
[[8115  989]
 [   0 9104]]
0.9456832161687171


In [32]:
embeddings = model.gnn(G, node_features, edge_features)

In [33]:
embeddings

tensor([[0.0000, 1.8667, 0.0000,  ..., 0.0514, 6.7053, 0.0000],
        [0.0000, 0.0000, 0.0000,  ..., 2.4395, 4.5555, 0.0000],
        [0.0000, 0.0000, 0.0000,  ..., 3.3647, 3.1353, 0.0000],
        ...,
        [0.0000, 0.0000, 0.0000,  ..., 1.1056, 0.7814, 0.0000],
        [0.0000, 0.0000, 0.0000,  ..., 2.7058, 3.9165, 0.0000],
        [0.0000, 0.0000, 0.0000,  ..., 2.1566, 2.8912, 0.0000]],
       grad_fn=<SumBackward1>)

In [35]:
bot_emb = pd.DataFrame(embeddings.detach().numpy())

In [36]:
bot_emb

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,118,119,120,121,122,123,124,125,126,127
0,0.0,1.866689,0.0,4.199146,6.120585,0.0,0.0,8.134766,0.0,8.543159,...,0.000000,0.838766,6.312424,3.163555,0.0,0.0,5.806769,0.051384,6.705304,0.0
1,0.0,0.000000,0.0,0.000000,0.000000,0.0,0.0,5.444332,0.0,7.505930,...,0.000000,4.460386,0.000000,0.000000,0.0,0.0,0.000000,2.439473,4.555478,0.0
2,0.0,0.000000,0.0,0.000000,0.000000,0.0,0.0,4.858747,0.0,7.965234,...,0.088603,4.867826,0.000000,0.000000,0.0,0.0,0.000000,3.364663,3.135283,0.0
3,0.0,0.000000,0.0,0.000000,0.000000,0.0,0.0,5.712359,0.0,8.147270,...,0.000000,4.880036,0.316823,0.000000,0.0,0.0,0.000000,3.137607,5.074939,0.0
4,0.0,0.000000,0.0,0.000000,0.000000,0.0,0.0,4.632731,0.0,8.214451,...,0.230374,4.064116,0.000000,0.000000,0.0,0.0,0.000000,3.950132,3.276883,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
107184,0.0,9.071394,0.0,18.290052,13.043841,0.0,0.0,22.371132,0.0,30.652094,...,0.000000,0.909483,29.564411,8.192632,0.0,0.0,6.259212,0.000000,15.955686,0.0
107185,0.0,0.000000,0.0,0.150230,0.010867,0.0,0.0,0.898572,0.0,1.780759,...,0.000000,0.501232,0.132170,0.000000,0.0,0.0,0.445663,1.369995,0.863999,0.0
107186,0.0,0.000000,0.0,0.310355,0.000000,0.0,0.0,1.025115,0.0,1.711297,...,0.000000,0.710734,0.360713,0.000000,0.0,0.0,0.483158,1.105566,0.781391,0.0
107187,0.0,0.000000,0.0,0.000000,0.000000,0.0,0.0,5.560455,0.0,8.576202,...,0.000000,4.270420,0.000000,0.000000,0.0,0.0,0.000000,2.705842,3.916513,0.0


In [37]:
df_train = pd.read_parquet(dir_data+'NF-BoT-IoT_chunks')
df_train.shape

(600100, 14)