In [3]:
!pip install  dgl -f https://data.dgl.ai/wheels/cu121/repo.html
!pip install  dglgo -f https://data.dgl.ai/wheels-test/repo.html

!pip install category_encoders

Looking in links: https://data.dgl.ai/wheels/cu121/repo.html
Collecting dgl
  Downloading https://data.dgl.ai/wheels/cu121/dgl-2.0.0%2Bcu121-cp310-cp310-manylinux1_x86_64.whl (926.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m926.0/926.0 MB[0m [31m925.4 kB/s[0m eta [36m0:00:00[0m
Installing collected packages: dgl
Successfully installed dgl-2.0.0+cu121
Looking in links: https://data.dgl.ai/wheels-test/repo.html
Collecting dglgo
  Downloading dglgo-0.0.2-py3-none-any.whl (63 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m63.5/63.5 kB[0m [31m496.0 kB/s[0m eta [36m0:00:00[0m
Collecting isort>=5.10.1 (from dglgo)
  Downloading isort-5.13.2-py3-none-any.whl (92 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m92.3/92.3 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting autopep8>=1.6.0 (from dglgo)
  Downloading autopep8-2.0.4-py2.py3-none-any.whl (45 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [1]:
import socket
import struct
import random
import pathlib
import joblib

import pandas as pd
import dask.dataframe as dd
import numpy as np

import dgl.function as fn
import dgl.nn as dglnn
from dgl import from_networkx
from dgl.data.utils import save_graphs
import torch.nn as nn
import torch as th
import torch.nn.functional as F
import networkx as nx
import category_encoders as ce
#import matplotlib.pyplot as plt
#import seaborn as sns

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.utils import class_weight



In [2]:
class SAGELayer(nn.Module):
    def __init__(self, ndim_in, edims, ndim_out, activation):
        super(SAGELayer, self).__init__()
        ### force to outut fix dimensions
        self.W_msg = nn.Linear(ndim_in + edims, ndim_out)
        ### apply weight
        self.W_apply = nn.Linear(ndim_in + ndim_out, ndim_out)
        self.activation = activation

    def message_func(self, edges):
        return {'m': self.W_msg(th.cat([edges.src['h'], edges.data['h']], 2))}

    def forward(self, g_dgl, nfeats, efeats):
        with g_dgl.local_scope():
            g = g_dgl
            g.ndata['h'] = nfeats
            g.edata['h'] = efeats
            # Eq4
            g.update_all(self.message_func, fn.mean('m', 'h_neigh'))
            # Eq5
            g.ndata['h'] = F.relu(self.W_apply(th.cat([g.ndata['h'], g.ndata['h_neigh']], 2)))
            return g.ndata['h']


class SAGE(nn.Module):
    def __init__(self, ndim_in, ndim_out, edim, activation, dropout):
        super(SAGE, self).__init__()
        self.layers = nn.ModuleList()
        self.layers.append(SAGELayer(ndim_in, edim, 128, activation))
        self.layers.append(SAGELayer(128, edim, ndim_out, activation))
        self.dropout = nn.Dropout(p=dropout)

    def forward(self, g, nfeats, efeats):
        for i, layer in enumerate(self.layers):
            if i != 0:
                nfeats = self.dropout(nfeats)
            nfeats = layer(g, nfeats, efeats)
        return nfeats.sum(1)

class MLPPredictor(nn.Module):
    def __init__(self, in_features, out_classes):
        super().__init__()
        self.W = nn.Linear(in_features * 2, out_classes)

    def apply_edges(self, edges):
        h_u = edges.src['h']
        h_v = edges.dst['h']
        score = self.W(th.cat([h_u, h_v], 1))
        return {'score': score}

    def forward(self, graph, h):
        with graph.local_scope():
            graph.ndata['h'] = h
            graph.apply_edges(self.apply_edges)
            return graph.edata['score']

class Model(nn.Module):
    def __init__(self, ndim_in, ndim_out, edim, activation, dropout):
        super().__init__()
        self.gnn = SAGE(ndim_in, ndim_out, edim, activation, dropout)
        self.pred = MLPPredictor(ndim_out, 2)
    def forward(self, g, nfeats, efeats):
        h = self.gnn(g, nfeats, efeats)
        return self.pred(g, h)

In [30]:
def compute_accuracy(pred, labels):
    return (pred.argmax(1) == labels).float().mean().item()

def correct_df(df, cols):

    df = df[cols]
    df['IPV4_SRC_ADDR'] = df.IPV4_SRC_ADDR.apply(lambda x: socket.inet_ntoa(struct.pack('>I', random.randint(0xac100001, 0xac1f0001))))
    df['IPV4_SRC_ADDR'] = df.IPV4_SRC_ADDR.apply(str)
    df['L4_SRC_PORT'] = df.L4_SRC_PORT.apply(str)
    df['IPV4_DST_ADDR'] = df.IPV4_DST_ADDR.apply(str)
    df['L4_DST_PORT'] = df.L4_DST_PORT.apply(str)
    df['IPV4_SRC_ADDR'] = df['IPV4_SRC_ADDR'] + ':' + df['L4_SRC_PORT']
    df['IPV4_DST_ADDR'] = df['IPV4_DST_ADDR'] + ':' + df['L4_DST_PORT']
    df.drop(columns=['L4_SRC_PORT','L4_DST_PORT'], inplace=True)
    df.rename(columns={"Label": "label"},inplace = True)
    df.drop_duplicates(inplace = True)
    label = df.label
    df.drop(columns=['label'],inplace = True)
    df = pd.concat([df, label], axis=1)

    return df

def train_egraphsage(df_train, model, epochs, cuda, report_epochs):

    print("Building a graph...")
    G = nx.from_pandas_edgelist(df_train, "IPV4_SRC_ADDR", "IPV4_DST_ADDR", ['h','label'],create_using=nx.MultiGraph())
    G = G.to_directed()
    G = from_networkx(G,edge_attrs=['h','label'] )

    G.ndata['h'] = th.ones(G.num_nodes(), G.edata['h'].shape[1])
    G.edata['train_mask'] = th.ones(len(G.edata['h']), dtype=th.bool)
    G.ndata['h'] = th.reshape(G.ndata['h'], (G.ndata['h'].shape[0], 1,G.ndata['h'].shape[1]))
    G.edata['h'] = th.reshape(G.edata['h'], (G.edata['h'].shape[0], 1,G.edata['h'].shape[1]))

    class_weights = class_weight.compute_class_weight(class_weight = "balanced",
                                                      classes = np.unique(G.edata['label'].cpu().numpy()),
                                                      y = G.edata['label'].cpu().numpy())

    class_weights = th.FloatTensor(class_weights).cuda() if cuda else th.FloatTensor(class_weights)
    criterion = nn.CrossEntropyLoss(weight=class_weights)

    if cuda:
        G = G.to('cuda:0')
        model = model.cuda()

    node_features = G.ndata['h']
    edge_features = G.edata['h']

    edge_label = G.edata['label']
    train_mask = G.edata['train_mask']

    opt = th.optim.Adam(model.parameters())

    print("Training started...")
    for epoch in range(1,epochs+1):
        pred = model(G, node_features,edge_features).cuda() if cuda else model(G, node_features,edge_features)
        loss = criterion(pred[train_mask], edge_label[train_mask])
        opt.zero_grad()
        loss.backward()
        opt.step()
        if epoch % report_epochs == 0:
            print('Training acc',str(epoch),':', compute_accuracy(pred[train_mask], edge_label[train_mask]))

    return model

def train_in_chunks(model, path, chunk_size, cols, epochs, cuda=False, report_epochs=100, limit=None):

    count = 0
    
    def extract_number(path):
        return int(path.stem.split(".")[1])

    files = sorted(pathlib.Path(path).glob("part.*.parquet"), key=extract_number)

    encoder = None
    scaler = None
    cols_to_norm = None

    df_train = pd.DataFrame()
    for index, path in enumerate(files):
        
        count += df_train.shape[0]
        
        print(path)
        df_chunk = correct_df(pd.read_parquet(path), cols)
        
        if index == len(files) - 1 or df_train.shape[0] >= chunk_size:
            if encoder == None:
                encoder = ce.TargetEncoder(cols=['TCP_FLAGS','PROTOCOL'])
                encoder.fit(df_train, df_train.label)
            df_train = encoder.transform(df_train)

            if scaler == None:
                scaler = StandardScaler()
                cols_to_norm = list(set(list(df_train.iloc[:, 2:].columns ))  - set(list(['label'])) )
                df_train[cols_to_norm] = scaler.fit_transform(df_train[cols_to_norm])
            df_train[cols_to_norm] = scaler.transform(df_train[cols_to_norm])

            df_train['h'] = df_train[cols_to_norm].values.tolist()

            model = train_egraphsage(df_train, model, epochs, cuda, report_epochs)

            df_train = pd.DataFrame()
    
        else:
            df_train = pd.concat([df_train, df_chunk])
            
        if limit != None and limit <= count:
            break

    return model, encoder, scaler, cols_to_norm


def predict_egraphsage(df_test, model, encoder, scaler, cols_to_norm, n_features):

    df_test = encoder.transform(df_test)
    df_test[cols_to_norm] = scaler.transform(df_test[cols_to_norm])
    df_test['h'] = df_test[cols_to_norm].values.tolist()

    G_test = nx.from_pandas_edgelist(df_test, "IPV4_SRC_ADDR", "IPV4_DST_ADDR", ['h','label'],create_using=nx.MultiGraph())
    G_test = G_test.to_directed()
    G_test = from_networkx(G_test,edge_attrs=['h','label'] )
    actual = G_test.edata.pop('label')
    G_test.ndata['feature'] = th.ones(G_test.num_nodes(), n_features)

    G_test.ndata['feature'] = th.reshape(G_test.ndata['feature'], (G_test.ndata['feature'].shape[0], 1, G_test.ndata['feature'].shape[1]))

    G_test.edata['h'] = th.reshape(G_test.edata['h'], (G_test.edata['h'].shape[0], 1, G_test.edata['h'].shape[1]))

    node_features_test = G_test.ndata['feature']
    edge_features_test = G_test.edata['h']
    test_pred = model(G_test, node_features_test, edge_features_test)#.cuda()

    test_pred = test_pred.argmax(1)

    test_pred = th.Tensor.cpu(test_pred).detach().numpy()

    return confusion_matrix(actual, test_pred), accuracy_score(actual, test_pred)

def save_model(model, MODEL_PATH, scaler, SCALER_PATH, encoder, ENCODER_PATH):
    th.save(model, MODEL_PATH)
    joblib.dump(scaler, SCALER_PATH)
    joblib.dump(encoder, ENCODER_PATH)

def save_graph(G, GRAPH_PATH):
    save_graphs(GRAPH_PATH, [G], None)

def load_model(MODEL_PATH, SCALER_PATH, ENCODER_PATH):
    model = th.load(MODEL_PATH)
    model.eval()
    scaler = joblib.load(SCALER_PATH)
    encoder = joblib.load(ENCODER_PATH)

    return model, scaler, encoder

def load_graph(G, GRAPH_PATH):
    return

In [4]:
pd.options.mode.copy_on_write = True

**COLAB directories**

In [None]:
dir_data = '/content/drive/MyDrive/csci_e-599a/data/'
dir_model = '/content/drive/MyDrive/csci_e-599a/model/'

**Local directories**

In [5]:
dir_data = '../data/netflow/parquet/original/'
dir_model = 'model/'

**Train on 'NF-BoT-IoT-v2_chunks' in chunks...**

In [31]:
path = dir_data + 'NF-BoT-IoT-v2_chunks'

cols = ['IPV4_SRC_ADDR', 'IPV4_DST_ADDR', 'L4_SRC_PORT','L4_DST_PORT', 'PROTOCOL', 'IN_BYTES', 'OUT_BYTES',
       'IN_PKTS', 'OUT_PKTS', 'TCP_FLAGS', 'FLOW_DURATION_MILLISECONDS','Label']

n_features = len(cols) - 4 - 1 #'IPV4_SRC_ADDR', 'IPV4_DST_ADDR', 'L4_SRC_PORT','L4_DST_PORT', 'Label'
model = Model(n_features, 128, n_features, F.relu, 0.2)

chunk_size = 1000000
epochs = 1000
cuda = False
report_epochs = 100

model, encoder, scaler, cols_to_norm = train_in_chunks(model, path, chunk_size, cols, epochs, cuda, report_epochs)

../data/netflow/parquet/original/NF-BoT-IoT-v2_chunks/part.0.parquet
../data/netflow/parquet/original/NF-BoT-IoT-v2_chunks/part.1.parquet
Building a graph...
Training started...
Training acc 20 : 0.8833523988723755
Training acc 40 : 0.9426321387290955
Training acc 60 : 0.9569600820541382
Training acc 80 : 0.9547532200813293
Training acc 100 : 0.9572961926460266
Training acc 120 : 0.9578973650932312
Training acc 140 : 0.9577405452728271
Training acc 160 : 0.9582371711730957
Training acc 180 : 0.9581326246261597
Training acc 200 : 0.9577181339263916
Training acc 220 : 0.9586591124534607
Training acc 240 : 0.9587375521659851
Training acc 260 : 0.9583604335784912
Training acc 280 : 0.9582931995391846
Training acc 300 : 0.9582446217536926
Training acc 320 : 0.9584910869598389
Training acc 340 : 0.9584164023399353
Training acc 360 : 0.9583902955055237
Training acc 380 : 0.958270788192749
Training acc 400 : 0.9583641290664673
Training acc 420 : 0.958278238773346
Training acc 440 : 0.958614349

**Save model, encoder and scaler**

In [8]:
MODEL_PATH_BOT = dir_model + 'nf_bot_v2.pt'
SCALER_PATH_BOT = dir_model + '_scaler_nf_bot_v2.pkl'
ENCODER_PATH_BOT = dir_model + '_scaler_nf_bot_v2.pkl'

save_model(model, MODEL_PATH_BOT, scaler, SCALER_PATH_BOT, encoder, ENCODER_PATH_BOT)

**Train on 'NF-ToN-IoT_chunks' in chunks...**

In [None]:
path = dir_data + 'NF-ToN-IoT_chunks'

model, encoder, scaler, cols_to_norm = train_in_chunks(model, path, chunk_size, cols, cuda, report_epochs)

**Save model, encoder and scaler**

In [None]:
MODEL_PATH_BOT_TON = dir_model + 'nf_bot_v2_ton.pt'
SCALER_PATH_BOT_TON = dir_model + '_scaler_nf_bot_v2_ton.pkl'
ENCODER_PATH_BOT_TON = dir_model + '_scaler_nf_bot_v2_ton.pkl'

save_model(model, MODEL_PATH_BOT_TON, scaler, SCALER_PATH_BOT_TON, encoder, ENCODER_PATH_BOT_TON)

**Load test data**

In [32]:
df_test = correct_df(pd.read_parquet(dir_data + 'Attack-2_chunks'), cols)

In [33]:
#df_test['h'] = df_test[cols_to_norm].values.tolist()

In [34]:
df_test.columns

Index(['IPV4_SRC_ADDR', 'IPV4_DST_ADDR', 'PROTOCOL', 'IN_BYTES', 'OUT_BYTES',
       'IN_PKTS', 'OUT_PKTS', 'TCP_FLAGS', 'FLOW_DURATION_MILLISECONDS',
       'label'],
      dtype='object')

**Load model, encoder, scaler**

In [35]:
#model, scaler, encoder = load_model(MODEL_PATH, SCALER_PATH, ENCODER_PATH)
#cols_to_norm = list(set(list(df_test.iloc[:, 2:].columns ))  - set(list(['label'])) )

**Test model on 'Attack-2_chunks'**

In [36]:
cm, acc = predict_egraphsage(df_test, model, encoder, scaler, cols_to_norm, n_features)

print("Test results:")
print(cm)
print(acc)

Test results:
[[  8980    124]
 [276953 304165]]
0.5305546048774867
