In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import tqdm
import gc
import pandas as pd
import dask.dataframe as dd
import numpy as np
import torch
from sklearn import preprocessing
import dgl
import dgl.function as fn
from dgl.data import DGLDataset
import time
import networkx as nx
import category_encoders as ce
import torch.nn as nn
import torch.nn.functional as F
import math

from typing import *
from sklearn.preprocessing import StandardScaler, Normalizer
import socket
import struct
import random

from sklearn.model_selection import train_test_split



In [11]:
class SAGELayer(nn.Module):
    def __init__(self, ndim_in, edims, ndim_out, activation):
        super(SAGELayer, self).__init__()
        self.W_apply = nn.Linear(ndim_in + edims , ndim_out)
        self.activation = F.relu
        self.W_edge = nn.Linear(128 * 2, 256)
        self.reset_parameters()

    def reset_parameters(self):
        gain = nn.init.calculate_gain('relu')
        nn.init.xavier_uniform_(self.W_apply.weight, gain=gain)

    def message_func(self, edges):
        return {'m':  edges.data['h']}

    def forward(self, g_dgl, nfeats, efeats):
        with g_dgl.local_scope():
            g = g_dgl
            g.ndata['h'] = nfeats
            g.edata['h'] = efeats
            g.update_all(self.message_func, fn.mean('m', 'h_neigh'))
            g.ndata['h'] = F.relu(self.W_apply(torch.cat([g.ndata['h'], g.ndata['h_neigh']], 2)))

            # Compute edge embeddings
            u, v = g.edges()
            edge = self.W_edge(torch.cat((g.srcdata['h'][u], g.dstdata['h'][v]), 2))
            return g.ndata['h'], edge
        
class SAGE(nn.Module):
    def __init__(self, ndim_in, ndim_out, edim,  activation):
        super(SAGE, self).__init__()
        self.layers = nn.ModuleList()
        self.layers.append(SAGELayer(ndim_in, edim, 128, F.relu))

    def forward(self, g, nfeats, efeats, corrupt=False):
        if corrupt:
            e_perm = torch.randperm(g.number_of_edges())
            #n_perm = torch.randperm(g.number_of_nodes())
            efeats = efeats[e_perm]
            #nfeats = nfeats[n_perm]
        for i, layer in enumerate(self.layers):
            #nfeats = layer(g, nfeats, efeats)
            nfeats, e_feats = layer(g, nfeats, efeats)
        #return nfeats.sum(1)
        return nfeats.sum(1), e_feats.sum(1)
    
class Discriminator(nn.Module):
    def __init__(self, n_hidden):
        super(Discriminator, self).__init__()
        self.weight = nn.Parameter(torch.Tensor(n_hidden, n_hidden))
        self.reset_parameters()

    def uniform(self, size, tensor):
        bound = 1.0 / math.sqrt(size)
        if tensor is not None:
            tensor.data.uniform_(-bound, bound)

    def reset_parameters(self):
        size = self.weight.size(0)
        self.uniform(size, self.weight)

    def forward(self, features, summary):
        features = torch.matmul(features, torch.matmul(self.weight, summary))
        return features
    
class DGI(nn.Module):
    def __init__(self, ndim_in, ndim_out, edim, activation):
        super(DGI, self).__init__()
        self.encoder = SAGE(ndim_in, ndim_out, edim,  F.relu)
        #self.discriminator = Discriminator(128)
        self.discriminator = Discriminator(256)
        self.loss = nn.BCEWithLogitsLoss()

    def forward(self, g, n_features, e_features):
        positive = self.encoder(g, n_features, e_features, corrupt=False)
        negative = self.encoder(g, n_features, e_features, corrupt=True)
        self.loss = nn.BCEWithLogitsLoss()

    def forward(self, g, n_features, e_features):
        positive = self.encoder(g, n_features, e_features, corrupt=False)
        negative = self.encoder(g, n_features, e_features, corrupt=True)

        positive = positive[1]
        negative = negative[1]

        summary = torch.sigmoid(positive.mean(dim=0))

        positive = self.discriminator(positive, summary)
        negative = self.discriminator(negative, summary)

        l1 = self.loss(positive, torch.ones_like(positive))
        l2 = self.loss(negative, torch.zeros_like(negative))

        return l1 + l2

In [18]:
def correct_df(df, cols, with_port=True):
    
    df = df[cols]
    #df['IPV4_SRC_ADDR'] = df.IPV4_SRC_ADDR.apply(lambda x: socket.inet_ntoa(struct.pack('>I', random.randint(0xac100001, 0xac1f0001))))
    df['IPV4_SRC_ADDR'] = df.IPV4_SRC_ADDR.apply(str)
    df['IPV4_DST_ADDR'] = df.IPV4_DST_ADDR.apply(str)
    if with_port:
        df['L4_SRC_PORT'] = df.L4_SRC_PORT.apply(str)
        df['L4_DST_PORT'] = df.L4_DST_PORT.apply(str)
        df['IPV4_SRC_ADDR'] = df['IPV4_SRC_ADDR'] + ':' + df['L4_SRC_PORT']
        df['IPV4_DST_ADDR'] = df['IPV4_DST_ADDR'] + ':' + df['L4_DST_PORT']
    df.drop(columns=['L4_SRC_PORT','L4_DST_PORT'], inplace=True)
    df.drop_duplicates(inplace = True)
    
    sample_size = min(df.shape[0], 2000000)
    print("Number of EDGEs: " + str(sample_size))
    
    return df.sample(n=sample_size, random_state=1)

#enc_cols=['TCP_FLAGS','PROTOCOL']
def create_graph(data, enc_cols):

    X_train = data.drop(columns=["Label"])
    y_train = data[["Label"]]

    encoder = ce.TargetEncoder(cols=enc_cols)#enc_cols
    encoder.fit(X_train, y_train.Label)

    X_train = encoder.transform(X_train)

    X_train.replace([np.inf, -np.inf], np.nan, inplace=True)
    X_train.fillna(0, inplace=True)

    scaler = Normalizer()
    cols_to_norm = list(set(list(X_train.iloc[:, 2:].columns))) # Ignore first two as the represents IP addresses
    scaler.fit(X_train[cols_to_norm])

    # Transform on training set
    X_train[cols_to_norm] = scaler.transform(X_train[cols_to_norm])
    X_train['h'] = X_train.iloc[:, 2:].values.tolist()

    train = pd.concat([X_train, y_train], axis=1)

    # Training graph
    train_g = nx.from_pandas_edgelist(train, "IPV4_SRC_ADDR", "IPV4_DST_ADDR",
                ["h", "Label"], create_using=nx.MultiGraph())

    train_g = train_g.to_directed()
    train_g = dgl.from_networkx(train_g, edge_attrs=['h', 'Label'])
    nfeat_weight = torch.ones([train_g.number_of_nodes(),
                            train_g.edata['h'].shape[1]])
    train_g.ndata['h'] = nfeat_weight

    return train_g #, test_g

def train_dgi(train_g, epochs, file_name, dir_model):
    
    ndim_in = train_g.ndata['h'].shape[1]
    ndim_out = 128
    edim = train_g.edata['h'].shape[1]

    dgi = DGI(ndim_in, ndim_out, edim, F.relu)

    dgi_optimizer = torch.optim.Adam(dgi.parameters(), lr=1e-3, weight_decay=0.)

    train_g.ndata['h'] = torch.reshape(train_g.ndata['h'],
                                       (train_g.ndata['h'].shape[0], 1,
                                        train_g.ndata['h'].shape[1]))
    train_g.edata['h'] = torch.reshape(train_g.edata['h'],
                                       (train_g.edata['h'].shape[0], 1,
                                        train_g.edata['h'].shape[1]))

    best = 1e9
    node_features = train_g.ndata['h']
    edge_features = train_g.edata['h']

    for epoch in range(epochs):
        dgi.train()

        dgi_optimizer.zero_grad()
        loss = dgi(train_g, node_features, edge_features)
        loss.backward()
        dgi_optimizer.step()

        if loss < best:
            best = loss
            torch.save(dgi.state_dict(), dir_model+'/dgi_'+file_name+'.pkl')

        if (epoch+1) % 50 == 0:
            print("Epoch {:05d} | Loss {:.4f}".format(epoch, loss.item()))
    
    print("Best Loss {:.4f}".format(loss.item()))
    
    return dgi
            
def load_dgi(file_name, dir_model):
    #dgi.load_state_dict(torch.load('dgi/'+file_name+'.pkl'))
    return torch.load(dir_model+'/dgi_'+file_name+'.pkl')


def df_embeddings(dgi, G):
    emb = dgi.encoder(G, G.ndata['h'], G.edata['h'])[1]
    emb = emb.detach().cpu().numpy()

    df_emb = pd.DataFrame(emb, )
    df_emb["Label"] = G.edata['Label'].detach().cpu().numpy()
    
    return df_emb

def save_embeddings(df_emb, path):
    
    col_names = ["emb_"+str(col) for col in range(256)]
    col_names.append("Label")
    df_emb.columns = col_names
    
    size99MB = int(df_emb.memory_usage().sum()/1e6/99) + 1
    dd_df = dd.from_pandas(df_emb, npartitions=size99MB)
    dd_df.to_parquet(path, compression="gzip")
    print("Saved to: '"+path)

def create_embeddings(dir_data, file_name, cols, with_port, epochs, emb_prefix, dir_model):
    
    print("Loading data... '"+dir_data+file_name+"'")
    df = pd.read_parquet(dir_data+file_name)
    
    print("Correcting DataFrame...")
    df = correct_df(df, cols, with_port)
    
    print("Creating Graphs...")
    enc_cols=['TCP_FLAGS','PROTOCOL']
    G = create_graph(df, enc_cols)
    
    print("Training...")
    dgi = train_dgi(G, epochs, file_name, dir_model)
    dgi.load_state_dict(torch.load(dir_model+'/dgi_'+file_name+'.pkl'))
    #dgi = load_dgi(file_name)
    
    print("Generating Embeddings...")
    df_emb = df_embeddings(dgi, G)
    #df_embeddings(dgi, G)
    
    print("Saving Embeddings...")
    save_embeddings(df_emb, dir_data + emb_prefix + file_name)
    
    return df_emb

In [4]:
#from google.colab import drive
#drive.mount('/content/drive')
#dir_data = '/content/drive/MyDrive/csci_e-599a/data/'
#dir_model = '/content/drive/MyDrive/csci_e-599a/model/'

dir_data = '../data/netflow/parquet/original/'
dir_model = 'model/'

In [5]:
pd.options.mode.copy_on_write = True

In [6]:
netflow_data = ['NF-BoT-IoT_chunks',
           'NF-ToN-IoT_chunks',
           'NF-UNSW-NB15_chunks',
           'NF-UQ-NIDS_chunks',
           'NF-CSE-CIC-IDS2018_chunks',
           'NF-BoT-IoT-v2_chunks',
           'NF-ToN-IoT-v2_chunks',
           'NF-UNSW-NB15-v2_chunks',
           'NF-UQ-NIDS-v2_chunks',
           'NF-CSE-CIC-IDS2018-v2_chunks', 
           'Attack-2_chunks']

In [14]:
netflow_data = ['Attack-2_chunks']

In [15]:
cols = ['IPV4_SRC_ADDR', 'IPV4_DST_ADDR', 'L4_SRC_PORT','L4_DST_PORT', 'PROTOCOL', 'IN_BYTES', 'OUT_BYTES',
       'IN_PKTS', 'OUT_PKTS', 'TCP_FLAGS', 'FLOW_DURATION_MILLISECONDS','Label']

In [16]:
epochs = 200

In [17]:
with_port = True
emb_prefix = "emb_port_"
for i, data_folder in enumerate(netflow_data):
    
    df_emb = create_embeddings(dir_data, data_folder, cols, with_port, epochs, emb_prefix, dir_model)

Loading data... '../data/netflow/parquet/original/Attack-2_chunks'
Correcting DataFrame...
Number of EDGEs: 295111
Creating Graphs...
Training...
Best Loss 0.6214
Generating Embeddings...
Saving Embeddings...
Saved to: '../data/netflow/parquet/original/emb_port_Attack-2_chunks


In [9]:
with_port = False
emb_prefix = "emb_wo_port_"

for i, data_folder in enumerate(netflow_data):
    
    df_emb = create_embeddings(dir_data, data_folder, cols, with_port, epochs, emb_prefix, dir_model)


Loading data... '../data/netflow/parquet/original/NF-UQ-NIDS_chunks'
Correcting DataFrame...
Number of EDGEs: 2000000
Creating Graphs...
Training...
Epoch 00000 | Loss 1.3834
Epoch 00050 | Loss 0.1932
Epoch 00100 | Loss 0.0876
Epoch 00150 | Loss 0.0850
Epoch 00200 | Loss 0.0796
Best Loss 0.0796
Generating Embeddings...
Saving Embeddings...
Saved to: '../data/netflow/parquet/original/emb_wo_port_NF-UQ-NIDS_chunks
Loading data... '../data/netflow/parquet/original/NF-CSE-CIC-IDS2018_chunks'
Correcting DataFrame...
Number of EDGEs: 2000000
Creating Graphs...
Training...
Epoch 00000 | Loss 1.4570
Epoch 00050 | Loss 1.3054
Epoch 00100 | Loss 0.5051
Epoch 00150 | Loss 0.2293
Epoch 00200 | Loss 0.2075
Best Loss 0.2075
Generating Embeddings...
Saving Embeddings...
Saved to: '../data/netflow/parquet/original/emb_wo_port_NF-CSE-CIC-IDS2018_chunks
Loading data... '../data/netflow/parquet/original/NF-BoT-IoT-v2_chunks'
Correcting DataFrame...
Number of EDGEs: 92173
Creating Graphs...
Training...
Ep