In [3]:
import dgl.nn as dglnn
from dgl import from_networkx
from dgl.data.utils import save_graphs
import torch.nn as nn
import torch as th
import torch.nn.functional as F
import dgl.function as fn
from dgl.data.utils import load_graphs
import networkx as nx
import pandas as pd
import socket
import struct
import random
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import sys

## Process Captured Dataframe to Match Graphsage

In [None]:
full_df = pd.read_parquet('data/parquet-20231022T155758Z-001/parquet/')

In [None]:
full_df.head()

Unnamed: 0,t_first,t_last,src4_addr,src6_addr,src_port,src_tos,dst4_addr,dst6_addr,dst_port,icmp_code,...,label,proto,sampled,export_sysid,fwd_status,app_latency,cli_latency,srv_latency,out_bytes,out_packets
0,2023-10-10 13:30:54.499,2023-10-10T09:30:54.592,10.200.0.1,,39935,0,10.200.0.102,,80,,...,<none>,6,0,0,0,0.23,0.018,0.132,104518,76
1,2023-10-10 13:45:11.604,2023-10-10T09:45:11.821,10.200.0.1,,61029,0,10.200.0.102,,80,,...,<none>,6,0,0,0,0.231,0.015,0.108,353,4
2,2023-10-10 13:00:25.922,2023-10-10T09:00:25.922,10.200.0.102,,22,0,10.200.0.1,,20562,,...,<none>,6,0,0,0,0.0,0.0,0.132,40,1
3,2023-10-10 13:00:25.923,2023-10-10T09:00:26.68,10.200.0.102,,80,0,10.200.0.1,,22382,,...,<none>,6,0,0,0,0.242,0.008,0.131,301,6
4,2023-10-10 14:30:47.630,2023-10-10T10:30:47.103,10.200.0.102,,80,0,10.200.0.1,,26170,,...,<none>,6,0,0,0,0.001,0.003,0.0,224,7


In [None]:
sub_df = full_df.drop(
    [
        'src6_addr',
        'dst6_addr',
        'icmp_code',
        'fwd_status',
        'app_latency',
        'cli_latency',
        'srv_latency',
        'src_tos',
        'sampled',
        'export_sysid',
        'icmp_status',
    ], axis=1)

In [None]:
sub_df['t_first'] = pd.to_datetime(sub_df['t_first'])
sub_df['t_last'] = pd.to_datetime(sub_df['t_last'])

# Create new column that is the difference in milliseconds
sub_df['FLOW_DURATION_MILLISECONDS'] = (sub_df['t_first'] - sub_df['t_last']).dt.total_seconds() * 1000
sub_df['FLOW_DURATION_MILLISECONDS'] = sub_df['FLOW_DURATION_MILLISECONDS'].astype(int)

# Randomize src IP
sub_df['IPV4_SRC_ADDR'] = sub_df.src4_addr.apply(lambda x: socket.inet_ntoa(struct.pack('>I', random.randint(0xac100001, 0xac1f0001))))

# Transform and append addrs and ports
sub_df['IPV4_SRC_ADDR'] = sub_df.IPV4_SRC_ADDR.apply(str)
sub_df['src_port'] = sub_df.src_port.apply(str)
sub_df['dst4_addr'] = sub_df.dst4_addr.apply(str)
sub_df['dst_port'] = sub_df.dst_port.apply(str)

sub_df['IPV4_SRC_ADDR'] = sub_df['IPV4_SRC_ADDR'] + ':' + sub_df['src_port']
sub_df['IPV4_DST_ADDR'] = sub_df['dst4_addr'] + ':' + sub_df['dst_port']

# Use dict to convert tcp flags to int
# Define a function to convert a string to an integer
def string_to_integer(input_string):
    binary_string = ''.join(['0' if char == '.' else '1' for char in input_string])
    return int(binary_string, 2)

# Apply the function to the DataFrame column
sub_df['TCP_FLAGS'] = sub_df['tcp_flags'].apply(lambda x: string_to_integer(x))

# Drop old columns
sub_df = sub_df.drop(
    [
        't_first',
        't_last',
        'src4_addr',
        'dst4_addr',
        'src_port',
        'dst_port',
        'tcp_flags'
    ], axis=1
)

# Adjust naming to match og notebook
new_names = {
    'proto': 'PROTOCOL',
    'in_bytes': 'IN_BYTES',
    'in_packets': 'IN_PKTS',
    'out_bytes': 'OUT_BYTES',
    'out_packets': 'OUT_PKTS',
}
sub_df.rename(columns=new_names, inplace=True)
sub_df = sub_df.reindex(
    columns=[
        'IPV4_SRC_ADDR',
        'IPV4_DST_ADDR',
        'PROTOCOL',
        'IN_BYTES',
        'OUT_BYTES',
        'IN_PKTS',
        'OUT_PKTS',
        'TCP_FLAGS',
        'FLOW_DURATION_MILLISECONDS',
        'label'
    ]
)

#### Converted TCP Flags
To convert the TCP flags, I converted `'...A.RSF'` to `00010111` and then converted binary to an integer.

In [None]:
# Scale with fit scaler
from joblib import load

scaler = load('scaler.joblib')

cols_to_norm = list(set(list(sub_df.iloc[:, 2:].columns ))  - set(list(['label'])) )
sub_df[cols_to_norm] = scaler.fit_transform(sub_df[cols_to_norm])
sub_df = sub_df.drop('label', axis=1)

In [None]:
cols_to_norm

['TCP_FLAGS',
 'OUT_BYTES',
 'IN_BYTES',
 'FLOW_DURATION_MILLISECONDS',
 'OUT_PKTS',
 'PROTOCOL',
 'IN_PKTS']

## Build Graphs

In [10]:
load_from_disk = True

if not load_from_disk:
    # Create edge feature set
    sub_df['h'] = sub_df[cols_to_norm].values.tolist()

    # Construct graph based on src and dest with H as edge features
    G_large = nx.from_pandas_edgelist(
        sub_df, "IPV4_SRC_ADDR", "IPV4_DST_ADDR", ['h'] , create_using=nx.MultiGraph()
    )

    G_large = from_networkx(
        G_large.to_directed(), edge_attrs=['h']
    )

    # Append dummy node features - list of ones size (num_nodes, 7_edge_features)
    G_large.ndata['feature'] = th.ones(G_large.num_nodes(), 7)

    # Reshape node features, and then edge features
    G_large.ndata['feature'] = th.reshape(
        G_large.ndata['feature'], (G_large.ndata['feature'].shape[0], 1, G_large.ndata['feature'].shape[1])
    )
    G_large.edata['h'] = th.reshape(
        G_large.edata['h'], (G_large.edata['h'].shape[0], 1, G_large.edata['h'].shape[1])
    )

    # Save to disk
    save_graphs('graphs/graph_data.bin', [G_large])
else:
    G_large = load_graphs('drive/MyDrive/graphs/graph_data.bin')[0][0]

## Load Fit Model w/ Weights

In [6]:
class MLPPredictor(nn.Module):
    def __init__(self, in_features, out_classes):
        super().__init__()
        self.W = nn.Linear(in_features * 2, out_classes)

    def apply_edges(self, edges):
        h_u = edges.src['h']
        h_v = edges.dst['h']
        score = self.W(th.cat([h_u, h_v], 1))
        return {'score': score}

    def forward(self, graph, h):
        with graph.local_scope():
            graph.ndata['h'] = h
            graph.apply_edges(self.apply_edges)
            return graph.edata['score']


class SAGELayer(nn.Module):
    def __init__(self, ndim_in, edims, ndim_out, activation):
        super(SAGELayer, self).__init__()
        ### force to outut fix dimensions
        self.W_msg = nn.Linear(ndim_in + edims, ndim_out)
        ### apply weight
        self.W_apply = nn.Linear(ndim_in + ndim_out, ndim_out)
        self.activation = activation

    def message_func(self, edges):
        return {'m': self.W_msg(th.cat([edges.src['h'], edges.data['h']], 2))}

    def forward(self, g_dgl, nfeats, efeats):
        with g_dgl.local_scope():
            g = g_dgl
            g.ndata['h'] = nfeats
            g.edata['h'] = efeats
            # Eq4
            g.update_all(self.message_func, fn.mean('m', 'h_neigh'))
            # Eq5
            g.ndata['h'] = F.relu(self.W_apply(th.cat([g.ndata['h'], g.ndata['h_neigh']], 2)))
            return g.ndata['h']


class SAGE(nn.Module):
    def __init__(self, ndim_in, ndim_out, edim, activation, dropout):
        super(SAGE, self).__init__()
        self.layers = nn.ModuleList()
        self.layers.append(SAGELayer(ndim_in, edim, 128, activation))
        self.layers.append(SAGELayer(128, edim, ndim_out, activation))
        self.dropout = nn.Dropout(p=dropout)

    def forward(self, g, nfeats, efeats):
        for i, layer in enumerate(self.layers):
            if i != 0:
                nfeats = self.dropout(nfeats)
            nfeats = layer(g, nfeats, efeats)
        return nfeats.sum(1)


class Model(nn.Module):
    def __init__(self, ndim_in, ndim_out, edim, activation, dropout):
        super().__init__()
        self.gnn = SAGE(ndim_in, ndim_out, edim, activation, dropout)
        self.pred = MLPPredictor(ndim_out, 2)
    def forward(self, g, nfeats, efeats):
        h = self.gnn(g, nfeats, efeats)
        return self.pred(g, h)

# Modeling functions
def compute_accuracy(pred, labels):
    return (pred.argmax(1) == labels).float().mean().item()

In [8]:
# Model(nodefeat_dim_in, ndim_out, edgefeat_dim, activation, dropout)
model = Model(7, 128, 7, F.relu, 0.2)
model = th.load('drive/MyDrive/models/nf-bot-iot-torch.pth', map_location=th.device('cuda:0'))
model.eval()

Model(
  (gnn): SAGE(
    (layers): ModuleList(
      (0): SAGELayer(
        (W_msg): Linear(in_features=14, out_features=128, bias=True)
        (W_apply): Linear(in_features=135, out_features=128, bias=True)
      )
      (1): SAGELayer(
        (W_msg): Linear(in_features=135, out_features=128, bias=True)
        (W_apply): Linear(in_features=256, out_features=128, bias=True)
      )
    )
    (dropout): Dropout(p=0.2, inplace=False)
  )
  (pred): MLPPredictor(
    (W): Linear(in_features=256, out_features=2, bias=True)
  )
)

## Generate Predictions

In [11]:
G_large = G_large.to('cuda:0')
predictions = model(
    G_large,                      # Graph
    G_large.ndata['feature'],     # Dummy Node Features (ones)
    G_large.edata['h']            # Edge Features (dataframe)
)

In [13]:
predictions.shape

torch.Size([3857724, 2])

In [16]:
pred_max = predictions.argmax(1)

### Percent Predicated as Attack
Based on the output below, it seems that the model predicted 72.25% of the flows as attacks. That seems high to me.

In [17]:
sum(pred_max)/len(pred_max)

tensor(0.7225, device='cuda:0')