In [1]:
import os
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import seaborn as sns
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler
import networkx as nx
import torch
from torch_geometric.data import Data

pd.set_option('display.max_rows', None)  # Show all rows
pd.set_option('display.max_columns', None)  # Show all columns
pd.set_option('display.width', None)  # Auto-adjust width
pd.set_option('display.colheader_justify', 'left')  # Align headers properly
pd.set_option('display.max_colwidth', None)  # Don't truncate column values

# Folder containing CSV files
ddos2018_df_cleaned = pd.read_csv('/Users/emmalim/Desktop/DSA4263/ddos2018_cleaned.csv')


  from pandas.core import (


In [2]:
# Remove Ininformative features: 
uninformative_features = [
    'Bwd Avg Bulk Rate', 'Fwd Avg Packets/Bulk', 'Fwd Avg Bulk Rate',  
    'Bwd Avg Bytes/Bulk', 'Bwd Avg Packets/Bulk', 'Fwd Avg Bytes/Bulk',  
    'CWE Flag Count', 'Bwd PSH Flags', 'Fwd URG Flags', 'Bwd URG Flags', 'Timestamp','Total Length of Fwd Packets', 'Fwd Header Length', 'Subflow Fwd Packets', 'Subflow Fwd Bytes', 
    'act_data_pkt_fwd', 'Total Length of Bwd Packets', 'Bwd Header Length', 'Subflow Bwd Packets', 
    'Subflow Bwd Bytes', 'Fwd Packets/s', 'Flow IAT Min', 'Fwd IAT Mean', 'Fwd IAT Min',
    'Idle Min', 'Idle Max', 'Min Packet Length', 'min_seg_size_forward', 'Fwd Packet Length Std',
    'Fwd IAT Total', 'Max Packet Length', 'Packet Length Mean', 'Packet Length Std', 'Avg Fwd Segment Size', 
    'Idle Mean', 'Average Packet Size', 'Avg Bwd Segment Size', 'Active Max', 'Active Min', 
    'Fwd IAT Max', 'SYN Flag Count', 'ECE Flag Count'
]
ddos2018_df_cleaned.drop(columns=uninformative_features, inplace=True, errors='ignore')

# Create a mapping dictionary
label_mapping = {
    'Benign': 0,
    'DDoS attacks-LOIC-HTTP': 1
}

# Apply the mapping to the 'label' column
ddos2018_df_cleaned['Label'] = ddos2018_df_cleaned['Label'].map(label_mapping)

print(ddos2018_df_cleaned.info())
print(ddos2018_df_cleaned.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1918684 entries, 0 to 1918683
Data columns (total 44 columns):
 #   Column                   Dtype  
---  ------                   -----  
 0   Flow ID                  object 
 1   Source IP                object 
 2   Source Port              int64  
 3   Destination IP           object 
 4   Destination Port         int64  
 5   Protocol                 int64  
 6   Flow Duration            int64  
 7   Total Fwd Packets        int64  
 8   Total Backward Packets   int64  
 9   Fwd Packet Length Max    float64
 10  Fwd Packet Length Min    float64
 11  Fwd Packet Length Mean   float64
 12  Bwd Packet Length Max    float64
 13  Bwd Packet Length Min    float64
 14  Bwd Packet Length Mean   float64
 15  Bwd Packet Length Std    float64
 16  Flow Bytes/s             float64
 17  Flow Packets/s           float64
 18  Flow IAT Mean            float64
 19  Flow IAT Std             float64
 20  Flow IAT Max             float64
 21  Fwd IAT 

## Create Graph Structure

In [None]:
# Create a directed multigraph (allows multiple edges between same nodes)
G = nx.MultiDiGraph()

# Add edges (each flow is a separate edge)
for _, row in ddos2018_df_cleaned.iterrows():
    src, dst = row['Source IP'], row['Destination IP']
    
    # Edge attributes (flow stats, choose features with top importance from RF )
    flow_features = {
        'Flow IAT Mean': row['Flow IAT Mean'],
        'Fwd Packet Length Mean': row['Fwd Packet Length Mean'],
        'Fwd Packet Length Max': row['Fwd Packet Length Max'],
        'Flow IAT Max': row['Flow IAT Max'],
        'Flow Duration': row['Flow Duration'], 
        'Label': row['Label']
    }
    # Add an edge with a unique key (MultiDiGraph allows multiple edges)
    G.add_edge(src, dst, **flow_features)
    
    
# Print general info about the graph (nodes, edges)
print(f"Nodes: {len(G.nodes())}")
print(f"Edges: {len(G.edges())}")

# If you want to get more details about nodes, edges, or graph attributes:
print("Node details:", list(G.nodes(data=True))[:5])  # Print first 5 nodes with attributes
print("Edge details:", list(G.edges(data=True))[:5])  # Print first 5 edges with attributes


Nodes: 13848
Edges: 1918684
Node details: [('8.6.0.1', {}), ('8.0.6.4', {}), ('221.194.47.233', {}), ('172.31.69.25', {}), ('131.202.242.193', {})]
Edge details: [('8.6.0.1', '8.0.6.4', {'Flow IAT Mean': 56300000.0, 'Fwd Packet Length Mean': 0.0, 'Fwd Packet Length Max': 0.0, 'Flow IAT Max': 56300000.0, 'Flow Duration': 112640715, 'Label': 0}), ('8.6.0.1', '8.0.6.4', {'Flow IAT Mean': 56300000.0, 'Fwd Packet Length Mean': 0.0, 'Fwd Packet Length Max': 0.0, 'Flow IAT Max': 56300000.0, 'Flow Duration': 112640744, 'Label': 0}), ('8.6.0.1', '8.0.6.4', {'Flow IAT Mean': 56300000.0, 'Fwd Packet Length Mean': 0.0, 'Fwd Packet Length Max': 0.0, 'Flow IAT Max': 56300000.0, 'Flow Duration': 112640793, 'Label': 0}), ('8.6.0.1', '8.0.6.4', {'Flow IAT Mean': 56300000.0, 'Fwd Packet Length Mean': 0.0, 'Fwd Packet Length Max': 0.0, 'Flow IAT Max': 56300000.0, 'Flow Duration': 112640616, 'Label': 0}), ('8.6.0.1', '8.0.6.4', {'Flow IAT Mean': 56300000.0, 'Fwd Packet Length Mean': 0.0, 'Fwd Packet Lengt

### Hypothesis: edge betweenness centrality is statistically significant in predicting the label

In [None]:

# Convert the MultiDiGraph G into PyTorch Geometric format
# 1. Convert nodes (IP addresses) to indices
ip_to_idx = {ip: i for i, ip in enumerate(set([node for edge in G.edges() for node in edge]))}

# 2. Convert edges and edge attributes into a suitable format for PyTorch Geometric
edge_index = []
edge_attr = []

for u, v, data in G.edges(data=True):
    # Add edge from u to v
    edge_index.append([ip_to_idx[u], ip_to_idx[v]])
    
    # Collect edge attributes (flow features)
    flow_features = [
        data['Flow IAT Max'],
        data['Fwd Packet Length Mean'],
        data['Fwd Packet Length Max'],
        data['Flow IAT Mean'],
        data['Flow Duration'], 
        data['Label']
    ]
    edge_attr.append(flow_features)

# Convert edge_index and edge_attr to PyTorch tensors
edge_index = torch.tensor(edge_index, dtype=torch.long).t().contiguous()
edge_attr = torch.tensor(edge_attr, dtype=torch.float)

# 3. Optionally, you can add node features or labels
# Here, you can set a dummy tensor for node features (if no additional node-specific features exist)
num_nodes = len(ip_to_idx)
x = torch.ones((num_nodes, 1), dtype=torch.float)  # Example node features (dummy)

# Optionally, if the node labels (e.g., class) are present, you can add them as a target.
# y = torch.tensor([data['Label'] for node, data in G.nodes(data=True)], dtype=torch.long)  # If node labels exist

# 4. Create a PyTorch Geometric Data object
graph_data = Data(x=x, edge_index=edge_index, edge_attr=edge_attr)

# Print the graph data
print(graph_data)


Data(x=[33176, 1], edge_index=[2, 7948748], edge_attr=[7948748, 6])


In [23]:
# Convert edge_attr to a tensor of labels for classification
labels = graph_data.edge_attr[:, -1]  # Assuming the last column is the label

# Split the edges into training and test sets
train_idx, test_idx = train_test_split(range(graph_data.edge_index.size(1)), test_size=0.2, random_state=42)

# Create train and test Data objects
train_data = Data(
    edge_index=graph_data.edge_index[:, train_idx],
    edge_attr=graph_data.edge_attr[train_idx],
    y=labels[train_idx],
)

test_data = Data(
    edge_index=graph_data.edge_index[:, test_idx],
    edge_attr=graph_data.edge_attr[test_idx],
    y=labels[test_idx],
)

print(f"Training data: {train_data}")
print(f"Test data: {test_data}")

Training data: Data(edge_index=[2, 6358998], edge_attr=[6358998, 6], y=[6358998])
Test data: Data(edge_index=[2, 1589750], edge_attr=[1589750, 6], y=[1589750])


In [24]:
import torch.nn as nn
import torch.nn.functional as F
from torch_geometric.nn import GCNConv

class GNNEdgeClassifier(nn.Module):
    def __init__(self, in_channels, hidden_channels, out_channels):
        super(GNNEdgeClassifier, self).__init__()
        self.conv1 = GCNConv(in_channels, hidden_channels)
        self.conv2 = GCNConv(hidden_channels, out_channels)

    def forward(self, data):
        edge_index = data.edge_index
        edge_attr = data.edge_attr
        
        # Initial feature transformation (using edge features here)
        x = edge_attr
        
        # Apply GCN layers
        x = F.relu(self.conv1(x, edge_index))
        x = self.conv2(x, edge_index)
        
        return x

# Hyperparameters
in_channels = train_data.edge_attr.size(1)  # Number of features in edge_attr
hidden_channels = 64
out_channels = 2  # Binary classification (benign or attack)

# Instantiate the model
model = GNNEdgeClassifier(in_channels, hidden_channels, out_channels)


In [30]:
import torch.optim as optim
from torch_geometric.data import DataLoader

# Create DataLoader (this is optional, but helps in batching)
train_loader = DataLoader([train_data], batch_size=1)
test_loader = DataLoader([test_data], batch_size=1)

# Define loss and optimizer
criterion = nn.CrossEntropyLoss()  # Use CrossEntropy for classification
optimizer = optim.Adam(model.parameters(), lr=0.01)

# Training loop
epochs = 20
for epoch in range(epochs):
    model.train()
    total_loss = 0
    
    for batch in train_loader:
        optimizer.zero_grad()
        out = model(batch)
        
        # Loss computation: Edge-level labels
        loss = criterion(out, batch.y.long())  # Convert labels to Long
        total_loss += loss.item()
        
        # Backpropagation
        loss.backward()
        optimizer.step()

    print(f"Epoch {epoch+1}, Loss: {total_loss:.4f}")



Epoch 1, Loss: 36420.0391
Epoch 2, Loss: 165480.1562
Epoch 3, Loss: 221658.3906
Epoch 4, Loss: 222318.6562
Epoch 5, Loss: 188175.5938
Epoch 6, Loss: 127622.4219
Epoch 7, Loss: 50301.8086
Epoch 8, Loss: 110937.4688
Epoch 9, Loss: 99628.0703
Epoch 10, Loss: 12828.0000
Epoch 11, Loss: 38216.6367
Epoch 12, Loss: 34259.5859
Epoch 13, Loss: 32127.2402
Epoch 14, Loss: 18309.7637
Epoch 15, Loss: 4262.0044
Epoch 16, Loss: 69623.4688
Epoch 17, Loss: 20594.8340
Epoch 18, Loss: 48214.5703
Epoch 19, Loss: 77328.9844
Epoch 20, Loss: 91619.5703


In [31]:
# Evaluation loop
model.eval()
correct = 0
total = 0

with torch.no_grad():
    for batch in test_loader:
        out = model(batch)
        
        # Get predictions (choose the class with the highest probability)
        pred = out.argmax(dim=1)
        
        # Calculate accuracy
        correct += (pred == batch.y).sum().item()
        total += batch.y.size(0)

accuracy = correct / total
print(f"Test Accuracy: {accuracy:.4f}")


Test Accuracy: 0.9274
