# HAI-20.07 Graph Neural Network Model

Anomaly detection using Graph Neural Networks with NetworkX graph data.

In [None]:
import sys
sys.path.append('..')

import polars as pl
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch_geometric.data import Data, DataLoader
from torch_geometric.nn import GCNConv, SAGEConv, GATConv
import networkx as nx
from pathlib import Path
import json

from utils.model_utils import ModelManager
from utils.evaluation import Evaluator
from utils.visualization import Visualizer

## 1. Load Data

In [None]:
# Load preprocessed data
processed_dir = Path('processed_data')

train_df1 = pl.read_parquet(processed_dir / 'train1.parquet')
train_df2 = pl.read_parquet(processed_dir / 'train2.parquet')
test_df1 = pl.read_parquet(processed_dir / 'test1.parquet')
test_df2 = pl.read_parquet(processed_dir / 'test2.parquet')

# Load NetworkX graph data
graph_dir = Path('../hai-security-dataset/graph/boiler')
with open(graph_dir / 'phy_boiler.json', 'r') as f:
    physical_graph_data = json.load(f)
    
with open(graph_dir / 'dcs_1001h.json', 'r') as f:
    dcs_graph_data = json.load(f)
    
# Create NetworkX graphs
physical_graph = nx.node_link_graph(physical_graph_data)
dcs_graph = nx.node_link_graph(dcs_graph_data)

# Combine graphs
combined_graph = nx.compose(physical_graph, dcs_graph)

## 2. Graph Feature Engineering

In [None]:
def extract_graph_features(G):
    """Extract node features from graph"""
    features = {}
    
    # Node degree
    degrees = dict(G.degree())
    
    # Betweenness centrality
    betweenness = nx.betweenness_centrality(G)
    
    # Closeness centrality
    closeness = nx.closeness_centrality(G)
    
    # Eigenvector centrality
    eigenvector = nx.eigenvector_centrality_numpy(G)
    
    # Combine features
    for node in G.nodes():
        features[node] = np.array([
            degrees[node],
            betweenness[node],
            closeness[node],
            eigenvector[node]
        ])
        
    return features

# Extract graph features
node_features = extract_graph_features(combined_graph)

# Create PyTorch Geometric data
def create_pyg_data(G, node_features):
    # Node features
    x = torch.FloatTensor([node_features[node] for node in G.nodes()])
    
    # Edge index
    edge_index = torch.LongTensor([[i, j] for i, j in G.edges()]).t()
    
    return Data(x=x, edge_index=edge_index)

pyg_data = create_pyg_data(combined_graph, node_features)

## 3. Model Definition

In [None]:
class GNNModel(nn.Module):
    def __init__(self, in_channels, hidden_channels, out_channels):
        super(GNNModel, self).__init__()
        
        # Graph convolutional layers
        self.conv1 = GCNConv(in_channels, hidden_channels)
        self.conv2 = GCNConv(hidden_channels, hidden_channels)
        self.conv3 = GCNConv(hidden_channels, hidden_channels)
        
        # Output layer
        self.lin = nn.Linear(hidden_channels, out_channels)
        
    def forward(self, x, edge_index):
        # Graph convolutions
        x = F.relu(self.conv1(x, edge_index))
        x = F.dropout(x, p=0.5, training=self.training)
        
        x = F.relu(self.conv2(x, edge_index))
        x = F.dropout(x, p=0.5, training=self.training)
        
        x = F.relu(self.conv3(x, edge_index))
        
        # Output layer
        x = self.lin(x)
        
        return x

# Initialize model
model = GNNModel(
    in_channels=4,  # Number of node features
    hidden_channels=64,
    out_channels=2  # Binary classification
)

# Loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

## 4. Model Training

In [None]:
# Training parameters
n_epochs = 100
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)
pyg_data = pyg_data.to(device)

# Training history
history = {'train_loss': []}

# Training loop
model.train()
for epoch in range(n_epochs):
    optimizer.zero_grad()
    
    # Forward pass
    out = model(pyg_data.x, pyg_data.edge_index)
    loss = criterion(out, pyg_data.y)
    
    # Backward pass
    loss.backward()
    optimizer.step()
    
    # Record loss
    history['train_loss'].append(loss.item())
    
    if (epoch + 1) % 10 == 0:
        print(f'Epoch [{epoch+1}/{n_epochs}], Loss: {loss.item():.4f}')

## 5. Attack Detection and Propagation Analysis

In [None]:
def analyze_attack_propagation(model, data, G):
    model.eval()
    with torch.no_grad():
        # Get node embeddings
        node_embeddings = model.conv3(
            model.conv2(
                model.conv1(data.x, data.edge_index),
                data.edge_index
            ),
            data.edge_index
        ).cpu().numpy()
        
        # Get predictions
        out = model(data.x, data.edge_index)
        pred = out.argmax(dim=1).cpu().numpy()
        
    # Find attack paths
    attack_nodes = [node for i, node in enumerate(G.nodes()) if pred[i] == 1]
    attack_subgraph = G.subgraph(attack_nodes)
    
    # Find connected components in attack subgraph
    attack_paths = list(nx.connected_components(attack_subgraph))
    
    return node_embeddings, pred, attack_paths

# Analyze attack propagation
node_embeddings, predictions, attack_paths = analyze_attack_propagation(
    model, pyg_data, combined_graph
)

## 6. Visualization

In [None]:
# Initialize visualizer
visualizer = Visualizer(save_dir='figures')

# Plot training history
fig = go.Figure()
fig.add_trace(go.Scatter(y=history['train_loss'], name='Training Loss'))
fig.update_layout(title='Training History', xaxis_title='Epoch', yaxis_title='Loss')
fig.show()

# Plot attack propagation paths
for i, path in enumerate(attack_paths):
    fig = visualizer.plot_attack_propagation(
        combined_graph,
        list(path),
        title=f'Attack Propagation Path {i+1}'
    )
    fig.show()

## 7. Save Model

In [None]:
# Initialize model manager
model_manager = ModelManager(base_dir='models')

# Prepare metadata
metadata = {
    'model_type': 'gnn',
    'dataset_version': '20.07',
    'parameters': {
        'in_channels': 4,
        'hidden_channels': 64,
        'out_channels': 2,
        'n_epochs': n_epochs
    },
    'graph_info': {
        'n_nodes': combined_graph.number_of_nodes(),
        'n_edges': combined_graph.number_of_edges(),
        'n_attack_paths': len(attack_paths)
    },
    'training_history': history
}

# Save model
model_manager.save_torch_model(
    model=model,
    model_name='gnn',
    version='v1',
    metadata=metadata
)