# Create PyG Dataset

This notebook is meant to be run in Google Colab. It uses the output files produced by running `graph_generator` to construct a PyG dataset, where each `torch_geometric.data` object is a graph snapshot of the system simulated by the generator.

# Import libraries

In [7]:
pip install torch-geometric



In [8]:
import torch
import networkx as nx
import numpy as np
import os
import os.path as osp
import pandas as pd
from torch_geometric.data import Data, Dataset, InMemoryDataset, download_url
import matplotlib.pyplot as plt

# Connect to Google drive and establish directories

In [9]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [10]:
edge_idx_path = '/content/drive/MyDrive/CS224 Project/2024-11-04 data/edge'
node_features_path = '/content/drive/MyDrive/CS224 Project/2024-11-04 data/out'
fault_label_path = '/content/drive/MyDrive/CS224 Project/2024-11-04 data/fault_label'

## PyG Dataset implementation

In [11]:
FEATURE_DIMENSION = 9
TIMESTAMP_FEATURE_INDEX = 3
HEALTHY_NODE_LABEL = 0
ROOT_CAUSE_NODE_LABEL = 1

class GraphDataset(InMemoryDataset):
    def __init__(self, root, transform=None, pre_transform=None, pre_filter=None):
        super().__init__(root, transform, pre_transform, pre_filter)
        self.load(self.processed_paths[0])

    @property
    def raw_file_names(self):
        return [edge_idx_path, node_features_path, fault_label_path]

    @property
    def processed_file_names(self):
        return ['data.pt']

    def download(self):
        pass

    def get_edge_index(self, path):
      edges = []
      with open(path, "r") as output:
        i = 0
        for line in output:
          edge = line.strip().split(',')
          edges.append([int(edge[0]), int(edge[1])])
      return torch.tensor(edges, dtype=torch.long).t().contiguous() # put edges into COO format

    def get_node_at_fault(self, path):
      node_at_fault, timestamp_of_fault = 0, 0
      with open(fault_label_path, "r") as output:
        ind = 0
        for line in output:
          node_at_fault, timestamp_of_fault = line.strip().split(',')
          ind += 1
        assert ind == 1 # there should only be one line in this file
      return int(node_at_fault), int(timestamp_of_fault)

    def get_node_features(self, node_features_path, fault_label_path):
      node_features, node_labels = [], []
      node_at_fault, timestamp_of_fault = self.get_node_at_fault(fault_label_path)
      with open(node_features_path, "r") as output:
        for line in output:
          items = line.strip().split(',')
          features = [list(map(int, items[i + 1 : i + FEATURE_DIMENSION + 1]))
                            for i in range(0, len(items), FEATURE_DIMENSION + 1)]
          time = max([node[TIMESTAMP_FEATURE_INDEX] for node in features]) # features[node_at_fault][TIMESTAMP_FEATURE_INDEX]
          features = torch.tensor(features)
          node_features.append(features)
          labels = [HEALTHY_NODE_LABEL] * features.shape[0]
          if time >= timestamp_of_fault:
            labels[node_at_fault] = ROOT_CAUSE_NODE_LABEL
          labels = torch.tensor(labels).reshape(features.shape[0], 1)
          node_labels.append(labels)
      return node_features, node_labels

    def process(self):
        edge_index = self.get_edge_index(self.raw_paths[0])
        node_features_per_graph, node_labels_per_graph = self.get_node_features(self.raw_paths[1], self.raw_paths[2])

        data_list = []
        for idx, features, labels in zip(range(len(node_features_per_graph)), node_features_per_graph, node_labels_per_graph):
          graph_data = Data(x=features, edge_index=edge_index, y=labels)
          data_list.append(graph_data)

          torch.save(graph_data, '/content/drive/MyDrive/CS224 Project/2024-11-04 data/folder/data_{}.pt'.format(idx))

        self.save(data_list, self.processed_paths[0])

In [13]:
pyg_dataset= GraphDataset('./graphs')
graph_0 = pyg_dataset[0]
print(graph_0) # Data(x=[5, 9], edge_index=[2, 4], y=[5, 1])

# for graph in pyg_dataset:
#   print(graph.y)

Data(x=[5, 9], edge_index=[2, 4], y=[5, 1])


# Load features and edges from output files
The following code is scratchwork that was used to inspect files and to test preprocessing logic that is used in the PyG dataset implementation above.

In [None]:
edges = []
with open(edge_idx_path, "r") as output:
  i = 0
  for line in output:
    edge = line.strip().split(',')
    edges.append([int(edge[0]), int(edge[1])])
edges = torch.tensor(edges, dtype=torch.long).t().contiguous() # put edges into COO format
print(edges)

tensor([[0, 0, 1, 2],
        [1, 2, 3, 4]])


In [None]:
node_at_fault, timestamp_of_fault = 0, 0
with open(fault_label_path, "r") as output:
  ind = 0
  for line in output:
    node_at_fault, timestamp_of_fault = [int(val) for val in line.strip().split(',')]
    ind += 1
  assert ind == 1 # there should only be one line in this file
print(node_at_fault, timestamp_of_fault)

0 20


In [None]:
FEATURE_DIMENSION = 9

graphs, node_features, node_labels = [], [], []
with open(node_features_path, "r") as output:
  for line in output:
    items = line.strip().split(',')
    formatted_data = [(items[i],
                       list(map(int, items[i + 1 : i + FEATURE_DIMENSION + 1])))
                      for i in range(0, len(items), FEATURE_DIMENSION + 1)]
    features = [list(map(int, items[i + 1 : i + FEATURE_DIMENSION + 1]))
                      for i in range(0, len(items), FEATURE_DIMENSION + 1)]
    time = features[node_at_fault][TIMESTAMP_FEATURE_INDEX]
    print(features, time, max([node[TIMESTAMP_FEATURE_INDEX] for node in features]))
    features = torch.tensor(features)
    node_features.append(features)
    labels = [1] * features.shape[0]
    if time >= timestamp_of_fault:
      labels[node_at_fault] = 0
    labels = torch.tensor(labels).reshape(features.shape[0], 1)
    node_labels.append(labels)
    graphs.append(formatted_data)
print(graphs[0])
print(node_features[0])
print("Number of graphs: ", len(graphs))

import numpy as np
NODES_IDX = {}
for i in range(len(formatted_data)):
  NODES_IDX[formatted_data[i][0]] = i
print(NODES_IDX)
assert len(list(NODES_IDX.keys())) == len(set(list(NODES_IDX.keys())))
assert node_features[0].shape == (len(list(NODES_IDX.keys())), FEATURE_DIMENSION)

[[0, 1, 10, 0, 2, 0, 1, 0, 1], [1, 3, 0, 0, 0, 0, 0, 0, 0], [1, 3, 0, 0, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0, 0, 0]] 0 0
[[0, 1, 10, 0, 2, 0, 1, 0, 1], [1, 3, 0, 0, 0, 0, 0, 0, 0], [1, 3, 0, 0, 3, 2, 0, 1, 1], [1, 0, 0, 0, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0, 0, 0]] 0 0
[[0, 1, 10, 0, 2, 0, 1, 0, 1], [1, 3, 0, 0, 0, 0, 0, 0, 0], [1, 3, 0, 0, 3, 2, 0, 1, 1], [1, 0, 0, 0, 0, 0, 0, 0, 0], [1, 0, 0, 0, 3, 0, 0, 1, 0]] 0 0
[[0, 1, 10, 0, 2, 0, 1, 0, 1], [1, 3, 0, 2, 3, 2, 0, 1, 1], [1, 3, 0, 0, 3, 2, 0, 1, 1], [1, 0, 0, 0, 0, 0, 0, 0, 0], [1, 0, 0, 0, 3, 0, 0, 1, 0]] 0 2
[[0, 1, 10, 0, 2, 0, 1, 0, 1], [1, 3, 0, 2, 3, 2, 0, 1, 1], [1, 3, 0, 0, 3, 2, 0, 1, 1], [1, 0, 0, 2, 3, 0, 0, 1, 0], [1, 0, 0, 0, 3, 0, 0, 1, 0]] 0 2
[[0, 1, 10, 10, 2, 0, 2, 0, 2], [1, 3, 0, 2, 3, 2, 0, 1, 1], [1, 3, 0, 0, 3, 2, 0, 1, 1], [1, 0, 0, 2, 3, 0, 0, 1, 0], [1, 0, 0, 0, 3, 0, 0, 1, 0]] 10 10
[[0, 1, 10, 10, 2, 0, 2, 0, 2], [1, 3, 0, 2, 3, 2, 0, 1, 1], [1, 3, 0, 10, 3, 2, 0, 2, 2], 