<a href="https://colab.research.google.com/github/tejareddytadi/Efficient-Graph-Partitioning-Approaches-for-Enhanced-Accident-Prediction/blob/main/Research_work.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>


TAP: A Comprehensive Data Repository for Traffic Accident Prediction in Transportation Networks. Baixiang Huang, Bryan Hooi, Kai Shu. [[link]](https://arxiv.org/pdf/2304.08640)

In [None]:
from IPython.display import clear_output
!pip install torch==1.12.1+cu116 torchvision==0.13.1+cu116 torchaudio==0.12.1 --extra-index-url https://download.pytorch.org/whl/cu116
!pip install torch==1.13.0
!pip install torch_geometric
!pip install xgboost==0.90
clear_output()

In [None]:
import os
import torch
import torch_geometric
import xgboost as xgb
os.environ['TORCH'] = torch.__version__
print('XGBoost version:', xgb.__version__)
print('CUDA version:', torch.version.cuda)
print('PyTorch version:', torch.__version__)
print('PyG version:', torch_geometric.__version__)

XGBoost version: 0.90
CUDA version: 11.7
PyTorch version: 1.13.0+cu117
PyG version: 2.4.0


In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import os
import math
import time
import shutil
import pickle
import numpy as np
import pandas as pd
import os.path as osp
import networkx as nx
import xgboost as xgb
import matplotlib.pyplot as plt

import torch
import torch_geometric
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torch_geometric.nn as pyg_nn

from torch import Tensor
from torch.nn import Parameter
from torch_geometric.io import read_npz
from torch_geometric.nn.conv import MessagePassing
from torch_geometric.nn.inits import reset, uniform, zeros
from torch_geometric.typing import OptTensor, OptPairTensor, Adj, Size
from torch_geometric.data import Data, DataLoader, InMemoryDataset, download_url

from pylab import cm
from matplotlib import colors
from IPython.display import clear_output
from xgboost.sklearn import XGBClassifier
from typing import Union, Tuple, Callable, Optional
from sklearn.preprocessing import LabelEncoder, MinMaxScaler, StandardScaler
from sklearn.metrics import accuracy_score, f1_score, average_precision_score, roc_auc_score

np.random.seed(7)
# torch.manual_seed(7)
plt.style.use("ggplot")

In [None]:
device = torch.device('cuda')
device, torch.cuda.current_device()

In [None]:
def read_npz(path):
    with np.load(path, allow_pickle=True) as f:
        return parse_npz(f)


def parse_npz(f):
    crash_time = f['crash_time']
    x = torch.from_numpy(f['x']).to(torch.float)
    coords = torch.from_numpy(f['coordinates']).to(torch.float)
    edge_attr = torch.from_numpy(f['edge_attr']).to(torch.float)
    cnt_labels = torch.from_numpy(f['cnt_labels']).to(torch.long)
    occur_labels = torch.from_numpy(f['occur_labels']).to(torch.long)
    edge_attr_dir = torch.from_numpy(f['edge_attr_dir']).to(torch.float)
    edge_attr_ang = torch.from_numpy(f['edge_attr_ang']).to(torch.float)
    severity_labels = torch.from_numpy(f['severity_8labels']).to(torch.long)
    edge_index = torch.from_numpy(f['edge_index']).to(torch.long).t().contiguous()

    num_edges = edge_index.size(1)
    print(f"Number of edges read: {num_edges}")

    return Data(x=x, y=occur_labels, severity_labels=severity_labels, edge_index=edge_index,
                edge_attr=edge_attr, edge_attr_dir=edge_attr_dir, edge_attr_ang=edge_attr_ang,
                coords=coords, cnt_labels=cnt_labels, crash_time=crash_time)




In [None]:
import networkx as nx
import matplotlib.pyplot as plt

data = read_npz('/content/drive/MyDrive/miami_fl.npz')

def create_networkx_graph(data):
    # Create a MultiGraph (renamed to 'g')
    g = nx.MultiGraph()

    # Add nodes with their attributes
    for i, label in enumerate(data.y):
        g.add_node(i, label=label.item())

    # Add all edges from the edge_index
    edge_index = data.edge_index.cpu().numpy()

    for edge in edge_index.T:
        source, target = edge[:2]  # Take only the first two values
        g.add_node(source, label=data.y[source].item())
        g.add_node(target, label=data.y[target].item())
        g.add_edge(source, target)

    return g

# Create MultiGraph from parsed data
g = create_networkx_graph(data)

# Print nodes and edges
print(f"Number of nodes in the graph: {g.number_of_nodes()}")
print(f"Number of edges read: {len(data.edge_index.T)}")
print(f"Number of edges in the graph: {g.number_of_edges()}")

# Visualize the graph
pos = {i: (data.coords[i][0].item(), data.coords[i][1].item()) for i in range(len(data.coords))}
labels = {i: data.y[i].item() for i in range(len(data.y))}

nx.draw(g, pos, with_labels=False, labels=labels, node_size=0.5, node_color='lightblue', width=0.5, alpha=0.7, edge_color='black', font_size=10, font_color='black')
plt.title('Miami')
plt.show()


In [None]:
import os
os.environ['METIS_DLL'] = '/usr/lib/x86_64-linux-gnu/libmetis.so'
!apt-get install -y libmetis-dev
!pip install metis
import metis

normal metis

In [None]:
import time
np.random.seed(42)
num_partitions = 10 # Change this to the desired number of partitions
start_time = time.time()
edgecuts, parts = metis.part_graph(g, num_partitions)
partitioning_time = time.time() - start_time
total_partitioned_nodes = 0
# Print the number of nodes in each partition
for partition_id in range(num_partitions):
    partition_nodes = [node for node, part in enumerate(parts) if part == partition_id]
    print(f"Partition {partition_id}: {len(partition_nodes)} nodes")
    total_partitioned_nodes = total_partitioned_nodes + len(partition_nodes)
print(f"Total partitioned nodes: {total_partitioned_nodes} out of {len(g.nodes)} total nodes")

# Create subgraphs for each partition and visualize them
subgraphs = []
for partition_id in range(num_partitions):
    partition_nodes = [node for node, part in enumerate(parts) if part == partition_id]
    subgraph = g.subgraph(partition_nodes)
    subgraphs.append(subgraph)

# Visualize the partitioned subgraphs
for i, subgraph in enumerate(subgraphs):
    plt.figure(figsize=(8, 6))
    pos = nx.spring_layout(subgraph)  # You can adjust the layout algorithm as needed
    nx.draw(subgraph, pos, node_size=0.5)
    plt.title(f'k=10')
    plt.show()

edges = []
num_nodes = []
for subgraph in subgraphs:
    edges.append(len(subgraph.edges()))
    num_nodes.append(len(subgraph.nodes()))

edges = np.array(edges)
nodes = np.array(num_nodes)
g_nodes = len(g.nodes)
g_edges = len(g.edges)

sum_nodes = np.sum(nodes)
print(f"Sum of nodes : {sum_nodes}")
sum_edges = np.sum(edges)
print(f"Node loss : {(g_nodes - sum_nodes) / g_nodes}")
print(f"Edge loss : {(g_edges - sum_edges) / g_edges}")

max_edges = np.max(edges)
avg_edges = np.average(edges)
balance = max_edges / avg_edges
print(f"Edge Balance : {balance}")
max_n = np.max(nodes)
avg_n = np.average(nodes)
Bal_n = max_n/avg_n
print(f"Node Balance :{Bal_n}")
print(f"average nodes in partition :{avg_n}")
print(f"average edges in partition :{avg_edges}")
print(f"Partitioning time: {partitioning_time} seconds")
print(f"Cut Size: {edgecuts}")

In [None]:
import time
np.random.seed(42)
num_partitions = 30 # Change this to the desired number of partitions
start_time = time.time()
edgecuts, parts = metis.part_graph(g, num_partitions)
partitioning_time = time.time() - start_time
total_partitioned_nodes = 0
# Print the number of nodes in each partition
for partition_id in range(num_partitions):
    partition_nodes = [node for node, part in enumerate(parts) if part == partition_id]
    print(f"Partition {partition_id}: {len(partition_nodes)} nodes")
    total_partitioned_nodes = total_partitioned_nodes + len(partition_nodes)
print(f"Total partitioned nodes: {total_partitioned_nodes} out of {len(g.nodes)} total nodes")

# Create subgraphs for each partition and visualize them
subgraphs = []
for partition_id in range(num_partitions):
    partition_nodes = [node for node, part in enumerate(parts) if part == partition_id]
    subgraph = g.subgraph(partition_nodes)
    subgraphs.append(subgraph)

# Visualize the partitioned subgraphs
for i, subgraph in enumerate(subgraphs):
    plt.figure(figsize=(8, 6))
    pos = nx.spring_layout(subgraph)  # You can adjust the layout algorithm as needed
    nx.draw(subgraph, pos, node_size=0.5)
    plt.title(f'Partition {i} Subgraph')
    plt.show()

edges = []
num_nodes = []
for subgraph in subgraphs:
    edges.append(len(subgraph.edges()))
    num_nodes.append(len(subgraph.nodes()))

edges = np.array(edges)
nodes = np.array(num_nodes)
g_nodes = len(g.nodes)
g_edges = len(g.edges)

sum_nodes = np.sum(nodes)
print(f"Sum of nodes : {sum_nodes}")
sum_edges = np.sum(edges)
print(f"Node loss : {(g_nodes - sum_nodes) / g_nodes}")
print(f"Edge loss : {(g_edges - sum_edges) / g_edges}")

max_edges = np.max(edges)
avg_edges = np.average(edges)
balance = max_edges / avg_edges
print(f"Edge Balance : {balance}")
max_n = np.max(nodes)
avg_n = np.average(nodes)
Bal_n = max_n/avg_n
print(f"Node Balance :{Bal_n}")
print(f"average nodes in partition :{avg_n}")
print(f"average edges in partition :{avg_edges}")
print(f"Partitioning time: {partitioning_time} seconds")
print(f"Cut Size: {edgecuts}")

In [None]:
import time
np.random.seed(42)
num_partitions = 50 # Change this to the desired number of partitions
start_time = time.time()
edgecuts, parts = metis.part_graph(g, num_partitions)
partitioning_time = time.time() - start_time
total_partitioned_nodes = 0
# Print the number of nodes in each partition
for partition_id in range(num_partitions):
    partition_nodes = [node for node, part in enumerate(parts) if part == partition_id]
    print(f"Partition {partition_id}: {len(partition_nodes)} nodes")
    total_partitioned_nodes = total_partitioned_nodes + len(partition_nodes)
print(f"Total partitioned nodes: {total_partitioned_nodes} out of {len(g.nodes)} total nodes")

# Create subgraphs for each partition and visualize them
subgraphs = []
for partition_id in range(num_partitions):
    partition_nodes = [node for node, part in enumerate(parts) if part == partition_id]
    subgraph = g.subgraph(partition_nodes)
    subgraphs.append(subgraph)

# Visualize the partitioned subgraphs
for i, subgraph in enumerate(subgraphs):
    plt.figure(figsize=(8, 6))
    pos = nx.spring_layout(subgraph)  # You can adjust the layout algorithm as needed
    nx.draw(subgraph, pos, node_size=0.5)
    plt.title(f'Partition {i} Subgraph')
    plt.show()

edges = []
num_nodes = []
for subgraph in subgraphs:
    edges.append(len(subgraph.edges()))
    num_nodes.append(len(subgraph.nodes()))

edges = np.array(edges)
nodes = np.array(num_nodes)
g_nodes = len(g.nodes)
g_edges = len(g.edges)

sum_nodes = np.sum(nodes)
print(f"Sum of nodes : {sum_nodes}")
sum_edges = np.sum(edges)
print(f"Node loss : {(g_nodes - sum_nodes) / g_nodes}")
print(f"Edge loss : {(g_edges - sum_edges) / g_edges}")

max_edges = np.max(edges)
avg_edges = np.average(edges)
balance = max_edges / avg_edges
print(f"Edge Balance : {balance}")
max_n = np.max(nodes)
avg_n = np.average(nodes)
Bal_n = max_n/avg_n
print(f"Node Balance :{Bal_n}")
print(f"average nodes in partition :{avg_n}")
print(f"average edges in partition :{avg_edges}")
print(f"Partitioning time: {partitioning_time} seconds")
print(f"Cut Size: {edgecuts}")

In [None]:
import time
np.random.seed(42)
num_partitions = 100 # Change this to the desired number of partitions
start_time = time.time()
edgecuts, parts = metis.part_graph(g, num_partitions)
partitioning_time = time.time() - start_time
total_partitioned_nodes = 0
# Print the number of nodes in each partition
for partition_id in range(num_partitions):
    partition_nodes = [node for node, part in enumerate(parts) if part == partition_id]
    print(f"Partition {partition_id}: {len(partition_nodes)} nodes")
    total_partitioned_nodes = total_partitioned_nodes + len(partition_nodes)
print(f"Total partitioned nodes: {total_partitioned_nodes} out of {len(g.nodes)} total nodes")

# Create subgraphs for each partition and visualize them
subgraphs = []
for partition_id in range(num_partitions):
    partition_nodes = [node for node, part in enumerate(parts) if part == partition_id]
    subgraph = g.subgraph(partition_nodes)
    subgraphs.append(subgraph)

# Visualize the partitioned subgraphs
for i, subgraph in enumerate(subgraphs):
    plt.figure(figsize=(8, 6))
    pos = nx.spring_layout(subgraph)  # You can adjust the layout algorithm as needed
    nx.draw(subgraph, pos, node_size=0.5)
    plt.title(f'Partition {i} Subgraph')
    plt.show()

edges = []
num_nodes = []
for subgraph in subgraphs:
    edges.append(len(subgraph.edges()))
    num_nodes.append(len(subgraph.nodes()))

edges = np.array(edges)
nodes = np.array(num_nodes)
g_nodes = len(g.nodes)
g_edges = len(g.edges)

sum_nodes = np.sum(nodes)
print(f"Sum of nodes : {sum_nodes}")
sum_edges = np.sum(edges)
print(f"Node loss : {(g_nodes - sum_nodes) / g_nodes}")
print(f"Edge loss : {(g_edges - sum_edges) / g_edges}")

max_edges = np.max(edges)
avg_edges = np.average(edges)
balance = max_edges / avg_edges
print(f"Edge Balance : {balance}")
max_n = np.max(nodes)
avg_n = np.average(nodes)
Bal_n = max_n/avg_n
print(f"Node Balance :{Bal_n}")
print(f"average nodes in partition :{avg_n}")
print(f"average edges in partition :{avg_edges}")
print(f"Partitioning time: {partitioning_time} seconds")
print(f"Cut Size: {edgecuts}")

for visualization purposes

In [None]:
import time
np.random.seed(42)
num_partitions = 70 # Change this to the desired number of partitions
start_time = time.time()
edgecuts, parts = metis.part_graph(g, num_partitions)
partitioning_time = time.time() - start_time
total_partitioned_nodes = 0
# Print the number of nodes in each partition
for partition_id in range(num_partitions):
    partition_nodes = [node for node, part in enumerate(parts) if part == partition_id]
    print(f"Partition {partition_id}: {len(partition_nodes)} nodes")
    total_partitioned_nodes = total_partitioned_nodes + len(partition_nodes)
print(f"Total partitioned nodes: {total_partitioned_nodes} out of {len(g.nodes)} total nodes")

# Create subgraphs for each partition and visualize them
subgraphs = []
for partition_id in range(num_partitions):
    partition_nodes = [node for node, part in enumerate(parts) if part == partition_id]
    subgraph = g.subgraph(partition_nodes)
    subgraphs.append(subgraph)

# Visualize the partitioned subgraphs


edges = []
num_nodes = []
for subgraph in subgraphs:
    edges.append(len(subgraph.edges()))
    num_nodes.append(len(subgraph.nodes()))

edges = np.array(edges)
nodes = np.array(num_nodes)
g_nodes = len(g.nodes)
g_edges = len(g.edges)

sum_nodes = np.sum(nodes)
print(f"Sum of nodes : {sum_nodes}")
sum_edges = np.sum(edges)
print(f"Node loss : {(g_nodes - sum_nodes) / g_nodes}")
print(f"Edge loss : {(g_edges - sum_edges) / g_edges}")

max_edges = np.max(edges)
avg_edges = np.average(edges)
balance = max_edges / avg_edges
print(f"Edge Balance : {balance}")
max_n = np.max(nodes)
avg_n = np.average(nodes)
Bal_n = max_n/avg_n
print(f"Node Balance :{Bal_n}")
print(f"average nodes in partition :{avg_n}")
print(f"average edges in partition :{avg_edges}")
print(f"Partitioning time: {partitioning_time} seconds")
print(f"Cut Size: {edgecuts}")

dgl edge balanced metis parmetis

In [None]:
!pip install dgl
#!pip install torch==1.13.0
import dgl

for visvualizatioon purposes

In [None]:
dgl.random.seed(42)
D = dgl.from_networkx(g)
D_sub = dgl.metis_partition(D,reshuffle = True,balance_edges=True,k = 70) #edge balanced
sub_arr = []
num_nodes = []
num_edges = []
array_of_elements = []
for i in range( len(D_sub)):
  subgraph = D_sub[i]
  subgraph_nx = dgl.to_networkx(subgraph)
  subgraph_nx = nx.to_undirected(subgraph_nx)
  num_nodes.append(len(subgraph_nx.nodes))
  num_edges.append(len(subgraph_nx.edges))
  sub_arr.append(subgraph_nx)
edges = np.array(num_edges)
nodes = np.array(num_nodes)
sum_nodes = np.sum(nodes)
print(f"sum of nodes : {sum_nodes}")
sum_edges = np.sum(edges)
print(f"sum of edges : {sum_edges}")
G_nodes = len(g.nodes)
G_edges = len(g.edges)
print(f"node loss : {(G_nodes-sum_nodes)/G_nodes}")
print(f"edge loss : {(G_edges-sum_edges)/G_edges}")
max = np.max(edges)
avg = np.average(edges)
Bal = max/avg
print(f"edge Balance : {Bal}")
max_n = np.max(nodes)
avg_n = np.average(nodes)
Bal_n = max_n/avg_n
print(f"node Balance : {Bal_n}")
print(f"average nodes in partitions : {avg_n}")
print(f"average edges in partition :{avg}")


In [None]:
dgl.random.seed(42)
D = dgl.from_networkx(g)
D_sub = dgl.metis_partition(D,reshuffle = True,balance_edges=True,k = 10) #edge balanced
sub_arr = []
num_nodes = []
num_edges = []
array_of_elements = []
for i in range( len(D_sub)):
  subgraph = D_sub[i]
  subgraph_nx = dgl.to_networkx(subgraph)
  subgraph_nx = nx.to_undirected(subgraph_nx)
  num_nodes.append(len(subgraph_nx.nodes))
  num_edges.append(len(subgraph_nx.edges))
  sub_arr.append(subgraph_nx)
edges = np.array(num_edges)
nodes = np.array(num_nodes)
sum_nodes = np.sum(nodes)
print(f"sum of nodes : {sum_nodes}")
sum_edges = np.sum(edges)
print(f"sum of edges : {sum_edges}")
G_nodes = len(g.nodes)
G_edges = len(g.edges)
print(f"node loss : {(G_nodes-sum_nodes)/G_nodes}")
print(f"edge loss : {(G_edges-sum_edges)/G_edges}")
max = np.max(edges)
avg = np.average(edges)
Bal = max/avg
print(f"edge Balance : {Bal}")
max_n = np.max(nodes)
avg_n = np.average(nodes)
Bal_n = max_n/avg_n
print(f"node Balance : {Bal_n}")
print(f"average nodes in partitions : {avg_n}")
print(f"average edges in partition :{avg}")
for i,sub_eb in enumerate(sub_arr):
  plt.figure()
  plt.figure(figsize=(8, 6))
  pos = nx.spring_layout(sub_eb)
  plt.title(f"k=10")
  nx.draw(sub_eb,pos=pos,node_size = 0.5,with_labels = False,node_color = 'red',edge_color = 'black')
  plt.show()

In [None]:
dgl.random.seed(42)
D = dgl.from_networkx(g)
D_sub = dgl.metis_partition(D,reshuffle = True,balance_edges=True,k = 30) #edge balanced
sub_arr = []
num_nodes = []
num_edges = []
array_of_elements = []
for i in range( len(D_sub)):
  subgraph = D_sub[i]
  subgraph_nx = dgl.to_networkx(subgraph)
  subgraph_nx = nx.to_undirected(subgraph_nx)
  num_nodes.append(len(subgraph_nx.nodes))
  num_edges.append(len(subgraph_nx.edges))
  sub_arr.append(subgraph_nx)
edges = np.array(num_edges)
nodes = np.array(num_nodes)
sum_nodes = np.sum(nodes)
print(f"sum of nodes : {sum_nodes}")
sum_edges = np.sum(edges)
print(f"sum of edges : {sum_edges}")
G_nodes = len(g.nodes)
G_edges = len(g.edges)
print(f"node loss : {(G_nodes-sum_nodes)/G_nodes}")
print(f"edge loss : {(G_edges-sum_edges)/G_edges}")
max = np.max(edges)
avg = np.average(edges)
Bal = max/avg
print(f"edge Balance : {Bal}")
max_n = np.max(nodes)
avg_n = np.average(nodes)
Bal_n = max_n/avg_n
print(f"node Balance : {Bal_n}")
print(f"average nodes in partitions : {avg_n}")
print(f"average edges in partition :{avg}")
for i,sub_eb in enumerate(sub_arr):
  plt.figure()
  plt.figure(figsize=(8, 6))
  pos = nx.spring_layout(sub_eb)
  plt.title(f"subgraph {i}")
  nx.draw(sub_eb,pos=pos,node_size = 0.5,with_labels = False,node_color = 'red',edge_color = 'black')
  plt.show()

In [None]:
dgl.random.seed(42)
D = dgl.from_networkx(g)
D_sub = dgl.metis_partition(D,reshuffle = True,balance_edges=True,k = 50) #edge balanced
sub_arr = []
num_nodes = []
num_edges = []
array_of_elements = []
for i in range( len(D_sub)):
  subgraph = D_sub[i]
  subgraph_nx = dgl.to_networkx(subgraph)
  subgraph_nx = nx.to_undirected(subgraph_nx)
  num_nodes.append(len(subgraph_nx.nodes))
  num_edges.append(len(subgraph_nx.edges))
  sub_arr.append(subgraph_nx)
edges = np.array(num_edges)
nodes = np.array(num_nodes)
sum_nodes = np.sum(nodes)
print(f"sum of nodes : {sum_nodes}")
sum_edges = np.sum(edges)
print(f"sum of edges : {sum_edges}")
G_nodes = len(g.nodes)
G_edges = len(g.edges)
print(f"node loss : {(G_nodes-sum_nodes)/G_nodes}")
print(f"edge loss : {(G_edges-sum_edges)/G_edges}")
max = np.max(edges)
avg = np.average(edges)
Bal = max/avg
print(f"edge Balance : {Bal}")
max_n = np.max(nodes)
avg_n = np.average(nodes)
Bal_n = max_n/avg_n
print(f"node Balance : {Bal_n}")
print(f"average nodes in partitions : {avg_n}")
print(f"average edges in partition :{avg}")
for i,sub_eb in enumerate(sub_arr):
  plt.figure()
  plt.figure(figsize=(8, 6))
  pos = nx.spring_layout(sub_eb)
  plt.title(f"subgraph {i}")
  nx.draw(sub_eb,pos=pos,node_size = 0.5,with_labels = False,node_color = 'red',edge_color = 'black')
  plt.show()

In [None]:
dgl.random.seed(42)
D = dgl.from_networkx(g)
D_sub = dgl.metis_partition(D,reshuffle = True,balance_edges=True,k = 100) #edge balanced
sub_arr = []
num_nodes = []
num_edges = []
array_of_elements = []
for i in range( len(D_sub)):
  subgraph = D_sub[i]
  subgraph_nx = dgl.to_networkx(subgraph)
  subgraph_nx = nx.to_undirected(subgraph_nx)
  num_nodes.append(len(subgraph_nx.nodes))
  num_edges.append(len(subgraph_nx.edges))
  sub_arr.append(subgraph_nx)
edges = np.array(num_edges)
nodes = np.array(num_nodes)
sum_nodes = np.sum(nodes)
print(f"sum of nodes : {sum_nodes}")
sum_edges = np.sum(edges)
print(f"sum of edges : {sum_edges}")
G_nodes = len(g.nodes)
G_edges = len(g.edges)
print(f"node loss : {(G_nodes-sum_nodes)/G_nodes}")
print(f"edge loss : {(G_edges-sum_edges)/G_edges}")
max = np.max(edges)
avg = np.average(edges)
Bal = max/avg
print(f"edge Balance : {Bal}")
max_n = np.max(nodes)
avg_n = np.average(nodes)
Bal_n = max_n/avg_n
print(f"node Balance : {Bal_n}")
print(f"average nodes in partitions : {avg_n}")
print(f"average edges in partition :{avg}")
for i,sub_eb in enumerate(sub_arr):
  plt.figure()
  plt.figure(figsize=(8, 6))
  pos = nx.spring_layout(sub_eb)
  plt.title(f"subgraph {i}")
  nx.draw(sub_eb,pos=pos,node_size = 0.5,with_labels = False,node_color = 'red',edge_color = 'black')
  plt.show()

random partitioning using dgl

In [None]:
import os
import dgl.distributed
import matplotlib.pyplot as plt
import networkx as nx
import numpy as np

# Assuming 'g' is your original NetworkX graph
D = dgl.from_networkx(g)

# Output path for saving partitioned subgraphs
out_path = '/content/partitioned_graphs/'

# Perform random graph partitioning with balanced edges
D_sub = dgl.distributed.partition_graph(D, 'PARTITION', num_parts=15, part_method='random',
                                       balance_edges=True, out_path=out_path)


In [None]:
import os
import dgl.distributed
import matplotlib.pyplot as plt
import networkx as nx
import numpy as np

# Assuming 'g' is your original NetworkX graph
D = dgl.from_networkx(g)

# Output path for saving partitioned subgraphs
out_path = '/content/partitioned_graphs/'

# Perform random graph partitioning with balanced edges
D_sub = dgl.distributed.partition_graph(D, 'PARTITION', num_parts=15, part_method='random',
                                       balance_edges=True, out_path=out_path)

# Initialize lists for metrics
edges = []
num_nodes = []

# Loop through the partitions
for i in range(len(D_sub)):
    subgraph = D_sub[i]
    sub_nx = dgl.to_networkx(subgraph)

    # Visualize the subgraph
    plt.figure(figsize=(8, 6))
    pos = nx.spring_layout(sub_nx)
    nx.draw(sub_nx, pos, node_size=0.5)
    plt.title(f'Partition {i} Subgraph')
    plt.show()

    # Append the number of edges and nodes to the lists
    edges.append(len(sub_nx.edges()))
    num_nodes.append(len(sub_nx.nodes()))

# Convert lists to arrays
edges = np.array(edges)
nodes = np.array(num_nodes)

# Calculate sums and print results
g_nodes = len(g.nodes())
g_edges = len(g.edges())

sum_nodes = np.sum(nodes)
print(f"Sum of nodes: {sum_nodes}")
print(f"Node loss: {(g_nodes - sum_nodes) / g_nodes}")

sum_edges = np.sum(edges)
print(f"Sum of edges: {sum_edges}")
print(f"Edge loss: {(g_edges - sum_edges) / g_edges}")

# Calculate max, avg, and balance
max_edge = np.max(edges)
avg_edge = np.average(edges)
balance = max_edge / avg_edge
print(f"Balance: {balance}")

# Print edge cuts
edgecuts = D_sub.ndata[dgl.distributed.NID]['_PARTITION']  # Access edge cuts from the node data
print(f"Edge cuts: {edgecuts}")



In [None]:
import os
import dgl.distributed
import matplotlib.pyplot as plt
import networkx as nx
import numpy as np

# Assuming 'g' is your original NetworkX graph
D = dgl.from_networkx(g)

# Output path for saving partitioned subgraphs
out_path = '/content/partitioned_graphs/'

# Perform random graph partitioning with balanced edges
D_sub = dgl.distributed.partition_graph(D, 'PARTITION', num_parts=15, part_method='random',
                                       balance_edges=True, out_path=out_path)

# Check if D_sub is None
if D_sub is None:
    print("D_sub is None")
else:
    # Initialize lists for metrics
    edges = []
    num_nodes = []

    # Loop through the partitions
    for i in range(len(D_sub)):
        subgraph = D_sub[i]
        sub_nx = dgl.to_networkx(subgraph)

        # Visualize the subgraph
        plt.figure(figsize=(8, 6))
        pos = nx.spring_layout(sub_nx)
        nx.draw(sub_nx, pos, node_size=0.5)
        plt.title(f'Partition {i} Subgraph')
        plt.show()

        # Append the number of edges and nodes to the lists
        edges.append(len(sub_nx.edges()))
        num_nodes.append(len(sub_nx.nodes()))

    # Convert lists to arrays
    edges = np.array(edges)
    nodes = np.array(num_nodes)

    # Calculate sums and print results
    g_nodes = len(g.nodes())
    g_edges = len(g.edges())

    sum_nodes = np.sum(nodes)
    print(f"Sum of nodes: {sum_nodes}")
    print(f"Node loss: {(g_nodes - sum_nodes) / g_nodes}")

    sum_edges = np.sum(edges)
    print(f"Sum of edges: {sum_edges}")
    print(f"Edge loss: {(g_edges - sum_edges) / g_edges}")

    max_edge = np.max(edges)
    avg_edge = np.average(edges)
    balance = max_edge / avg_edge
    print(f"Balance: {balance}")
    edgecuts = D_sub.ndata[dgl.distributed.NID]['_PARTITION']  # Access edge cuts from the node data
    print(f"Edge cuts: {edgecuts}")

In [None]:
D = dgl.from_networkx(g)
out_path = '/content/partitioned_graphs/'

# Perform random graph partitioning with balanced edges
D_sub = dgl.distributed.partition_graph(D, 'PARTITION', num_parts=15, part_method='random',
                                       balance_edges=True, out_path=out_path)


random parttioning

In [None]:
import networkx as nx
import matplotlib.pyplot as plt
import numpy as np

# Your existing graph
# or nx.DiGraph() if your graph is directed
# Add nodes and edges to your existing graph g (not shown in the example)

# Parameters for the random partition graph
partition_sizes = [190, 500, 700, 800, 300, 1000, 900, 600, 800, 561]  # Adjust the sizes as needed
p_in = 0.25  # Probability of edges within groups
p_out = 0.01  # Probability of edges between groups

# Create a new graph based on the random partition model
G_partitioned = nx.random_partition_graph(partition_sizes, p_in, p_out)

# Add 'partition' attribute to nodes in G_partitioned
partition_info = G_partitioned.graph["partition"]
for i, size in enumerate(partition_info):
    for node in G_partitioned.nodes():
        if node < sum(partition_sizes[:i+1]) and node >= sum(partition_sizes[:i]):
            G_partitioned.nodes[node]['partition'] = i

# Now, you can add nodes and edges from your existing graph g to G_partitioned
G_partitioned.add_nodes_from(g.nodes(data=True))
G_partitioned.add_edges_from(g.edges())

# Calculate metrics
g_nodes = len(g.nodes())
g_edges = len(g.edges())
partitioned_nodes = sum(partition_sizes)

# Node loss
node_loss = (g_nodes - partitioned_nodes) / g_nodes

# Edge loss
edge_loss = (g_edges - G_partitioned.number_of_edges()) / g_edges

# Calculate balance
max_partition_size = max(partition_sizes)
avg_partition_size = np.mean(partition_sizes)
balance = max_partition_size / avg_partition_size

# Edge cut
edge_cut = sum(len(set(G_partitioned.neighbors(node)) - set(str(G_partitioned.nodes[node]['partition']))) for node in G_partitioned.nodes)

# Print metrics
print(f"Node loss: {node_loss}")
print(f"Edge loss: {edge_loss}")
print(f"Balance: {balance}")
print(f"Edge cut: {edge_cut}")
"""
# Visualize the partitioned graph
pos = nx.spring_layout(G_partitioned)
nx.draw(G_partitioned, pos, with_labels=True, node_size=50, font_size=8)
plt.show()

# Visualize each subgraph separately
for i, size in enumerate(partition_info):
    subgraph_nodes = [node for node, data in G_partitioned.nodes(data=True) if data.get('partition') == i]
    subgraph = G_partitioned.subgraph(subgraph_nodes)

    # Plot subgraph
    plt.figure()
    pos_subgraph = nx.spring_layout(subgraph)
    nx.draw(subgraph, pos_subgraph, with_labels=True, node_size=50, font_size=8)
    plt.title(f"Subgraph {i+1}")
    plt.show()"""


KeyError: ignored

In [None]:
import networkx as nx
import matplotlib.pyplot as plt

# Your existing graph
# or nx.DiGraph() if your graph is directed
# Add nodes and edges to your existing graph g (not shown in the example)

# Parameters for the random partition graph
partition_sizes = [190, 500, 700, 800, 300, 1000, 900, 600, 800, 561]  # Adjust the sizes as needed
p_in = 0.25  # Probability of edges within groups
p_out = 0.01  # Probability of edges between groups

# Create a new graph based on the random partition model
G_partitioned = nx.random_partition_graph(partition_sizes, p_in, p_out)

# Add 'partition' attribute to nodes in G_partitioned
partition_info = G_partitioned.graph["partition"]
for i, size in enumerate(partition_info):
    for node in G_partitioned.nodes():
        if node < sum(partition_sizes[:i+1]) and node >= sum(partition_sizes[:i]):
            G_partitioned.nodes[node]['partition'] = i

# Now, you can add nodes and edges from your existing graph g to G_partitioned
G_partitioned.add_nodes_from(g.nodes(data=True))
G_partitioned.add_edges_from(g.edges())

# Print the number of nodes in each partition
print("Number of partitions:", len(partition_info))
for i, size in enumerate(partition_info):
    print(f"Partition {i+1} size: {size} nodes")

# Visualize the partitioned graph
pos = nx.spring_layout(G_partitioned)
nx.draw(G_partitioned, pos, with_labels=True, node_size=50, font_size=8)
plt.show()

# Visualize each subgraph separately
for i, size in enumerate(partition_info):
    subgraph_nodes = [node for node, data in G_partitioned.nodes(data=True) if data.get('partition') == i]
    subgraph = G_partitioned.subgraph(subgraph_nodes)

    # Plot subgraph
    plt.figure()
    pos_subgraph = nx.spring_layout(subgraph)
    nx.draw(subgraph, pos_subgraph, with_labels=True, node_size=50, font_size=8)
    plt.title(f"Subgraph {i+1}")
    plt.show()


In [None]:
class TRAVELDataset(InMemoryDataset):
    r"""The Traffic Accident Prediction (TAP) dataset introduced in the
    `"TAP: A Comprehensive Data Repository for Traffic Accident Prediction in Road Networks"
    <https://arxiv.org/pdf/2304.08640>`_ paper.
    Nodes represent intersections and edges are roads.
    Node and edge features represent embeddings of geospatial features.
    The task is to predict the occurrence and severity of accidents on roadways.
    For further information, please refer to the TAP repository.
    `TAP
    <https://github.com/baixianghuang/travel>`_

    Args:
        root (string): Root directory where the dataset should be saved.
        name (string): The name of the dataset.
        transform (callable, optional): A function/transform that takes in an
            :obj:`torch_geometric.data.Data` object and returns a transformed
            version. The data object will be transformed before every access.
            (default: :obj:`None`)
        pre_transform (callable, optional): A function/transform that takes in
            an :obj:`torch_geometric.data.Data` object and returns a
            transformed version. The data object will be transformed before
            being saved to disk. (default: :obj:`None`)
    """
    url = 'https://github.com/baixianghuang/travel/raw/main/TAP-city/{}.npz'
    # url = 'https://github.com/baixianghuang/travel/raw/main/TAP-state/{}.npz'

    def __init__(self, root: str, name: str,
                 transform: Optional[Callable] = None,
                 pre_transform: Optional[Callable] = None):
        self.name = name.lower()
        super().__init__(root, transform, pre_transform)
        self.data, self.slices = torch.load(self.processed_paths[0])

    @property
    def raw_dir(self) -> str:
        return osp.join(self.root, self.name, 'raw')

    @property
    def processed_dir(self) -> str:
        return osp.join(self.root, self.name, 'processed')

    @property
    def raw_file_names(self) -> str:
        return f'{self.name}.npz'

    @property
    def processed_file_names(self) -> str:
        return 'data.pt'

    def download(self):
        download_url(self.url.format(self.name), self.raw_dir)

    def process(self):
        data = read_npz(self.raw_paths[0])
        data = data if self.pre_transform is None else self.pre_transform(data)
        data, slices = self.collate([data])
        torch.save((data, slices), self.processed_paths[0])

    def __repr__(self) -> str:
        return f'{self.name.capitalize()}Full()'

In [None]:
class TRAVELConv(MessagePassing):
    r"""
    Args:
        in_channels (int or tuple): Size of each input sample, or :obj:`-1` to
            derive the size from the first input(s) to the forward method.
            A tuple corresponds to the sizes of source and target
            dimensionalities.
        out_channels (int): Size of each output sample.
        nn (torch.nn.Module): Multiple layers of non-linear transformations
            that maps feature data of shape :obj:`[-1,
            num_node_features + num_edge_features]` to shape
            :obj:`[-1, new_dimension]`, *e.g.*, defined by
            :class:`torch.nn.Sequential`.
        aggr (string, optional): The aggregation scheme to use
            (:obj:`"add"`, :obj:`"mean"`, :obj:`"max"`).
            (default: :obj:`"add"`)
        root_weight (bool, optional): If set to :obj:`False`, the layer will
            not add the transformed root node features to the output.
            (default: :obj:`True`)
        bias (bool, optional): If set to :obj:`False`, the layer will not learn
            an additive bias. (default: :obj:`True`)
        **kwargs (optional): Additional arguments of
            :class:`torch_geometric.nn.conv.MessagePassing`.
    """
    def __init__(self, in_channels: Union[int, Tuple[int, int]],
                 out_channels: int, nn: Callable, aggr: str = 'add',
                 root_weight: bool = True, bias: bool = True, **kwargs):
        super(TRAVELConv, self).__init__(aggr=aggr, **kwargs)

        self.in_channels = in_channels
        self.out_channels = out_channels
        self.nn = nn
        self.aggr = aggr

        if isinstance(in_channels, int):
            in_channels = (in_channels, in_channels)

        self.in_channels_l = in_channels[0]

        if root_weight:
            self.root = Parameter(torch.Tensor(in_channels[1], out_channels))
        else:
            self.register_parameter('root', None)

        if bias:
            self.bias = Parameter(torch.Tensor(out_channels))
        else:
            self.register_parameter('bias', None)

        self.reset_parameters()

    def reset_parameters(self):
        reset(self.nn)
        if self.root is not None:
            uniform(self.root.size(0), self.root)
        zeros(self.bias)


    def forward(self, x: Union[Tensor, OptPairTensor], edge_index: Adj,
                edge_attr: OptTensor = None, size: Size = None) -> Tensor:
        if isinstance(x, Tensor):
            x: OptPairTensor = (x, x)

        out = self.propagate(edge_index, x=x, edge_attr=edge_attr, size=size)

        x_r = x[1]
        if x_r is not None and self.root is not None:
            out += torch.matmul(x_r, self.root)

        if self.bias is not None:
            out += self.bias
        return out


    def message(self, x_i: Tensor, x_j: Tensor, edge_attr: Tensor) -> Tensor:
        inputs = torch.cat([x_j, edge_attr], dim=1)
        return self.nn(inputs)

    def __repr__(self):
        return '{}({}, {}, aggr="{}", nn={})'.format(self.__class__.__name__,
                                                     self.in_channels,
                                                     self.out_channels,
                                                     self.aggr, self.nn)

In [None]:
def train(model, data, optimizer):
    model.train()
    optimizer.zero_grad()
    F.nll_loss(model()[data.train_mask], data.y[data.train_mask]).backward()
    optimizer.step()


@torch.no_grad()
def test(model, data):
    model.eval()
    logits, measures = model().detach(), []
    for _, mask in data('train_mask', 'val_mask', 'test_mask'):
        pred = logits[mask].max(1)[1]
        mea = f1_score(data.y[mask].cpu(), pred.cpu(), average='binary')
        measures.append(mea)

    label_pred = logits.max(1)[1]
    mask = data.test_mask
    scores = logits[mask][:,1]
    pred = logits[mask].max(1)[1]
    test_y = data.y[mask]

    test_acc = pred.eq(test_y).sum().item() / mask.sum().item()
    test_map = average_precision_score(test_y.cpu(), scores.cpu())
    test_auc = roc_auc_score(test_y.cpu(), scores.cpu())
    return measures, label_pred, test_acc, test_map, test_auc


def train_loop(model, data, optimizer, num_epochs, model_name='', city_name=''):
    epochs, train_measures, valid_measures, test_measures, test_accs, test_maps, test_aucs = [], [], [], [], [], [], []
    for epoch in range(num_epochs):
        train(model, data, optimizer)
        log = 'Epoch: {:03d}, Train: {:.4f}, Val: {:.4f}, Test: {:.4f}'
        measures, label_pred, test_acc, test_map, test_auc = test(model, data)
        train_mea, valid_mea, test_mea = measures
        epochs.append(epoch)
        train_measures.append(train_mea)
        valid_measures.append(valid_mea)
        test_measures.append(test_mea)
        test_aucs.append(test_auc)
        test_accs.append(test_acc)
        test_maps.append(test_map)

        if epoch % 5 == 0:
            clear_output(True)
            fig, (ax1, ax) = plt.subplots(1, 2, figsize=(30, 12))
            gdf_pred['label'] = label_pred.cpu().numpy()
            for i in range(class_num):
                G = nx.MultiGraph()
                G.add_nodes_from(gdf_pred[gdf_pred['label'] == i].index)
                sub1 = nx.draw(G, pos=pos_dict, ax=ax1, node_color=color_ls[i], node_size=10)

            ax.text(1, 1, log.format(epoch, train_measures[-1], valid_measures[-1], test_measures[-1]), fontsize=18)
            ax.plot(epochs, train_measures, "r", epochs, valid_measures, "g", epochs, test_measures, "b")
            ax.set_ylim([0, 1])
            ax.legend(["train", "valid", "test"])
            ax1.legend(["Negative", "Positive"])
            ax1.set_title(city_name+' '+model_name, y=-0.01)
            plt.show()

    select_idx = np.argmax(valid_measures[num_epochs//2:]) + num_epochs//2
    final_test_mea = np.array(test_measures)[select_idx]
    final_test_auc = np.array(test_aucs)[select_idx]
    final_test_acc = np.array(test_accs)[select_idx]
    final_test_map = np.array(test_maps)[select_idx]

    print('Selected epoch {}'.format(select_idx))
    print('F1 {:.5f} | AUC {:.5f} | Test Acc {:.5f} | MAP {:.5f}'.format(final_test_mea, final_test_auc, final_test_acc, final_test_map))
    return (round(final_test_mea*100, 2), round(final_test_auc*100, 2), round(final_test_acc*100, 2), round(final_test_map*100, 2))

In [None]:
# cities_sorted_by_accident.pkl is available in the directory `util` (https://github.com/baixianghuang/travel/tree/main/util)
# with open('cities_sorted_by_accident.pkl', 'rb') as fp:
#     all_city_ls = pickle.load(fp)

# print('# cities:', len(all_city_ls))
# for e in all_city_ls[:50]:
#     print(e[0]+' ('+e[1]+')', end = ', ')
# len(all_city_ls)

## Training

In [None]:
def draw_with_labels(df_nodes, model_name='test'):
    plt.figure(figsize=(6, 5))
    for i in range(class_num):
        G = nx.MultiGraph()
        G.add_nodes_from(df_nodes[df_nodes['label'] == i].index)
        nx.draw(G, pos=pos_dict, node_color=color_ls[i], node_size=3, label=i)
    plt.legend(labels=["Negative", "Positive"], loc="upper right", fontsize='small')
    plt.title(model_name, y=-0.01)
    plt.show()


d=16
p=0.5
all_res = []
color_ls = []
class_num = 2
num_epochs = 301
file_path = 'exp/'
cmap = cm.get_cmap('cool', class_num)
for i in range(class_num):
    rgba = cmap(i)
    color_ls.append(colors.rgb2hex(rgba))

In [None]:
class MLP(nn.Module):
    def __init__(self, hidden_dim=d):
        super(MLP, self).__init__()
        self.fc1 = nn.Linear(dataset.num_features, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, hidden_dim)
        self.fc3 = nn.Linear(hidden_dim, dataset.num_classes)

    def forward(self):
        x = F.relu(self.fc1(data.x))
        x = F.dropout(x, p=p, training=self.training)
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return F.log_softmax(x, dim=1)


class GCN(torch.nn.Module):
    def __init__(self, hidden_dim=d):
        super(GCN, self).__init__()
        self.conv1 = pyg_nn.GCNConv(dataset.num_features, hidden_dim)
        self.conv2 = pyg_nn.GCNConv(hidden_dim, hidden_dim)
        self.fc1 = nn.Linear(hidden_dim, dataset.num_classes)

    def forward(self):
        x, edge_index = data.x, data.edge_index
        x = F.relu(self.conv1(x, edge_index))
        x = F.dropout(x, p=p, training=self.training)
        x = F.relu(self.conv2(x, edge_index))
        x = self.fc1(x)
        return F.log_softmax(x, dim=1)


class ChebNet(torch.nn.Module):
    def __init__(self, hidden_dim=d):
        super(ChebNet, self).__init__()
        self.conv1 = pyg_nn.ChebConv(dataset.num_features, hidden_dim, K=2)
        self.conv2 = pyg_nn.ChebConv(hidden_dim, hidden_dim, K=2)
        self.fc1 = nn.Linear(hidden_dim, dataset.num_classes)

    def forward(self):
        x, edge_index = data.x, data.edge_index
        x = F.relu(self.conv1(x, edge_index))
        x = F.dropout(x, p=p, training=self.training)
        x = F.relu(self.conv2(x, edge_index))
        x = self.fc1(x)
        return F.log_softmax(x, dim=1)


class ARMANet(torch.nn.Module):
    def __init__(self, hidden_dim=d):
        super(ARMANet, self).__init__()
        self.conv1 = pyg_nn.ARMAConv(dataset.num_features, hidden_dim)
        self.conv2 = pyg_nn.ARMAConv(hidden_dim, hidden_dim)
        self.fc1 = nn.Linear(hidden_dim, dataset.num_classes)

    def forward(self):
        x, edge_index = data.x, data.edge_index
        x = F.relu(self.conv1(x, edge_index))
        x = F.dropout(x, p=p, training=self.training)
        x = F.relu(self.conv2(x, edge_index))
        x = self.fc1(x)
        return F.log_softmax(x, dim=1)


class GraphSAGE(torch.nn.Module):
    def __init__(self, dim=d):
        super(GraphSAGE, self).__init__()
        self.conv1 = pyg_nn.SAGEConv(dataset.num_features, dim)
        self.conv2 = pyg_nn.SAGEConv(dim, dim*2, normalize=True)
        self.fc1 = nn.Linear(dim*2, dataset.num_classes)

    def forward(self):
        x, edge_index = data.x, data.edge_index
        x = F.relu(self.conv1(x, edge_index))
        x = F.dropout(x, p=p, training=self.training)
        x = F.relu(self.conv2(x, edge_index))
        x = self.fc1(x)
        return F.log_softmax(x, dim=1)


class TAGCN(torch.nn.Module):
    def __init__(self, hidden_dim=d):
        super(TAGCN, self).__init__()
        self.conv1 = pyg_nn.TAGConv(dataset.num_features, hidden_dim)
        self.conv2 = pyg_nn.TAGConv(hidden_dim, hidden_dim)
        self.fc1 = nn.Linear(hidden_dim, dataset.num_classes)

    def forward(self):
        x, edge_index = data.x, data.edge_index
        x = F.relu(self.conv1(x, edge_index))
        x = F.dropout(x, p=p, training=self.training)
        x = F.relu(self.conv2(x, edge_index))
        x = self.fc1(x)
        return F.log_softmax(x, dim=1)


class GIN(torch.nn.Module):
    def __init__(self, dim=d):
        super(GIN, self).__init__()
        nn1 = nn.Sequential(nn.Linear(dataset.num_features, dim*2), nn.ReLU(), nn.Linear(dim*2, dim))
        nn2 = nn.Sequential(nn.Linear(dim, dim*2), nn.ReLU(), nn.Linear(dim*2, dim))
        self.conv1 = pyg_nn.GINConv(nn1)
        self.conv2 = pyg_nn.GINConv(nn2)
        self.fc1 = nn.Linear(dim, dataset.num_classes)

    def forward(self):
        x, edge_index = data.x, data.edge_index
        x = F.relu(self.conv1(x, edge_index))
        x = F.dropout(x, p=p, training=self.training)
        x = F.relu(self.conv2(x, edge_index))
        x = self.fc1(x)
        return F.log_softmax(x, dim=1)


class GAT(torch.nn.Module):
    def __init__(self, dim=d):
        super(GAT, self).__init__()
        self.conv1 = pyg_nn.GATConv(dataset.num_features, dim, edge_dim=edge_attr_all.shape[1])
        self.conv2 = pyg_nn.GATConv(dim, dim, edge_dim=edge_attr_all.shape[1])
        self.fc1 = nn.Linear(dim, dataset.num_classes)

    def forward(self):
        x, edge_index, edge_attr = data.x, data.edge_index, edge_attr_all
        x = F.relu(self.conv1(x, edge_index, edge_attr))
        x = F.dropout(x, p=p, training=self.training)
        x = F.relu(self.conv2(x, edge_index, edge_attr))
        x = self.fc1(x)
        return F.log_softmax(x, dim=1)


class MPNN(torch.nn.Module):
    def __init__(self, dim=d):
        super(MPNN, self).__init__()
        nn1 = nn.Sequential(nn.Linear(edge_attr_all.shape[1], 16), nn.ReLU(), nn.Linear(16, dataset.num_features*dim))
        self.conv1 = pyg_nn.NNConv(dataset.num_features, dim, nn1)
        nn2 = nn.Sequential(nn.Linear(edge_attr_all.shape[1], 16), nn.ReLU(), nn.Linear(16, dim*dim))
        self.conv2 = pyg_nn.NNConv(dim, dim, nn2)
        self.fc1 = nn.Linear(dim, dataset.num_classes)

    def forward(self):
        x, edge_index, edge_attr = data.x, data.edge_index, edge_attr_all #data.edge_attr
        x = F.relu(self.conv1(x, edge_index, edge_attr))
        x = F.dropout(x, p=p, training=self.training)
        x = F.relu(self.conv2(x, edge_index, edge_attr))
        x = self.fc1(x)
        return F.log_softmax(x, dim=1)


class CGC(torch.nn.Module):
    def __init__(self, dim=d):
        super(CGC, self).__init__()
        self.conv1 = pyg_nn.CGConv(dataset.num_features, edge_attr_all.size(-1))
        self.conv2 = pyg_nn.CGConv(dataset.num_features, edge_attr_all.size(-1))
        self.fc1 = nn.Linear(dataset.num_features, dataset.num_classes)

    def forward(self):
        x, edge_index, edge_attr = data.x, data.edge_index, edge_attr_all
        x = F.relu(self.conv1(x, edge_index, edge_attr)) #
        x = F.dropout(x, p=p, training=self.training)
        x = F.relu(self.conv2(x, edge_index, edge_attr))
        x = self.fc1(x)
        return F.log_softmax(x, dim=-1)


class GraphTransformer(torch.nn.Module):
    def __init__(self, dim=d):
        super(GraphTransformer, self).__init__()
        self.conv1 = pyg_nn.TransformerConv(dataset.num_features, dim, edge_dim=edge_attr_all.shape[1])
        self.conv2 = pyg_nn.TransformerConv(dim, dim, edge_dim=edge_attr_all.shape[1])
        self.fc1 = nn.Linear(dim, dataset.num_classes)

    def forward(self):
        x, edge_index, edge_attr = data.x, data.edge_index, edge_attr_all
        x = F.relu(self.conv1(x, edge_index, edge_attr))
        x = F.dropout(x, p=p, training=self.training)
        x = F.relu(self.conv2(x, edge_index, edge_attr))
        x = self.fc1(x)
        return F.log_softmax(x, dim=1)


class GEN(torch.nn.Module):
    def __init__(self, dim=d):
        super(GEN, self).__init__()
        self.node_encoder = nn.Linear(data.x.size(-1), dim)
        self.edge_encoder = nn.Linear(edge_attr_all.size(-1), dim)
        self.conv1 = pyg_nn.GENConv(dim, dim)
        self.conv2 = pyg_nn.GENConv(dim, dim)
        self.fc1 = nn.Linear(dim, dataset.num_classes)

    def forward(self):
        x, edge_index, edge_attr = self.node_encoder(data.x), data.edge_index, self.edge_encoder(edge_attr_all)
        x = F.relu(self.conv1(x, edge_index, edge_attr))
        x = F.dropout(x, p=p, training=self.training)
        x = F.relu(self.conv2(x, edge_index, edge_attr))
        x = self.fc1(x)
        return F.log_softmax(x, dim=1)


class TRAVELNet(torch.nn.Module):
    def __init__(self, dim=d):
        super(TRAVELNet, self).__init__()
        convdim = 8
        self.node_encoder = nn.Sequential(nn.Linear(data.x.size(-1), dim), nn.LeakyReLU(), nn.Linear(dim, dim))
        self.edge_encoder_dir = nn.Sequential(nn.Linear(data.component_dir.size(-1), dim), nn.LeakyReLU(), nn.Linear(dim, dim))
        self.edge_encoder_ang = nn.Sequential(nn.Linear(data.component_ang.size(-1), dim), nn.LeakyReLU(), nn.Linear(dim, dim))
        nn1 = nn.Sequential(nn.Linear(dim + dim, dim), nn.LeakyReLU(), nn.Linear(dim, dim), nn.LeakyReLU(), nn.Linear(dim, convdim))
        self.conv1 = TRAVELConv(dim, convdim, nn1)
        nn2 = nn.Sequential(nn.Linear(2*convdim + dim, dim), nn.LeakyReLU(), nn.Linear(dim, dim), nn.LeakyReLU(), nn.Linear(dim, dataset.num_classes))
        self.conv2 = TRAVELConv(2*convdim, dataset.num_classes, nn2)
        self.bn1 = nn.BatchNorm1d(convdim*2)
        nn1_2 = nn.Sequential(nn.Linear(dim + dim, dim), nn.LeakyReLU(), nn.Linear(dim, dim), nn.LeakyReLU(), nn.Linear(dim, convdim))
        self.conv1_2 = TRAVELConv(dim, convdim, nn1_2)
        nn2_2 = nn.Sequential(nn.Linear(2*convdim + dim, dim), nn.LeakyReLU(), nn.Linear(dim, dim), nn.LeakyReLU(), nn.Linear(dim, dataset.num_classes))
        self.conv2_2 = TRAVELConv(2*convdim, dataset.num_classes, nn2_2)
        self.bn2 = nn.BatchNorm1d(dataset.num_classes*2)
        self.fc = nn.Linear(dataset.num_classes*2, dataset.num_classes)

    def forward(self):
        x, edge_index = self.node_encoder(data.x), data.edge_index
        edge_attr_dir, edge_attr_ang = self.edge_encoder_dir(data.component_dir), self.edge_encoder_ang(data.component_ang)
        x1 = F.relu(self.conv1(x, edge_index, edge_attr_dir))
        x2 = F.relu(self.conv1_2(x, edge_index, edge_attr_ang))
        x = torch.cat((x1, x2), axis=1)
        x = self.bn1(x)
        x = F.dropout(x, p=p, training=self.training)
        x1 = F.relu(self.conv2(x, edge_index, edge_attr_dir))
        x2 = F.relu(self.conv2_2(x, edge_index, edge_attr_ang))
        x = torch.cat((x1, x2), axis=1)
        x = self.fc(x)
        return F.log_softmax(x, dim=1)

In [None]:
%%time
for e in [('Miami', 'Florida'),('Los Angeles', 'California'),('Orlando', 'Florida'),
          ('Dallas', 'Texas'),('Houston', 'Texas'),('New York', 'New York')]:
    city_name, state_abbrev = e[0].lower().replace(" ", "_"), us_state_to_abbrev[e[1]].lower()
    city_format = e[0]+' ('+us_state_to_abbrev[e[1]]+')'
    if os.path.exists(file_path+city_name+'_'+state_abbrev+'/processed'):
        shutil.rmtree(file_path+city_name+'_'+state_abbrev+'/processed')
    dataset = TRAVELDataset(file_path, city_name+'_'+state_abbrev)
    data = dataset[0]
    class_num = dataset.num_classes
    # print(f'Number of graphs: {len(dataset)}')
    # print(f'Number of node features: {dataset.num_features}')
    # print(f'Number of edge features: {dataset.num_edge_features}')
    # print(f'Number of classes: {dataset.num_classes}')
    # print(f'Number of nodes: {data.num_nodes}')
    # print(f'Number of edges: {data.num_edges}')
    # print(f'Average node degree: {data.num_edges / data.num_nodes:.2f}')
    # print(f'Contains isolated nodes: {data.has_isolated_nodes()}')
    # print(f'Contains self-loops: {data.has_self_loops()}')
    # print(f'Is undirected: {data.is_undirected()}')

    # 60%, 20% and 20% for training, validation and test
    data.train_mask, data.val_mask, data.test_mask = train_test_split_stratify(dataset, train_ratio=0.6, val_ratio=0.2, class_num=class_num)
    sc = MinMaxScaler()
    data.x[data.train_mask] = torch.tensor(sc.fit_transform(data.x[data.train_mask]), dtype=torch.float)
    data.x[data.val_mask] = torch.tensor(sc.transform(data.x[data.val_mask]), dtype=torch.float)
    data.x[data.test_mask] = torch.tensor(sc.transform(data.x[data.test_mask]), dtype=torch.float)

    edge_attr_all = MinMaxScaler().fit_transform(data.edge_attr.cpu())
    edge_attr_all = torch.tensor(edge_attr_all).float().to(device)

    coords = data.coords.numpy()
    gdf_pred = pd.DataFrame({'x': coords[:, 0], 'y': coords[:, 1], 'label': data.y.numpy()})
    zip_iterator = zip(gdf_pred.index, gdf_pred[['x', 'y']].values)
    pos_dict = dict(zip_iterator)
    draw_with_labels(gdf_pred, 'Ground Truth')

    X_train, X_test, y_train, y_test = data.x[data.train_mask].cpu().numpy(), data.x[data.test_mask].cpu().numpy(), data.y[data.train_mask].cpu().numpy(), data.y[data.test_mask].cpu().numpy()
    start_time = time.time()
    xgb_clf = XGBClassifier(use_label_encoder=False, eval_metric='logloss')
    xgb_clf.fit(X_train, y_train)
    y_pred = xgb_clf.predict(X_test)
    test_acc, test_f1, test_map, test_auc = accuracy_score(y_test, y_pred), f1_score(y_test, y_pred, average='binary'), average_precision_score(y_test, y_pred), roc_auc_score(y_test, y_pred)
    print('f1_score {:.5f} | AUC {:.5f} | Test Accuracy {:.5f} | MAP {:.5f}'.format(test_f1, test_auc, test_acc, test_map))
    res = (round(test_f1*100, 2), round(test_auc*100, 2), round(test_acc*100, 2), round(test_map*100, 2))
    t = round(time.time() - start_time, 2)
    all_res.append((city_format,) + ('XGBoost',) + res + (t,))
    print("Execution time: %.4f seconds" % t)

    data = data.to(device)

    start_time = time.time()
    model = MLP().to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)
    res = train_loop(model, data, optimizer, num_epochs, 'MLP', city_name)
    t = round(time.time() - start_time, 2)
    all_res.append((city_format,) + ('MLP',) + res + (t,))
    print("Execution time: %.4f seconds" % t)

    start_time = time.time()
    model = GCN().to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)
    res = train_loop(model, data, optimizer, num_epochs, 'GCN', city_name)
    t = round(time.time() - start_time, 2)
    all_res.append((city_format,) + ('GCN',) + res + (t,))
    print("Execution time: %.4f seconds" % t)

    start_time = time.time()
    model = ChebNet().to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=0.02, weight_decay=5e-4)
    res = train_loop(model, data, optimizer, num_epochs, 'ChebNet', city_name)
    t = round(time.time() - start_time, 2)
    all_res.append((city_format,) + ('ChebNet',) + res + (t,))
    print("Execution time: %.4f seconds" % t)

    start_time = time.time()
    model = ARMANet().to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=0.02, weight_decay=5e-4)
    res = train_loop(model, data, optimizer, num_epochs, 'ARMANet', city_name)
    t = round(time.time() - start_time, 2)
    all_res.append((city_format,) + ('ARMANet',) + res + (t,))
    print("Execution time: %.4f seconds" % t)

    start_time = time.time()
    model = GraphSAGE().to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=0.003, weight_decay=5e-4)
    res = train_loop(model, data, optimizer, num_epochs, 'GraphSAGE', city_name)
    t = round(time.time() - start_time, 2)
    all_res.append((city_format,) + ('GraphSAGE',) + res + (t,))
    print("Execution time: %.4f seconds" % t)

    start_time = time.time()
    model = TAGCN().to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)
    res = train_loop(model, data, optimizer, num_epochs, 'TAGCN', city_name)
    t = round(time.time() - start_time, 2)
    all_res.append((city_format,) + ('TAGCN',) + res + (t,))
    print("Execution time: %.4f seconds" % t)

    start_time = time.time()
    model = GIN().to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=0.005, weight_decay=5e-4)
    res = train_loop(model, data, optimizer, num_epochs, 'GIN', city_name)
    t = round(time.time() - start_time, 2)
    all_res.append((city_format,) + ('GIN',) + res + (t,))
    print("Execution time: %.4f seconds" % t)

    start_time = time.time()
    model = GAT().to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=0.007, weight_decay=5e-4)
    res = train_loop(model, data, optimizer, num_epochs, 'GAT', city_name)
    t = round(time.time() - start_time, 2)
    all_res.append((city_format,) + ('GAT',) + res + (t,))
    print("Execution time: %.4f seconds" % t)

    start_time = time.time()
    model = MPNN().to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=0.001, weight_decay=5e-4)
    res = train_loop(model, data, optimizer, num_epochs, 'MPNN', city_name)
    t = round(time.time() - start_time, 2)
    all_res.append((city_format,) + ('MPNN',) + res + (t,))
    print("Execution time: %.4f seconds" % t)

    start_time = time.time()
    model = CGC().to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=0.015, weight_decay=5e-4)
    res = train_loop(model, data, optimizer, num_epochs, 'CGC', city_name)
    t = round(time.time() - start_time, 2)
    all_res.append((city_format,) + ('CGC',) + res + (t,))
    print("Execution time: %.4f seconds" % t)

    start_time = time.time()
    model = GraphTransformer().to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)
    res = train_loop(model, data, optimizer, num_epochs, 'Transformer', city_name)
    t = round(time.time() - start_time, 2)
    all_res.append((city_format,) + ('Transformer',) + res + (t,))
    print("Execution time: %.4f seconds" % t)

    start_time = time.time()
    model = GEN().to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=0.002, weight_decay=5e-4)
    res = train_loop(model, data, optimizer, num_epochs, 'GEN', city_name)
    t = round(time.time() - start_time, 2)
    all_res.append((city_format,) + ('GEN',) + res + (t,))
    print("Execution time: %.4f seconds" % t)

    # Note that directional and angular edge features are precomputed in our datasets
    component_dir = np.concatenate((data.edge_attr.cpu(), data.edge_attr_dir.cpu()), axis=1)
    component_ang = np.concatenate((data.edge_attr.cpu(), data.edge_attr_ang.cpu()), axis=1)
    component_dir = StandardScaler().fit_transform(component_dir)
    component_ang = StandardScaler().fit_transform(component_ang)
    data.component_dir = torch.tensor(component_dir).float().to(device)
    data.component_ang = torch.tensor(component_ang).float().to(device)

    start_time = time.time()
    model = TRAVELNet().to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=0.005, weight_decay=5e-4)
    res = train_loop(model, data, optimizer, num_epochs, 'TRAVEL', city_name)
    t = round(time.time() - start_time, 2)
    all_res.append((city_format,) + ('TRAVEL',) + res + (t,))
    print("Execution time: %.4f seconds" % t)

In [None]:
df = pd.DataFrame(all_res, columns=['City', 'Method', 'F1', 'AUC', 'Acc', 'MAP', 'Time'])
print('# datasets:', df.shape[0] // len(df.Method.unique()))
df