## Import Library

In [15]:
# Import library
import time
import metis
import torch
import random
import torch_geometric
import networkx as nx
import numpy as np
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm_notebook
from torch.autograd import Variable
from sklearn.metrics import f1_score
import torch.nn as nn
import torch.nn.functional as F
from torch_geometric.data import Data
from torch_geometric.nn import GATConv
import torch_geometric.transforms as T
from torch_geometric.utils.convert import to_networkx
from torch_geometric.transforms import RandomNodeSplit

# Inline command
%matplotlib inline

In [None]:
# VK DATA SET
import os
import os.path as osp

import torch
import numpy as np
import scipy.sparse as sp
from torch_sparse import coalesce
from torch_geometric.data import (InMemoryDataset, Data, download_url,
                                  extract_zip)


class VK(InMemoryDataset):

    def __init__(self, root, transform=None, pre_transform=None):
        super().__init__(root, transform, pre_transform)
        self.data, self.slices = torch.load(self.processed_paths[0])

    @property
    def raw_file_names(self):
        return ['vk_data.npz', 'vk_graph.npz']

    @property
    def processed_file_names(self):
        return 'data.pt'

    # def download(self):
    #     path = download_url(self.url, self.raw_dir)
    #     extract_zip(path, self.raw_dir)
    #     os.unlink(path)

    def process(self):
        data = np.load(osp.join(self.raw_dir, 'vk_data.npz'))
        x = torch.from_numpy(data['feature']).to(torch.float)
        y = torch.from_numpy(data['label']).to(torch.long)
        split = torch.from_numpy(data['node_types'])

        adj = sp.load_npz(osp.join(self.raw_dir, 'vk_graph.npz'))
        row = torch.from_numpy(adj.row).to(torch.long)
        col = torch.from_numpy(adj.col).to(torch.long)
        edge_index = torch.stack([row, col], dim=0)
        edge_index, _ = coalesce(edge_index, None, x.size(0), x.size(0))

        data = Data(x=x, edge_index=edge_index, y=y)
        data.train_mask = split == 1
        data.val_mask = split == 2
        data.test_mask = split == 3

        data = data if self.pre_transform is None else self.pre_transform(data)

        torch.save(self.collate([data]), self.processed_paths[0])

dataset = VK(root='./datasets/VK')

## Load Dataset using PyG Dataset Loader

### Reddit Dataset

In [None]:
# Load dataset library
from torch_geometric.datasets import Reddit
# Download dataset
dataset = Reddit(root='./datasets/Reddit')
dataset.transform = T.NormalizeFeatures()

### Cora Dataset

In [2]:
# Load dataset library
from torch_geometric.datasets import Planetoid
# Download dataset
dataset = Planetoid(root='./datasets/Cora', name='Cora')
dataset.transform = T.NormalizeFeatures()

### Karate Club Dataset

In [None]:
# Load dataset library
from torch_geometric.datasets import KarateClub
# Download dataset
dataset = KarateClub()
dataset.transform = T.NormalizeFeatures()

## Dataset Information

In [3]:
# Print dataset information
print('Dataset: {}'.format(dataset))
print('===========================')
print('Number of graphs: {}'.format(len(dataset)))
print('Number of features: {}'.format(dataset.num_features))
print('Number of classes: {}'.format(dataset.num_classes))

Dataset: Cora()
Number of graphs: 1
Number of features: 1433
Number of classes: 7


In [4]:
# Print dataset detailed information
graph_data = dataset[0]
print('Graph details')
print('===========================')
print('Number of nodes: {}'.format(graph_data.num_nodes))
print('Number of edges: {}'.format(graph_data.num_edges))
print('Average node degree: {:.2f}'.format(graph_data.num_edges / graph_data.num_nodes))
print('Number of training nodes: {}'.format(graph_data.train_mask.sum()))
print('Number of validation nodes: {}'.format(graph_data.val_mask.sum()))
print('Number of test nodes: {}'.format(graph_data.test_mask.sum()))
print('Training node label rate: {:.2f}'.format(int(graph_data.train_mask.sum()) / graph_data.num_nodes))
print('Contains isolated nodes: {}'.format(graph_data.has_isolated_nodes()))
print('Contains self loops: {}'.format(graph_data.has_self_loops()))
print('Is undirected: {}'.format(graph_data.is_undirected()))

Graph details
Number of nodes: 2708
Number of edges: 10556
Average node degree: 3.90
Number of training nodes: 140
Number of validation nodes: 500
Number of test nodes: 1000
Training node label rate: 0.05
Contains isolated nodes: False
Contains self loops: False
Is undirected: True


## Dataset Preprocessing

### Graph Tools

In [5]:
class GraphTools(object):
    def __init__(self, graph, partition_num=10):
        self.graph = graph
        self.node_label = graph.y
        self.partition_num = partition_num
        self.node_feature_num = graph.num_node_features
        self.graph_class_num = len(np.unique(graph.y))
    
    def visualize_graph(self, color=None, epoch=None, loss=None):
        print("[Graph tools] Plotting graph")
        # Define plot properties
        plt.figure(figsize=(7,7))
        plt.xticks([])
        plt.yticks([])
        rgb = color if (color is not(None)) else np.random.rand(3,).reshape(1,-1) 

        # Check whether input is in tensor or NX graph representation
        if (torch.is_tensor(self.graph)):
            # Convert tensor to numpy array
            graph_numpy = self.graph.detach().cpu().numpy()
            # Create scatter plot
            plt.scatter(graph_numpy[:, 0], graph_numpy[:, 0], s=140, color=rgb, cmap='Set2')
            # Print additional label
            if ((epoch is not None) and (loss is not None)):
                plt.xlabel('Epoch: {}, Loss: {:.4f}'.format(epoch, loss.item()), fontsize=16)
        else:
            # Convert graph to networkx format
            graph_nx = to_networkx(self.graph, to_undirected=True, node_attrs=['x'] if (self.graph.num_node_features) else None, edge_attrs=['edge_attr'] if (self.graph.num_edge_features) else None)
            nx.draw_networkx(graph_nx, pos=nx.spring_layout(graph_nx, seed=42), with_labels=False, node_size=100, node_color=rgb, cmap='Set2')
        
        # Show graph
        plt.show()
        
    def decompose_graph(self):
        # Convert graph into networkX representation
        print("[Graph tools] Converting graph from tensor to networkX")
        self.nx_graph = to_networkx(self.graph, to_undirected=True, node_attrs=['x'] if (self.graph.num_node_features) else None, edge_attrs=['edge_attr'] if (self.graph.num_edge_features) else None)
        # Partition graph
        print("[Graph tools] Partitioning graph using metis")
        (edgecuts, parts) = metis.part_graph(self.nx_graph, self.partition_num)
        # Create cluster membership list
        self.clusters = list(set(parts))
        self.cluster_members = {node : member for node, member in enumerate(parts)}
            
    def generate_subgraph(self):
        self.subgraph_nodes = {}
        self.subgraph_edges = {}
        self.subgraph_node_features = {}
        self.subgraph_edge_features = {}
        self.subgraph_node_labels = {}
        for cluster in self.clusters:
            subgraph = self.nx_graph.subgraph([node for node in sorted(self.nx_graph.nodes()) if (self.cluster_members[node] == cluster)])
            self.subgraph_nodes[cluster] = [node[0] for node in sorted(subgraph.nodes(data=True))]
            mapper = {node : i for i, node in enumerate(sorted(self.subgraph_nodes[cluster]))}
            self.subgraph_edges[cluster] = [[mapper[edge[0]], mapper[edge[1]]] for edge in subgraph.edges()] + [[mapper[edge[1]], mapper[edge[0]]] for edge in subgraph.edges()]
            self.subgraph_node_features[cluster] = [node[1]['x'] for node in sorted(subgraph.nodes(data=True))]
            self.subgraph_node_labels[cluster] = self.node_label[self.subgraph_nodes[cluster]]
            
    def subgraph_to_tensor(self):
        for cluster in self.clusters:
            self.subgraph_nodes[cluster] = torch.LongTensor(self.subgraph_nodes[cluster])
            self.subgraph_edges[cluster] = torch.LongTensor(self.subgraph_edges[cluster])
            self.subgraph_node_features[cluster] = torch.FloatTensor(self.subgraph_node_features[cluster])
            self.subgraph_node_labels[cluster] = torch.LongTensor(self.subgraph_node_labels[cluster])

In [6]:
graph_tools = GraphTools(graph_data)
# graph_tools.visualize_graph()
graph_tools.decompose_graph()
graph_tools.generate_subgraph()
graph_tools.subgraph_to_tensor()

[Graph tools] Converting graph from tensor to networkX
[Graph tools] Partitioning graph using metis


## Graph Attention Network (GAT)

### Define Network Model

In [7]:
class GAT(nn.Module):
    def __init__(self, input_channels, output_channels, hidden_features=8, input_head=8, output_head=1):
        # Define class properties
        super(GAT, self).__init__()
        self.input_channels = input_channels
        self.output_channels = output_channels
        self.hidden_features = hidden_features
        self.input_head = input_head
        self.output_head = output_head

        # Define attention mechanism
        self.attention1 = GATConv(self.input_channels, self.hidden_features, heads=self.input_head, dropout=0.6)
        self.attention2 = GATConv((self.hidden_features*self.input_head), self.output_channels, concat=False, heads=self.output_head, dropout=0.6)

    def forward(self, data):
        # Define feedforward process
        x, edge_index = data.x, data.edge_index
        # First layer
        x = F.dropout(x, p=0.6, training=self.training)
        x = self.attention1(x, edge_index)
        x = F.elu(x)
        # Second layer
        x = F.dropout(x, p=0.6, training=self.training)
        x = self.attention2(x, edge_index)
        # Output value
        return F.log_softmax(x, dim=1)

### Define Trainer Class

In [25]:
class GATTrainer(object):
    def __init__(self, graph_cluster, epochs=100, learning_rate=0.005, num_train_per_class=10):
        # Define class properties
        self.graph_cluster = graph_cluster
        self.epochs = epochs
        self.learning_rate = learning_rate
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        self.transform = RandomNodeSplit(split='random', num_train_per_class=num_train_per_class, num_val=0.1, num_test=0.2)
        self.total_train_loss = 0
        self.total_node_count = 0
        self.create_network()
    
    def create_network(self):
        self.model = GAT(self.graph_cluster.node_feature_num, self.graph_cluster.graph_class_num)
        self.model = self.model.to(self.device)
    
    def update_avg_loss(self, batch_avg_loss, node_count):
        self.total_train_loss = self.total_train_loss + batch_avg_loss.item() * node_count
        self.total_node_count = self.total_node_count + node_count
        average_loss = self.total_train_loss / self.total_node_count
        return average_loss
    
    def inference(self, cluster):
        cluster_data = Data(x=self.graph_cluster.subgraph_node_features[cluster], edge_index=self.graph_cluster.subgraph_edges[cluster].t().contiguous(), y=self.graph_cluster.subgraph_node_labels[cluster]).to(self.device)
        self.transform(cluster_data)
        prediction = self.model(cluster_data)
        prediction = prediction[cluster_data.test_mask]
        ground_truth = cluster_data.y[cluster_data.test_mask]
        return prediction, ground_truth
    
    def forward_propagation(self, cluster):
        cluster_data = Data(x=self.graph_cluster.subgraph_node_features[cluster], edge_index=self.graph_cluster.subgraph_edges[cluster].t().contiguous(), y=self.graph_cluster.subgraph_node_labels[cluster]).to(self.device)
        self.transform(cluster_data)
        prediction = self.model(cluster_data)
        avg_loss = F.nll_loss(prediction[cluster_data.train_mask], cluster_data.y[cluster_data.train_mask])
        node_count = cluster_data.train_mask.sum()
        return avg_loss, node_count
        
    def train(self):
        self.optimizer = torch.optim.Adam(self.model.parameters(), lr=self.learning_rate, weight_decay=5e-4)
        self.model.train()
        for epoch in tqdm_notebook(range(self.epochs), desc= 'Training progress: '):
            random.shuffle(self.graph_cluster.clusters)
            for cluster in self.graph_cluster.clusters:
                self.optimizer.zero_grad()
                batch_avg_loss, node_count = self.forward_propagation(cluster)
                batch_avg_loss.backward()
                self.optimizer.step()
                average_loss = self.update_avg_loss(batch_avg_loss, node_count)
            
            if (epoch % 10) == 0:
                print("Epoch: {} - Average Loss: {}".format(epoch, average_loss))
                
    def test(self):
        self.predictions = []
        self.ground_truths = []
        self.model.eval()
        for cluster in self.graph_cluster.clusters:
            prediction, ground_truth = self.inference(cluster)
            self.predictions.append(prediction.cpu().detach().numpy())
            self.ground_truths.append(ground_truth.cpu().detach().numpy())
        self.predictions = np.concatenate(self.predictions).argmax(1)
        self.ground_truths = np.concatenate(self.ground_truths)
        model_score = f1_score(self.ground_truths, self.predictions, average='micro')
        print("F1-score: {}\n".format(model_score))
        

### Train Model

In [29]:
gat_trainer = GATTrainer(graph_tools, epochs=1000)
gat_trainer.train()
gat_trainer.test()

Training progress:   0%|          | 0/1000 [00:00<?, ?it/s]

Epoch: 0 - Average Loss: 1.9419031143188477
Epoch: 10 - Average Loss: 1.83107590675354
Epoch: 20 - Average Loss: 1.7063993215560913
Epoch: 30 - Average Loss: 1.6120610237121582
Epoch: 40 - Average Loss: 1.539084553718567
Epoch: 50 - Average Loss: 1.483109951019287
Epoch: 60 - Average Loss: 1.442809820175171
Epoch: 70 - Average Loss: 1.4083218574523926
Epoch: 80 - Average Loss: 1.379614233970642
Epoch: 90 - Average Loss: 1.3556207418441772
Epoch: 100 - Average Loss: 1.3356729745864868
Epoch: 110 - Average Loss: 1.319382905960083
Epoch: 120 - Average Loss: 1.307982325553894
Epoch: 130 - Average Loss: 1.2965439558029175
Epoch: 140 - Average Loss: 1.2863901853561401
Epoch: 150 - Average Loss: 1.27568519115448
Epoch: 160 - Average Loss: 1.2673760652542114
Epoch: 170 - Average Loss: 1.259655475616455
Epoch: 180 - Average Loss: 1.2528756856918335
Epoch: 190 - Average Loss: 1.2478652000427246
Epoch: 200 - Average Loss: 1.2436083555221558
Epoch: 210 - Average Loss: 1.2387456893920898
Epoch: 220

--------------------------

------------------------------------------

## Model Playground

### Subgraph Generation

In [None]:
new_subgraph = Data(x=graph_tools.subgraph_node_features[0], edge_index=graph_tools.subgraph_edges[0].t().contiguous(), y=graph_tools.subgraph_node_labels[0])

In [None]:
print('Graph details')
print('===========================')
print('Number of nodes: {}'.format(new_subgraph.num_nodes))
print('Number of edges: {}'.format(new_subgraph.num_edges))
print('Number of node features: {}'.format(new_subgraph.num_node_features))
print('Number of edge features: {}'.format(new_subgraph.num_edge_features))
print('Number of node labels: {}'.format(len(new_subgraph.y)))
print('Average node degree: {:.2f}'.format(new_subgraph.num_edges / new_subgraph.num_nodes))
print('Contains isolated nodes: {}'.format(new_subgraph.has_isolated_nodes()))
print('Contains self loops: {}'.format(new_subgraph.has_self_loops()))
print('Is undirected: {}'.format(new_subgraph.is_undirected()))

In [None]:
subgraph_tools = GraphTools(new_subgraph)
subgraph_tools.visualize_graph()

In [None]:
transform = RandomNodeSplit(split='random', num_train_per_class=2000, num_val=0.1, num_test=0.2)
transform(new_subgraph)

In [None]:
print('Graph details')
print('===========================')
print('Number of nodes: {}'.format(new_subgraph.num_nodes))
print('Number of edges: {}'.format(new_subgraph.num_edges))
print('Number of node features: {}'.format(new_subgraph.num_node_features))
print('Number of edge features: {}'.format(new_subgraph.num_edge_features))
print('Average node degree: {:.2f}'.format(new_subgraph.num_edges / new_subgraph.num_nodes))
print('Number of training nodes: {}'.format(new_subgraph.train_mask.sum()))
print('Training node label rate: {:.2f}'.format(int(new_subgraph.train_mask.sum()) / new_subgraph.num_nodes))
print('Contains isolated nodes: {}'.format(new_subgraph.has_isolated_nodes()))
print('Contains self loops: {}'.format(new_subgraph.has_self_loops()))
print('Is undirected: {}'.format(new_subgraph.is_undirected()))

### Training Process

In [None]:
# Configure GPU for training
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print("Device: {}".format(device))

# Initialize model and dataset
model = GAT().to(device)
data = new_subgraph.to(device)

# Define training parameters
optimizer = torch.optim.Adam(model.parameters(), lr=0.005, weight_decay=5e-4)
epochs  = 1000

# Start training process
model.train()
for epoch in range(epochs):
    model.train()
    optimizer.zero_grad()
    out = model(data)
    loss = F.nll_loss(out[data.train_mask], data.y[data.train_mask])

    if (epoch % 10) == 0:
        print("Epoch: {} - Loss: {}".format(epoch, loss))

    loss.backward()
    optimizer.step()

### Evaluation Process

In [None]:
model.eval()
_, pred = model(data).max(dim=1)
correct = float(pred[data.test_mask].eq(data.y[data.test_mask]).sum().item())
acc = correct / data.test_mask.sum().item()
print('Accuracy: {:.4f}'.format(acc))