# Graph Classification with Graph Neural Networks

* raph classification refers to the problem of classifiying entire graphs

* TASK
    * molecular property prediction
    * molecules 는 그래표로 표현됨. 
    * task는 분자가 HIV 바이러스 복제를 억제하는지 여부를 추론하는 것일 수 있습니다.

* dataset
    * [TUDatasets](https://chrsmrrs.github.io/datasets/)

In [1]:
import torch
from torch_geometric.datasets import TUDataset

dataset = TUDataset(root='data/TUDataset', name='MUTAG')

print()
print(f'Dataset: {dataset}:')
print('====================')
print(f'Number of graphs: {len(dataset)}')
print(f'Number of features: {dataset.num_features}')
print(f'Number of classes: {dataset.num_classes}')

data = dataset[0]  # Get the first graph object.

print()
print(data)
print('=============================================================')

# Gather some statistics about the first graph.
print(f'Number of nodes: {data.num_nodes}')
print(f'Number of edges: {data.num_edges}')
print(f'Average node degree: {data.num_edges / data.num_nodes:.2f}')
print(f'Contains isolated nodes: {data.contains_isolated_nodes()}')
print(f'Contains self-loops: {data.contains_self_loops()}')
print(f'Is undirected: {data.is_undirected()}')

Downloading https://www.chrsmrrs.com/graphkerneldatasets/MUTAG.zip
Extracting data/TUDataset/MUTAG/MUTAG.zip
Processing...
Done!

Dataset: MUTAG(188):
Number of graphs: 188
Number of features: 7
Number of classes: 2

Data(edge_attr=[38, 4], edge_index=[2, 38], x=[17, 7], y=[1])
Number of nodes: 17
Number of edges: 38
Average node degree: 2.24
Contains isolated nodes: False
Contains self-loops: False
Is undirected: True


* 특징
    * 188개의 그래프를 가짐
    * 각각의 그래프를 이진 분류하는 것이 문제임
    * node features (7)
    * edge features (38)
    * 추가 4차원 가장자리 기능(edge_attr=[38, 4])을 제공합니다. 그러나 단순함을 위해 이러한 항목을 사용하지 않습니다.


In [2]:
torch.manual_seed(12345)
dataset = dataset.shuffle()

train_dataset = dataset[:150]
test_dataset = dataset[150:]

print(f'Number of training graphs: {len(train_dataset)}')
print(f'Number of test graphs: {len(test_dataset)}')

Number of training graphs: 150
Number of test graphs: 38


# Mini-batching of graphs

* 그래프 분류에서 그래프들은 보통 작기 때문에, 좋은 아이디어는 완전한 GPU 효율을 보장하기 위해서 gnn을 인풋에 넣기 전에 그래프를 배치로 만드는 것이다.
* 이미지 또는 언어 영역에서 이 절차는 일반적으로 각 예제를 동일한 크기의 모양 세트로 다시 조정하거나 패딩하여 달성되며, 그런 다음 예제는 추가 차원으로 그룹화됩니다.
    * 차원의 길이는 미니 배치로 그룹화된 수와 같으며 batch_size라고 한다
* 그러나 GNN의 경우 위에서 설명한 두 가지 접근 방식이 실현 가능하지 않거나 불필요한 메모리 소비가 많이 발생할 수 있습니다.
    * 따라서 PyTorch Geometric는 여러 예제에서 병렬화를 달성하기 위해 다른 접근 방식을 선택합니다.
    * 여기에서 인접 행렬은 대각선 방식으로 쌓이고(여러 개의 격리된 하위 그래프를 보유하는 거대한 그래프 생성) 노드 및 대상 기능은 단순히 노드 차원에서 연결됩니다.
        * 즉 edge를 연결시키지 않는다.
    * 배치에 따라서 길이가 달라지지 않을까라는 의심
    * 


In [3]:
from torch_geometric.data import DataLoader

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)

for step, data in enumerate(train_loader):
    print(f'Step {step + 1}:')
    print('=======')
    print(f'Number of graphs in the current batch: {data.num_graphs}')
    print(data)
    print()

Step 1:
Number of graphs in the current batch: 64
Batch(batch=[1185], edge_attr=[2624, 4], edge_index=[2, 2624], ptr=[65], x=[1185, 7], y=[64])

Step 2:
Number of graphs in the current batch: 64
Batch(batch=[1146], edge_attr=[2538, 4], edge_index=[2, 2538], ptr=[65], x=[1146, 7], y=[64])

Step 3:
Number of graphs in the current batch: 22
Batch(batch=[383], edge_attr=[832, 4], edge_index=[2, 832], ptr=[23], x=[383, 7], y=[22])



Here, we opt for a `batch_size` of 64, leading to 3 (randomly shuffled) mini-batches, containing all $2 \cdot 64+22 = 150$ graphs.

Furthermore, each `Batch` object is equipped with a **`batch` vector**, which maps each node to its respective graph in the batch:

$$
\textrm{batch} = [ 0, \ldots, 0, 1, \ldots, 1, 2, \ldots ]
$$

# GNN

1. Embed each node by performing multiple rounds of message passing
2. Aggregate node embeddings into a unified graph embedding (readout layer)
3. Train a final classifier on the graph embedding

There exists multiple readout layers in literature, but the most common one is to simply take the average of node embeddings:

$$
\mathbf{x}_{\mathcal{G}} = \frac{1}{|\mathcal{V}|} \sum_{v \in \mathcal{V}} \mathcal{x}^{(L)}_v
$$

-> global_mean_ppol이라고 있음
    * 미니 배치에 있는 모든 노드의 노드 임베딩과 할당 벡터 배치를 받아 배치의 각 그래프에 대해 [batch_size, hidden_channels] 크기의 그래프 임베딩을 계산합니다.
* GNN을 그래프 분류 작업에 적용하기 위한 최종 아키텍처는 다음과 같으며 완전한 종단 간 교육을 허용합니다.


In [4]:
from torch.nn import Linear
import torch.nn.functional as F
from torch_geometric.nn import GCNConv
from torch_geometric.nn import global_mean_pool


class GCN(torch.nn.Module):
    def __init__(self, hidden_channels):
        super(GCN, self).__init__()
        torch.manual_seed(12345)
        self.conv1 = GCNConv(dataset.num_node_features, hidden_channels)
        self.conv2 = GCNConv(hidden_channels, hidden_channels)
        self.conv3 = GCNConv(hidden_channels, hidden_channels)
        self.lin = Linear(hidden_channels, dataset.num_classes)

    def forward(self, x, edge_index, batch):
        # 1. Obtain node embeddings 
        x = self.conv1(x, edge_index)
        x = x.relu()
        x = self.conv2(x, edge_index)
        x = x.relu()
        x = self.conv3(x, edge_index)

        # 2. Readout layer
        x = global_mean_pool(x, batch)  # [batch_size, hidden_channels]

        # 3. Apply a final classifier
        x = F.dropout(x, p=0.5, training=self.training)
        x = self.lin(x)
        
        return x

model = GCN(hidden_channels=64)
print(model)

GCN(
  (conv1): GCNConv(7, 64)
  (conv2): GCNConv(64, 64)
  (conv3): GCNConv(64, 64)
  (lin): Linear(in_features=64, out_features=2, bias=True)
)


In [6]:
model = GCN(hidden_channels=64)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
criterion = torch.nn.CrossEntropyLoss()

def train():
    model.train()

    for data in train_loader:  # Iterate in batches over the training dataset.
         out = model(data.x, data.edge_index, data.batch)  # Perform a single forward pass.
         loss = criterion(out, data.y)  # Compute the loss.
         loss.backward()  # Derive gradients.
         optimizer.step()  # Update parameters based on gradients.
         optimizer.zero_grad()  # Clear gradients.

def test(loader):
     model.eval()

     correct = 0
     for data in loader:  # Iterate in batches over the training/test dataset.
         out = model(data.x, data.edge_index, data.batch)  
         pred = out.argmax(dim=1)  # Use the class with highest probability.
         correct += int((pred == data.y).sum())  # Check against ground-truth labels.
     return correct / len(loader.dataset)  # Derive ratio of correct predictions.

class Printer():
    """Print things to stdout on one line dynamically"""
    def __init__(self,num_period=10):
        self.num_period = num_period
        self.init_value = 0

    def __call__(self,data) :
        if self.init_value % self.num_period == 0 :
            print('\n'+data.__str__())
            self.init_value = 1
        else :
            sys.stdout.write("\r\x1b[K"+data.__str__())
            sys.stdout.flush()
            self.init_value += 1 

printf = Printer(num_period=50)

for epoch in range(1, 201):
    train()
    train_acc = test(train_loader)
    test_acc = test(test_loader)
    printf(f'Epoch: {epoch:03d}, Train Acc: {train_acc:.4f}, Test Acc: {test_acc:.4f}')


Epoch: 001, Train Acc: 0.6467, Test Acc: 0.7368
[KEpoch: 050, Train Acc: 0.7600, Test Acc: 0.7632
Epoch: 051, Train Acc: 0.7667, Test Acc: 0.7632
[KEpoch: 100, Train Acc: 0.7667, Test Acc: 0.7368
Epoch: 101, Train Acc: 0.7667, Test Acc: 0.7632
[KEpoch: 150, Train Acc: 0.8133, Test Acc: 0.7105
Epoch: 151, Train Acc: 0.8133, Test Acc: 0.7105
[KEpoch: 200, Train Acc: 0.8467, Test Acc: 0.7368

# 대략 모델의 결과는 76%이다


An alternative formulation ([Morris et al. (2018)](https://arxiv.org/abs/1810.02244)) omits neighborhood normalization completely and adds a simple skip-connection to the GNN layer in order to preserve central node information

$$
\mathbf{x}_v^{(\ell+1)} = \mathbf{W}^{(\ell + 1)}_1 \mathbf{x}_v^{(\ell)} + \mathbf{W}^{(\ell + 1)}_2 \sum_{w \in \mathcal{N}(v)} \mathbf{x}_w^{(\ell)}
$$



In [10]:
from torch_geometric.nn import GraphConv
class GNN(torch.nn.Module):
    def __init__(self, hidden_channels):
        super(GNN, self).__init__()
        torch.manual_seed(12345)
        self.conv1 = GraphConv(dataset.num_node_features, hidden_channels)
        self.conv2 = GraphConv(hidden_channels, hidden_channels)
        self.conv3 = GraphConv(hidden_channels, hidden_channels)
        self.lin = Linear(hidden_channels, dataset.num_classes)

    def forward(self, x, edge_index, batch):
        x = self.conv1(x, edge_index)
        x = x.relu()
        x = self.conv2(x, edge_index)
        x = x.relu()
        x = self.conv3(x, edge_index)

        x = global_mean_pool(x, batch)

        x = F.dropout(x, p=0.5, training=self.training)
        x = self.lin(x)
        
        return x

model = GNN(hidden_channels=64)
print(model)

GNN(
  (conv1): GraphConv(7, 64)
  (conv2): GraphConv(64, 64)
  (conv3): GraphConv(64, 64)
  (lin): Linear(in_features=64, out_features=2, bias=True)
)


In [12]:
model = GNN(hidden_channels=64)
print(model)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

for epoch in range(1, 201):
    train()
    train_acc = test(train_loader)
    test_acc = test(test_loader)
    printf(f'Epoch: {epoch:03d}, Train Acc: {train_acc:.4f}, Test Acc: {test_acc:.4f}')

GNN(
  (conv1): GraphConv(7, 64)
  (conv2): GraphConv(64, 64)
  (conv3): GraphConv(64, 64)
  (lin): Linear(in_features=64, out_features=2, bias=True)
)

Epoch: 001, Train Acc: 0.3533, Test Acc: 0.2632
[KEpoch: 050, Train Acc: 0.9200, Test Acc: 0.8158
Epoch: 051, Train Acc: 0.9133, Test Acc: 0.7895
[KEpoch: 100, Train Acc: 0.9467, Test Acc: 0.8684
Epoch: 101, Train Acc: 0.9333, Test Acc: 0.8158
[KEpoch: 150, Train Acc: 0.9533, Test Acc: 0.8158
Epoch: 151, Train Acc: 0.9467, Test Acc: 0.8158
[KEpoch: 200, Train Acc: 0.9200, Test Acc: 0.8158

In [None]:
test()

# 결론

*  You have learned how graphs can be batched together for better GPU utilization, and how to apply readout layers for obtaining graph embeddings rather than node embeddings.