# Chapter 3: Graph Classification

Main question: Given a network with labels on some nodes, how do we assign labels to all other nodes in the network?

This notebook guides you through iterative classification for this task.

---

## Step 1: Import necessary libraries and import the dataset

In [2]:
import torch
import torch.nn.functional as F
from torch_geometric.datasets import Planetoid
from torch_geometric.data import Data

# Load the Cora dataset
dataset = Planetoid(root='./data/Cora', name='Cora')

data = dataset[0]
data

Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.cora.x
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.cora.tx
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.cora.allx
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.cora.y
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.cora.ty
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.cora.ally
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.cora.graph
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.cora.test.index
Processing...
Done!


Data(x=[2708, 1433], edge_index=[2, 10556], y=[2708], train_mask=[2708], val_mask=[2708], test_mask=[2708])

---

## Step 2: Implement your classifiers

In [14]:
import torch_geometric as pyg
# Classifier 1: node features -> node labels
class Phi1(torch.nn.Module):
    # TODO: Implement the first classifier
    def __init__(self, in_channels, out_channels):
        super(Phi1, self).__init__()
        self.classifier = torch.nn.Sequential(
            torch.nn.Linear(in_channels, 64),
            torch.nn.ReLU(),
            torch.nn.Dropout(p=0.5),
            torch.nn.Linear(64, out_channels)
        )

    def forward(self, x):
        return self.classifier(x)
    
# Classifier 2: (node features, labels of neighbors) -> node labels
class Phi2(torch.nn.Module):
    # TODO: Implement the second classifier
    def __init__(self, in_channels, out_channels):
        super(Phi2, self).__init__()
        self.classifier = torch.nn.Sequential(
            torch.nn.Linear(in_channels, 64),
            torch.nn.ReLU(),
            torch.nn.Dropout(p=0.5),
            torch.nn.Linear(64, out_channels)
        )
    def forward(self, x):
        return self.classifier(x)

---

## Step 3: Training phase 1 (node features only for Phi 1)

In [18]:
def train_phi(model: torch.nn.Module, x: torch.Tensor, data: Data, optimizer: torch.optim.Optimizer) -> float:
    """
    Implement the training loop for the first classifier. Note that you should use the training mask to ensure that only training nodes are used.
    Args:
        model: The classifier to be trained
        x: The input features with shape (num_nodes, num_node_features)
        data: The graph data
        optimizer: The optimizer used for training
    Returns:
        The loss value
    """
    # TODO: Implement the training loop
    model.train()
    optimizer.zero_grad()
    out = model(x)
    loss = F.cross_entropy(out[data.train_mask], data.y[data.train_mask])
    loss.backward()
    optimizer.step()
    return loss.item()

def test_phi(model: torch.nn.Module, x: torch.Tensor, data: Data) -> None:
    """
    Implement the testing loop for the first classifier. Note that you should use the training mask to ensure that only training nodes are used.
    Args:
        model: The classifier to be tested
        x: The input features with shape (num_nodes, num_node_features)
        data: The graph data
    """
    model.eval()
    out = model(x)
    pred = out.argmax(dim=1)
    train_correct = pred[data.train_mask] == data.y[data.train_mask]
    train_acc = int(train_correct.sum()) / int(data.train_mask.sum())
    val_correct = pred[data.val_mask] == data.y[data.val_mask]
    val_acc = int(val_correct.sum()) / int(data.val_mask.sum())
    print("Train accuracy: ", train_acc)
    print("Val accuracy: ", val_acc)


# TODO: Initialize the first classifier and the optimizer
data.x = data.x / data.x.sum(1, keepdim=True).clamp(min=1)
phi1 = Phi1(dataset.num_node_features, dataset.num_classes)
optimizer1 = torch.optim.Adam(phi1.parameters(), lr=0.01, weight_decay=5e-4)

for epoch in range(1000):
    loss = train_phi(phi1, data.x, data, optimizer1)
    if epoch % 100 == 0:
        print(f'Epoch {epoch}, Loss: {loss:.4f}')

test_phi(phi1, data.x, data)

Epoch 0, Loss: 1.9471
Epoch 100, Loss: 0.1587
Epoch 200, Loss: 0.0932
Epoch 300, Loss: 0.1096
Epoch 400, Loss: 0.0851
Epoch 500, Loss: 0.0815
Epoch 600, Loss: 0.0845
Epoch 700, Loss: 0.0688
Epoch 800, Loss: 0.0839
Epoch 900, Loss: 0.0939
Train accuracy:  1.0
Val accuracy:  0.588


---

## Step 4: Training phase 1 (node features and relational features for Phi 2)

In [26]:
def cat_relational_features(data: Data, y_pred: torch.Tensor) -> torch.Tensor:
    """
    Summarize the relational features of each node in the graph
    Args:
        data: The graph data
        y_pred: The predicted labels of the nodes with shape (num_nodes); note that y_pred[data.train_mask] is the ground truth labels
    Returns:
        The relational features with shape (num_nodes, num_relational_features)
    """
    # TODO: Implement the function
    num_nodes = data.num_nodes
    num_classes = dataset.num_classes
    y_onehot = F.one_hot(y_pred, num_classes).float()
    neighbor_summary = torch.zeros(num_nodes, num_classes, device=data.x.device)
    
    src, dst = data.edge_index
    neighbor_summary.index_add_(0, dst, y_onehot[src])
    
    degree = torch.zeros(num_nodes, 1, device=data.x.device)
    degree.index_add_(0, dst, torch.ones(src.size(0), 1, device=data.x.device))
    degree = degree.clamp(min=1)
    neighbor_summary = neighbor_summary / degree
    
    return torch.cat([data.x, neighbor_summary], dim=1)

def get_y_pred(x: torch.Tensor, data: Data, model: torch.nn.Module) -> torch.Tensor:
    """
    Get the predicted labels of the nodes; set the labels of training nodes to the ground truth labels.
    Args:
        x: The input features with shape (num_nodes, num_node_features)
        data: The graph data
        model: The classifier
    Returns:
        The predicted labels with shape (num_nodes)
    """
    model.eval()
    out = model(x)
    y_pred = out.argmax(dim=1)
    y_pred[data.train_mask] = data.y[data.train_mask] # fix the labels of training nodes
    return y_pred

In [28]:
# TODO: Initialize the second classifier and the optimizer
phi2 = Phi2(dataset.num_features + dataset.num_classes, dataset.num_classes)
optimizer2 = torch.optim.Adam(phi2.parameters(), lr=0.01)

# Initial training with predicted labels from phi1
y_pred = get_y_pred(data.x, data, phi1)
# Concatenate the relational features to the node features
xz = cat_relational_features(data, y_pred)
for epoch in range(1000):
    loss = train_phi(phi2, xz, data, optimizer2)
    if epoch % 100 == 0:
        print(f'Epoch {epoch}, Loss: {loss:.4f}')

test_phi(phi2, xz, data)

Epoch 0, Loss: 1.9500
Epoch 100, Loss: 0.0208
Epoch 200, Loss: 0.0078
Epoch 300, Loss: 0.0050
Epoch 400, Loss: 0.0031
Epoch 500, Loss: 0.0037
Epoch 600, Loss: 0.0025
Epoch 700, Loss: 0.0018
Epoch 800, Loss: 0.0023
Epoch 900, Loss: 0.0008
Train accuracy:  1.0
Val accuracy:  0.642


---

## Step 5: Phase 2; iterate until convergence

In [None]:
def iterative_classification(data: Data, phi1: torch.nn.Module, phi2: torch.nn.Module, max_iters: int = 10) -> None:
    """
    Implement the iterative classification algorithm (phase 2).
    Args:
        data: The graph data
        phi1: The first classifier
        phi2: The second classifier
        max_iters: The maximum number of iterations
    """
    y_pred = get_y_pred(data.x, data, phi1)
    prev_y_pred = y_pred.clone()

    for i in range(max_iters):
        # TODO: Implement the iterative classification algorithm (phase 2)
        xz = cat_relational_features(data, y_pred)   
        with torch.no_grad():
            out = phi2(xz)
            new_y_pred = out.argmax(dim=1)
        new_y_pred[data.train_mask] = data.y[data.train_mask]
        change_count = (new_y_pred != y_pred).sum().item()
        val_correct = new_y_pred[data.val_mask] == data.y[data.val_mask]
        val_acc = int(val_correct.sum()) / int(data.val_mask.sum())
        print(f'Iter {i+1}: Val Acc: {val_acc:.4f}, Labels Changed: {change_count}')
        y_pred = new_y_pred
        
        if change_count == 0:
            print("Converged!")
            break
    acc = (y_pred[data.test_mask] == data.y[data.test_mask]).float().mean().item()
    print(f'Test Accuracy: {acc:.4f}')


iterative_classification(data, phi1, phi2,max_iters=10)

--- Starting Iterative Classification ---
Iter 1: Val Acc: 0.6420, Labels Changed: 981
Iter 2: Val Acc: 0.7040, Labels Changed: 561
Iter 3: Val Acc: 0.6740, Labels Changed: 394
Iter 4: Val Acc: 0.6920, Labels Changed: 338
Iter 5: Val Acc: 0.6800, Labels Changed: 315
Iter 6: Val Acc: 0.6920, Labels Changed: 299
Iter 7: Val Acc: 0.6820, Labels Changed: 284
Iter 8: Val Acc: 0.6940, Labels Changed: 276
Iter 9: Val Acc: 0.6820, Labels Changed: 275
Iter 10: Val Acc: 0.6940, Labels Changed: 275
Final Test Accuracy: 0.7090


---
# Discussions

这个任务揭示了一个很有趣的现象。单点预测确实肯定是效果很差的，但是用这个差的效果经过网络的迭代，居然最后能获得一个还不错的结果。rubbish in, rubbish out的规律似乎不成立了。但是，其实我们可以观察到，这个迭代的过程中不是简单的rubbish in的过程，而是加入了高质量的网络信息。可能正是这种网络信息提升了模型的表现。