In [2]:
from torch_geometric.datasets import Planetoid
from torch_geometric.transforms import NormalizeFeatures

dataset = Planetoid(root='/tmp/Cora', name='Cora', transform=NormalizeFeatures())
print(f'Dataset: {dataset}:')
print('======================')
print(f'Number of graphs: {len(dataset)}')
print(f'Number of classes: {dataset.num_classes}')#number of node classes

Dataset: Cora():
Number of graphs: 1
Number of classes: 7


In [3]:
for (i,graph) in zip(range(len(dataset)),dataset):
        print(f'{i}-th graph: {graph}')
        print(f'------------')
        print(f'Number of nodes: {graph.num_nodes}')
        print(f'Number of node features: {graph.num_node_features}')
        print(f'Number of edges: {graph.num_edges}')
        print(f'Number of edge features: {graph.num_edge_features}')
        print(f'Average node degree: {graph.num_edges / graph.num_nodes:.2f}')
        print(f'Number of training nodes: {graph.train_mask.sum()}')
        print(f'Number of validation nodes: {graph.val_mask.sum()}')
        print(f'Number of test nodes: {graph.test_mask.sum()}')
        print(f'Has isolated nodes(nodes without edges): {graph.has_isolated_nodes()}')
        print(f'Has self-loops: {graph.has_self_loops()}')
        print(f'Is undirected: {graph.is_undirected()}')

0-th graph: Data(x=[2708, 1433], edge_index=[2, 10556], y=[2708], train_mask=[2708], val_mask=[2708], test_mask=[2708])
------------
Number of nodes: 2708
Number of node features: 1433
Number of edges: 10556
Number of edge features: 0
Average node degree: 3.90
Number of training nodes: 140
Number of validation nodes: 500
Number of test nodes: 1000
Has isolated nodes(nodes without edges): False
Has self-loops: False
Is undirected: True


In [8]:
graph.train_mask

tensor([ True,  True,  True,  ..., False, False, False])

In [9]:
len(graph.train_mask)

2708

In [10]:
graph.train_mask.sum()


tensor(140)

In [11]:
graph.val_mask

tensor([False, False, False,  ..., False, False, False])

In [12]:
len(graph.val_mask)

2708

In [13]:
graph.val_mask.sum()

tensor(500)

In [14]:
graph.test_mask

tensor([False, False, False,  ...,  True,  True,  True])

In [15]:
len(graph.test_mask)

2708

In [16]:
graph.test_mask.sum()

tensor(1000)

## Training an MLP on Cora

**Lets construct a simple MLP that solely operates on input node features (using shared weights across all nodes)...**

In [6]:
import torch
from torch.nn import Linear
import torch.nn.functional as F


class MLP(torch.nn.Module):
    def __init__(self, hidden_channels):
        super().__init__()
        torch.manual_seed(12345)
        self.lin1 = Linear(dataset.num_features, hidden_channels)
        self.lin2 = Linear(hidden_channels, dataset.num_classes)

    def forward(self, x):
        x = self.lin1(x)
        x = x.relu()
        x = F.dropout(x, p=0.5, training=self.training)
        x = self.lin2(x)
        return x

model = MLP(hidden_channels=16)
print(model)

criterion = torch.nn.CrossEntropyLoss()  # Define loss criterion.
optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)  # Define optimizer.

def train():
      model.train()
      optimizer.zero_grad()  # Clear gradients.
      out = model(graph.x)  # Perform a single forward pass.
      loss = criterion(out[graph.train_mask], graph.y[graph.train_mask])  # Compute the loss solely based on the training nodes.
      loss.backward()  # Derive gradients.
      optimizer.step()  # Update parameters based on gradients.
      return loss

def test():
      model.eval()
      out = model(graph.x)
      pred = out.argmax(dim=1)  # Use the class with highest probability.
      test_correct = pred[graph.test_mask] == graph.y[graph.test_mask]  # Check against ground-truth labels.
      test_acc = int(test_correct.sum()) / int(graph.test_mask.sum())  # Derive ratio of correct predictions.
      return test_acc

for epoch in range(1, 201):
    loss = train()
    print(f'Epoch: {epoch:03d}, Loss: {loss:.4f}')
    
test_acc = test()
print(f'Test Accuracy: {test_acc:.4f}')

MLP(
  (lin1): Linear(in_features=1433, out_features=16, bias=True)
  (lin2): Linear(in_features=16, out_features=7, bias=True)
)
Epoch: 001, Loss: 1.9615
Epoch: 002, Loss: 1.9557
Epoch: 003, Loss: 1.9505
Epoch: 004, Loss: 1.9423
Epoch: 005, Loss: 1.9327
Epoch: 006, Loss: 1.9279
Epoch: 007, Loss: 1.9144
Epoch: 008, Loss: 1.9087
Epoch: 009, Loss: 1.9023
Epoch: 010, Loss: 1.8893
Epoch: 011, Loss: 1.8776
Epoch: 012, Loss: 1.8594
Epoch: 013, Loss: 1.8457
Epoch: 014, Loss: 1.8365
Epoch: 015, Loss: 1.8280
Epoch: 016, Loss: 1.7965
Epoch: 017, Loss: 1.7984
Epoch: 018, Loss: 1.7832
Epoch: 019, Loss: 1.7495
Epoch: 020, Loss: 1.7441
Epoch: 021, Loss: 1.7188
Epoch: 022, Loss: 1.7124
Epoch: 023, Loss: 1.6785
Epoch: 024, Loss: 1.6660
Epoch: 025, Loss: 1.6119
Epoch: 026, Loss: 1.6236
Epoch: 027, Loss: 1.5827
Epoch: 028, Loss: 1.5784
Epoch: 029, Loss: 1.5524
Epoch: 030, Loss: 1.5020
Epoch: 031, Loss: 1.5065
Epoch: 032, Loss: 1.4742
Epoch: 033, Loss: 1.4581
Epoch: 034, Loss: 1.4246
Epoch: 035, Loss: 1.

## Converting the Above MLP into a GNN

**We can convert the MLP implemented above by simply substituting the `torch.nn.Linear()` layers with `GCNConv()` layers from PyTorch Geoemtric's `torch_geometric.nn`**

I have put the MLP code (commented off) too for comparison. The changes or differences have the comment ***DIFFERENT*** next to them.

In [10]:
from torch_geometric.nn import GCNConv

#class MLP(torch.nn.Module):
#    def __init__(self, hidden_channels):
#        super().__init__()
#        torch.manual_seed(12345)
#        self.lin1 = Linear(dataset.num_features, hidden_channels)
#        self.lin2 = Linear(hidden_channels, dataset.num_classes)

#    def forward(self, x):
#        x = self.lin1(x)
#        x = x.relu()
#        x = F.dropout(x, p=0.5, training=self.training)
#        x = self.lin2(x)
#        return x

class GCN(torch.nn.Module):
    def __init__(self, hidden_channels):
        super().__init__()
        torch.manual_seed(12345)
        self.conv1 = GCNConv(dataset.num_features, hidden_channels) #DIFFERENT
        self.conv2 = GCNConv(hidden_channels, dataset.num_classes)  #DIFFERENT

    def forward(self, x, edge_index):  #DIFFERENT
        x = self.conv1(x, edge_index)  #DIFFERENT
        x = x.relu()
        x = F.dropout(x, p=0.5, training=self.training)
        x = self.conv2(x, edge_index) #DIFFERENT
        return x

#model = MLP(hidden_channels=16)
model = GCN(hidden_channels=16)
print(model)


#optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)
#criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)
criterion = torch.nn.CrossEntropyLoss()

#def train():
#      model.train()
#      optimizer.zero_grad()
#      out = model(graph.x)
#      loss = criterion(out[graph.train_mask], graph.y[graph.train_mask])
#      loss.backward()
#      optimizer.step()
#      return loss


def train():
      model.train()
      optimizer.zero_grad()
      out = model(graph.x, graph.edge_index) #DIFFERENT
      loss = criterion(out[graph.train_mask], graph.y[graph.train_mask])
      loss.backward()
      optimizer.step()
      return loss
    
#def test():
#      model.eval()
#      out = model(graph.x)
#      pred = out.argmax(dim=1)
#      test_correct = pred[graph.test_mask] == graph.y[graph.test_mask]
#      test_acc = int(test_correct.sum()) / int(graph.test_mask.sum())
#      return test_acc


def test():
      model.eval()
      out = model(graph.x, graph.edge_index)  #DIFFERENT
      pred = out.argmax(dim=1)
      test_correct = pred[graph.test_mask] == graph.y[graph.test_mask]
      test_acc = int(test_correct.sum()) / int(graph.test_mask.sum())
      return test_acc


#for epoch in range(1, 201):
#    loss = train()
#    print(f'Epoch: {epoch:03d}, Loss: {loss:.4f}')

for epoch in range(1, 101):
    loss = train()
    print(f'Epoch: {epoch:03d}, Loss: {loss:.4f}')
    
#test_acc = test()
#print(f'Test Accuracy: {test_acc:.4f}')

test_acc = test()
print(f'Test Accuracy: {test_acc:.4f}')

GCN(
  (conv1): GCNConv(1433, 16)
  (conv2): GCNConv(16, 7)
)
Epoch: 001, Loss: 1.9463
Epoch: 002, Loss: 1.9409
Epoch: 003, Loss: 1.9343
Epoch: 004, Loss: 1.9275
Epoch: 005, Loss: 1.9181
Epoch: 006, Loss: 1.9086
Epoch: 007, Loss: 1.9015
Epoch: 008, Loss: 1.8933
Epoch: 009, Loss: 1.8808
Epoch: 010, Loss: 1.8685
Epoch: 011, Loss: 1.8598
Epoch: 012, Loss: 1.8482
Epoch: 013, Loss: 1.8290
Epoch: 014, Loss: 1.8233
Epoch: 015, Loss: 1.8057
Epoch: 016, Loss: 1.7966
Epoch: 017, Loss: 1.7825
Epoch: 018, Loss: 1.7617
Epoch: 019, Loss: 1.7491
Epoch: 020, Loss: 1.7310
Epoch: 021, Loss: 1.7147
Epoch: 022, Loss: 1.7056
Epoch: 023, Loss: 1.6954
Epoch: 024, Loss: 1.6697
Epoch: 025, Loss: 1.6538
Epoch: 026, Loss: 1.6312
Epoch: 027, Loss: 1.6161
Epoch: 028, Loss: 1.5899
Epoch: 029, Loss: 1.5711
Epoch: 030, Loss: 1.5576
Epoch: 031, Loss: 1.5393
Epoch: 032, Loss: 1.5137
Epoch: 033, Loss: 1.4948
Epoch: 034, Loss: 1.4913
Epoch: 035, Loss: 1.4698
Epoch: 036, Loss: 1.3998
Epoch: 037, Loss: 1.4041
Epoch: 038, L

**The `train()` and `test()` methods for this GNN are almost the same as the MLP, except that GNN makes use of not only the node features `x` (in case of the MLP) but also the graph connectivity `edge_index` as input to our model. This is because the `GCNConv()` layer takes into account the neighboring node information whereas the `Linear()` layer does not.**

1. `GCNConv()` is defined as:

$$
\mathbf{x}_v^{(\ell + 1)} = \mathbf{W}^{(\ell + 1)} \sum_{w \in \mathcal{N}(v) \, \cup \, \{ v \}} \frac{1}{c_{w,v}} \cdot \mathbf{x}_w^{(\ell)}
$$

where $\mathbf{W}^{(\ell + 1)}$ denotes a trainable weight matrix of shape `[num_output_features, num_input_features]` and $c_{w,v}$ refers to a fixed normalization coefficient for each edge.

2. `Linear()` is defined as:

$$
\mathbf{x}_v^{(\ell + 1)} = \mathbf{W}^{(\ell + 1)} \mathbf{x}_v^{(\ell)}
$$

By simply substituting the `Linear()` layers with `GCNConv()` layers, we obtained **81.5%** of test accuracy. This is in stark contrast to the **59%** of test accuracy obtained by our MLP, indicating that neighbourhood information plays a crucial role in obtaining better performance. The neighbourhood information is relevant and useful for Cora dataset because **cited papers are very likely related to the category of a document**.