# Solution 2: Node classification with PyG

In [1]:
import numpy as np

## Load dataset

The Cora dataset we will use is available as in the ```torch_geometric``` package.

In [2]:
from torch_geometric.datasets import Planetoid

dataset = Planetoid(root='./data/Cora', name='Cora')

  if osp.exists(f) and torch.load(f) != _repr(self.pre_transform):
  if osp.exists(f) and torch.load(f) != _repr(self.pre_filter):
  return torch.load(f, map_location)


### Show statistics of the network

In [3]:
# number of nodes and edges
dataset.print_summary()

Planetoid (#graphs=1):
+------------+----------+----------+
|            |   #nodes |   #edges |
|------------+----------+----------|
| mean       |     2708 |    10556 |
| std        |      nan |      nan |
| min        |     2708 |    10556 |
| quantile25 |     2708 |    10556 |
| median     |     2708 |    10556 |
| quantile75 |     2708 |    10556 |
| max        |     2708 |    10556 |
+------------+----------+----------+


  std=data.std().item(),


In [4]:
# number of node classes
print(dataset.num_classes)

7


In [5]:
# number of node features
print(dataset.num_node_features)

1433


### Train and test split

Unlike our previous cases, here the `train_mask` and the `test_mask` are predefined. Note: not all nodes appear in the two sets.

You can alternatively redefine them.

In [6]:
print('Train mask: ', dataset[0].train_mask, '. Training set size: ', np.count_nonzero(dataset[0].train_mask))
print('Test mask: ', dataset[0].test_mask,  ' . Test set size: ', np.count_nonzero(dataset[0].test_mask))

Train mask:  tensor([ True,  True,  True,  ..., False, False, False]) . Training set size:  140
Test mask:  tensor([False, False, False,  ...,  True,  True,  True])  . Test set size:  1000


## Define and train a simply Graph Convolutional Network model

In [7]:
import torch
import torch.nn.functional as F
from torch_geometric.nn import GCNConv

class GCN(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.conv1 = GCNConv(dataset.num_node_features, 16)
        self.conv2 = GCNConv(16, dataset.num_classes)

    def forward(self, data):
        x, edge_index = data.x, data.edge_index

        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = F.dropout(x, training=self.training)
        x = self.conv2(x, edge_index)

        return F.log_softmax(x, dim=1)

In [8]:
# find the best device to run on
if torch.backends.mps.is_available():
    device = torch.device("mps")
else:
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [9]:
# move the model and data to the device
model = GCN().to(device)
data = dataset[0].to(device)

In [10]:
optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)
model.train()
for epoch in range(200):
    optimizer.zero_grad()
    out = model(data)
    loss = F.nll_loss(out[data.train_mask], data.y[data.train_mask])
    loss.backward()
    optimizer.step()

## Node classification and evaluation

To evaluate the Graph Convolutional Network model we just trained, we use it to predict the labels of the nodes in the test set, and compare the results with the ground truth.

In [11]:
model.eval()
pred = model(data).argmax(dim=1)

y_true = data.y[data.test_mask].cpu().numpy()
y_pred = pred[data.test_mask].cpu().numpy()

We print different metrics using the `classification_report` function in `sklearn`.

In [12]:
from sklearn.metrics import classification_report

print('Performance (GCN): \n', classification_report(y_true, y_pred))

Performance (GCN): 
               precision    recall  f1-score   support

           0       0.67      0.73      0.70       130
           1       0.78      0.90      0.84        91
           2       0.87      0.92      0.90       144
           3       0.90      0.75      0.82       319
           4       0.79      0.83      0.81       149
           5       0.78      0.75      0.76       103
           6       0.67      0.84      0.74        64

    accuracy                           0.80      1000
   macro avg       0.78      0.82      0.80      1000
weighted avg       0.81      0.80      0.80      1000

