# Ch 5. Including Node Features with Vanilla Neural Networks

This notebook compares the performance of a vanilla neural network (VNN, aka multilayer perceptron) with a graph neural network (GNN) on the Cora and Facebook Page-Page datasets.

The VNN treats the data as being essentially tabular, ignoring network topology (edges). The GNN accounts for network topology.

## 5.1 Datasets

In [1]:
from torch_geometric.datasets import FacebookPagePage, Planetoid

In [2]:
def summarize_dataset(dataset):
    data = dataset[0]
    
    print(f'Dataset: {dataset}')
    print('--------------------')
    print(f'# graphs   : {len(dataset)}')
    print(f'# nodes    : {data.x.shape[0]}')
    print(f'# features : {dataset.num_features}')
    print(f'# classes  : {dataset.num_classes}')
    print('')
    print(f'Graph:')
    print('--------------------')
    print(f'Edges are directed       : {data.is_directed()}')
    print(f'Graph has isolated nodes : {data.has_isolated_nodes()}')
    print(f'Graph has loops          : {data.has_self_loops()}')

### 5.1.1 Cora

In [3]:
cora_ds = Planetoid(root=".", name="Cora")
cora_data = cora_ds[0]

In [4]:
summarize_dataset(cora_ds)

Dataset: Cora()
--------------------
# graphs   : 1
# nodes    : 2708
# features : 1433
# classes  : 7

Graph:
--------------------
Edges are directed       : False
Graph has isolated nodes : False
Graph has loops          : False


### 5.1.2 Facebook Page-Page

In [5]:
fb_ds = FacebookPagePage(root="./FacebookPagePage")
fb_data = fb_ds[0]

In [6]:
summarize_dataset(fb_ds)

Dataset: FacebookPagePage()
--------------------
# graphs   : 1
# nodes    : 22470
# features : 128
# classes  : 4

Graph:
--------------------
Edges are directed       : False
Graph has isolated nodes : False
Graph has loops          : True


In [7]:
fb_data.train_mask = range(18000)        # Training
fb_data.val_mask = range(18001, 20000)   # Validation
fb_data.test_mask = range(20001, 22470)  # Test

## 5.2 Classifying nodes with vanilla neural networks

In [8]:
import pandas as pd
import torch
from torch.nn import Linear
import torch.nn.functional as F

In [9]:
# Simple accuracy measure -- not intended for production use
def accuracy(y_pred, y_true):
    return torch.sum(y_pred == y_true) / len(y_true)

In [10]:
# Multilayer Perceptron (MLP)
class MLP(torch.nn.Module):
    def __init__(self, dim_in, dim_h, dim_out):
        # input, hidden, output
        super().__init__()
        self.linear1 = Linear(dim_in, dim_h)
        self.linear2 = Linear(dim_h, dim_out)

    def forward(self, x):
        x = self.linear1(x)
        x = torch.relu(x)
        x = self.linear2(x)
        return F.log_softmax(x, dim=1)

    def fit(self, data, epochs):
        criterion = torch.nn.CrossEntropyLoss()
        optimizer = torch.optim.Adam(self.parameters(), lr=0.01, weight_decay=5e-4)
        self.train()
        for epoch in range(epochs+1):
            optimizer.zero_grad()
            out = self(data.x)
            loss = criterion(out[data.train_mask], data.y[data.train_mask])
            acc = accuracy(out[data.train_mask].argmax(dim=1), data.y[data.train_mask])
            loss.backward()
            optimizer.step()

            if epoch % 20 == 0:
                val_loss = criterion(out[data.val_mask], data.y[data.val_mask])
                val_acc = accuracy(out[data.val_mask].argmax(dim=1), data.y[data.val_mask])
                print(f'Epoch {epoch:>3} | Train Loss: {loss:.3f} | Train Acc: {acc*100:>5.2f}% | Val Loss: {val_loss:.2f} | Val Acc: {val_acc*100:.2f}%')

    def test(self, data):
        self.eval()
        out = self(data.x)
        acc = accuracy(out.argmax(dim=1)[data.test_mask], data.y[data.test_mask])
        return acc

### 5.2.1 Cora MLP

In [11]:
cora_df_x = pd.DataFrame(cora_data.x.numpy())
cora_df_x['label'] = pd.DataFrame(cora_data.y)

In [12]:
cora_df_x

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1424,1425,1426,1427,1428,1429,1430,1431,1432,label
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
4,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2703,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3
2704,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3
2705,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3
2706,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3


In [13]:
cora_mlp = MLP(cora_ds.num_features, 16, cora_ds.num_classes)
print(cora_mlp)
cora_mlp.fit(cora_data, epochs=100)
cora_acc = cora_mlp.test(cora_data)
print(f'Cora MLP test accuracy: {cora_acc*100:.2f}%')

MLP(
  (linear1): Linear(in_features=1433, out_features=16, bias=True)
  (linear2): Linear(in_features=16, out_features=7, bias=True)
)
Epoch   0 | Train Loss: 1.954 | Train Acc: 16.43% | Val Loss: 1.99 | Val Acc: 8.00%
Epoch  20 | Train Loss: 0.205 | Train Acc: 100.00% | Val Loss: 1.52 | Val Acc: 48.20%
Epoch  40 | Train Loss: 0.016 | Train Acc: 100.00% | Val Loss: 1.48 | Val Acc: 51.80%
Epoch  60 | Train Loss: 0.008 | Train Acc: 100.00% | Val Loss: 1.48 | Val Acc: 51.40%
Epoch  80 | Train Loss: 0.008 | Train Acc: 100.00% | Val Loss: 1.43 | Val Acc: 53.20%
Epoch 100 | Train Loss: 0.009 | Train Acc: 100.00% | Val Loss: 1.38 | Val Acc: 55.40%
Cora MLP test accuracy: 53.70%


### 5.2.2 Facebook Page-Page MLP

In [14]:
fb_df_x = pd.DataFrame(fb_data.x.numpy())
fb_df_x['label'] = pd.DataFrame(fb_data.y)

In [15]:
fb_df_x

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,119,120,121,122,123,124,125,126,127,label
0,-0.262576,-0.276483,-0.262350,-0.299327,-0.299159,-0.270681,-0.307757,-0.269733,-0.25101,-0.308343,...,-0.273229,-0.223700,-0.284379,-0.224216,-0.209509,-0.255755,-0.215140,-0.375903,-0.223836,0
1,-0.262576,-0.276483,-0.262350,-0.299327,-0.299159,-0.270681,-0.307757,-0.269733,-0.25101,-0.308343,...,-0.234818,-0.223700,-0.284379,-0.197935,-0.147256,-0.255755,-0.215140,-0.364134,-0.128634,2
2,-0.262576,-0.265053,-0.262350,-0.299327,-0.299159,-0.270681,-0.307757,-0.210461,-0.25101,3.222161,...,-0.273229,-0.223700,-0.284379,-0.224216,-0.209509,-0.255755,-0.215140,-0.375903,-0.223836,1
3,-0.246378,-0.276483,-0.241991,-0.299327,-0.299159,-0.270681,-0.307051,-0.269733,-0.25101,-0.308343,...,-0.273229,-0.223700,-0.265534,-0.080353,-0.209509,-0.250560,-0.180260,-0.375903,-0.223836,2
4,-0.262576,-0.276483,-0.262350,-0.299327,-0.299159,-0.270681,-0.307757,-0.269733,-0.25101,-0.308343,...,-0.273229,-0.175312,-0.272613,-0.224216,-0.181153,-0.255755,-0.215140,-0.370639,-0.223836,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22465,-0.262576,-0.276483,-0.262350,-0.296955,-0.299159,-0.270681,-0.307757,-0.269733,-0.25101,-0.308343,...,-0.273229,-0.223700,-0.284379,-0.224216,-0.209509,-0.255755,-0.196685,-0.370115,-0.223836,3
22466,-0.262576,-0.276483,-0.262350,-0.299327,-0.299159,-0.270681,-0.307757,-0.269733,-0.25101,-0.308343,...,-0.273229,-0.221643,-0.284379,-0.224216,-0.209509,-0.255755,-0.215140,-0.375903,-0.223836,1
22467,-0.262576,-0.276483,-0.262350,-0.299327,-0.299159,-0.270681,-0.307757,-0.269733,-0.25101,-0.308343,...,-0.273229,-0.223700,-0.284379,-0.224216,-0.146793,-0.255755,-0.180389,-0.372097,-0.222613,2
22468,-0.262576,-0.276483,-0.262350,-0.299327,-0.299159,-0.270681,-0.307668,-0.269733,-0.25101,-0.308343,...,-0.273229,-0.223700,-0.284379,-0.224216,-0.209509,-0.252456,-0.215140,-0.375903,-0.218148,1


In [16]:
fb_mlp = MLP(fb_ds.num_features, 16, fb_ds.num_classes)
print(fb_mlp)
fb_mlp.fit(fb_data, epochs=100)
fb_acc = fb_mlp.test(fb_data)
print(f'FBPP MLP test accuracy: {fb_acc*100:.2f}%')

MLP(
  (linear1): Linear(in_features=128, out_features=16, bias=True)
  (linear2): Linear(in_features=16, out_features=4, bias=True)
)
Epoch   0 | Train Loss: 1.370 | Train Acc: 31.36% | Val Loss: 1.37 | Val Acc: 31.02%
Epoch  20 | Train Loss: 0.659 | Train Acc: 74.22% | Val Loss: 0.67 | Val Acc: 72.99%
Epoch  40 | Train Loss: 0.578 | Train Acc: 77.03% | Val Loss: 0.62 | Val Acc: 74.84%
Epoch  60 | Train Loss: 0.549 | Train Acc: 78.33% | Val Loss: 0.60 | Val Acc: 75.44%
Epoch  80 | Train Loss: 0.529 | Train Acc: 78.99% | Val Loss: 0.60 | Val Acc: 75.74%
Epoch 100 | Train Loss: 0.513 | Train Acc: 79.79% | Val Loss: 0.60 | Val Acc: 75.39%
FBPP MLP test accuracy: 75.90%


## 5.3 Classifying nodes with vanilla graph neural networks

The **graph linear layer** is given by

$$
h_A = \sum_{i \in \mathscr{N}_A} {x_i W^T}
$$

where $A$ is a node and $\mathscr{N}_A$ is the set of neighbors of node $A$.

In [17]:
from torch_geometric.utils import to_dense_adj

def to_dense_adjacency_matrix(data):
    # https://pytorch-geometric.readthedocs.io/en/latest/_modules/torch_geometric/utils/_to_dense_adj.html
    adjacency = to_dense_adj(data.edge_index)[0]
    
    # Identity matrix for self loops so the central node is also considered.
    # https://pytorch.org/docs/stable/generated/torch.eye.html#torch.eye
    adjacency += torch.eye(len(adjacency))

    return adjacency

In [18]:
class VanillaGNNLayer(torch.nn.Module):
    def __init__(self, dim_in, dim_out):
        super().__init__()
        self.linear = Linear(dim_in, dim_out, bias=False)

    def forward(self, x, adjacency):
        x = self.linear(x)

        # Matrix multiplication (multiply two sparse matrices)
        # https://pytorch.org/docs/stable/generated/torch.sparse.mm.html
        x = torch.sparse.mm(adjacency, x)
        return x

In [19]:
class VanillaGNN(torch.nn.Module):
    def __init__(self, dim_in, dim_h, dim_out):
        super().__init__()
        self.gnn1 = VanillaGNNLayer(dim_in, dim_h)
        self.gnn2 = VanillaGNNLayer(dim_h, dim_out)

    def forward(self, x, adjacency):
        h = self.gnn1(x, adjacency)
        h = torch.relu(h)
        h = self.gnn2(h, adjacency)
        return F.log_softmax(h, dim=1)

    def fit(self, data, adjacency, epochs):
        criterion = torch.nn.CrossEntropyLoss()
        optimizer = torch.optim.Adam(self.parameters(), lr=0.01, weight_decay=5e-4)
        self.train()
        for epoch in range(epochs+1):
            optimizer.zero_grad()
            out = self(data.x, adjacency)
            loss = criterion(out[data.train_mask], data.y[data.train_mask])
            acc = accuracy(out[data.train_mask].argmax(dim=1), data.y[data.train_mask])
            loss.backward()
            optimizer.step()
            if epoch % 20 == 0:
                val_loss = criterion(out[data.val_mask], data.y[data.val_mask])
                val_acc = accuracy(out[data.val_mask].argmax(dim=1), data.y[data.val_mask])
                print(f'Epoch {epoch:>3} | Train Loss: {loss:.3f} | Train Acc: {acc*100:>5.2f}% | Val Loss: {val_loss:.2f} | Val Acc: {val_acc*100:.2f}%')

    def test(self, data, adjacency):
        self.eval()
        out = self(data.x, adjacency)
        acc = accuracy(out.argmax(dim=1)[data.test_mask], data.y[data.test_mask])
        return acc

### 5.3.1 Cora GNN

In [20]:
cora_dense_adj = to_dense_adjacency_matrix(cora_data)
cora_dense_adj

tensor([[1., 0., 0.,  ..., 0., 0., 0.],
        [0., 1., 1.,  ..., 0., 0., 0.],
        [0., 1., 1.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 1., 0., 0.],
        [0., 0., 0.,  ..., 0., 1., 1.],
        [0., 0., 0.,  ..., 0., 1., 1.]])

In [21]:
cora_gnn = VanillaGNN(cora_ds.num_features, 16, cora_ds.num_classes)
print(cora_gnn)
cora_gnn.fit(cora_data, cora_dense_adj, epochs=100)
cora_acc = cora_gnn.test(cora_data, cora_dense_adj)
print(f'\nGNN test accuracy: {cora_acc*100:.2f}%')

VanillaGNN(
  (gnn1): VanillaGNNLayer(
    (linear): Linear(in_features=1433, out_features=16, bias=False)
  )
  (gnn2): VanillaGNNLayer(
    (linear): Linear(in_features=16, out_features=7, bias=False)
  )
)
Epoch   0 | Train Loss: 2.154 | Train Acc: 11.43% | Val Loss: 2.19 | Val Acc: 6.40%
Epoch  20 | Train Loss: 0.122 | Train Acc: 99.29% | Val Loss: 1.26 | Val Acc: 75.20%
Epoch  40 | Train Loss: 0.010 | Train Acc: 100.00% | Val Loss: 1.80 | Val Acc: 78.40%
Epoch  60 | Train Loss: 0.003 | Train Acc: 100.00% | Val Loss: 1.96 | Val Acc: 77.80%
Epoch  80 | Train Loss: 0.002 | Train Acc: 100.00% | Val Loss: 1.97 | Val Acc: 77.00%
Epoch 100 | Train Loss: 0.002 | Train Acc: 100.00% | Val Loss: 1.95 | Val Acc: 77.20%

GNN test accuracy: 77.00%


### 5.3.2 Facebook Page-Page GNN

In [22]:
fb_dense_adj = to_dense_adjacency_matrix(fb_data)
fb_dense_adj

tensor([[1., 0., 0.,  ..., 0., 0., 0.],
        [0., 1., 0.,  ..., 0., 0., 0.],
        [0., 0., 1.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 1., 0., 0.],
        [0., 0., 0.,  ..., 0., 1., 0.],
        [0., 0., 0.,  ..., 0., 0., 1.]])

In [23]:
fb_gnn = VanillaGNN(fb_ds.num_features, 16, fb_ds.num_classes)
print(fb_gnn)
fb_gnn.fit(fb_data, fb_dense_adj, epochs=100)
fb_acc = fb_gnn.test(fb_data, fb_dense_adj)
print(f'\nGNN test accuracy: {fb_acc*100:.2f}%')

VanillaGNN(
  (gnn1): VanillaGNNLayer(
    (linear): Linear(in_features=128, out_features=16, bias=False)
  )
  (gnn2): VanillaGNNLayer(
    (linear): Linear(in_features=16, out_features=4, bias=False)
  )
)
Epoch   0 | Train Loss: 100.693 | Train Acc: 27.42% | Val Loss: 102.66 | Val Acc: 27.51%
Epoch  20 | Train Loss: 4.304 | Train Acc: 79.36% | Val Loss: 3.34 | Val Acc: 79.39%
Epoch  40 | Train Loss: 1.786 | Train Acc: 83.02% | Val Loss: 1.54 | Val Acc: 83.94%
Epoch  60 | Train Loss: 1.957 | Train Acc: 83.81% | Val Loss: 1.42 | Val Acc: 84.64%
Epoch  80 | Train Loss: 1.077 | Train Acc: 84.26% | Val Loss: 1.05 | Val Acc: 84.24%
Epoch 100 | Train Loss: 0.687 | Train Acc: 84.77% | Val Loss: 0.71 | Val Acc: 84.69%

GNN test accuracy: 85.26%


## 5.4 Discussion

The GNN includes topological information (node neighborhood), whereas the MLP doesn't. Including topology with the node features boosts classification performance roughly 10-20% over using node features alone.