## Chapter 5: Include Node Features with Vanilla Neural Networks

## here we will see node classification ;
## first with vanilla NNs
## second with vanilla GNNs

## the CORA dataset
### 2708 pubs - nodes
### node feature length 1433 = binary bag or words as per NLP
### 7 classes of nodes - Node classification task
### Edges = 10556 -  as per [Pytorch Geometric](https://pytorch-geometric.readthedocs.io/en/latest/generated/torch_geometric.datasets.Planetoid.html)

In [1]:
from torch_geometric.datasets import Planetoid

In [6]:
dataset_CORA = Planetoid(root=".", name="Cora") 
### I added a suffix to dataset because
### I plan to use other datasets to do the same analysis and learn more

In [20]:
data_graph_CORA = dataset_CORA[0] # get the cora graph

In [21]:
data_graph_CORA

Data(x=[2708, 1433], edge_index=[2, 10556], y=[2708], train_mask=[2708], val_mask=[2708], test_mask=[2708])

In [22]:
# Print information about the dataset
print(f'Dataset: {dataset_CORA}')
print('---------------')
print(f'Number of graphs: {len(dataset_CORA)}')
print(f'Number of nodes: {data_graph_CORA.x.shape[0]}')
print(f'Number of features: {dataset_CORA.num_features}')
print(f'Number of classes: {dataset_CORA.num_classes}')

# Print information about the graph
print(f'\nGraph:')
print('------')
print(f'Edges are directed: {data_graph_CORA.is_directed()}')
print(f'Graph has isolated nodes: {data_graph_CORA.has_isolated_nodes()}')
print(f'Graph has loops: {data_graph_CORA.has_self_loops()}')

Dataset: Cora()
---------------
Number of graphs: 1
Number of nodes: 2708
Number of features: 1433
Number of classes: 7

Graph:
------
Edges are directed: False
Graph has isolated nodes: False
Graph has loops: False


## The FB - Page-Page dataset

### nodes = 22470
### edges = ???
### node feature dim = 128
### node classification task with 4 classes

In [15]:
from torch_geometric.datasets import FacebookPagePage

In [17]:
dataset_FBPP = FacebookPagePage(root="./facebook_pagepage")

Downloading https://graphmining.ai/datasets/ptg/facebook.npz
Processing...
Done!


In [55]:
dataset_FBPP.data

Data(x=[22470, 128], edge_index=[2, 342004], y=[22470])

In [23]:
data_graph_FBPP = dataset_FBPP[0] # the graph

In [24]:
# Print information about the dataset
print(f'Dataset: {dataset_FBPP}')
print('---------------')
print(f'Number of graphs: {len(dataset_FBPP)}')
print(f'Number of nodes: {data_graph_FBPP.x.shape[0]}')
print(f'Number of features: {dataset_FBPP.num_features}')
print(f'Number of classes: {dataset_FBPP.num_classes}')

# Print information about the graph
print(f'\nGraph:')
print('------')
print(f'Edges are directed: {data_graph_FBPP.is_directed()}')
print(f'Graph has isolated nodes: {data_graph_FBPP.has_isolated_nodes()}')
print(f'Graph has loops: {data_graph_FBPP.has_self_loops()}')

Dataset: FacebookPagePage()
---------------
Number of graphs: 1
Number of nodes: 22470
Number of features: 128
Number of classes: 4

Graph:
------
Edges are directed: False
Graph has isolated nodes: False
Graph has loops: True


In [25]:
## FBPP doesn't have training, evaluation, test by default so we make them


In [26]:
data_graph_FBPP.train_mask = range(18000)
data_graph_FBPP.val_mask = range(18001, 20000)
data_graph_FBPP.test_mask = range(20001, 22470)

In [59]:
dataset_FBPP.data.train_mask = range(18000)
dataset_FBPP.data.val_mask = range(18001, 20000)
dataset_FBPP.data.test_mask = range(20001, 22470)

### MLP on Cora

In [28]:
import pandas as pd

df_x = pd.DataFrame(data_graph_CORA.x.numpy())

df_x['label'] = pd.DataFrame(data_graph_CORA.y)

In [29]:
df_x

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1424,1425,1426,1427,1428,1429,1430,1431,1432,label
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
4,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2703,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3
2704,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3
2705,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3
2706,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3


In [30]:
import torch
torch.manual_seed(0)
from torch.nn import Linear
import torch.nn.functional as F

In [31]:
def accuracy(y_pred, y_true):
    return torch.sum(y_pred == y_true) / len(y_true)

In [46]:
class MLP(torch.nn.Module):
    
    def __init__(self, dim_in, dim_h, dim_out):
        super().__init__()
        self.linear1 = Linear(dim_in, dim_h)
        self.linear2 = Linear(dim_h, dim_out)
        
    def forward(self, x):
        x = self.linear1(x)
        x = torch.relu(x)
        x = self.linear2(x)
        
        return F.log_softmax(x, dim=1)
    
    def fit(self, data, epochs):
        criterion = torch.nn.CrossEntropyLoss()
        optimizer = torch.optim.Adam(self.parameters(), lr=0.01, weight_decay=5e-4)
        self.train()
        
        for epoch in range(epochs+1):
            optimizer.zero_grad()
            out = self(data.x)
            loss = criterion(out[data.train_mask], data.y[data.train_mask])
            acc = accuracy(out[data.train_mask].argmax(dim=1),
                          data.y[data.train_mask])
            
            loss.backward()
            optimizer.step()
            
            if epoch % 20 ==0:
                val_loss = criterion(out[data.val_mask], data.y[data.val_mask])
                val_acc = accuracy(out[data.val_mask].argmax(dim=1), data.y[data.val_mask])
                print(f'Epoch {epoch:>3} | Train Loss: {loss:.3f} | Train Acc:'
                      f' {acc*100:>5.2f}% | Val Loss: {val_loss:.2f} | '
                      f'Val Acc: {val_acc*100:.2f}%')

    @torch.no_grad()      
    def test(self, data):
        self.eval()
        out = self(data.x)
        acc = accuracy(out.argmax(dim=1)[data.test_mask], data.y[data.test_mask])
        return acc
                


In [47]:
mlp = MLP(dataset_CORA.num_features, 16, dataset_CORA.num_classes)

In [48]:
print(mlp)

MLP(
  (linear1): Linear(in_features=1433, out_features=16, bias=True)
  (linear2): Linear(in_features=16, out_features=7, bias=True)
)


In [49]:
mlp.fit(dataset_CORA.data, epochs=100)

Epoch   0 | Train Loss: 1.950 | Train Acc: 17.86% | Val Loss: 1.94 | Val Acc: 12.80%
Epoch  20 | Train Loss: 0.097 | Train Acc: 100.00% | Val Loss: 1.47 | Val Acc: 54.40%
Epoch  40 | Train Loss: 0.012 | Train Acc: 100.00% | Val Loss: 1.60 | Val Acc: 53.20%
Epoch  60 | Train Loss: 0.007 | Train Acc: 100.00% | Val Loss: 1.62 | Val Acc: 50.20%
Epoch  80 | Train Loss: 0.008 | Train Acc: 100.00% | Val Loss: 1.51 | Val Acc: 50.40%
Epoch 100 | Train Loss: 0.009 | Train Acc: 100.00% | Val Loss: 1.45 | Val Acc: 53.20%


In [51]:
acc = mlp.test(dataset_CORA.data)

In [52]:
print(f'\nMLP test accuracy: {acc*100:.2f}%')


MLP test accuracy: 53.10%


## FB Page-Page graph

In [53]:
mlp_fb = MLP(dataset_FBPP.num_features, 16, dataset_FBPP.num_classes)

In [61]:
mlp_fb.fit(dataset_FBPP.data, epochs=100)

Epoch   0 | Train Loss: 1.400 | Train Acc: 26.78% | Val Loss: 1.41 | Val Acc: 26.81%
Epoch  20 | Train Loss: 0.658 | Train Acc: 73.81% | Val Loss: 0.66 | Val Acc: 72.79%
Epoch  40 | Train Loss: 0.579 | Train Acc: 76.53% | Val Loss: 0.61 | Val Acc: 75.29%
Epoch  60 | Train Loss: 0.549 | Train Acc: 78.32% | Val Loss: 0.60 | Val Acc: 76.19%
Epoch  80 | Train Loss: 0.531 | Train Acc: 78.89% | Val Loss: 0.60 | Val Acc: 75.34%
Epoch 100 | Train Loss: 0.519 | Train Acc: 79.47% | Val Loss: 0.60 | Val Acc: 75.24%


In [None]:
acc = mlp_fb.test(dataset_FBPP.data)
print(f'\nGNN test accuracy: {acc*100:.2f}%')

## Vanilla GNN

In [62]:
class VanillaGNNLayer(torch.nn.Module):
    def __init__(self, dim_in, dim_out):
        super().__init__()
        self.linear = Linear(dim_in, dim_out, bias=False)

    def forward(self, x, adjacency):
        x = self.linear(x)
        x = torch.sparse.mm(adjacency, x)
        return x

In [77]:
from torch_geometric.utils import to_dense_adj

adjacency_cora = to_dense_adj(dataset_CORA.data.edge_index)[0]
adjacency_cora += torch.eye(len(adjacency_cora))
adjacency_cora



adjacency_fbpp = to_dense_adj(dataset_FBPP.data.edge_index)[0]
adjacency_fbpp += torch.eye(len(adjacency_fbpp))
adjacency_fbpp

tensor([[1., 0., 0.,  ..., 0., 0., 0.],
        [0., 1., 0.,  ..., 0., 0., 0.],
        [0., 0., 1.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 1., 0., 0.],
        [0., 0., 0.,  ..., 0., 1., 0.],
        [0., 0., 0.,  ..., 0., 0., 1.]])

In [78]:
adjacency_cora.shape, adjacency_fbpp.shape

(torch.Size([2708, 2708]), torch.Size([22470, 22470]))

In [79]:
del adjacency

In [80]:
class VanillaGNN(torch.nn.Module):
    """Vanilla Graph Neural Network"""
    def __init__(self, dim_in, dim_h, dim_out):
        super().__init__()
        self.gnn1 = VanillaGNNLayer(dim_in, dim_h)
        self.gnn2 = VanillaGNNLayer(dim_h, dim_out)

    def forward(self, x, adjacency):
        h = self.gnn1(x, adjacency)
        h = torch.relu(h)
        h = self.gnn2(h, adjacency)
        return F.log_softmax(h, dim=1)

    def fit(self, data, epochs):
        criterion = torch.nn.CrossEntropyLoss()
        optimizer = torch.optim.Adam(self.parameters(),
                                      lr=0.01,
                                      weight_decay=5e-4)

        self.train()
        for epoch in range(epochs+1):
            optimizer.zero_grad()
            out = self(data.x, adjacency)
            loss = criterion(out[data.train_mask], data.y[data.train_mask])
            acc = accuracy(out[data.train_mask].argmax(dim=1),
                          data.y[data.train_mask])
            loss.backward()
            optimizer.step()

            if(epoch % 20 == 0):
                val_loss = criterion(out[data.val_mask], data.y[data.val_mask])
                val_acc = accuracy(out[data.val_mask].argmax(dim=1),
                                  data.y[data.val_mask])
                print(f'Epoch {epoch:>3} | Train Loss: {loss:.3f} | Train Acc:'
                      f' {acc*100:>5.2f}% | Val Loss: {val_loss:.2f} | '
                      f'Val Acc: {val_acc*100:.2f}%')

    @torch.no_grad()
    def test(self, data):
        self.eval()
        out = self(data.x, adjacency)
        acc = accuracy(out.argmax(dim=1)[data.test_mask], data.y[data.test_mask])
        return acc

In [83]:
# Create the Vanilla GNN model

adjacency = adjacency_cora
gnn = VanillaGNN(dataset_CORA.num_features, 16, dataset_CORA.num_classes)
print(gnn)

VanillaGNN(
  (gnn1): VanillaGNNLayer(
    (linear): Linear(in_features=1433, out_features=16, bias=False)
  )
  (gnn2): VanillaGNNLayer(
    (linear): Linear(in_features=16, out_features=7, bias=False)
  )
)


In [84]:
# Train
gnn.fit(dataset_CORA.data, epochs=100)

Epoch   0 | Train Loss: 2.357 | Train Acc: 22.14% | Val Loss: 2.27 | Val Acc: 17.40%
Epoch  20 | Train Loss: 0.060 | Train Acc: 100.00% | Val Loss: 2.00 | Val Acc: 71.20%
Epoch  40 | Train Loss: 0.006 | Train Acc: 100.00% | Val Loss: 2.61 | Val Acc: 73.20%
Epoch  60 | Train Loss: 0.002 | Train Acc: 100.00% | Val Loss: 2.79 | Val Acc: 73.60%
Epoch  80 | Train Loss: 0.002 | Train Acc: 100.00% | Val Loss: 2.76 | Val Acc: 74.20%
Epoch 100 | Train Loss: 0.001 | Train Acc: 100.00% | Val Loss: 2.71 | Val Acc: 74.80%


In [85]:
# Test
acc = gnn.test(dataset_CORA.data)
print(f'\nGNN test accuracy: {acc*100:.2f}%')


GNN test accuracy: 74.10%


## GNN on FB

In [88]:
adjacency=adjacency_fbpp
gnn_fb = VanillaGNN(dataset_FBPP.num_features, 16, dataset_FBPP.num_classes)

In [89]:
print(gnn_fb)

VanillaGNN(
  (gnn1): VanillaGNNLayer(
    (linear): Linear(in_features=128, out_features=16, bias=False)
  )
  (gnn2): VanillaGNNLayer(
    (linear): Linear(in_features=16, out_features=4, bias=False)
  )
)


In [90]:
gnn_fb.fit(dataset_FBPP.data, epochs=100)

Epoch   0 | Train Loss: 32.356 | Train Acc: 36.23% | Val Loss: 29.68 | Val Acc: 36.42%
Epoch  20 | Train Loss: 3.962 | Train Acc: 81.55% | Val Loss: 2.56 | Val Acc: 82.59%
Epoch  40 | Train Loss: 1.712 | Train Acc: 85.48% | Val Loss: 1.10 | Val Acc: 86.09%
Epoch  60 | Train Loss: 0.788 | Train Acc: 84.09% | Val Loss: 0.66 | Val Acc: 84.49%
Epoch  80 | Train Loss: 0.815 | Train Acc: 85.42% | Val Loss: 0.65 | Val Acc: 85.19%
Epoch 100 | Train Loss: 1.175 | Train Acc: 85.94% | Val Loss: 0.87 | Val Acc: 85.14%


In [92]:
acc = gnn_fb.test(dataset_FBPP.data)
print(f'\nGNN test accuracy: {acc*100:.2f}%')


GNN test accuracy: 85.66%
