# Pytorch_Geometric : Learning to use PyGeo

#### This notebook is adapted from a series of videos from Antonio Longa(links: https://github.com/AntonioLonga/PytorchGeometricTutorial and https://www.youtube.com/watch?v=JtDgmmQ60x8&list=PLGMXrbDNfqTzqxB1IGgimuhtfAhGd8lHF&pp=iAQB )

## Part 1 : Introduction and Graph Attention Network

### Import Librairies

In [1]:
# for Dataset exploration
import torch_geometric
from torch_geometric.datasets import Planetoid

# for Basic GNN
import torch
import torch.nn.functional as F
from torch_geometric.nn import SAGEConv

#for GAT
import numpy as np
import torch.nn as nn
from torch_geometric.data import Data
from torch_geometric.nn import GATConv
import torch_geometric.transforms as T

import matplotlib.pyplot as plt

import session_info

device= torch.device('cuda' if torch.cuda.is_available() else 'cpu')
session_info.show()

### Dataset exploration from PyGeo

#### Loading the dataset

In [2]:
dataset= Planetoid(root='exploration_cora',name="Cora")# Note that pytorch will create a file containing the download and will not re-download it if we recall this cell

#### Dataset properties

In [3]:
print(dataset)
print(f'Number of graphs: {len(dataset)}')
print(f'Number of classes: {dataset.num_classes}')
print(f'Number of node features: {dataset.num_node_features}')
print(f'Number of edge features: {dataset.num_edge_features}')

Cora()
Number of graphs: 1
Number of classes: 7
Number of node features: 1433
Number of edge features: 0


In [4]:
print(dataset._data)

Data(x=[2708, 1433], edge_index=[2, 10556], y=[2708], train_mask=[2708], val_mask=[2708], test_mask=[2708])


In [5]:
print(f'train_mask: {dataset._data.x.shape}')
print(dataset._data.x) # x is the list of features for each nodes of the graph

train_mask: torch.Size([2708, 1433])
tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]])


In [6]:
print(f'edge_index: {dataset._data.edge_index.shape}')# we have two list containning 10556 nodes
print(dataset._data.edge_index)# the first list in the results give us the start of the edge, the second give us its edge partner 

edge_index: torch.Size([2, 10556])
tensor([[   0,    0,    0,  ..., 2707, 2707, 2707],
        [ 633, 1862, 2582,  ...,  598, 1473, 2706]])


In [7]:
print(f'train_mask: {dataset._data.y.shape}')
print(dataset._data.y) # y is the category of each nodes of x

train_mask: torch.Size([2708])
tensor([3, 4, 4,  ..., 3, 3, 3])


In [8]:
print(f'train_mask: {dataset._data.train_mask.shape}')
print(dataset._data.train_mask) # list of boleean that give us nodes to consider for the training part

train_mask: torch.Size([2708])
tensor([ True,  True,  True,  ..., False, False, False])


### Basic GNN with PyGeo

#### Taking the only dataset to feed our GNN

In [9]:
data= dataset[0]
data

Data(x=[2708, 1433], edge_index=[2, 10556], y=[2708], train_mask=[2708], val_mask=[2708], test_mask=[2708])

#### Define the model and optimizer

In [10]:
class Net(torch.nn.Module):
    def __init__(self):
        super(Net,self).__init__()
        
        self.gnn1= SAGEConv(dataset.num_features,
                           dataset.num_classes, 
                            aggr= 'max' # could be min,mean,add or a personnalized aggregation function
                           )
    def forward(self):
        x= self.gnn1(data.x,data.edge_index)
        return F.log_softmax(x,dim=1)

In [11]:
model= Net().to(device)
data= data.to(device)

optimizer= torch.optim.Adam(model.parameters(), lr= 0.001, weight_decay=0.0005)
num_epoch= 100

In [12]:
#Training loop:
best_val_acc=0
test_acc=0
for epoch in range(num_epoch):  # loop over the dataset multiple times
    model.train()
    optimizer.zero_grad()
    loss = F.nll_loss(model()[data.train_mask], data.y[data.train_mask])
    loss.backward()
    optimizer.step()
    
        
    # Validation and test set
    with torch.no_grad():
        logits,accs = model(), []
        for _, mask in data('train_mask','val_mask', 'test_mask'):
            pred = logits[mask].max(1)[1]
            acc= pred.eq(data.y[mask]).sum().item() / mask.sum().item()
            accs.append(acc)
        _,val_acc,tmp_test_acc= accs
        if val_acc> best_val_acc:
            best_val_acc = val_acc
            test_acc=tmp_test_acc
            
            
    if (epoch+1) % 10 ==0:
        print (f'epoch {epoch+1}/{num_epoch}, Val = {best_val_acc:.3f}, Test = {test_acc:.3f}')

epoch 10/100, Val = 0.534, Test = 0.545
epoch 20/100, Val = 0.654, Test = 0.685
epoch 30/100, Val = 0.694, Test = 0.711
epoch 40/100, Val = 0.696, Test = 0.712
epoch 50/100, Val = 0.696, Test = 0.712
epoch 60/100, Val = 0.704, Test = 0.712
epoch 70/100, Val = 0.712, Test = 0.716
epoch 80/100, Val = 0.712, Test = 0.716
epoch 90/100, Val = 0.714, Test = 0.710
epoch 100/100, Val = 0.714, Test = 0.710


### Graph Attention Network (GAT)

#### How build a GAT layer from scratch

##### first, build the layers we want to implement with an attention mechanism

In [13]:
# Define parameters of the layer

in_features=5
out_feature= 2
nb_nodes=3

#initialize the weight matrix for a linear transformation

W= nn.Parameter(torch.zeros(size=(in_features,out_feature)))
nn.init.xavier_uniform_(W.data, gain=1.414)

#genarate the input
inputs= torch.rand(nb_nodes,in_features)

#apply the linear transformation
h= torch.mm(inputs,W)
N=h.size()[0] #number of nodes in outputs
print(h.shape)


torch.Size([3, 2])


##### second, initialize the attention mechanism 

In [14]:
a= nn.Parameter(torch.zeros(size=(2*out_feature,1)))# the attention mechanism needs to take 2 times the number of outputs of the layer
nn.init.xavier_uniform_(a.data,gain=1.414)
print(a.shape)

leakRelu=nn.LeakyReLU(0.2)

torch.Size([4, 1])


##### operation that give all possible node interaction included the self interaction

In [15]:
a_input=  torch.cat([h.repeat(1,N).view(N * N,-1),h.repeat(N,1)],dim=1).view(N,-1,2*out_feature)
print(a_input.shape)

torch.Size([3, 3, 4])


##### combine the output of the layer and of the attention mechanism in the activation layer

In [16]:
e= leakRelu(torch.matmul(a_input,a).squeeze(2))
print(torch.matmul(a_input,a).shape)
print(e.shape)
print(e)

torch.Size([3, 3, 1])
torch.Size([3, 3])
tensor([[-0.2809, -0.1466, -0.3159],
        [-0.2783, -0.1441, -0.3133],
        [-0.3298, -0.1956, -0.3648]], grad_fn=<LeakyReluBackward0>)


##### for the moment, the attention is calculated on the whole nodes of the graph without taking into account the adjacency of the nodes


##### lets add a mask to include only the adjacent nodes

In [17]:
# define the adjacency matrix of our graph
adj=torch.randint(2, size=(nb_nodes,nb_nodes))
print(adj)

zero_vec = -9e15*torch.ones_like(e) # generate a matrix of size of e with -inf like values

# define where attention is needed
attention= torch.where(adj>0, e, zero_vec)# when two nodes are connected, attention keep the values as calculated previously but if there are no connection, the value is set to the -inf like value
print(attention)
# apply the softmax activation to the attention mechanism
attention=F.softmax(attention,dim=1)
print(attention)


tensor([[1, 1, 1],
        [1, 1, 0],
        [1, 0, 1]])
tensor([[-2.8088e-01, -1.4665e-01, -3.1587e-01],
        [-2.7831e-01, -1.4408e-01, -9.0000e+15],
        [-3.2985e-01, -9.0000e+15, -3.6484e-01]], grad_fn=<WhereBackward0>)
tensor([[0.3216, 0.3678, 0.3106],
        [0.4665, 0.5335, 0.0000],
        [0.5087, 0.0000, 0.4913]], grad_fn=<SoftmaxBackward0>)


#####  apply the masked attention to the previous calculated outputs of the layer

In [18]:
h_prime= torch.matmul(attention,h)
print(h_prime)
print(h)

tensor([[-0.9707, -0.8190],
        [-0.8175, -0.6210],
        [-1.1527, -1.1179]], grad_fn=<MmBackward0>)
tensor([[-0.9999, -0.9819],
        [-0.6580, -0.3054],
        [-1.3109, -1.2588]], grad_fn=<MmBackward0>)


#####  define the GATLayer from the previous work

In [19]:
class GAT_layer(nn.Module):
    def __init__(self,in_features, out_features, dropout,alpha,concat=True):
        super (GAT_layer,self).__init__()
        self.dropout= dropout
        self.in_features=in_features
        self.out_features= out_features
        self.alpha=alpha # to define how LeakyRelu handle negative values
        self.concat=concat

        self.W= nn.Parameter(torch.zeros(size=(self.in_features,self.out_features)))
        nn.init.xavier_uniform_(self.W.data, gain=1.414)
        
        self.a= nn.Parameter(torch.zeros(size=(2*self.out_features,1)))
        nn.init.xavier_uniform_(self.a.data,gain=1.414)
        
        self.leakyrelu =nn.LeakyReLU(self.alpha)
        
    def forward(self,inputs,adj):
        
        #linear transformation
        h= torch.mm(inputs,self.W)
        N=h.size()[0]
        
        #attention
        a_input=  torch.cat([h.repeat(1,N).view(N * N,-1),h.repeat(N,1)],dim=1).view(N,-1,2*self.out_features)
        e= self.leakyrelu(torch.matmul(a_input,self.a).squeeze(2))
        
        #masked attention
        zero_vec = -9e15*torch.ones_like(e) 
        attention= torch.where(adj>0, e, zero_vec)
        attention=F.softmax(attention,dim=1)
        attention= F.dropout(attention, self.dropout, training=self.training)
        h_prime= torch.matmul(attention,h)
        
        if self.concat:
            return F.elu(h_prime)
        else:
            return h_prime

##### This is just an implementation to understand how GAT works, the pre-built PyGeo Class is faster and will be use next

#### Use the GAT_layer on an exemple

In [20]:
# we already have import the Cora dataset, we just apply a transformation to normalize the features
dataset= Planetoid(root='exploration_cora',name="Cora", transform= T.NormalizeFeatures())

##### Define the model

In [21]:
class GAT(nn.Module):
    def __init__(self):
        super(GAT,self).__init__()
        self.hidden= 8
        self.in_head= 8
        self.out_head= 1
        
        
        self.conv1= GATConv(dataset.num_features, self.hidden, heads= self.in_head,dropout = 0.6)
        self.conv2= GATConv(self.hidden*self.in_head, dataset.num_classes, concat= False,heads= self.out_head, dropout=0.6)
        
    def forward(self,data):
        
        x, edge_index = data.x,data.edge_index
        
        x= F.dropout(x,p= 0.6, training= self.training)
        x= self.conv1(x,edge_index)
        x= F.elu(x)
        x= F.dropout(x,p= 0.6, training= self.training)
        x= self.conv2(x,edge_index)
        
        return F.log_softmax(x,dim=1)

##### Define optimizer and push everything on the GPU

In [22]:
model= GAT().to(device)
data= dataset[0].to(device)

optimizer=  torch.optim.Adam(model.parameters(), lr=5e-3, weight_decay= 8e-4)
num_epoch=1000

##### Training

In [23]:
#Training loop:

for epoch in range(num_epoch):  # loop over the dataset multiple times
    model.train()
    optimizer.zero_grad()
    outputs= model(data)
    loss = F.nll_loss(outputs[data.train_mask], data.y[data.train_mask])
    loss.backward()
    optimizer.step()
    
    if (epoch+1) % 100 ==0:
        print (f'epoch {epoch+1}/{num_epoch}, Loss= {loss:.3f}')
    

epoch 100/1000, Loss= 0.994
epoch 200/1000, Loss= 0.775
epoch 300/1000, Loss= 0.739
epoch 400/1000, Loss= 0.623
epoch 500/1000, Loss= 0.693
epoch 600/1000, Loss= 0.698
epoch 700/1000, Loss= 0.581
epoch 800/1000, Loss= 0.703
epoch 900/1000, Loss= 0.578
epoch 1000/1000, Loss= 0.669


In [24]:
model.eval()
_,pred= model(data).max(dim=1)
correct= float(pred[data.test_mask].eq(data.y[data.test_mask]).sum().item())
acc= correct / data.test_mask.sum().item()
print(f'Accuracy = {acc:.3f}')

Accuracy = 0.835
