## Graphs

Transductive - you have a single graph (like Cora) you split some nodes (and not graphs) into train/val/test training sets. While you're training you'll be using only the labels from your training nodes. BUT. During the forward prop, by the nature of how spatial GNNs work, you'll be aggregating the feature vectors from your neighbors and some of them may belong to val or even test sets! The main point is - you ARE NOT using their label information but you ARE using the structural information and their features.

Inductive - you're probably much more familiar with this one if you come from the computer vision or NLP background. You have a set of training graphs, a separate set of val graphs and of course a separate set of test graphs.

## 1. https://github.com/phlippe/uvadlc_notebooks/blob/master/docs/tutorial_notebooks/tutorial7/GNN_overview.ipynb

In [None]:
## Standard libraries
import os
import json
import math
import numpy as np 
import time

## Imports for plotting
import matplotlib.pyplot as plt
%matplotlib inline 
from IPython.display import set_matplotlib_formats
set_matplotlib_formats('svg', 'pdf') # For export
from matplotlib.colors import to_rgb
import matplotlib
matplotlib.rcParams['lines.linewidth'] = 2.0
#import seaborn as sns
#sns.reset_orig()
#sns.set()

## Progress bar
from tqdm.notebook import tqdm

## PyTorch
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.utils.data as data
import torch.optim as optim
# Torchvision
import torchvision
from torchvision.datasets import CIFAR10
from torchvision import transforms

In [None]:
import torch.nn as nn
import torch

class GATLayer(nn.Module):
    
    def __init__(self, c_in, c_out, num_heads=1, concat_heads=True, alpha=0.2):
        """
        Inputs:
            c_in - Dimensionality of input features
            c_out - Dimensionality of output features
            num_heads - Number of heads, i.e. attention mechanisms to apply in parallel. The 
                        output features are equally split up over the heads if concat_heads=True.
            concat_heads - If True, the output of the different heads is concatenated instead of averaged.
            alpha - Negative slope of the LeakyReLU activation.
        """
        
        super().__init__()
        self.num_heads = num_heads
        self.concat_heads = concat_heads
        
        if self.concat_heads:
            assert c_out % num_heads == 0, "Number of output features must be a multiple of the count of heads."
            c_out = c_out // num_heads
        
        # Sub-modules and parameters needed in the layer
        self.projection = nn.Linear(c_in, c_out * num_heads)
        
        self.a = nn.Parameter(torch.Tensor(num_heads, 2 * c_out)) # One per head
        
        self.leakyrelu = nn.LeakyReLU(alpha)
        
        # Initialization from the original implementation
        nn.init.xavier_uniform_(self.projection.weight.data, gain=1.414)
        nn.init.xavier_uniform_(self.a.data, gain=1.414)
        
    def forward(self, node_feats, adj_matrix, print_attn_probs=False):
        """
        Inputs:
            node_feats - Input features of the node. Shape: [batch_size, c_in]
            adj_matrix - Adjacency matrix including self-connections. Shape: [batch_size, num_nodes, num_nodes]
            print_attn_probs - If True, the attention weights are printed during the forward pass (for debugging purposes)
        """
        batch_size, num_nodes = node_feats.size(0), node_feats.size(1)
        
        # Apply linear layer and sort nodes by head
        node_feats = self.projection(node_feats)
        node_feats = node_feats.view(batch_size, num_nodes, self.num_heads, -1)
        
        # We need to calculate the attention logits for every edge in the adjacency matrix 
        # Doing this on all possible combinations of nodes is very expensive
        # => Create a tensor of [W*h_i||W*h_j] with i and j being the indices of all edges
        
        edges = adj_matrix.nonzero(as_tuple=False) # Returns indices where the adjacency matrix is not 0 => edges
        node_feats_flat = node_feats.view(batch_size * num_nodes, self.num_heads, -1)
        edge_indices_row = edges[:,0] * num_nodes + edges[:,1]
        edge_indices_col = edges[:,0] * num_nodes + edges[:,2]
        
        a_input = torch.cat([
            torch.index_select(input=node_feats_flat, index=edge_indices_row, dim=0),
            torch.index_select(input=node_feats_flat, index=edge_indices_col, dim=0)
        ], dim=-1) # Index select returns a tensor with node_feats_flat being indexed at the desired positions along dim=0
        
        # Calculate attention MLP output (independent for each head)
        attn_logits = torch.einsum('bhc,hc->bh', a_input, self.a) 
        attn_logits = self.leakyrelu(attn_logits)
        
        # Map list of attention values back into a matrix
        attn_matrix = attn_logits.new_zeros(adj_matrix.shape+(self.num_heads,)).fill_(-9e15)
        attn_matrix[adj_matrix[...,None].repeat(1,1,1,self.num_heads) == 1] = attn_logits.reshape(-1)
        
        # Weighted average of attention
        attn_probs = F.softmax(attn_matrix, dim=2)
        if print_attn_probs:
            print("Attention probs\n", attn_probs.permute(0, 3, 1, 2))
        node_feats = torch.einsum('bijh,bjhc->bihc', attn_probs, node_feats)
        
        # If heads should be concatenated, we can do this by reshaping. Otherwise, take mean
        if self.concat_heads:
            node_feats = node_feats.reshape(batch_size, num_nodes, -1)
        else:
            node_feats = node_feats.mean(dim=2)
        
        return node_feats 

In [None]:
node_feats = torch.arange(8, dtype=torch.float32).view(1, 4, 2)
adj_matrix = torch.Tensor([[[1, 1, 0, 0],
                            [1, 1, 1, 1],
                            [0, 1, 1, 1],
                            [0, 1, 1, 1]]])

In [None]:
layer = GATLayer(2, 2, num_heads=2)
layer.projection.weight.data = torch.Tensor([[1., 0.], [0., 1.]])
layer.projection.bias.data = torch.Tensor([0., 0.])
layer.a.data = torch.Tensor([[-0.2, 0.3], [0.1, -0.1]])

with torch.no_grad():
    out_feats = layer(node_feats, adj_matrix, print_attn_probs=True)

print("Adjacency matrix", adj_matrix)
print("Input features", node_feats)
print("Output features", out_feats)

## 2 Basic GAT

### 2.1 https://towardsdatascience.com/graph-attention-networks-under-the-hood-3bd70dc7a87 


In [None]:
def leaky_relu(z):
    return np.where(z > 0, z, z * 0.01)

def softmax(z):
    if len(z.shape) > 1:
        # Softmax for matrix
        max_matrix = np.max(z, axis=0)
        stable_z = z - max_matrix
        e = np.exp(stable_z)
        a = e / np.sum(e, axis=0, keepdims=True)
    else:
        # Softmax for vector
        vector_max_value = np.max(z)
        a = (np.exp(z - vector_max_value)) / sum(np.exp(z - vector_max_value))

    assert a.shape == z.shape

    return a

In [None]:
print('\n\n----- One-hot vector representation of nodes. Shape(n,n)\n')
X = np.eye(5, 5)
n = X.shape[0]
np.random.shuffle(X)
print(X)

print('\n\n----- Embedding dimension\n')
emb = 3
print(emb)

print('\n\n----- Weight Matrix. Shape(emb, n)\n')
W = np.random.uniform(-np.sqrt(1. / emb), np.sqrt(1. / emb), (emb, n))
print(W)

print('\n\n----- Adjacency Matrix (undirected graph). Shape(n,n)\n')
A = np.random.randint(2, size=(n, n))
np.fill_diagonal(A, 1)  
A = (A + A.T)
A[A > 1] = 1
print(A)

In [None]:
# equation (1)
print('\n\n----- Linear Transformation. Shape(n, emb)\n')
z1 = X.dot(W.T)
print(z1)

In [None]:
# equation (2) - First part
print('\n\n----- Concat hidden features to represent edges. Shape(len(emb.concat(emb)), number of edges)\n')
edge_coords = np.where(A==1)
h_src_nodes = z1[edge_coords[0]]
h_dst_nodes = z1[edge_coords[1]]
z2 = np.concatenate((h_src_nodes, h_dst_nodes), axis=1)

In [None]:
# equation (2) - Second part
print('\n\n----- Attention coefficients. Shape(1, len(emb.concat(emb)))\n')
att = np.random.rand(1, z2.shape[1])
print(att)

print('\n\n----- Edge representations combined with the attention coefficients. Shape(1, number of edges)\n')
z2_att = z2.dot(att.T)
print(z2_att)

print('\n\n----- Leaky Relu. Shape(1, number of edges)')
e = leaky_relu(z2_att)
print(e)

In [None]:

# equation (3)
print('\n\n----- Edge scores as matrix. Shape(n,n)\n')
e_matr = np.zeros(A.shape)
e_matr[edge_coords[0], edge_coords[1]] = e.reshape(-1,)
print(e_matr)

print('\n\n----- For each node, normalize the edge (or neighbor) contributions using softmax\n')
alpha0 = softmax(e_matr[:,0][e_matr[:,0] != 0]) 
alpha1 = softmax(e_matr[:,1][e_matr[:,1] != 0])
alpha2 = softmax(e_matr[:,2][e_matr[:,2] != 0])
alpha3 = softmax(e_matr[:,3][e_matr[:,3] != 0])
alpha4 = softmax(e_matr[:,4][e_matr[:,4] != 0])
alpha = np.concatenate((alpha0, alpha1, alpha2, alpha3, alpha4))
print(alpha)

print('\n\n----- Normalized edge score matrix. Shape(n,n)\n')
A_scaled = np.zeros(A.shape)
A_scaled[edge_coords[0], edge_coords[1]] = alpha.reshape(-1,)
print(A_scaled)

In [None]:
# equation (4)
print('\n\nNeighborhood aggregation (GCN) scaled with attention scores (GAT). Shape(n, emb)\n')
ND_GAT = A_scaled.dot(z1)
print(ND_GAT)

### 2.2 https://github.com/johncava/pytorch-GAT

In [None]:
import numpy as np 
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.autograd import Variable

In [None]:
def get_neighbors(graph):
    neighbors = []
    for node in graph:
        node_neighbors = []
        for index, val in enumerate(node):
            if val == 1:
                node_neighbors.append(index)
        neighbors.append(node_neighbors)
    return neighbors

In [None]:
# Toy graph structure as a symmetric graph
graph = [[1,0,0,0,0,0,0,0,0,0],
         [0,1,0,0,0,0,0,0,0,0],
         [1,0,1,0,0,0,0,0,0,0],
         [0,1,1,1,0,0,0,0,0,0],
         [1,1,0,0,1,0,0,0,0,0],
         [0,0,1,0,0,1,0,0,0,0],
         [1,1,0,0,0,0,1,0,0,0],
         [0,0,1,0,1,1,0,1,0,0],
         [1,0,0,1,0,1,0,0,1,0],
         [0,0,0,1,0,0,1,0,0,1]]

# Toy label
label = [[1,1],[1,1],[1,1],[1,1],[1,1],[0,0],[0,0],[0,0],[0,0],[0,0]]

# Turn array into numpy array
graph = np.array(graph)

# Turn symmetric graph into an adjacency graph
graph = graph + graph.T - np.eye(10)

# Random feature matrix for the graph
features = np.random.rand(10,10) * 10

# Turn features into pytorch Variable
features = Variable(torch.Tensor(features))

# Turn label into pytorch Variable
label = Variable(torch.Tensor(label))

# Get neighbors for attention model
neighbors = get_neighbors(graph)

# Define W_out which would be equal in this case to the number of features of the label dataset => 2
W_out = 2


In [None]:
# Define Graph Attention Model
class Attention(nn.Module):
    def __init__(self):
        super(Attention, self).__init__()
        # Linear Function that takes the features (h_i) and turns it into new features (new_h_i)
        self.W = nn.Linear(10,W_out)
        # Note: Attention Mechanism takes twice the output of Linear Function (W) because of the concatentation of Wh_i and Wh_j (Wh_i || Wh_j)
        self.a = nn.Linear(2*W_out,1)
        self.leaky_relu = nn.LeakyReLU(0.2)

    def forward(self,x):
        # List to hold the new h_i values calculated from the attention mechanism
        new_h_list = []
        # Go through each node and perform attention in respect to its neighbors (which has been computed previously)
        for primary_index,primary_node in enumerate(neighbors):
            h = []
            W_hjs = []
            e = torch.Tensor([])
            # Reference Equation (1),(3) : e_ij = a(Wh_i, Wh_j) = Leaky_Relu(attention(Wh_i, Wh_j)) => Neural_Network( Wh_i || Wh_j )
            for neighbor in primary_node:
                # Neighbor node features matrix multiplied with W. Also stored for future use when multiplying against alphas in line 75
                W_hj = self.W(features[neighbor])
                # Note: concatenation of e_ij into a single torch tensor such that there is one line to do F.softmax(e) in line 70
                e = torch.cat((e,self.leaky_relu(self.a(torch.cat((self.W(x[primary_index]),W_hj))))))
                W_hjs.append(W_hj)
            # Softmax(e_ij) Reference: Equation (2)
            a = F.softmax(e)
            # Reference: Equation (4)
            new_h = torch.Tensor([0.0]*W_out)
            for a_ij, w_hj in zip(a,W_hjs):
                new_h += a_ij * w_hj
            new_h_list.append(F.leaky_relu(new_h))
            ######################################
        return torch.stack(new_h_list)


In [None]:
# Initialize Attention Model
attention = Attention()

loss_function = nn.MSELoss()
optimizer = optim.Adam(attention.parameters(), lr=1e-3)
max_iterations = 10

for iteration in range(max_iterations):
    prediction = attention(features)
    optimizer.zero_grad()
    loss = loss_function(prediction,label)
    print(loss.item())
    loss.backward()
    optimizer.step()

print("Done")

## 3. https://dsgiitr.com/blogs/gat/
### Implementing GAT Layer in PyTorch

In [None]:
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F

torch.manual_seed(2020) # seed for reproducible numbers

In [None]:
class GATLayer(nn.Module):
    """
    Simple PyTorch Implementation of the Graph Attention layer.
    """

    def __init__(self, in_features, out_features, dropout, alpha, concat=True):
        super(GATLayer, self).__init__()
        self.dropout       = dropout        # drop prob = 0.6
        self.in_features   = in_features    # 
        self.out_features  = out_features   # 
        self.alpha         = alpha          # LeakyReLU with negative input slope, alpha = 0.2
        self.concat        = concat         # conacat = True for all layers except the output layer.

        # Xavier Initialization of Weights
        # Alternatively use weights_init to apply weights of choice 
        self.W = nn.Parameter(torch.zeros(size=(in_features, out_features)))
        nn.init.xavier_uniform_(self.W.data, gain=1.414)
        self.a = nn.Parameter(torch.zeros(size=(2*out_features, 1)))
        nn.init.xavier_uniform_(self.a.data, gain=1.414)
        
        # LeakyReLU
        self.leakyrelu = nn.LeakyReLU(self.alpha)

    def forward(self, input, adj):
        # Linear Transformation
        h = torch.mm(input, self.W)
        N = h.size()[0]

        # Attention Mechanism
        a_input = torch.cat([h.repeat(1, N).view(N * N, -1), h.repeat(N, 1)], dim=1).view(N, -1, 2 * self.out_features)
        e       = self.leakyrelu(torch.matmul(a_input, self.a).squeeze(2))

        # Masked Attention
        zero_vec  = -9e15*torch.ones_like(e)
        attention = torch.where(adj > 0, e, zero_vec)
        
        attention = F.softmax(attention, dim=1)
        attention = F.dropout(attention, self.dropout, training=self.training)
        h_prime   = torch.matmul(attention, h)

        if self.concat:
            return F.elu(h_prime)
        else:
            return h_prime

In [None]:
from torch_geometric.datasets import Planetoid
import torch_geometric.transforms as T

name_data = 'Cora'
dataset = Planetoid(root= 'data/' + name_data, name = name_data)
dataset.transform = T.NormalizeFeatures()

print(f"Number of Classes in {name_data}:", dataset.num_classes)
print(f"Number of Node Features in {name_data}:", dataset.num_node_features)

In [None]:
from torch_geometric.data import Data
from torch_geometric.nn import GATConv
from torch_geometric.datasets import Planetoid
import torch_geometric.transforms as T

class GAT(torch.nn.Module):
    def __init__(self):
        super(GAT, self).__init__()
        self.hid = 8
        self.in_head = 8
        self.out_head = 1
        
        self.conv1 = GATConv(dataset.num_features, self.hid, heads=self.in_head, dropout=0.6)
        self.conv2 = GATConv(self.hid*self.in_head, dataset.num_classes, concat=False,
                             heads=self.out_head, dropout=0.6)

    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        
        # Dropout before the GAT layer is used to avoid overfitting in small datasets like Cora.
        # One can skip them if the dataset is sufficiently large.
        
        x = F.dropout(x, p=0.6, training=self.training)
        x = self.conv1(x, edge_index)
        x = F.elu(x)
        x = F.dropout(x, p=0.6, training=self.training)
        x = self.conv2(x, edge_index)
        
        return F.log_softmax(x, dim=1)

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model = GAT().to(device)

data = dataset[0].to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.005, weight_decay=5e-4)

model.train()
for epoch in range(1000):
    model.train()
    optimizer.zero_grad()
    out = model(data)
    loss = F.nll_loss(out[data.train_mask], data.y[data.train_mask])
    
    if epoch%200 == 0:
        print(loss)
    
    loss.backward()
    optimizer.step()

In [None]:
model.eval()
_, pred = model(data).max(dim=1)
correct = float(pred[data.test_mask].eq(data.y[data.test_mask]).sum().item())
acc = correct / data.test_mask.sum().item()
print('Accuracy: {:.4f}'.format(acc))

## 4. https://github.com/raunakkmr/Graph-Attention-Networks

In [None]:
import math
import torch
import torch.nn as nn
import numpy as np
import os
import scipy.sparse as sp
from torch.utils.data import DataLoader, Dataset
import argparse
import json
import sys
from math import ceil
import torch.optim as optim

In [None]:
class GraphAttention(nn.Module):

    def __init__(self, input_dim, output_dim, num_heads, dropout=0.5):
        super().__init__()

        self.input_dim = input_dim
        self.output_dim = output_dim
        self.num_heads = num_heads

        self.fcs = nn.ModuleList([nn.Linear(input_dim, output_dim) for _ in range(num_heads)])
        self.a = nn.ModuleList([nn.Linear(2*output_dim, 1) for _ in range(num_heads)])

        self.dropout = nn.Dropout(dropout)
        self.softmax = nn.Softmax(dim=0)
        self.leakyrelu = nn.LeakyReLU()

    def forward(self, features, nodes, mapping, rows):
 

        nprime = features.shape[0]
        
        rows = [np.array([mapping[v] for v in row], dtype=np.int64) for row in rows]
        sum_degs = np.hstack(([0], np.cumsum([len(row) for row in rows])))
        
        mapped_nodes = [mapping[v] for v in nodes]
        indices = torch.LongTensor([[v, c] for (v, row) in zip(mapped_nodes, rows) for c in row]).t()


        out = []
        for k in range(self.num_heads):
            h = self.fcs[k](features)

            nbr_h = torch.cat(tuple([h[row] for row in rows]), dim=0) # Neighbour
            self_h = torch.cat(tuple([h[mapping[nodes[i]]].repeat(len(row), 1) for (i, row) in enumerate(rows)]), dim=0)
            
            cat_h = torch.cat((self_h, nbr_h), dim=1)

            e = self.leakyrelu(self.a[k](cat_h))

            alpha = [self.softmax(e[lo : hi]) for (lo, hi) in zip(sum_degs, sum_degs[1:])]
            alpha = torch.cat(tuple(alpha), dim=0)
            
            alpha = alpha.squeeze(1)
            alpha = self.dropout(alpha)

            adj = torch.sparse.FloatTensor(indices, alpha, torch.Size([nprime, nprime]))
            out.append(torch.sparse.mm(adj, h)[mapped_nodes])

        return out

In [None]:
class GAT(nn.Module):

    def __init__(self, input_dim, hidden_dims, output_dim, num_heads,
                 dropout=0.5, device='cpu'):

        super().__init__()

        self.input_dim = input_dim
        self.hidden_dims = hidden_dims
        self.output_dim = output_dim
        self.num_heads = num_heads
        self.device = device
        self.num_layers = len(hidden_dims) + 1

        dims = [input_dim] + [d*nh for (d, nh) in zip(hidden_dims, num_heads[:-1])] + [output_dim*num_heads[-1]]
        in_dims = dims[:-1]
        out_dims = [d // nh for (d, nh) in zip(dims[1:], num_heads)]

        self.attn = nn.ModuleList([GraphAttention(i, o, nh, dropout) for (i, o, nh) in zip(in_dims, out_dims, num_heads)])

        self.bns = nn.ModuleList([nn.BatchNorm1d(dim) for dim in dims[1:-1]])

        self.dropout = nn.Dropout(dropout)
        
        self.elu = nn.ELU()

    def forward(self, features, node_layers, mappings, rows):

        out = features
        for k in range(self.num_layers):
            nodes = node_layers[k+1]
            mapping = mappings[k]
            
            init_mapped_nodes = np.array([mappings[0][v] for v in nodes], dtype=np.int64)
            
            cur_rows = rows[init_mapped_nodes]
            out = self.dropout(out)
            
            out = self.attn[k](out, nodes, mapping, cur_rows)
            
            if k+1 < self.num_layers:
                out = [self.elu(o) for o in out]
                out = torch.cat(tuple(out), dim=1)
                out = self.bns[k](out)
                
            else:
                out = torch.cat(tuple([x.flatten().unsqueeze(0) for x in out]), dim=0)
                out = out.mean(dim=0).reshape(len(nodes), self.output_dim)

        return out

In [None]:
json_dict = {
    "stats_per_batch" : 3,
    "dataset" : "cora",
    "dataset_path" : "data/cora/",
    "mode" : "train",
    "task" : "node_classification",
    "cuda" : "True",
    "hidden_dims" : [8],
    "num_heads" : [8, 1],
    "dropout" : 0.6,
    "batch_size" : 140,
    "epochs" : 200,
    "lr" : 5e-2,
    "weight_decay" : 5e-4,
    "transductive" : "True",
    "self_loop" : "True"
}

In [None]:
class Cora(Dataset):

    def __init__(self, path, mode, num_layers,
                 self_loop=False, normalize_adj=False, transductive=False):

        super(Cora, self).__init__()

        self.path = path
        self.mode = mode
        self.num_layers = num_layers
        self.self_loop = self_loop
        self.normalize_adj = normalize_adj
        self.transductive = transductive
        self.idx = {
            'train' : np.array(range(140)),
            'val' : np.array(range(200, 500)),
            'test' : np.array(range(500, 1500))
        }

        print('--------------------------------')
        print('Reading cora dataset from {}'.format(path))
        citations = np.loadtxt(os.path.join(path, 'cora.cites'), dtype=np.int64)
        content = np.loadtxt(os.path.join(path, 'cora.content'), dtype=str)
        print('Finished reading data.')

        print('Setting up data structures.')
        if transductive:
            idx = np.arange(content.shape[0])
        else:
            if mode == 'train':
                idx = self.idx['train']
                
            elif mode == 'val':
                idx = np.hstack((self.idx['train'], self.idx['val']))
                
            elif mode == 'test':
                idx = np.hstack((self.idx['train'], self.idx['test']))
                
        features, labels = content[idx, 1:-1].astype(np.float32), content[idx, -1]
        d = {j : i for (i,j) in enumerate(sorted(set(labels)))}
        labels = np.array([d[l] for l in labels])

        vertices = np.array(content[idx, 0], dtype=np.int64)
        d = {j : i for (i,j) in enumerate(vertices)}
        
        edges = np.array([e for e in citations if e[0] in d.keys() and e[1] in d.keys()])
        edges = np.array([d[v] for v in edges.flatten()]).reshape(edges.shape)
        
        n, m = labels.shape[0], edges.shape[0]
        u, v = edges[:, 0], edges[:, 1]
        
        adj = sp.coo_matrix((np.ones(m), (u, v)),
                            shape=(n, n),
                            dtype=np.float32)
        
        adj += adj.T.multiply(adj.T > adj) - adj.multiply(adj.T > adj) # Building a symmetric adjacency matrix of a given sparse matrix
        
        if self_loop:
            adj += sp.eye(n)
            
        if normalize_adj:
            degrees = np.power(np.array(np.sum(adj, axis=1)), -0.5).flatten()
            degrees = sp.diags(degrees)
            adj = (degrees.dot(adj.dot(degrees)))
        print('Finished setting up data structures.')
        print('--------------------------------')

        self.features = features
        self.labels = labels
        self.adj = adj.tolil()

    def __len__(self):
        return len(self.idx[self.mode])

    def __getitem__(self, idx):
        
        if self.transductive:
            idx += int(self.idx[self.mode][0])
        else:
            if self.mode != 'train':
                idx += len(self.idx['train'])
                
                
        node_layers, mappings = self._form_computation_graph(idx)
        rows = self.adj.rows[node_layers[0]]
        
        features = self.features[node_layers[0], :]
        labels = self.labels[node_layers[-1]]
        
        features = torch.FloatTensor(features)
        labels = torch.LongTensor(labels)

        return features, node_layers, mappings, rows, labels

    def collate_wrapper(self, batch):

        idx = [node_layers[-1][0] for node_layers in [sample[1] for sample in batch]]

        
        node_layers, mappings = self._form_computation_graph(idx)
        
        rows = self.adj.rows[node_layers[0]]       
        features = self.features[node_layers[0], :]
        labels = self.labels[node_layers[-1]]
        
        features = torch.FloatTensor(features)
        labels = torch.LongTensor(labels)

        return features, node_layers, mappings, rows, labels

    def get_dims(self):

        return self.features.shape[1], len(set(self.labels))

    def _form_computation_graph(self, idx):

        _list, _set = list, set
        rows = self.adj.rows
        
        if type(idx) is int:
            node_layers = [np.array([idx], dtype=np.int64)]
        elif type(idx) is list:
            node_layers = [np.array(idx, dtype=np.int64)]
            
        for _ in range(self.num_layers):
            prev = node_layers[-1]
            arr = [node for node in prev]
            arr.extend([v for node in arr for v in rows[node]])
            arr = np.array(_list(_set(arr)), dtype=np.int64)
            node_layers.append(arr)
        node_layers.reverse()

        mappings = [{j : i for (i,j) in enumerate(arr)} for arr in node_layers]

        return node_layers, mappings

In [None]:
# from datasets import node_classification

def get_criterion(task):

    if task == 'node_classification':
        criterion = nn.CrossEntropyLoss()

    return criterion

def get_dataset(args):

    task, dataset_name, *dataset_args = args
    if task == 'node_classification':
        if dataset_name == 'cora':
            dataset = Cora(*dataset_args)

    return dataset

def get_fname(config):

    hidden_dims_str = '_'.join([str(x) for x in config['hidden_dims']])
    num_heads_str = '_'.join([str(x) for x in config['num_heads']])
    batch_size = config['batch_size']
    epochs = config['epochs']
    lr = config['lr']
    weight_decay = config['weight_decay']
    dropout = config['dropout']
    transductive = str(config['transductive'])
    fname = 'gat_hidden_dims_{}_num_heads_{}_batch_size_{}_epochs_{}_lr_{}_weight_decay_{}_dropout_{}_transductive_{}.pth'.format(
        hidden_dims_str, num_heads_str, batch_size, epochs, lr,
        weight_decay, dropout, transductive)

    return fname

def parse_args():

    parser = argparse.ArgumentParser()

    parser.add_argument('--json', type=str, default='config.json',
                        help='path to json file with arguments, default: config.json')

    parser.add_argument('--print_every', type=int, default=16,
                        help='print loss and accuracy after how many batches, default: 16')

    parser.add_argument('--dataset', type=str, choices=['cora'], default='cora',
                        help='name of the dataset, default=cora')
    parser.add_argument('--dataset_path', type=str,
                        default='/Users/raunak/Documents/Datasets/Cora', 
                        help='path to dataset')
    parser.add_argument('--self_loop', action='store_true',
                        help='whether to add self loops to adjacency matrix, default=False')
    parser.add_argument('--normalize_adj', action='store_true',
                        help='whether to normalize adj like in gcn, default=False')
    parser.add_argument('--transductive', action='store_true',
                        help='whether to use all nodes while training, default=False')

    parser.add_argument('--task', type=str,
                        choices=['unsupervised', 'node_classification'],
                        default='node_classification',
                        help='type of task, default=node_classification')

    parser.add_argument('--dropout', type=float, default=0.5,
                        help='dropout parameter, default=0.5.')
    parser.add_argument('--cuda', action='store_true',
                        help='whether to use GPU, default: False')
    parser.add_argument('--hidden_dims', type=int, nargs="*",
                        help='dimensions of hidden layers, specify through config.json')
    parser.add_argument('--num_heads', type=int, nargs="*",
                        help='number of attention heads in each layer, length should be equal to len(hidden_dims)+1, specify through config.json')

    parser.add_argument('--batch_size', type=int, default=8,
                        help='training batch size, default=8')
    parser.add_argument('--epochs', type=int, default=10,
                        help='number of training epochs, default=10')
    parser.add_argument('--lr', type=float, default=1e-3,
                        help='learning rate, default=1e-3')
    parser.add_argument('--weight_decay', type=float, default=5e-4,
                        help='weight decay, default=5e-4')

    parser.add_argument('--save', action='store_true',
                        help='whether to save model in trained_models/ directory, default: False')
    parser.add_argument('--load', action='store_true',
                        help='whether to load model in trained_models/ directory')

    args = parser.parse_args([])
    config = vars(args)

    config.update(json_dict)

    config['num_layers'] = len(config['hidden_dims']) + 1

    print('--------------------------------')
    print('Config:')
    for (k, v) in config.items():
        print("    '{}': '{}'".format(k, v))
    print('--------------------------------')

    return config

###### https://pytorch.org/docs/stable/data.html#dataloader-collate-fn

In [None]:
def main():
    config = parse_args()

    #if config['cuda'] and torch.cuda.is_available():
    #    device = 'cuda:0'
    #else:
    device = 'cpu'

    dataset_args = (config['task'], config['dataset'], config['dataset_path'],
                    'train', config['num_layers'], config['self_loop'],
                    config['normalize_adj'], config['transductive'])
    
    dataset = get_dataset(dataset_args)
    
    loader = DataLoader(dataset=dataset, batch_size=config['batch_size'],
                        shuffle=True, collate_fn=dataset.collate_wrapper)
    
    input_dim, output_dim = dataset.get_dims()

    model = GAT(input_dim, config['hidden_dims'], output_dim,
                       config['num_heads'], config['dropout'], device)
    model.to(device)

    if not config['load']:
        criterion = get_criterion(config['task'])
        optimizer = optim.Adam(model.parameters(), lr=config['lr'],
                               weight_decay=config['weight_decay'])
        
        epochs = config['epochs']
        stats_per_batch = config['stats_per_batch']
        num_batches = int(ceil(len(dataset) / config['batch_size']))
        
        model.train()
        print('--------------------------------')
        print('Training.')
        for epoch in range(epochs):
            print('Epoch {} / {}'.format(epoch+1, epochs))
            running_loss = 0.0
            num_correct, num_examples = 0, 0
            
            for (idx, batch) in enumerate(loader):
                features, node_layers, mappings, rows, labels = batch
                features, labels = features.to(device), labels.to(device)
                optimizer.zero_grad()
                out = model(features, node_layers, mappings, rows)
                loss = criterion(out, labels)
                loss.backward()
                optimizer.step()
                
                with torch.no_grad():
                    running_loss += loss.item()
                    predictions = torch.max(out, dim=1)[1]
                    num_correct += torch.sum(predictions == labels).item()
                    num_examples += len(labels)
                    
                if (idx + 1) % stats_per_batch == 0:
                    running_loss /= stats_per_batch
                    accuracy = num_correct / num_examples
                    print('    Batch {} / {}: loss {}, accuracy {}'.format(
                        idx+1, num_batches, running_loss, accuracy))
                    running_loss = 0.0
                    num_correct, num_examples = 0, 0
        print('Finished training.')
        print('--------------------------------')

        if config['save']:
            print('--------------------------------')
            directory = os.path.join(os.path.dirname(os.getcwd()),
                                    'trained_models')
            if not os.path.exists(directory):
                os.makedirs(directory)
            fname = get_fname(config)
            path = os.path.join(directory, fname)
            print('Saving model at {}'.format(path))
            torch.save(model.state_dict(), path)
            print('Finished saving model.')
            print('--------------------------------')
        
        
    dataset_args = (config['task'], config['dataset'], config['dataset_path'],
                    'test', config['num_layers'], config['self_loop'],
                    config['normalize_adj'], config['transductive'])
    
    dataset = get_dataset(dataset_args)
    
    
    loader = DataLoader(dataset=dataset, batch_size=config['batch_size'],
                        shuffle=False, collate_fn=dataset.collate_wrapper)
    criterion = get_criterion(config['task'])
    stats_per_batch = config['stats_per_batch']
    
    num_batches = int(ceil(len(dataset) / config['batch_size']))
    model.eval()
    print('--------------------------------')
    print('Testing.')
    
    
    running_loss, total_loss = 0.0, 0.0
    num_correct, num_examples = 0, 0
    total_correct, total_examples = 0, 0
    
    for (idx, batch) in enumerate(loader):
        features, node_layers, mappings, rows, labels = batch
        features, labels = features.to(device), labels.to(device)
        
        out = model(features, node_layers, mappings, rows)
        loss = criterion(out, labels)
        
        running_loss += loss.item()
        total_loss += loss.item()
        
        predictions = torch.max(out, dim=1)[1]
        
        num_correct += torch.sum(predictions == labels).item()
        total_correct += torch.sum(predictions == labels).item()
        
        num_examples += len(labels)
        total_examples += len(labels)
        
        if (idx + 1) % stats_per_batch == 0:
            running_loss /= stats_per_batch
            accuracy = num_correct / num_examples
            print('    Batch {} / {}: loss {}, accuracy {}'.format(
                idx+1, num_batches, running_loss, accuracy))
            running_loss = 0.0
            num_correct, num_examples = 0, 0
            
    total_loss /= num_batches
    total_accuracy = total_correct / total_examples
    print('Loss {}, accuracy {}'.format(total_loss, total_accuracy))
    print('Finished testing.')
    print('--------------------------------')


In [None]:
if __name__ == '__main__':
    main()

## 5.1 https://github.com/PetarV-/GAT
#### pytorch implementatuon of GAT https://arxiv.org/abs/1710.10903

In [None]:
from __future__ import division
from __future__ import print_function

import os
import glob
import time
import random
import argparse
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.autograd import Variable

In [None]:
class GraphAttentionLayer(nn.Module):
    """
    Simple GAT layer, similar to https://arxiv.org/abs/1710.10903
    """
    def __init__(self, in_features, out_features, dropout, alpha, concat=True):
        super(GraphAttentionLayer, self).__init__()
        self.dropout = dropout
        self.in_features = in_features
        self.out_features = out_features
        self.alpha = alpha
        self.concat = concat

        self.W = nn.Parameter(torch.empty(size=(in_features, out_features)))
        nn.init.xavier_uniform_(self.W.data, gain=1.414)
        self.a = nn.Parameter(torch.empty(size=(2*out_features, 1)))
        nn.init.xavier_uniform_(self.a.data, gain=1.414)

        self.leakyrelu = nn.LeakyReLU(self.alpha)

    def forward(self, h, adj):
        Wh = torch.mm(h, self.W) # h.shape: (N, in_features), Wh.shape: (N, out_features)
        e = self._prepare_attentional_mechanism_input(Wh)

        zero_vec = -9e15*torch.ones_like(e)
        attention = torch.where(adj > 0, e, zero_vec)
        attention = F.softmax(attention, dim=1)
        attention = F.dropout(attention, self.dropout, training=self.training)
        h_prime = torch.matmul(attention, Wh)

        if self.concat:
            return F.elu(h_prime)
        else:
            return h_prime

    def _prepare_attentional_mechanism_input(self, Wh):
        # Wh.shape (N, out_feature)
        # self.a.shape (2 * out_feature, 1)
        # Wh1&2.shape (N, 1)
        # e.shape (N, N)
        Wh1 = torch.matmul(Wh, self.a[:self.out_features, :])
        Wh2 = torch.matmul(Wh, self.a[self.out_features:, :])
        # broadcast add
        e = Wh1 + Wh2.T
        return self.leakyrelu(e)

    def __repr__(self):
        return self.__class__.__name__ + ' (' + str(self.in_features) + ' -> ' + str(self.out_features) + ')'


class SpecialSpmmFunction(torch.autograd.Function):
    """Special function for only sparse region backpropataion layer."""
    @staticmethod
    def forward(ctx, indices, values, shape, b):
        assert indices.requires_grad == False
        a = torch.sparse_coo_tensor(indices, values, shape)
        ctx.save_for_backward(a, b)
        ctx.N = shape[0]
        return torch.matmul(a, b)

    @staticmethod
    def backward(ctx, grad_output):
        a, b = ctx.saved_tensors
        grad_values = grad_b = None
        if ctx.needs_input_grad[1]:
            grad_a_dense = grad_output.matmul(b.t())
            edge_idx = a._indices()[0, :] * ctx.N + a._indices()[1, :]
            grad_values = grad_a_dense.view(-1)[edge_idx]
        if ctx.needs_input_grad[3]:
            grad_b = a.t().matmul(grad_output)
        return None, grad_values, None, grad_b


class SpecialSpmm(nn.Module):
    def forward(self, indices, values, shape, b):
        return SpecialSpmmFunction.apply(indices, values, shape, b)

    
class SpGraphAttentionLayer(nn.Module):
    """
    Sparse version GAT layer, similar to https://arxiv.org/abs/1710.10903
    """

    def __init__(self, in_features, out_features, dropout, alpha, concat=True):
        super(SpGraphAttentionLayer, self).__init__()
        self.in_features = in_features
        self.out_features = out_features
        self.alpha = alpha
        self.concat = concat

        self.W = nn.Parameter(torch.zeros(size=(in_features, out_features)))
        nn.init.xavier_normal_(self.W.data, gain=1.414)
                
        self.a = nn.Parameter(torch.zeros(size=(1, 2*out_features)))
        nn.init.xavier_normal_(self.a.data, gain=1.414)

        self.dropout = nn.Dropout(dropout)
        self.leakyrelu = nn.LeakyReLU(self.alpha)
        self.special_spmm = SpecialSpmm()

    def forward(self, input, adj):
        dv = 'cuda' if input.is_cuda else 'cpu'

        N = input.size()[0]
        edge = adj.nonzero().t()

        h = torch.mm(input, self.W)
        # h: N x out
        assert not torch.isnan(h).any()

        # Self-attention on the nodes - Shared attention mechanism
        edge_h = torch.cat((h[edge[0, :], :], h[edge[1, :], :]), dim=1).t()
        # edge: 2*D x E

        edge_e = torch.exp(-self.leakyrelu(self.a.mm(edge_h).squeeze()))
        assert not torch.isnan(edge_e).any()
        # edge_e: E

        e_rowsum = self.special_spmm(edge, edge_e, torch.Size([N, N]), torch.ones(size=(N,1), device=dv))
        # e_rowsum: N x 1

        edge_e = self.dropout(edge_e)
        # edge_e: E

        h_prime = self.special_spmm(edge, edge_e, torch.Size([N, N]), h)
        assert not torch.isnan(h_prime).any()
        # h_prime: N x out
        
        h_prime = h_prime.div(e_rowsum)
        # h_prime: N x out
        assert not torch.isnan(h_prime).any()

        if self.concat:
            # if this layer is not last layer,
            return F.elu(h_prime)
        else:
            # if this layer is last layer,
            return h_prime

    def __repr__(self):
        return self.__class__.__name__ + ' (' + str(self.in_features) + ' -> ' + str(self.out_features) + ')'

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F



class GAT(nn.Module):
    def __init__(self, nfeat, nhid, nclass, dropout, alpha, nheads):
        """Dense version of GAT."""
        super(GAT, self).__init__()
        self.dropout = dropout

        self.attentions = [GraphAttentionLayer(nfeat, nhid, dropout=dropout, alpha=alpha, concat=True) for _ in range(nheads)]
        for i, attention in enumerate(self.attentions):
            self.add_module('attention_{}'.format(i), attention)

        self.out_att = GraphAttentionLayer(nhid * nheads, nclass, dropout=dropout, alpha=alpha, concat=False)

    def forward(self, x, adj):
        x = F.dropout(x, self.dropout, training=self.training)
        x = torch.cat([att(x, adj) for att in self.attentions], dim=1)
        x = F.dropout(x, self.dropout, training=self.training)
        x = F.elu(self.out_att(x, adj))
        return F.log_softmax(x, dim=1)


class SpGAT(nn.Module):
    def __init__(self, nfeat, nhid, nclass, dropout, alpha, nheads):
        """Sparse version of GAT."""
        super(SpGAT, self).__init__()
        self.dropout = dropout

        self.attentions = [SpGraphAttentionLayer(nfeat, 
                                                 nhid, 
                                                 dropout=dropout, 
                                                 alpha=alpha, 
                                                 concat=True) for _ in range(nheads)]
        for i, attention in enumerate(self.attentions):
            self.add_module('attention_{}'.format(i), attention)

        self.out_att = SpGraphAttentionLayer(nhid * nheads, 
                                             nclass, 
                                             dropout=dropout, 
                                             alpha=alpha, 
                                             concat=False)

    def forward(self, x, adj):
        x = F.dropout(x, self.dropout, training=self.training)
        x = torch.cat([att(x, adj) for att in self.attentions], dim=1)
        x = F.dropout(x, self.dropout, training=self.training)
        x = F.elu(self.out_att(x, adj))
        return F.log_softmax(x, dim=1)

## Load Data

In [None]:
import numpy as np
import scipy.sparse as sp
import torch


def encode_onehot(labels):
    # The classes must be sorted before encoding to enable static class encoding.
    # In other words, make sure the first class always maps to index 0.
    classes = sorted(list(set(labels)))
    classes_dict = {c: np.identity(len(classes))[i, :] for i, c in enumerate(classes)}
    labels_onehot = np.array(list(map(classes_dict.get, labels)), dtype=np.int32)
    return labels_onehot


def load_data(path="./data/cora/", dataset="cora"):
    """Load citation network dataset (cora only for now)"""
    print('Loading {} dataset...'.format(dataset))

    idx_features_labels = np.genfromtxt("{}{}.content".format(path, dataset), dtype=np.dtype(str))
    features = sp.csr_matrix(idx_features_labels[:, 1:-1], dtype=np.float32)
    labels = encode_onehot(idx_features_labels[:, -1])

    # build graph
    idx = np.array(idx_features_labels[:, 0], dtype=np.int32)
    idx_map = {j: i for i, j in enumerate(idx)}
    edges_unordered = np.genfromtxt("{}{}.cites".format(path, dataset), dtype=np.int32)
    edges = np.array(list(map(idx_map.get, edges_unordered.flatten())), dtype=np.int32).reshape(edges_unordered.shape)
    adj = sp.coo_matrix((np.ones(edges.shape[0]), (edges[:, 0], edges[:, 1])), shape=(labels.shape[0], labels.shape[0]), dtype=np.float32)

    # build symmetric adjacency matrix
    adj = adj + adj.T.multiply(adj.T > adj) - adj.multiply(adj.T > adj)

    features = normalize_features(features)
    adj = normalize_adj(adj + sp.eye(adj.shape[0]))

    idx_train = range(140)
    idx_val = range(200, 500)
    idx_test = range(500, 1500)

    adj = torch.FloatTensor(np.array(adj.todense()))
    features = torch.FloatTensor(np.array(features.todense()))
    labels = torch.LongTensor(np.where(labels)[1])

    idx_train = torch.LongTensor(idx_train)
    idx_val = torch.LongTensor(idx_val)
    idx_test = torch.LongTensor(idx_test)

    return adj, features, labels, idx_train, idx_val, idx_test


def normalize_adj(mx):
    """Row-normalize sparse matrix"""
    rowsum = np.array(mx.sum(1))
    r_inv_sqrt = np.power(rowsum, -0.5).flatten()
    r_inv_sqrt[np.isinf(r_inv_sqrt)] = 0.
    r_mat_inv_sqrt = sp.diags(r_inv_sqrt)
    return mx.dot(r_mat_inv_sqrt).transpose().dot(r_mat_inv_sqrt)


def normalize_features(mx):
    """Row-normalize sparse matrix"""
    rowsum = np.array(mx.sum(1))
    r_inv = np.power(rowsum, -1).flatten()
    r_inv[np.isinf(r_inv)] = 0.
    r_mat_inv = sp.diags(r_inv)
    mx = r_mat_inv.dot(mx)
    return mx


def accuracy(output, labels):
    preds = output.max(1)[1].type_as(labels)
    correct = preds.eq(labels).double()
    correct = correct.sum()
    return correct / len(labels)

In [None]:
# Training settings
parser = argparse.ArgumentParser()
parser.add_argument('--no-cuda', action='store_true', default=False, help='Disables CUDA training.')
parser.add_argument('--fastmode', action='store_true', default=False, help='Validate during training pass.')
parser.add_argument('--sparse', action='store_true', default=False, help='GAT with sparse version or not.')
parser.add_argument('--seed', type=int, default=72, help='Random seed.')
parser.add_argument('--epochs', type=int, default=10000, help='Number of epochs to train.')
parser.add_argument('--lr', type=float, default=0.005, help='Initial learning rate.')
parser.add_argument('--weight_decay', type=float, default=5e-4, help='Weight decay (L2 loss on parameters).')
parser.add_argument('--hidden', type=int, default=8, help='Number of hidden units.')
parser.add_argument('--nb_heads', type=int, default=8, help='Number of head attentions.')
parser.add_argument('--dropout', type=float, default=0.6, help='Dropout rate (1 - keep probability).')
parser.add_argument('--alpha', type=float, default=0.2, help='Alpha for the leaky_relu.')
parser.add_argument('--patience', type=int, default=100, help='Patience')



In [None]:
args = parser.parse_args([])
args.cuda = not args.no_cuda and torch.cuda.is_available()

random.seed(args.seed)
np.random.seed(args.seed)
torch.manual_seed(args.seed)
if args.cuda:
    torch.cuda.manual_seed(args.seed)

# Load data
adj, features, labels, idx_train, idx_val, idx_test = load_data()

In [None]:
# Model and optimizer
if args.sparse:
    model = SpGAT(nfeat=features.shape[1], 
                nhid=args.hidden, 
                nclass=int(labels.max()) + 1, 
                dropout=args.dropout, 
                nheads=args.nb_heads, 
                alpha=args.alpha)
else:
    model = GAT(nfeat=features.shape[1], 
                nhid=args.hidden, 
                nclass=int(labels.max()) + 1, 
                dropout=args.dropout, 
                nheads=args.nb_heads, 
                alpha=args.alpha)
optimizer = optim.Adam(model.parameters(), 
                       lr=args.lr, 
                       weight_decay=args.weight_decay)

if args.cuda:
    model.cuda()
    features = features.cuda()
    adj = adj.cuda()
    labels = labels.cuda()
    idx_train = idx_train.cuda()
    idx_val = idx_val.cuda()
    idx_test = idx_test.cuda()

In [None]:
features, adj, labels = Variable(features), Variable(adj), Variable(labels)


def train(epoch):
    t = time.time()
    model.train()
    optimizer.zero_grad()
    output = model(features, adj)
    loss_train = F.nll_loss(output[idx_train], labels[idx_train])
    acc_train = accuracy(output[idx_train], labels[idx_train])
    loss_train.backward()
    optimizer.step()

    if not args.fastmode:
        # Evaluate validation set performance separately,
        # deactivates dropout during validation run.
        model.eval()
        output = model(features, adj)

    loss_val = F.nll_loss(output[idx_val], labels[idx_val])
    acc_val = accuracy(output[idx_val], labels[idx_val])
    print('Epoch: {:04d}'.format(epoch+1),
          'loss_train: {:.4f}'.format(loss_train.data.item()),
          'acc_train: {:.4f}'.format(acc_train.data.item()),
          'loss_val: {:.4f}'.format(loss_val.data.item()),
          'acc_val: {:.4f}'.format(acc_val.data.item()),
          'time: {:.4f}s'.format(time.time() - t))

    return loss_val.data.item()

In [None]:
def compute_test():
    model.eval()
    output = model(features, adj)
    loss_test = F.nll_loss(output[idx_test], labels[idx_test])
    acc_test = accuracy(output[idx_test], labels[idx_test])
    print("Test set results:",
          "loss= {:.4f}".format(loss_test.data.item()),
          "accuracy= {:.4f}".format(acc_test.data.item()))

In [None]:
# Train model
t_total = time.time()
loss_values = []
bad_counter = 0
best = args.epochs + 1
best_epoch = 0
for epoch in range(args.epochs):
    loss_values.append(train(epoch))

    torch.save(model.state_dict(), '{}.pkl'.format(epoch))
    if loss_values[-1] < best:
        best = loss_values[-1]
        best_epoch = epoch
        bad_counter = 0
    else:
        bad_counter += 1

    if bad_counter == args.patience:
        break

    files = glob.glob('*.pkl')
    for file in files:
        epoch_nb = int(file.split('.')[0])
        if epoch_nb < best_epoch:
            os.remove(file)

files = glob.glob('*.pkl')
for file in files:
    epoch_nb = int(file.split('.')[0])
    if epoch_nb > best_epoch:
        os.remove(file)

print("Optimization Finished!")
print("Total time elapsed: {:.4f}s".format(time.time() - t_total))

# Restore best model
print('Loading {}th epoch'.format(best_epoch))
model.load_state_dict(torch.load('{}.pkl'.format(best_epoch)))

# Testing
compute_test()

## 5.2 https://github.com/psh150204/GAT *** Can be referred for understanding
#### PyTorch implementation of the paper "Graph Attention Networks" (ICLR 2018)

#### Layer

In [6]:
import numpy as np 
import torch 
import torch.nn as nn 
import torch.nn.functional as F 
import os 
import time 
import random 
import argparse
import glob
import matplotlib.pyplot as plt
import torch.optim as optim 
from torch.autograd import Variable
import scipy.sparse as sp 

In [7]:
class GraphConvolutionLayer(nn.Module):
    def __init__(self, in_features, out_features):
        super(GraphConvolutionLayer,self).__init__()
        self.W = nn.Parameter(torch.zeros(in_features, out_features, dtype = torch.float32))
        nn.init.xavier_uniform_(self.W) # initialize as described in Glorot & Bengio (2010)
    
    def forward(self, input, adj):
        # input (= X) : a tensor with size [N, F]
        # adj (= A_hat) : a tensor with size [N, N]

        return torch.mm(adj, torch.mm(input, self.W))

class Attention(nn.Module):
    # single head attention
    def __init__(self, in_features, out_features, alpha):
        super(Attention, self).__init__()
        self.alpha = alpha

        self.W = nn.Linear(in_features, out_features, bias = False)
        self.a_T = nn.Linear(2 * out_features, 1, bias = False)

        nn.init.xavier_uniform_(self.W.weight)
        nn.init.xavier_uniform_(self.a_T.weight)

    def forward(self, h, adj):
        # h : a tensor with size [N, F] where N be a number of nodes and F be a number of features
        N = h.size(0)
        Wh = self.W(h) # h -> Wh : [N, F] -> [N, F']
        
        # H1 : [N, N, F'], H2 : [N, N, F'], attn_input = [N, N, 2F']

        # H1 = [[h1 h1 ... h1]   |  H2 = [[h1 h2 ... hN]   |   attn_input = [[h1||h1 h1||h2 ... h1||hN]
        #       [h2 h2 ... h2]   |        [h1 h2 ... hN]   |                 [h2||h1 h2||h2 ... h2||hN]
        #            ...         |             ...         |                         ...
        #       [hN hN ... hN]]  |        [h1 h2 ... hN]]  |                 [hN||h1 hN||h2 ... hN||hN]]
        
        H1 = Wh.unsqueeze(1).repeat(1,N,1)
        H2 = Wh.unsqueeze(0).repeat(N,1,1)
        
        attn_input = torch.cat([H1, H2], dim = -1)

        e = F.leaky_relu(self.a_T(attn_input).squeeze(-1), negative_slope = self.alpha) # [N, N]
        
        attn_mask = -1e18*torch.ones_like(e)
        masked_e = torch.where(adj > 0, e, attn_mask)
        attn_scores = F.softmax(masked_e, dim = -1) # [N, N]

        h_prime = torch.mm(attn_scores, Wh) # [N, F']

        return F.elu(h_prime) # [N, F']

class GraphAttentionLayer(nn.Module):
    # multi head attention
    def __init__(self, in_features, out_features, num_heads, alpha, concat=True):
        super(GraphAttentionLayer, self).__init__()
        
        self.concat = concat
        self.attentions = nn.ModuleList([Attention(in_features, out_features, alpha) for _ in range(num_heads)])
        
    def forward(self, input, adj):
        # input (= X) : a tensor with size [N, F]

        if self.concat :
            # concatenate
            outputs = []
            for attention in self.attentions:
                outputs.append(attention(input, adj))
            
            return torch.cat(outputs, dim = -1) # [N, KF']

        else :
            # average
            output = None
            for attention in self.attentions:
                if output == None:
                    output = attention(input, adj)
                else:
                    output += attention(input, adj)
            
            return output/len(self.attentions) # [N, F']

#### Model

In [8]:
import torch
import torch.nn as nn 
import torch.nn.functional as F 


class GCN(nn.Module):
    def __init__(self, F, H, C, dropout):
        super(GCN, self).__init__()
        self.layer1 = GraphConvolutionLayer(F, H)
        self.layer2 = GraphConvolutionLayer(H, C)
        self.dropout = nn.Dropout(p = dropout)

    def forward(self, x, adj):
        # X : a tensor with size [N, F]
        
        x = self.dropout(F.relu(self.layer1(x, adj))) # [N, H]
        return self.layer2(x, adj) # [N, C]
    
class GAT(nn.Module):
    def __init__(self, F, H, C, dropout, alpha, K):
        super(GAT, self).__init__()
        self.layer1 = GraphAttentionLayer(F, H, K, alpha)
        self.layer2 = GraphAttentionLayer(K * H, C, 1, alpha, concat = False)
        self.dropout = nn.Dropout(p = dropout)

    def forward(self, x, adj):
        # x : a tensor with size [N, F]

        x = self.dropout(F.relu(self.layer1(x, adj))) # [N, KH]
        return self.layer2(x, adj) # [N, C]

#### Load_data

In [9]:
def load_data(dataset="cora"):
    
    print("loading {} dataset ... ". format(dataset))

    path="./data/"+dataset+"/" 

    if dataset == 'cora':
        idx_features_labels = np.genfromtxt("{}{}.content".format(path,dataset), dtype=np.dtype(str))
        features = sp.csr_matrix(idx_features_labels[:,1:-1], dtype=np.float32)
        labels = encode_onehot(idx_features_labels[:,-1])

        idx = np.array(idx_features_labels[:,0],dtype=np.int32)
        idx_map = {j: i for i,j in enumerate(idx)}
        edges_unordered = np.genfromtxt("{}{}.cites".format(path,dataset), dtype=np.int32)
        edges = np.array(list(map(idx_map.get, edges_unordered.flatten())), dtype=np.int32).reshape(edges_unordered.shape)
        adj = sp.coo_matrix((np.ones(edges.shape[0]), (edges[:,0], edges[:,1])), shape=(labels.shape[0], labels.shape[0]), dtype=np.float32)

        
        
    elif dataset == 'citeseer':
        idx_features_labels = np.genfromtxt("{}{}.content".format(path,dataset), dtype=np.dtype(str))
        features = sp.csr_matrix(idx_features_labels[:,1:-1], dtype=np.float32)
        labels = encode_onehot(idx_features_labels[:,-1])

        idx = np.array(idx_features_labels[:,0],dtype=np.dtype(str))
        idx_map = {j: i for i,j in enumerate(idx)}
        
        edges_unordered = np.genfromtxt("{}{}.cites".format(path,dataset), dtype=np.dtype(str))
        edges = np.array(list(map(idx_map.get, edges_unordered.flatten())), dtype=np.dtype(str)).reshape(edges_unordered.shape)
        
        adj = sp.coo_matrix((np.ones(edges.shape[0]), (edges[:,0], edges[:,1])), shape=(labels.shape[0], labels.shape[0]), dtype=np.float32)

    adj = adj + adj.T.multiply(adj.T>adj) - adj.multiply(adj.T>adj)
    features = normalize_features(features)
    adj = normalize_adj(adj+sp.eye(adj.shape[0]))

    idx_train = range(140)
    idx_val = range(200,500)
    idx_test = range(500,1500)

    adj = torch.FloatTensor(np.array(adj.todense()))
    features = torch.FloatTensor(np.array(features.todense()))
    labels = torch.LongTensor(np.where(labels)[1])

    idx_train = torch.LongTensor(idx_train)
    idx_val = torch.LongTensor(idx_val)
    idx_test = torch.LongTensor(idx_test)

    return adj, features, labels, idx_train, idx_val, idx_test 

def accuracy(output, labels):
    preds = output.max(1)[1].type_as(labels)
    correct = preds.eq(labels).double()
    correct = correct.sum()

    return correct / len(labels)

def normalize_adj(mx): # A_hat = DAD
    rowsum = np.array(mx.sum(1))
    r_inv_sqrt = np.power(rowsum, -0.5).flatten()
    r_inv_sqrt[np.isinf(r_inv_sqrt)] = 0.
    r_mat_inv_sqrt = sp.diags(r_inv_sqrt)
    mx_to =  mx.dot(r_mat_inv_sqrt).transpose().dot(r_mat_inv_sqrt)
    return mx_to

def normalize_features(mx):
    rowsum = np.array(mx.sum(1))
    r_inv = np.power(rowsum, -1).flatten()
    r_inv[np.isinf(r_inv)] = 0.
    r_mat_inv = sp.diags(r_inv)
    mx_to =  r_mat_inv.dot(mx) 
    return mx_to 

def encode_onehot(labels):
    classes = set(labels)
    classes_dict = {c: np.identity(len(classes))[i,:] for i, c in enumerate(classes)}
    labels_onehot = np.array(list(map(classes_dict.get, labels)), dtype=np.int32)
    return labels_onehot

def mkdir_p(mypath):
    '''Creates a directory. equivalent to using mkdir -p on the command line'''

    from errno import EEXIST
    from os import makedirs,path

    try:
        makedirs(mypath)
    except OSError as exc: # Python >2.5
        if exc.errno == EEXIST and path.isdir(mypath):
            pass
        else: raise

In [10]:
def main(args):
    # meta settings
    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)

    if args.cuda:
        torch.cuda.manual_seed(args.seed)

    device = torch.device('cuda' if(torch.cuda.is_available()) else 'cpu')

    # load the data
    adj, features, labels, idx_train, idx_val, idx_test = load_data(args.dataset)
    features = features.to(device)
    labels = labels.to(device)

    # parameter intialization
    N = features.size(0) # num_of_nodes
    F = features.size(1) # num_of_features
    H = args.hidden # hidden nodes
    C = labels.max().item() + 1 # num_classes
    
    # for validation
    epochs_since_improvement = 0
    best_loss = 10.

    # init training object
    if args.model == 'GCN':
        network = GCN(F, H, C, args.dropout).to(device)

        # pre-processing
        A_tilde = adj + torch.eye(N)
        D_tilde_inv_sqrt = torch.diag(torch.sqrt(torch.sum(A_tilde, dim = 1)) ** -1)
        adj = torch.mm(D_tilde_inv_sqrt, torch.mm(A_tilde, D_tilde_inv_sqrt)).to(device) # A_hat
        
    else:
        network = GAT(F, H, C, args.dropout, args.alpha, args.n_heads).to(device)
        adj = adj.to(device)
    
    optimizer = optim.Adam(network.parameters(), lr = args.lr, weight_decay = args.weight_decay)
    criterion = nn.CrossEntropyLoss()

    train_losses, train_accs, val_losses, val_accs = [], [], [], []

    # Train
    for epoch in range(args.epochs):
        t = time.time()
        network.train()

        preds = network(features, adj) # [N, F]
        train_loss = criterion(preds[idx_train], labels[idx_train])
        train_acc = accuracy(preds[idx_train], labels[idx_train])

        optimizer.zero_grad()
        train_loss.backward()
        optimizer.step()

        # validation
        with torch.no_grad():
            network.eval()
            preds_val = network(features, adj)
            val_loss = criterion(preds_val[idx_val], labels[idx_val])
            val_acc = accuracy(preds_val[idx_val], labels[idx_val])

            # early stopping
            if val_loss < best_loss :
                best_loss = val_loss
                epochs_since_improvement = 0
            else:
                epochs_since_improvement += 1

        train_losses.append(train_loss.item())
        train_accs.append(train_acc.item())
        val_losses.append(val_loss.item())
        val_accs.append(val_acc.item())

        print('[%d/%d] train loss : %.4f | train acc %.2f%% | val loss %.4f | val acc %.2f%% | time %.3fs'
                    %(epoch+1, args.epochs, train_loss.item(), train_acc.item() * 100, val_loss.item(), val_acc.item() * 100, time.time() - t))

        if epochs_since_improvement > args.patience - 1 :
            print("There's no improvements during %d epochs and so stop the training."%(args.patience))
            break
    
    # Test
    with torch.no_grad():
        network.eval()
        preds = network(features, adj)
        test_acc = accuracy(preds[idx_test], labels[idx_test])
        print('Test Accuracy : %.2f'%(test_acc * 100))



if __name__  == "__main__":
    
    # Training settings
    parser = argparse.ArgumentParser()
    parser.add_argument('--no-cuda', action='store_true', default=False, help='Disables CUDA training.')
    parser.add_argument('--fastmode', action='store_true', default=False, help='Validate during training pass.')
    parser.add_argument('--sparse', action='store_true', default=False, help='GAT with sparse version or not.')
    parser.add_argument('--seed', type=int, default=72, help='Random seed.')
    parser.add_argument('--epochs', type=int, default=10, help='Number of epochs to train.')
    parser.add_argument('--save_every', type=int, default=10, help='Save every n epochs')
    parser.add_argument('--lr', type=float, default=0.005, help='Initial learning rate.')
    parser.add_argument('--weight_decay', type=float, default=5e-4, help='Weight decay (L2 loss on parameters).')
    parser.add_argument('--hidden', type=int, default=64, help='Number of hidden units.')
    parser.add_argument('--n_heads', type=int, default=8, help='Number of head attentions.')
    parser.add_argument('--dropout', type=float, default=0.5, help='Dropout rate (1 - keep probability).')
    parser.add_argument('--alpha', type=float, default=0.2, help='Alpha for the leaky_relu.')
    parser.add_argument('--patience', type=int, default=10, help='patience')
    parser.add_argument('--dataset', type=str, default='cora', choices=['cora','citeseer'], help='Dataset to train.')
    parser.add_argument('--model', type=str, default='GAT', choices=['GCN','GAT'], help='Model to train.')

    args = parser.parse_args([])

    args.cuda = False # not args.no_cuda and torch.cuda.is_available()
    print(args.cuda)

    main(args)

False
loading cora dataset ... 


OutOfMemoryError: CUDA out of memory. Tried to allocate 1.75 GiB (GPU 0; 4.00 GiB total capacity; 1.84 GiB already allocated; 815.20 MiB free; 1.84 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [11]:
random.seed(args.seed)
np.random.seed(args.seed)
torch.manual_seed(args.seed)

if args.cuda:
    torch.cuda.manual_seed(args.seed)

device = torch.device('cuda' if(torch.cuda.is_available()) else 'cpu')

# load the data
adj, features, labels, idx_train, idx_val, idx_test = load_data(args.dataset)
features = features.to(device)
labels = labels.to(device)

# parameter intialization
N = features.size(0) # num_of_nodes
F = features.size(1) # num_of_features
H = args.hidden # hidden nodes
C = labels.max().item() + 1 # num_classes

# for validation
epochs_since_improvement = 0
best_loss = 10.

network = GAT(F, H, C, args.dropout, args.alpha, args.n_heads).to(device)
adj = adj.to(device)
    
optimizer = optim.Adam(network.parameters(), lr = args.lr, weight_decay = args.weight_decay)
criterion = nn.CrossEntropyLoss()

train_losses, train_accs, val_losses, val_accs = [], [], [], []



loading cora dataset ... 


In [14]:
features.size(), adj.size()

(torch.Size([2708, 1433]), torch.Size([2708, 2708]))

In [None]:
    # Train
for epoch in range(args.epochs):
    t = time.time()
    network.train()

        preds = network(features, adj) # [N, F]
        train_loss = criterion(preds[idx_train], labels[idx_train])
        train_acc = accuracy(preds[idx_train], labels[idx_train])

        optimizer.zero_grad()
        train_loss.backward()
        optimizer.step()

        # validation
        with torch.no_grad():
            network.eval()
            preds_val = network(features, adj)
            val_loss = criterion(preds_val[idx_val], labels[idx_val])
            val_acc = accuracy(preds_val[idx_val], labels[idx_val])

            # early stopping
            if val_loss < best_loss :
                best_loss = val_loss
                epochs_since_improvement = 0
            else:
                epochs_since_improvement += 1

        train_losses.append(train_loss.item())
        train_accs.append(train_acc.item())
        val_losses.append(val_loss.item())
        val_accs.append(val_acc.item())

        print('[%d/%d] train loss : %.4f | train acc %.2f%% | val loss %.4f | val acc %.2f%% | time %.3fs'
                    %(epoch+1, args.epochs, train_loss.item(), train_acc.item() * 100, val_loss.item(), val_acc.item() * 100, time.time() - t))

        if epochs_since_improvement > args.patience - 1 :
            print("There's no improvements during %d epochs and so stop the training."%(args.patience))
            break
    
    # Test
    with torch.no_grad():
        network.eval()
        preds = network(features, adj)
        test_acc = accuracy(preds[idx_test], labels[idx_test])
        print('Test Accuracy : %.2f'%(test_acc * 100))



if __name__  == "__main__":
    
    # Training settings
    parser = argparse.ArgumentParser()
    parser.add_argument('--no-cuda', action='store_true', default=False, help='Disables CUDA training.')
    parser.add_argument('--fastmode', action='store_true', default=False, help='Validate during training pass.')
    parser.add_argument('--sparse', action='store_true', default=False, help='GAT with sparse version or not.')
    parser.add_argument('--seed', type=int, default=72, help='Random seed.')
    parser.add_argument('--epochs', type=int, default=10, help='Number of epochs to train.')
    parser.add_argument('--save_every', type=int, default=10, help='Save every n epochs')
    parser.add_argument('--lr', type=float, default=0.005, help='Initial learning rate.')
    parser.add_argument('--weight_decay', type=float, default=5e-4, help='Weight decay (L2 loss on parameters).')
    parser.add_argument('--hidden', type=int, default=64, help='Number of hidden units.')
    parser.add_argument('--n_heads', type=int, default=8, help='Number of head attentions.')
    parser.add_argument('--dropout', type=float, default=0.5, help='Dropout rate (1 - keep probability).')
    parser.add_argument('--alpha', type=float, default=0.2, help='Alpha for the leaky_relu.')
    parser.add_argument('--patience', type=int, default=10, help='patience')
    parser.add_argument('--dataset', type=str, default='cora', choices=['cora','citeseer'], help='Dataset to train.')
    parser.add_argument('--model', type=str, default='GAT', choices=['GCN','GAT'], help='Model to train.')

    args = parser.parse_args([])

    args.cuda = False # not args.no_cuda and torch.cuda.is_available()
    print(args.cuda)

    main(args)

## 5.3 https://github.com/marblet/gat-pytorch
#### This is the pytorch inplementation of Graph Attention Networks.https://arxiv.org/abs/1710.10903

#### Model

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import networkx as nx
import numpy as np
import pickle as pkl
import scipy.sparse as sp
import sys
import torch

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


class GAT(nn.Module):
    def __init__(self, data, nhid=8, nhead=8, nhead_out=1, alpha=0.2, dropout=0.6):
        super(GAT, self).__init__()
        nfeat, nclass = data.num_features, data.num_classes
        
        self.attentions = [GATConv(nfeat, nhid, dropout=dropout, alpha=alpha) for _ in range(nhead)]
        self.out_atts = [GATConv(nhid * nhead, nclass, dropout=dropout, alpha=alpha) for _ in range(nhead_out)]
        
        for i, attention in enumerate(self.attentions):
            self.add_module('attention_{}'.format(i), attention)
            
        for i, attention in enumerate(self.out_atts):
            self.add_module('out_att{}'.format(i), attention)
            
        self.reset_parameters()

    def reset_parameters(self):
        for att in self.attentions:
            att.reset_parameters()
        for att in self.out_atts:
            att.reset_parameters()

    def forward(self, data):
        x, edge_list = data.features, data.edge_list
        x = torch.cat([att(x, edge_list) for att in self.attentions], dim=1)
        x = F.elu(x)
        x = torch.sum(torch.stack([att(x, edge_list) for att in self.out_atts]), dim=0) / len(self.out_atts)
        return F.log_softmax(x, dim=1)


class GATConv(nn.Module):
    def __init__(self, in_features, out_features, dropout, alpha, bias=True):
        super(GATConv, self).__init__()
        self.dropout = dropout
        self.in_features = in_features
        self.out_features = out_features
        self.alpha = alpha

        self.weight = nn.Parameter(torch.FloatTensor(in_features, out_features))
        self.a = nn.Parameter(torch.zeros(size=(2*out_features, 1)))
        if bias:
            self.bias = nn.Parameter(torch.FloatTensor(out_features))
        else:
            self.register_parameter('bias', None)
        self.reset_parameters()

    def reset_parameters(self):
        nn.init.xavier_uniform_(self.weight.data, gain=1.414)
        if self.bias is not None:
            self.bias.data.fill_(0)
        nn.init.xavier_uniform_(self.a.data, gain=1.414)

    def forward(self, x, edge_list):
        x = F.dropout(x, self.dropout, training=self.training)
        h = torch.matmul(x, self.weight)

        source, target = edge_list
        a_input = torch.cat([h[source], h[target]], dim=1)
        e = F.leaky_relu(torch.matmul(a_input, self.a), negative_slope=self.alpha)

        N = h.size(0)
        attention = -1e20*torch.ones([N, N], device=device, requires_grad=True)
        attention[source, target] = e[:, 0]
        attention = F.softmax(attention, dim=1)
        attention = F.dropout(attention, self.dropout, training=self.training)
        h = F.dropout(h, self.dropout, training=self.training)
        h_prime = torch.matmul(attention, h)
        if self.bias is not None:
            h_prime = h_prime + self.bias

        return h_prime
    
    


class SPGAT(nn.Module):
    def __init__(self, data, nhid=8, nhead=8, nhead_out=1, alpha=0.2, dropout=0.6):
        super(SPGAT, self).__init__()
        nfeat, nclass = data.num_features, data.num_classes
        self.attentions = [SPGATConv(nfeat, nhid, dropout=dropout, alpha=alpha) for _ in range(nhead)]
        self.out_atts = [SPGATConv(nhid * nhead, nclass, dropout=dropout, alpha=alpha) for _ in range(nhead_out)]
        for i, attention in enumerate(self.attentions):
            self.add_module('attention_{}'.format(i), attention)
        for i, attention in enumerate(self.out_atts):
            self.add_module('out_att{}'.format(i), attention)
        self.reset_parameters()

    def reset_parameters(self):
        for att in self.attentions:
            att.reset_parameters()
        for att in self.out_atts:
            att.reset_parameters()

    def forward(self, data):
        x, edge_list = data.features, data.edge_list
        x = torch.cat([att(x, edge_list) for att in self.attentions], dim=1)
        x = F.elu(x)
        x = torch.sum(torch.stack([att(x, edge_list) for att in self.out_atts]), dim=0) / len(self.out_atts)
        return F.log_softmax(x, dim=1)


def sp_softmax(indices, values, N):
    source, _ = indices
    v_max = values.max()
    exp_v = torch.exp(values - v_max)
    exp_sum = torch.zeros(N, 1, device=device)
    exp_sum.scatter_add_(0, source.unsqueeze(1), exp_v)
    exp_sum += 1e-10
    softmax_v = exp_v / exp_sum[source]
    return softmax_v


def sp_matmul(indices, values, mat):
    source, target = indices
    out = torch.zeros_like(mat)
    out.scatter_add_(0, source.expand(mat.size(1), -1).t(), values * mat[target])
    return out


class SPGATConv(GATConv):
    def __init__(self, in_features, out_features, dropout, alpha, bias=True):
        super(SPGATConv, self).__init__(in_features, out_features, dropout, alpha, bias)
        self.reset_parameters()

    def reset_parameters(self):
        super().reset_parameters()

    def forward(self, x, edge_list):
        x = F.dropout(x, self.dropout, training=self.training)
        h = torch.matmul(x, self.weight)

        source, target = edge_list
        a_input = torch.cat([h[source], h[target]], dim=1)
        e = F.leaky_relu(torch.matmul(a_input, self.a), negative_slope=self.alpha)

        attention = sp_softmax(edge_list, e, h.size(0))
        attention = F.dropout(attention, self.dropout, training=self.training)
        h = F.dropout(h, self.dropout, training=self.training)
        h_prime = sp_matmul(edge_list, attention, h)
        if self.bias is not None:
            h_prime = h_prime + self.bias

        return h_prime

#### Data

In [None]:
class Data(object):
    def __init__(self, adj, edge_list, features, labels, train_mask, val_mask, test_mask):
        self.adj = adj
        self.edge_list = edge_list
        self.features = features
        self.labels = labels
        self.train_mask = train_mask
        self.val_mask = val_mask
        self.test_mask = test_mask
        self.num_features = features.size(1)
        self.num_classes = int(torch.max(labels)) + 1

    def to(self, device):
        self.adj = self.adj.to(device)
        self.edge_list = self.edge_list.to(device)
        self.features = self.features.to(device)
        self.labels = self.labels.to(device)
        self.train_mask = self.train_mask.to(device)
        self.val_mask = self.val_mask.to(device)
        self.test_mask = self.test_mask.to(device)


def load_data(dataset_str, norm_feat=True):
    names = ['x', 'y', 'tx', 'ty', 'allx', 'ally', 'graph']
    objects = []
    for name in names:
        with open("data/planetoid/ind.{}.{}".format(dataset_str, name), 'rb') as f:
            if sys.version_info > (3, 0):
                out = pkl.load(f, encoding='latin1')
            else:
                out = objects.append(pkl.load(f))

            if name == 'graph':
                objects.append(out)
            else:
                out = out.todense() if hasattr(out, 'todense') else out
                objects.append(torch.Tensor(out))

    x, y, tx, ty, allx, ally, graph = tuple(objects)
    test_idx = parse_index_file("data/planetoid/ind.{}.test.index".format(dataset_str))
    train_idx = torch.arange(y.size(0), dtype=torch.long)
    val_idx = torch.arange(y.size(0), y.size(0) + 500, dtype=torch.long)
    sorted_test_idx = np.sort(test_idx)

    if dataset_str == 'citeseer':
        len_test_idx = max(test_idx) - min(test_idx) + 1
        tx_ext = torch.zeros(len_test_idx, tx.size(1))
        tx_ext[sorted_test_idx - min(test_idx), :] = tx
        ty_ext = torch.zeros(len_test_idx, ty.size(1))
        ty_ext[sorted_test_idx - min(test_idx), :] = ty

        tx, ty = tx_ext, ty_ext

    features = torch.cat([allx, tx], dim=0)
    features[test_idx] = features[sorted_test_idx]
    
    if norm_feat:
        features = preprocess_features(features)

    labels = torch.cat([ally, ty], dim=0).max(dim=1)[1]
    labels[test_idx] = labels[sorted_test_idx]

    edge_list = adj_list_from_dict(graph)
    edge_list = add_self_loops(edge_list, features.size(0))
    adj = normalize_adj(edge_list)

    train_mask = index_to_mask(train_idx, labels.shape[0])
    val_mask = index_to_mask(val_idx, labels.shape[0])
    test_mask = index_to_mask(test_idx, labels.shape[0])

    data = Data(adj, edge_list, features, labels, train_mask, val_mask, test_mask)

    return data


def adj_list_from_dict(graph):
    G = nx.from_dict_of_lists(graph)
    coo_adj = nx.to_scipy_sparse_matrix(G).tocoo()
    indices = torch.from_numpy(np.vstack((coo_adj.row, coo_adj.col)).astype(np.int64))
    return indices


def index_to_mask(index, size):
    mask = torch.zeros((size, ), dtype=torch.bool)
    mask[index] = 1
    return mask


def parse_index_file(filename):
    index = []
    for line in open(filename):
        index.append(int(line.strip()))
    return index


def add_self_loops(edge_list, size):
    i = torch.arange(size, dtype=torch.int64).view(1, -1)
    self_loops = torch.cat((i, i), dim=0)
    edge_list = torch.cat((edge_list, self_loops), dim=1)
    return edge_list


def get_degree(edge_list):
    row, col = edge_list
    deg = torch.bincount(row)
    return deg


def normalize_adj(edge_list):
    deg = get_degree(edge_list)
    row, col = edge_list
    deg_inv_sqrt = torch.pow(deg.to(torch.float), -0.5)
    deg_inv_sqrt[deg_inv_sqrt == float('inf')] = 0.0
    weight = torch.ones(edge_list.size(1))
    v = deg_inv_sqrt[row] * weight * deg_inv_sqrt[col]
    norm_adj = torch.sparse.FloatTensor(edge_list, v)
    return norm_adj


def preprocess_features(features):
    rowsum = features.sum(dim=1, keepdim=True)
    rowsum[rowsum == 0] = 1
    features = features / rowsum
    return 

#### Train

In [None]:
import torch
import torch.nn.functional as F
from torch.optim import Adam
from copy import deepcopy
from numpy import mean, std
from tqdm import tqdm

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


class EarlyStopping:
    def __init__(self, patience, verbose, use_loss, use_acc, save_model):
        assert use_loss or use_acc, 'use loss or (and) acc'
        self.patience = patience
        self.use_loss = use_loss
        self.use_acc = use_acc
        self.save_model = save_model
        self.verbose = verbose
        self.counter = 0
        self.best_val_loss = float('inf')
        self.best_val_acc = 0
        self.state_dict = None

    def check(self, evals, model, epoch):
        if self.use_loss and self.use_acc:
            # For GAT, based on https://github.com/PetarV-/GAT/blob/master/execute_cora.py
            if evals['val_loss'] <= self.best_val_loss or evals['val_acc'] >= self.best_val_acc:
                if evals['val_loss'] <= self.best_val_loss and evals['val_acc'] >= self.best_val_acc:
                    if self.save_model:
                        self.state_dict = deepcopy(model.state_dict())
                self.best_val_loss = min(self.best_val_loss, evals['val_loss'])
                self.best_val_acc = max(self.best_val_acc, evals['val_acc'])
                self.counter = 0
            else:
                self.counter += 1
        elif self.use_loss:
            if evals['val_loss'] < self.best_val_loss:
                self.best_val_loss = evals['val_loss']
                self.counter = 0
                if self.save_model:
                    self.state_dict = deepcopy(model.state_dict())
            else:
                self.counter += 1
        elif self.use_acc:
            if evals['val_acc'] > self.best_val_acc:
                self.best_val_acc = evals['val_acc']
                self.counter = 0
                if self.save_model:
                    self.state_dict = deepcopy(model.state_dict())
            else:
                self.counter += 1
        stop = False
        if self.counter >= self.patience:
            stop = True
            if self.verbose:
                print("Stop training, epoch:", epoch)
            if self.save_model:
                model.load_state_dict(self.state_dict)
        return stop


def train(model, optimizer, data):
    model.train()
    optimizer.zero_grad()
    output = model(data)
    loss = F.nll_loss(output[data.train_mask], data.labels[data.train_mask])
    loss.backward()
    optimizer.step()


def evaluate(model, data):
    model.eval()

    with torch.no_grad():
        output = model(data)

    outputs = {}
    for key in ['train', 'val', 'test']:
        if key == 'train':
            mask = data.train_mask
        elif key == 'val':
            mask = data.val_mask
        else:
            mask = data.test_mask
        loss = F.nll_loss(output[mask], data.labels[mask]).item()
        pred = output[mask].max(dim=1)[1]
        acc = pred.eq(data.labels[mask]).sum().item() / mask.sum().item()

        outputs['{}_loss'.format(key)] = loss
        outputs['{}_acc'.format(key)] = acc

    return outputs


def run(data, model, lr, weight_decay, epochs=100000, niter=100, early_stopping=True, patience=100,
        use_loss=True, use_acc=True, save_model=True, verbose=False):
    # for GPU
    data.to(device)

    val_acc_list = []
    test_acc_list = []

    for _ in tqdm(range(niter)):
        optimizer = Adam(model.parameters(), lr=lr, weight_decay=weight_decay)
        model.to(device).reset_parameters()
        if torch.cuda.is_available():
            torch.cuda.synchronize()

        # for early stopping
        if early_stopping:
            stop_checker = EarlyStopping(patience, verbose, use_loss, use_acc, save_model)

        for epoch in range(1, epochs + 1):
            train(model, optimizer, data)
            evals = evaluate(model, data)

            if verbose:
                print('epoch: {: 4d}'.format(epoch),
                      'train loss: {:.5f}'.format(evals['train_loss']),
                      'train acc: {:.5f}'.format(evals['train_acc']),
                      'val loss: {:.5f}'.format(evals['val_loss']),
                      'val acc: {:.5f}'.format(evals['val_acc']))

            if early_stopping:
                if stop_checker.check(evals, model, epoch):
                    break

        if torch.cuda.is_available():
            torch.cuda.synchronize()
        evals = evaluate(model, data)
        if verbose:
            for met, val in evals.items():
                print(met, val)

        val_acc_list.append(evals['val_acc'])
        test_acc_list.append(evals['test_acc'])

    print("mean", mean(test_acc_list))
    print("std", std(test_acc_list))
    return {
        'val_acc': mean(val_acc_list),
        'test_acc': mean(test_acc_list),
        'test_acc_std': std(test_acc_list)
    }

#### main

## 6. https://github.com/QData/LaMP
#### arxiv.org/pdf/1904.08049.pdf 
(NEURAL MESSAGE PASSING FOR MULTI-LABEL CLASSIFICATION)