# Download the Cora data

In [None]:
! wget https://linqs-data.soe.ucsc.edu/public/lbc/cora.tgz
! tar -zxvf cora.tgz

--2023-04-11 01:31:57--  https://linqs-data.soe.ucsc.edu/public/lbc/cora.tgz
Resolving linqs-data.soe.ucsc.edu (linqs-data.soe.ucsc.edu)... 128.114.47.74
Connecting to linqs-data.soe.ucsc.edu (linqs-data.soe.ucsc.edu)|128.114.47.74|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 168052 (164K) [application/x-gzip]
Saving to: ‘cora.tgz’


2023-04-11 01:31:58 (1004 KB/s) - ‘cora.tgz’ saved [168052/168052]

cora/
cora/README
cora/cora.cites
cora/cora.content


# import modules and set random seed

In [None]:
import numpy as np
import scipy.sparse as sp
import torch
import pandas as pd
import math
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import time

seed = 0

np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

# Loading and preprocessing the data

In [None]:
def encode_onehot(labels):
    # The classes must be sorted before encoding to enable static class encoding.
    # In other words, make sure the first class always maps to index 0.
    classes = sorted(list(set(labels)))
    classes_dict = {c: np.identity(len(classes))[i, :] for i, c in
                    enumerate(classes)}
    labels_onehot = np.array(list(map(classes_dict.get, labels)),
                             dtype=np.int32)
    return labels_onehot


def load_data(path="/content/cora/", dataset="cora", training_samples=140):
    """Load citation network dataset (cora only for now)"""
    print('Loading {} dataset...'.format(dataset))

    idx_features_labels = np.genfromtxt("{}{}.content".format(path, dataset),
                                        dtype=np.dtype(str))
    features = sp.csr_matrix(idx_features_labels[:, 1:-1], dtype=np.float32)
    labels = encode_onehot(idx_features_labels[:, -1])

    # build graph
    idx = np.array(idx_features_labels[:, 0], dtype=np.int32)
    idx_map = {j: i for i, j in enumerate(idx)}
    edges_unordered = np.genfromtxt("{}{}.cites".format(path, dataset),
                                    dtype=np.int32)
    edges = np.array(list(map(idx_map.get, edges_unordered.flatten())),
                     dtype=np.int32).reshape(edges_unordered.shape)
    adj = sp.coo_matrix((np.ones(edges.shape[0]), (edges[:, 0], edges[:, 1])),
                        shape=(labels.shape[0], labels.shape[0]),
                        dtype=np.float32)

    # build symmetric adjacency matrix
    adj = adj + adj.T.multiply(adj.T > adj) - adj.multiply(adj.T > adj)

    features = normalize(features)
    adj = adj + sp.eye(adj.shape[0])
    adj = normalize_adj(adj)

    # Random indexes
    idx_rand = torch.randperm(len(labels))
    # Nodes for training
    idx_train = idx_rand[:training_samples]
    # Nodes for validation
    idx_val= idx_rand[training_samples:]

    adj = torch.FloatTensor(np.array(adj.todense()))
    features = torch.FloatTensor(np.array(features.todense()))
    labels = torch.LongTensor(np.where(labels)[1])

    idx_train = torch.LongTensor(idx_train)
    idx_val = torch.LongTensor(idx_val)

    return adj, features, labels, idx_train, idx_val

def normalize_adj(mx):
    """symmetric normalization"""
    rowsum = np.array(mx.sum(1))
    r_inv_sqrt = np.power(rowsum, -0.5).flatten()
    r_inv_sqrt[np.isinf(r_inv_sqrt)] = 0.
    r_mat_inv_sqrt = sp.diags(r_inv_sqrt)
    return mx.dot(r_mat_inv_sqrt).transpose().dot(r_mat_inv_sqrt)

def normalize(mx):
    """Row-normalize sparse matrix"""
    rowsum = np.array(mx.sum(1))
    r_inv = np.power(rowsum, -1).flatten()
    r_inv[np.isinf(r_inv)] = 0.
    r_mat_inv = sp.diags(r_inv)
    mx = r_mat_inv.dot(mx)
    return mx


def accuracy(output, labels):
    preds = output.max(1)[1].type_as(labels)
    correct = preds.eq(labels).double()
    correct = correct.sum()
    return correct / len(labels)

## check the data

In [None]:
adj, features, labels, idx_train, idx_val = load_data()

Loading cora dataset...


In [None]:
print(adj)
print(adj.shape)

tensor([[0.1667, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        [0.0000, 0.5000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.2000,  ..., 0.0000, 0.0000, 0.0000],
        ...,
        [0.0000, 0.0000, 0.0000,  ..., 0.2000, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.2000, 0.0000],
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.2500]])
torch.Size([2708, 2708])


In [None]:
print(features)
print(features.shape)

tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]])
torch.Size([2708, 1433])


In [None]:
print(labels)
print(labels.unique())
print(len(labels))

tensor([2, 5, 4,  ..., 1, 0, 2])
tensor([0, 1, 2, 3, 4, 5, 6])
2708


In [None]:
print(len(idx_train))
print(len(idx_val))

140
2568


# Vanilla GCN for node classification








## Define Graph Convolution layer (Your Task)

This module takes $\mathbf{h} = \{ \overrightarrow{h_1}, \overrightarrow{h_2}, \dots, \overrightarrow{h_N} \}$ where $\overrightarrow{h_i} \in \mathbb{R}^F$ as input and outputs $\mathbf{h'} = \{ \overrightarrow{h'_1}, \overrightarrow{h'_2}, \dots, \overrightarrow{h'_N} \}$, where $\overrightarrow{h'_i} \in \mathbb{R}^{F'}$.
1.   perform initial transformation: $\mathbf{s} = \mathbf{W} \times \mathbf{h} ^{(l)}$
2.   multiply $\mathbf{s}$ by normalized adjacency matrix: $\mathbf{h'} = \mathbf{A} \times \mathbf{s}$

In [None]:
class GraphConvolution(nn.Module):
    """
    A Graph Convolution Layer (GCN)
    """

    def __init__(self, in_features, out_features, bias=True):
        """
        * `in_features`, $F$, is the number of input features per node
        * `out_features`, $F'$, is the number of output features per node
        * `bias`, whether to include the bias term in the linear layer. Default=True
        """
        super(GraphConvolution, self).__init__()
        # TODO: initialize the weight W that maps the input feature (dim F ) to output feature (dim F')
        # hint: use nn.Linear()
        ############ Your code here ###################################
        self.linear = nn.Linear(in_features, out_features, bias)
        ###############################################################

    def forward(self, input, adj):
        # TODO: transform input feature to output (don't forget to use the adjacency matrix 
        # to sum over neighbouring nodes )
        # hint: use the linear layer you declared above. 
        # hint: you can use torch.spmm() sparse matrix multiplication to handle the 
        #       adjacency matrix
        ############ Your code here ###################################
        output = self.linear(input)
        output = torch.spmm(adj, output)
        return output
        ###############################################################


## Define GCN (Your Task)

you will implement a two-layer GCN with ReLU activation function and Dropout after the first Conv layer.

In [None]:
class GCN(nn.Module):
    '''
    A two-layer GCN
    '''
    def __init__(self, nfeat, n_hidden, n_classes, dropout, bias=True):
        """
        * `nfeat`, is the number of input features per node of the first layer
        * `n_hidden`, number of hidden units
        * `n_classes`, total number of classes for classification
        * `dropout`, the dropout ratio
        * `bias`, whether to include the bias term in the linear layer. Default=True
        """

        super(GCN, self).__init__()
        # TODO: Initialization
        # (1) 2 GraphConvolution() layers. 
        # (2) 1 Dropout layer
        # (3) 1 activation function: ReLU()
        ############ Your code here ###################################
        self.gcn1 = GraphConvolution(nfeat, n_hidden, bias)
        self.gcn2 = GraphConvolution(n_hidden, n_classes, bias)
        self.dropout = nn.Dropout(dropout)
        self.relu = nn.ReLU()
        ###############################################################

    def forward(self, x, adj):
        # TODO: the input will pass through the first graph convolution layer, 
        # the activation function, the dropout layer, then the second graph 
        # convolution layer. No activation function for the 
        # last layer. Return the logits. 
        ############ Your code here ###################################
        x = self.gcn1(x, adj)
        x = self.relu(x)
        x = self.dropout(x)
        x = self.gcn2(x, adj)
        return x
        ###############################################################

## define loss function

In [None]:
criterion = nn.CrossEntropyLoss()

## training loop

In [None]:
args = {"training_samples": 140,
        "epochs": 100,
        "lr": 0.01,
        "weight_decay": 5e-4,
        "hidden": 16,
        "dropout": 0.5,
        "bias": True, 
        }


In [None]:
def train(epoch):
    t = time.time()
    model.train()
    optimizer.zero_grad()
    output = model(features, adj)
    loss_train = criterion(output[idx_train], labels[idx_train])
    acc_train = accuracy(output[idx_train], labels[idx_train])
    loss_train.backward()
    optimizer.step()

    model.eval()
    output = model(features, adj)

    loss_val = criterion(output[idx_val], labels[idx_val])
    acc_val = accuracy(output[idx_val], labels[idx_val])
    print('Epoch: {:04d}'.format(epoch+1),
          'loss_train: {:.4f}'.format(loss_train.item()),
          'acc_train: {:.4f}'.format(acc_train.item()),
          'loss_val: {:.4f}'.format(loss_val.item()),
          'acc_val: {:.4f}'.format(acc_val.item()),
          'time: {:.4f}s'.format(time.time() - t))


def test():
    model.eval()
    output = model(features, adj)
    loss_test = criterion(output[idx_val], labels[idx_val])
    acc_test = accuracy(output[idx_val], labels[idx_val])
    print("Test set results:",
          "loss= {:.4f}".format(loss_test.item()),
          "accuracy= {:.4f}".format(acc_test.item()))




In [None]:
model = GCN(nfeat=features.shape[1],
            n_hidden=args["hidden"],
            n_classes=labels.max().item() + 1,
            dropout=args["dropout"]).to(device)
optimizer = optim.Adam(model.parameters(),
                       lr=args["lr"], weight_decay=args["weight_decay"])


adj, features, labels, idx_train, idx_val = load_data(training_samples=args["training_samples"])
adj, features, labels, idx_train, idx_val = adj.to(device), features.to(device), labels.to(device), idx_train.to(device), idx_val.to(device)

Loading cora dataset...


## training Vanilla GCN

In [None]:
# Train model
t_total = time.time()
for epoch in range(args["epochs"]):
    train(epoch)
print("Optimization Finished!")
print("Total time elapsed: {:.4f}s".format(time.time() - t_total))

# evaluating
test()

Epoch: 0001 loss_train: 1.9387 acc_train: 0.1143 loss_val: 1.9272 acc_val: 0.1597 time: 2.8248s
Epoch: 0002 loss_train: 1.9319 acc_train: 0.1143 loss_val: 1.9223 acc_val: 0.1597 time: 0.0033s
Epoch: 0003 loss_train: 1.9245 acc_train: 0.1143 loss_val: 1.9165 acc_val: 0.1663 time: 0.0033s
Epoch: 0004 loss_train: 1.9154 acc_train: 0.1786 loss_val: 1.9103 acc_val: 0.2134 time: 0.0032s
Epoch: 0005 loss_train: 1.9070 acc_train: 0.2000 loss_val: 1.9037 acc_val: 0.1340 time: 0.0033s
Epoch: 0006 loss_train: 1.8966 acc_train: 0.2071 loss_val: 1.8968 acc_val: 0.1464 time: 0.0033s
Epoch: 0007 loss_train: 1.8865 acc_train: 0.2429 loss_val: 1.8893 acc_val: 0.3470 time: 0.0033s
Epoch: 0008 loss_train: 1.8768 acc_train: 0.4000 loss_val: 1.8815 acc_val: 0.3520 time: 0.0033s
Epoch: 0009 loss_train: 1.8699 acc_train: 0.3857 loss_val: 1.8735 acc_val: 0.3104 time: 0.0033s
Epoch: 0010 loss_train: 1.8577 acc_train: 0.3786 loss_val: 1.8654 acc_val: 0.3018 time: 0.0047s
Epoch: 0011 loss_train: 1.8397 acc_train

# Graph Attention Networks

## Graph attention layer (Your task)
A GAT is made up of multiple such layers. In this section, you will implement a single graph attention layer. Similar to the `GraphConvolution()`, this `GraphAttentionLayer()` module takes $\mathbf{h} = \{ \overrightarrow{h_1}, \overrightarrow{h_2}, \dots, \overrightarrow{h_N} \}$ where $\overrightarrow{h_i} \in \mathbb{R}^F$ as input and outputs $\mathbf{h'} = \{ \overrightarrow{h'_1}, \overrightarrow{h'_2}, \dots, \overrightarrow{h'_N} \}$, where $\overrightarrow{h'_i} \in \mathbb{R}^{F'}$. However, instead of weighing each neighbouring node based on the adjacency matrix, we will use self attention to learn the relative importance of each neighbouring node. Recall from HW4 where you are asked to write out the equation for single headed attention, here we will implement multi-headed attention, which involves the following steps: 


### The initial transformation
In GCN above, you have completed similar transformation. But here, we need to define a weight matrix and perform this transformation for each head: $\overrightarrow{s^k_i} = \mathbf{W}^k \overrightarrow{h_i}$. We will perform a single linear transformation and then split it up for each head later. Note the input $\overrightarrow{h}$ has shape `[n_nodes, in_features]` and $\overrightarrow{s}$ has shape of `[n_nodes, n_heads * n_hidden]`. Remember to reshape $\overrightarrow{s}$ has shape of `[n_nodes, n_heads, n_hidden]` for later uses. Note: set `bias=False` for this linear transformation. 

### attention score
We calculate these for each head $k$. Here for simplicity of the notation, we omit $k$ in the following equations. The attention scores are defined as the follows: 
$$e_{ij} = a(\mathbf{W} \overrightarrow{h_i}, \mathbf{W} \overrightarrow{h_j}) =a(\overrightarrow{s_i}, \overrightarrow{s_j})$$, 
where $e_{ij}$ is the attention score (importance) of node $j$ to node $i$.
We will have to calculate this for each head. $a$ is the attention mechanism, that calculates the attention score. The paper concatenates $\overrightarrow{s_i}$, $\overrightarrow{s_j}$ and does a linear transformation with a weight vector $\mathbf{a} \in \mathbb{R}^{2 F'}$ followed by a $\text{LeakyReLU}$. $$e_{ij} = \text{LeakyReLU} \Big(
\mathbf{a}^\top \Big[ \overrightarrow{s_i} \Vert \overrightarrow{s_j}  \Big] \Big)$$

#### How to vectorize this? Some hints: 
1. `tensor.repeat()` gives you $\{\overrightarrow{s_1}, \overrightarrow{s_2}, \dots, \overrightarrow{s_N}, \overrightarrow{s_1}, \overrightarrow{s_2}, \dots, \overrightarrow{s_N}, ...\}$.

2. `tensor.repeat_interleave()` gives you
$\{\overrightarrow{s_1}, \overrightarrow{s_1}, \dots, \overrightarrow{s_1}, \overrightarrow{s_2}, \overrightarrow{s_2}, \dots, \overrightarrow{s_2}, ...\}$.

3. concatenate to get $\Big[\overrightarrow{s_i} \Vert \overrightarrow{s_j} \Big]$ for all pairs of $i, j$. Reshape $\overrightarrow{s_i} \Vert \overrightarrow{s_j}$ has shape of `[n_nodes, n_nodes, n_heads, 2 * n_hidden]`

4. apply the attention layer and non-linear activation function to get $e_{ij} = \text{LeakyReLU} \Big( \mathbf{a}^\top \Big[ \overrightarrow{s_i} \Vert \overrightarrow{s_j}  \Big] \Big)$, where $\mathbf{a}^\top$ is a single linear transformation that maps from dimension `n_hidden * 2` to `1`. Note: set the `bias=False` for this linear transformation. $\mathbf{e}$ is of shape `[n_nodes, n_nodes, n_heads, 1]`. Remove the last dimension `1` using `squeeze()`. 


#### Perform softmax 
First, we need to mask $e_{ij}$ based on adjacency matrix. We only need to sum over the neighbouring nodes for the attention calculation. Set the elements in $e_{ij}$ to $- \infty$ if there is no edge from $i$ to $j$ for the softmax calculation. We need to do this for all heads and the adjacency matrix is the same for each head. Use `tensor.masked_fill()` to mask $e_{ij}$ based on adjacency matrix for all heads. Hint: reshape the adjacency matrix to `[n_nodes, n_nodes, 1]` using `unsqueeze()`. 
Now we are ready to normalize attention scores (or coefficients) $$\alpha_{ij} = \text{softmax}_j(e_{ij}) =  \frac{\exp(e_{ij})}{\sum_{k \in \mathcal{N}_i} \exp(e_{ik})}$$

#### Apply dropout
Apply the dropout layer. (this step is easy)

#### Calculate final output for each head
$$\overrightarrow{h'^k_i} = \sum_{j \in \mathcal{N}_i} \alpha^k_{ij} \overrightarrow{s^k_j}$$


#### Concat or Mean
Finally we concateneate the transformed features: $\overrightarrow{h'_i} = \Bigg\Vert_{k=1}^{K} \overrightarrow{h'^k_i}$. In the code, we only need to reshape the tensor to shape of `[n_nodes, n_heads * n_hidden]`. Note that if it is the final layer, then it doesn't make sense to do concatenation anymore. Instead, we sum over the `n_heads` dimension: $\overrightarrow{h'_i} = \frac{1}{K} \sum_{k=1}^{K} \overrightarrow{h'^k_i}$. 

In [None]:
class GraphAttentionLayer(nn.Module):

    def __init__(self, in_features: int, out_features: int, n_heads: int,
                 is_concat: bool = True,
                 dropout: float = 0.6,
                 alpha: float = 0.2):
        """
        in_features: F, the number of input features per node
        out_features: F', the number of output features per node
        n_heads: K, the number of attention heads
        is_concat: whether the multi-head results should be concatenated or averaged
        dropout: the dropout probability
        alpha: the negative slope for leaky relu activation
        """
        super(GraphAttentionLayer, self).__init__()

        self.is_concat = is_concat
        self.n_heads = n_heads

        if is_concat:
            assert out_features % n_heads == 0
            self.n_hidden = out_features // n_heads
        else:
            self.n_hidden = out_features

        # TODO: initialize the following modules: 
        # (1) self.W: Linear layer that transform the input feature before self attention. 
        # You should NOT use for loops for the multiheaded implementation (set bias = Flase)
        # (2) self.attention: Linear layer that compute the attention score (set bias = Flase)
        # (3) self.activation: Activation function (LeakyReLU whith negative_slope=alpha)
        # (4) self.softmax: Softmax function (what's the dim to compute the summation?)
        # (5) self.dropout_layer: Dropout function(with ratio=dropout)
        ################ your code here ########################
        self.W = nn.Linear(in_features, out_features, bias=False)
        self.attention = nn.Linear(self.n_hidden * 2, 1, bias=False)
        self.activation = nn.LeakyReLU(alpha)
        self.softmax = nn.Softmax(dim=1)
        self.dropout_layer = nn.Dropout(dropout)
        ########################################################

    def forward(self, h: torch.Tensor, adj_mat: torch.Tensor):
        # Number of nodes
        n_nodes = h.shape[0]
        
        # TODO: 
        # (1) calculate s = Wh and reshape it to [n_nodes, n_heads, n_hidden] 
        #     (you can use tensor.view() function)
        # (2) get [s_i || s_j] using tensor.repeat(), repeat_interleave(), torch.cat(), tensor.view()  
        # (3) apply the attention layer 
        # (4) apply the activation layer (you will get the attention score e)
        # (5) remove the last dimension 1 use tensor.squeeze()
        # (6) mask the attention score with the adjacency matrix (if there's no edge, assign it to -inf)
        #     note: check the dimensions of e and your adjacency matrix. You may need to use the function unsqueeze()
        # (7) apply softmax 
        # (8) apply dropout_layer 
        ############## Your code here #########################################
        s = self.W(h).view(n_nodes, self.n_heads, self.n_hidden)
        # print(h.shape)
        # print(s.shape)
        s_i = s.repeat_interleave(n_nodes, dim=0)
        s_j = s.repeat(n_nodes, 1, 1)
        # print(n_nodes)
        # print(s_i.shape)
        # print(s_j.shape)
        s_ij = torch.cat([s_i, s_j], dim=-1)
        # print(s_ij.shape)
        s_ij = s_ij.view(n_nodes, n_nodes, self.n_heads, 2*self.n_hidden)
        # print(s_ij.shape)
        e = self.attention(s_ij)
        # print(e.shape)

        e = self.activation(e)
        # print(e.shape)
        e = e.squeeze(dim=-1)
        # print(e.shape)
        # print(adj_mat.shape)
        adj_mat = adj_mat.unsqueeze(dim=-1)
        e = e.masked_fill(adj_mat == 0, float('-inf'))
        a = self.softmax(e)
        a = self.dropout_layer(a)
        #######################################################################

        # Summation 
        h_prime = torch.einsum('ijh,jhf->ihf', a, s) #[n_nodes, n_heads, n_hidden]


        # TODO: Concat or Mean
        # Concatenate the heads
        if self.is_concat:
            ############## Your code here #########################################
            h_prime = h_prime.reshape(n_nodes, self.n_heads * self.n_hidden)
            return h_prime
            #######################################################################
        # Take the mean of the heads (for the last layer)
        else:
            ############## Your code here #########################################
            h_prime = h_prime.mean(dim=1)
            return h_prime
            #######################################################################






## Define GAT network
it's really similar to how we defined GCN. We followed the paper to use two attention layers and ELU() activation function. 

In [None]:
class GAT(nn.Module):

    def __init__(self, nfeat: int, n_hidden: int, n_classes: int, n_heads: int, dropout: float, alpha: float):
        """
        in_features: the number of features per node
        n_hidden: the number of features in the first graph attention layer
        n_classes: the number of classes
        n_heads: the number of heads in the graph attention layers
        dropout: the dropout probability
        alpha: the negative input slope for leaky ReLU of the att58666112x4ention layer
        """
        super().__init__()

        # First graph attention layer where we concatenate the heads
        self.gc1 = GraphAttentionLayer(nfeat, n_hidden, n_heads, is_concat=True, dropout=dropout, alpha=alpha)
        self.gc2 = GraphAttentionLayer(n_hidden, n_classes, 1, is_concat=False, dropout=dropout, alpha=alpha)
        self.activation = nn.ELU()  
        self.dropout = nn.Dropout(dropout)

    def forward(self, x: torch.Tensor, adj_mat: torch.Tensor):
        """
        x: the features vectors
        adj_mat: the adjacency matrix
        """
        x = self.dropout(x)
        x = self.gc1(x, adj_mat)
        x = self.activation(x)
        x = self.dropout(x)
        x = self.gc2(x, adj_mat)
        return x

## training GAT

In [None]:
args = {"training_samples": 140,
        "epochs": 100,
        "lr": 0.01,
        "weight_decay": 5e-4,
        "hidden": 16,
        "dropout": 0.5,
        "bias": True, 
        "alpha": 0.2,
        "n_heads": 8
        }

In [None]:
model = GAT(nfeat=features.shape[1],
            n_hidden=args["hidden"],
            n_classes=labels.max().item() + 1,
            dropout=args["dropout"],
            alpha=args["alpha"],
            n_heads=args["n_heads"]).to(device)
optimizer = optim.Adam(model.parameters(),
                       lr=args["lr"], weight_decay=args["weight_decay"])
adj, features, labels, idx_train, idx_val = adj.to(device), features.to(device), labels.to(device), idx_train.to(device), idx_val.to(device)

In [None]:
# Train model
t_total = time.time()
for epoch in range(args["epochs"]):
    train(epoch)
print("Optimization Finished!")
print("Total time elapsed: {:.4f}s".format(time.time() - t_total))

# Testing
test()

Epoch: 0001 loss_train: 1.9460 acc_train: 0.1857 loss_val: 1.9427 acc_val: 0.3563 time: 0.2649s
Epoch: 0002 loss_train: 1.9426 acc_train: 0.3429 loss_val: 1.9395 acc_val: 0.4062 time: 0.2266s
Epoch: 0003 loss_train: 1.9359 acc_train: 0.4929 loss_val: 1.9361 acc_val: 0.4023 time: 0.2243s
Epoch: 0004 loss_train: 1.9322 acc_train: 0.4714 loss_val: 1.9322 acc_val: 0.3988 time: 0.2239s
Epoch: 0005 loss_train: 1.9273 acc_train: 0.4643 loss_val: 1.9280 acc_val: 0.3941 time: 0.2241s
Epoch: 0006 loss_train: 1.9193 acc_train: 0.4429 loss_val: 1.9235 acc_val: 0.3937 time: 0.2271s
Epoch: 0007 loss_train: 1.9184 acc_train: 0.5000 loss_val: 1.9187 acc_val: 0.3882 time: 0.2246s
Epoch: 0008 loss_train: 1.9067 acc_train: 0.4571 loss_val: 1.9134 acc_val: 0.3820 time: 0.2248s
Epoch: 0009 loss_train: 1.8983 acc_train: 0.4214 loss_val: 1.9079 acc_val: 0.3769 time: 0.2241s
Epoch: 0010 loss_train: 1.8903 acc_train: 0.4143 loss_val: 1.9020 acc_val: 0.3758 time: 0.2247s
Epoch: 0011 loss_train: 1.8841 acc_train

# Question: (Your task)
Compare the evaluation results for Vanilla GCN and GAT. Comment on the discrepancy in their performance (if any) and briefly explain why you think it's the case (in 1-2 sentences). 