In [1]:
from __future__ import absolute_import, print_function, division, unicode_literals

import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
import math


from tensorflow.python.ops import control_flow_util
control_flow_util.ENABLE_CONTROL_FLOW_V2 = True

print(tf.__version__)
print(tf.executing_eagerly())

2.2.0
True


In [2]:
Dataset = 'cora'
Sparse = False 
Batch_Size = 1 
Epochs = 100000
Patience = 100
Learning_Rate = 0.005
Weight_Decay = 0.0005
ffd_drop = 0.6
attn_drop = 0.6
Residual = False

dataset = Dataset

# training params
batch_size = Batch_Size
nb_epochs = Epochs
patience = Patience
lr = Learning_Rate
l2_coef = Weight_Decay
residual = Residual


hid_units = [8] # numbers of hidden units per each attention head in each layer
n_heads = [8, 1] # additional entry for the output layer

nonlinearity = tf.nn.elu
optimizer = tf.keras.optimizers.Adam(lr = lr)

In [9]:
import numpy as np
import pickle as pkl
import networkx as nx
import scipy.sparse as sp
from scipy.sparse.linalg.eigen.arpack import eigsh
import sys

"""
 Prepare adjacency matrix by expanding up to a given neighbourhood.
 This will insert loops on every node.
 Finally, the matrix is converted to bias vectors.
 Expected shape: [graph, nodes, nodes]
"""
def adj_to_bias(adj, sizes, nhood=1):
    nb_graphs = adj.shape[0]
    mt = np.empty(adj.shape)
    for g in range(nb_graphs):
        mt[g] = np.eye(adj.shape[1])
        for _ in range(nhood):
            mt[g] = np.matmul(mt[g], (adj[g] + np.eye(adj.shape[1])))
        for i in range(sizes[g]):
            for j in range(sizes[g]):
                if mt[g][i][j] > 0.0:
                    mt[g][i][j] = 1.0
    return -1e9 * (1.0 - mt)



###############################################
# This section of code adapted from tkipf/gcn #
###############################################

def parse_index_file(filename):
    """Parse index file."""
    index = []
    for line in open(filename):
        index.append(int(line.strip()))
    return index

def sample_mask(idx, l):
    """Create mask."""
    mask = np.zeros(l)
    mask[idx] = 1
    return np.array(mask, dtype=np.bool)

def load_data(dataset_str): # {'pubmed', 'citeseer', 'cora'}
    """Load data."""
    names = ['x', 'y', 'tx', 'ty', 'allx', 'ally', 'graph']  #ind.cora.test.index left out
    objects = []
    for i in range(len(names)):
        with open("data/ind.{}.{}".format(dataset_str, names[i]), 'rb') as f:
            if sys.version_info > (3, 0):
                objects.append(pkl.load(f, encoding='latin1'))
            else:
                objects.append(pkl.load(f))                   
      
    
    x, y, tx, ty, allx, ally, graph = tuple(objects)
    # print(f'This is graph {graph},{len(graph)}')
    
    test_idx_reorder = parse_index_file("data/ind.{}.test.index".format(dataset_str))
    
    test_idx_range = np.sort(test_idx_reorder) #test_idx_range =  [1708,1709,...,2707] 1000 entries
       

    if dataset_str == 'citeseer':
        # Fix citeseer dataset (there are some isolated nodes in the graph)
        # Find isolated nodes, add them as zero-vecs into the right position
        test_idx_range_full = range(min(test_idx_reorder), max(test_idx_reorder)+1)
        tx_extended = sp.lil_matrix((len(test_idx_range_full), x.shape[1]))
        tx_extended[test_idx_range-min(test_idx_range), :] = tx
        tx = tx_extended
        ty_extended = np.zeros((len(test_idx_range_full), y.shape[1]))
        ty_extended[test_idx_range-min(test_idx_range), :] = ty
        ty = ty_extended

    features = sp.vstack((allx, tx)).tolil() 
    
    
    features[test_idx_reorder, :] = features[test_idx_range, :]
    adj = nx.adjacency_matrix(nx.from_dict_of_lists(graph))

    labels = np.vstack((ally, ty))
    labels[test_idx_reorder, :] = labels[test_idx_range, :]

    idx_test = test_idx_range.tolist()
    idx_train = range(len(y))
    idx_val = range(len(y), len(y)+500)

    train_mask = sample_mask(idx_train, labels.shape[0])
    val_mask = sample_mask(idx_val, labels.shape[0])
    test_mask = sample_mask(idx_test, labels.shape[0])

    y_train = np.zeros(labels.shape)
    y_val = np.zeros(labels.shape)
    y_test = np.zeros(labels.shape)
    y_train[train_mask, :] = labels[train_mask, :]
    y_val[val_mask, :] = labels[val_mask, :]
    y_test[test_mask, :] = labels[test_mask, :]

    print(f'adj.shape: {adj.shape}')
    print(f'features.shape: {features.shape}')


    
    return adj, features, y_train, y_val, y_test, train_mask, val_mask, test_mask

def load_random_data(size):

    adj = sp.random(size, size, density=0.002) # density similar to cora
    features = sp.random(size, 1000, density=0.015)
    int_labels = np.random.randint(7, size=(size))
    labels = np.zeros((size, 7)) # Nx7
    labels[np.arange(size), int_labels] = 1

    train_mask = np.zeros((size,)).astype(bool)
    train_mask[np.arange(size)[0:int(size/2)]] = 1

    val_mask = np.zeros((size,)).astype(bool)
    val_mask[np.arange(size)[int(size/2):]] = 1

    test_mask = np.zeros((size,)).astype(bool)
    test_mask[np.arange(size)[int(size/2):]] = 1

    y_train = np.zeros(labels.shape)
    y_val = np.zeros(labels.shape)
    y_test = np.zeros(labels.shape)
    y_train[train_mask, :] = labels[train_mask, :]
    y_val[val_mask, :] = labels[val_mask, :]
    y_test[test_mask, :] = labels[test_mask, :]
  
    # sparse NxN, sparse NxF, norm NxC, ..., norm Nx1, ...
    return adj, features, y_train, y_val, y_test, train_mask, val_mask, test_mask

def sparse_to_tuple(sparse_mx):
    """Convert sparse matrix to tuple representation."""
    def to_tuple(mx):
        if not sp.isspmatrix_coo(mx):
            mx = mx.tocoo()
        coords = np.vstack((mx.row, mx.col)).transpose()
        values = mx.data
        shape = mx.shape
        return coords, values, shape

    if isinstance(sparse_mx, list):
        for i in range(len(sparse_mx)):
            sparse_mx[i] = to_tuple(sparse_mx[i])
    else:
        sparse_mx = to_tuple(sparse_mx)

    return sparse_mx

def standardize_data(f, train_mask):
    """Standardize feature matrix and convert to tuple representation"""
    # standardize data
    f = f.todense()
    mu = f[train_mask == True, :].mean(axis=0)
    sigma = f[train_mask == True, :].std(axis=0)
    f = f[:, np.squeeze(np.array(sigma > 0))]
    mu = f[train_mask == True, :].mean(axis=0)
    sigma = f[train_mask == True, :].std(axis=0)
    f = (f - mu) / sigma
    return f

def preprocess_features(features):
    """Row-normalize feature matrix and convert to tuple representation"""
    rowsum = np.array(features.sum(1))
    r_inv = np.power(rowsum, -1).flatten()
    r_inv[np.isinf(r_inv)] = 0.
    r_mat_inv = sp.diags(r_inv)
    features = r_mat_inv.dot(features)
    return features.todense(), sparse_to_tuple(features)

def normalize_adj(adj):
    """Symmetrically normalize adjacency matrix."""
    adj = sp.coo_matrix(adj)
    rowsum = np.array(adj.sum(1))
    d_inv_sqrt = np.power(rowsum, -0.5).flatten()
    d_inv_sqrt[np.isinf(d_inv_sqrt)] = 0.
    d_mat_inv_sqrt = sp.diags(d_inv_sqrt)
    return adj.dot(d_mat_inv_sqrt).transpose().dot(d_mat_inv_sqrt).tocoo()


def preprocess_adj(adj):
    """Preprocessing of adjacency matrix for simple GCN model and conversion to tuple representation."""
    adj_normalized = normalize_adj(adj + sp.eye(adj.shape[0]))
    return sparse_to_tuple(adj_normalized)

def preprocess_adj_bias(adj):
    num_nodes = adj.shape[0]
    adj = adj + sp.eye(num_nodes)  # self-loop
    adj[adj > 0.0] = 1.0
    if not sp.isspmatrix_coo(adj):
        adj = adj.tocoo()
    adj = adj.astype(np.float32)
    # This is where I made a mistake, I used (adj.row, adj.col) instead 
    indices = np.vstack((adj.col, adj.row)).transpose()    
    
    return tf.SparseTensor(indices=indices, values=adj.data, dense_shape=adj.shape)



In [10]:
import tensorflow as tf
import numpy as np

class attn_head(tf.keras.layers.Layer):
    def __init__(self,hidden_dim, nb_nodes = None,in_drop=0.0, coef_drop=0.0,activation = tf.nn.elu,residual = False):        
        super(attn_head,self).__init__()        
        self.activation = activation
        self.residual = residual
        
        self.in_dropout = tf.keras.layers.Dropout(in_drop)
        self.coef_dropout = tf.keras.layers.Dropout(coef_drop)        
        self.conv_no_bias = tf.keras.layers.Conv1D(hidden_dim,1,use_bias=False)
        self.conv_f1 = tf.keras.layers.Conv1D(1,1)
        self.conv_f2 = tf.keras.layers.Conv1D(1,1)
                
        self.conv_residual = tf.keras.layers.Conv1D(hidden_dim,1)
        self.bias_zero = tf.Variable(tf.zeros(hidden_dim))
        
    def __call__(self,seq,bias_mat,training):
                
        # 输入的节点特征
        seq = self.in_dropout(seq,training = training)
        # 使用 hidden_dim=8 个1维卷积，卷积核大小为1
        # 相当于 Wh
        # seq_fts.shape: (num_graph, num_nodes, hidden_dim)
        seq_fts = self.conv_no_bias(seq)
        # 1x1 卷积可以理解为按hidden_dim这个通道进行加权求和，但参数共享
        # 相当于单输出全连接层1
        # f_1.shape: (num_graph, num_nodes, 1)
        f_1 = self.conv_f1(seq_fts)
        # 相当于单输出全连接层2
        f_2 = self.conv_f2(seq_fts)
        
        # 广播机制计算(num_graph,num_nodes,1)+(num_graph,1,num_nodes)
        # logits.shape: (num_graph, num_nodes, num_nodes)
        # 相当于计算了所有节点的 [e_ij]
        logits = f_1 + tf.transpose(f_2,[0,2,1])
        # 得到邻居节点的注意力系数：[alpha_ij]
        # coefs.shape: (num_graph, num_nodes, num_nodes)
        coefs = tf.nn.softmax(tf.nn.leaky_relu(logits)+bias_mat)
        # dropout
        coefs = self.coef_dropout(coefs,training = training)
        seq_fts = self.in_dropout(seq_fts,training = training)
        # 计算：[alpha_ij] x Wh
        # vals.shape: (num_graph, num_nodes, num_nodes)
        vals = tf.matmul(coefs, seq_fts)
        vals = tf.cast(vals, dtype=tf.float32)
        # 最终结果再加上一个 bias 
        ret = vals + self.bias_zero
        # 残差
        if self.residual:
            if seq.shape[-1] != ret.shape[-1]:
                ret = ret + self.conv_residual(seq)                
            else:
                ret = ret + seq
        # 返回 h' = σ([alpha_ij] x Wh)
        # shape: (num_graph, num_nodes, hidden_dim)
        return self.activation(ret)
    
    
class sp_attn_head(tf.keras.layers.Layer):
    def __init__(self,hidden_dim, nb_nodes,in_drop=0.0, coef_drop=0.0,activation = tf.nn.elu,residual = False):        
        super(sp_attn_head,self).__init__()     
        self.hidden_dim = hidden_dim
        self.nb_nodes = nb_nodes
        self.activation = activation
        self.residual = residual
        
        self.in_dropout = tf.keras.layers.Dropout(in_drop)
        self.coef_dropout = tf.keras.layers.Dropout(coef_drop)        
        
        self.conv_no_bias = tf.keras.layers.Conv1D(hidden_dim,1,use_bias=False)
        self.conv_f1 = tf.keras.layers.Conv1D(1,1)
        self.conv_f2 = tf.keras.layers.Conv1D(1,1)
                
        self.conv_residual = tf.keras.layers.Conv1D(hidden_dim,1)
        self.bias_zero = tf.Variable(tf.zeros(hidden_dim))
        
    def __call__(self,seq,bias_mat,training):

        adj_mat = bias_mat
        seq = self.in_dropout(seq,training = training)
        seq_fts = self.conv_no_bias(seq)
        f_1 = self.conv_f1(seq_fts)
        f_2 = self.conv_f2(seq_fts)
        
        f_1 = tf.reshape(f_1, (self.nb_nodes, 1))
        f_1 = adj_mat*f_1
        f_2 = tf.reshape(f_2, (self.nb_nodes, 1))
        f_2 = adj_mat * tf.transpose(f_2, [1,0])
        logits = tf.compat.v1.sparse_add(f_1,f_2)


        lrelu = tf.SparseTensor(indices=logits.indices, 
                        values=tf.nn.leaky_relu(logits.values), 
                        dense_shape=logits.dense_shape)
        coefs = tf.compat.v2.sparse.softmax(lrelu)
        
        if training != False:
            coefs = tf.SparseTensor(indices=coefs.indices,
                                    values=self.coef_dropout(coefs.values,training = training),
                                    dense_shape=coefs.dense_shape)
            seq_fts = self.in_dropout(seq_fts,training = training)
        
        coefs = tf.compat.v2.sparse.reshape(coefs, [nb_nodes, nb_nodes])
        
        seq_fts = tf.squeeze(seq_fts)
        vals = tf.sparse.sparse_dense_matmul(coefs, seq_fts)
        vals = tf.expand_dims(vals, axis=0)
        vals.set_shape([1, self.nb_nodes, self.hidden_dim])
        
        ret = vals + self.bias_zero
        if self.residual:
            if seq.shape[-1] != ret.shape[-1]:
                ret = ret + self.conv_residual(seq)                
            else:
                ret = ret + seq
        return self.activation(ret)    
        

In [11]:
def choose_attn_head(Sparse):
    if Sparse:
        chosen_attention = sp_attn_head
    else:
        chosen_attention = attn_head
    
    return chosen_attention

class inference(tf.keras.layers.Layer):
    def __init__(self,n_heads,hid_units,nb_classes, nb_nodes,Sparse,ffd_drop=0.0, attn_drop=0.0,activation = tf.nn.elu,residual = False):        
        super(inference,self).__init__()
        attned_head = choose_attn_head(Sparse)
        self.attns = []
        self.sec_attns = []
        self.final_attns = []
        self.final_sum = n_heads[-1]
        # 构造 n_heads[0] 个 attention
        for i in range(n_heads[0]):
            self.attns.append(attned_head(hidden_dim = hid_units[0], nb_nodes = nb_nodes,
                                            in_drop = ffd_drop, coef_drop = attn_drop, 
                                            activation = activation,
                                            residual = residual))
        
        # hid_units表示每一个attention head中每一层的隐藏单元个数
        # 若给定hid_units = [8], 表示使用单个全连接层
        # 因此，不执行下面的代码
        for i in range(1, len(hid_units)):
            h_old = h_1
            sec_attns = []
            for j in range(n_heads[i]):                
                sec_attns.append(attned_head(hidden_dim = hid_units[i], nb_nodes = nb_nodes,
                                             in_drop = ffd_drop, coef_drop = attn_drop, 
                                             activation = activation,
                                             residual = residual))
                self.sec_attns.append(sec_attns)
                
        # 加上输出层
        for i in range(n_heads[-1]):
            self.final_attns.append(attned_head(hidden_dim = nb_classes, nb_nodes = nb_nodes,                                                                                                         
                                                in_drop = ffd_drop, coef_drop = attn_drop, 
                                                activation = lambda x: x,
                                                residual = residual))                

    def __call__(self,inputs,bias_mat,training):        
        first_attn = []
        out = []
        # 计算 n_heads[0] 个 attention
        for indiv_attn in self.attns:
            first_attn.append(indiv_attn(seq = inputs, bias_mat = bias_mat,training = training))
        # h_1.shape: (num_graph, num_nodes, hidden_dim*n_heads[0])
        h_1 = tf.concat(first_attn,axis = -1)     
        # 如果 attention 使用了多层网络，则依次计算
        for sec_attns in self.sec_attns:
            next_attn = []
            for indiv_attns in sec_attns:
                next_attn.append(indiv_attn(seq = h_1,bias_mat = bias_mat,training = training))
            h_1 = tf.concat(next_attns,axis = -1)
        # 得到最终的预测结果
        for indiv_attn in self.final_attns:
            out.append(indiv_attn(seq=h_1,bias_mat = bias_mat,training = training))
        # 将结果在最后一个维度取均值
        # logits.shape: (num_graph, num_nodes, nb_classes)
        logits = tf.add_n(out)/self.final_sum
        return logits

In [12]:
class GAT(tf.keras.Model):
    def __init__(self, hid_units,n_heads, nb_classes, nb_nodes,Sparse,ffd_drop = 0.0,attn_drop = 0.0,activation = tf.nn.elu,residual=False):    
        super(GAT,self).__init__()
        '''
        hid_units: 隐藏单元个数
        n_heads: 每层使用的注意力头个数
        nb_classes: 类别数，7
        nb_nodes: 节点的个数，2708
        activation: 激活函数
        residual: 是否使用残差连接
        '''                        
        self.hid_units = hid_units         #[8]
        self.n_heads = n_heads             #[8,1]
        self.nb_classes = nb_classes
        self.nb_nodes = nb_nodes
        self.activation = activation
        self.residual = residual        
        
        self.inferencing = inference(n_heads,hid_units,nb_classes,nb_nodes,Sparse = Sparse,ffd_drop = ffd_drop,attn_drop = attn_drop, activation = activation,residual = residual)
        
    

    def masked_softmax_cross_entropy(self,logits, labels, mask):
        loss = tf.nn.softmax_cross_entropy_with_logits(logits=logits, labels=labels)
        mask = tf.cast(mask, dtype=tf.float32)
        mask /= tf.reduce_mean(mask)
        loss *= mask
        return tf.reduce_mean(loss)

    def masked_accuracy(self,logits, labels, mask):
        correct_prediction = tf.equal(tf.argmax(logits, 1), tf.argmax(labels, 1))
        accuracy_all = tf.cast(correct_prediction, tf.float32)
        mask = tf.cast(mask, dtype=tf.float32)
        mask /= tf.reduce_mean(mask)
        accuracy_all *= mask
        return tf.reduce_mean(accuracy_all)

    
    def __call__(self,inputs,training,bias_mat,lbl_in,msk_in):     
        # logits.shape: (num_graph, num_nodes, nb_classes)         
        logits = self.inferencing(inputs = inputs, bias_mat = bias_mat,training = training)        
        
        log_resh = tf.reshape(logits, [-1, self.nb_classes])        
        lab_resh = tf.reshape(lbl_in, [-1, self.nb_classes])
        msk_resh = tf.reshape(msk_in, [-1])        
        
        loss = self.masked_softmax_cross_entropy(log_resh, lab_resh, msk_resh)
        
        lossL2 = tf.add_n([tf.nn.l2_loss(v) for v in self.trainable_variables if v.name not
                                                     in ['bias', 'gamma', 'b', 'g', 'beta']]) * l2_coef
                
        loss = loss+lossL2
        accuracy = self.masked_accuracy(log_resh, lab_resh, msk_resh)
        
        return logits,accuracy,loss

In [13]:
def train(model,inputs,bias_mat,lbl_in,msk_in,training):        
    with tf.GradientTape() as tape:                
        logits,accuracy,loss = model(inputs = inputs,
                                     training =True,
                                     bias_mat = bias_mat,
                                     lbl_in =  lbl_in,
                                     msk_in =  msk_in)             

    gradients = tape.gradient(loss,model.trainable_variables)
    gradient_variables = zip(gradients, model.trainable_variables)
    optimizer.apply_gradients(gradient_variables)        
                
    return logits,accuracy,loss

def evaluate(model,inputs,bias_mat,lbl_in,msk_in,training):                                                        
    logits,accuracy,loss = model(inputs= inputs,
                                     bias_mat = bias_mat,
                                     lbl_in = lbl_in,
                                     msk_in = msk_in,
                                     training = False)                        
    return logits,accuracy,loss

In [14]:

import time
import numpy as np
import tensorflow as tf

#from models import GAT
#from utils import process

print('Dataset: ' + dataset)
print('----- Opt. hyperparams -----')
print('lr: ' + str(lr))
print('l2_coef: ' + str(l2_coef))
print('----- Archi. hyperparams -----')
print('nb. layers: ' + str(len(hid_units)))
print('nb. units per layer: ' + str(hid_units))
print('nb. attention heads: ' + str(n_heads))
print('residual: ' + str(residual))
print('nonlinearity: ' + str(nonlinearity))


adj, features, y_train, y_val, y_test, train_mask, val_mask, test_mask = load_data(dataset)

features, spars = preprocess_features(features)

nb_nodes = features.shape[0]
ft_size = features.shape[1]
nb_classes = y_train.shape[1]


features = features[np.newaxis]
y_train = y_train[np.newaxis]
y_val = y_val[np.newaxis]
y_test = y_test[np.newaxis]
train_mask = train_mask[np.newaxis]
val_mask = val_mask[np.newaxis]
test_mask = test_mask[np.newaxis]


print(f'These are the parameters')
print(f'batch_size: {batch_size}')
print(f'nb_nodes: {nb_nodes}')
print(f'ft_size: {ft_size}')
print(f'nb_classes: {nb_classes}')


if Sparse:
    biases = preprocess_adj_bias(adj)
    
else:
    adj = adj.todense()
    adj = adj[np.newaxis]
    biases = adj_to_bias(adj, [nb_nodes], nhood=1)
    
    
model = GAT(hid_units,n_heads, nb_classes, nb_nodes,Sparse,ffd_drop = ffd_drop,attn_drop = attn_drop,activation = tf.nn.elu,residual=False)
print('model: ' + str('SpGAT' if Sparse else 'GAT'))


vlss_mn = np.inf
vacc_mx = 0.0
curr_step = 0

train_loss_avg = 0
train_acc_avg = 0
val_loss_avg = 0
val_acc_avg = 0

model_number = 0

for epoch in range(nb_epochs):
    ###Training Segment###
    tr_step = 0
    tr_size = features.shape[0]
    while tr_step * batch_size < tr_size:
        
        if Sparse:
            bbias = biases
        else:
            bbias = biases[tr_step*batch_size:(tr_step+1)*batch_size]
            
        _, acc_tr,loss_value_tr = train(model,
                                        inputs=     features[tr_step*batch_size:(tr_step+1)*batch_size],
                                        bias_mat=     bbias,
                                        lbl_in =     y_train[tr_step*batch_size:(tr_step+1)*batch_size],
                                        msk_in =train_mask[tr_step*batch_size:(tr_step+1)*batch_size],
                                        training=True)
        train_loss_avg += loss_value_tr
        train_acc_avg += acc_tr
        tr_step += 1
        
    ###Validation Segment###
    vl_step = 0
    vl_size = features.shape[0]
    while vl_step * batch_size < vl_size:
        
        if Sparse:
            bbias = biases
        else:
            bbias = biases[vl_step*batch_size:(vl_step+1)*batch_size]
        
        _, acc_vl,loss_value_vl = evaluate(model,
                                            inputs=     features[vl_step*batch_size:(vl_step+1)*batch_size],
                                            bias_mat=     bbias,
                                            lbl_in =     y_val[vl_step*batch_size:(vl_step+1)*batch_size],
                                            msk_in =val_mask[vl_step*batch_size:(vl_step+1)*batch_size],
                                            training=False)
        val_loss_avg += loss_value_vl
        val_acc_avg += acc_vl
        vl_step += 1
        
    print('Training: loss = %.5f, acc = %.5f | Val: loss = %.5f, acc = %.5f' %
                                        (train_loss_avg/tr_step, train_acc_avg/tr_step,
                                        val_loss_avg/vl_step, val_acc_avg/vl_step))
    

    ###Early Stopping Segment###
    
    if val_acc_avg/vl_step >= vacc_mx or val_loss_avg/vl_step <= vlss_mn:
            if val_acc_avg/vl_step >= vacc_mx and val_loss_avg/vl_step <= vlss_mn:
                    vacc_early_model = val_acc_avg/vl_step
                    vlss_early_model = val_loss_avg/vl_step            
                    working_weights = model.get_weights()
            vacc_mx = np.max((val_acc_avg/vl_step, vacc_mx))
            vlss_mn = np.min((val_loss_avg/vl_step, vlss_mn))
            curr_step = 0
    else:
            curr_step += 1
            if curr_step == patience:
                    print('Early stop! Min loss: ', vlss_mn, ', Max accuracy: ', vacc_mx)
                    print('Early stop model validation loss: ', vlss_early_model, ', accuracy: ', vacc_early_model)
                    model.set_weights(working_weights)
                    break

    train_loss_avg = 0
    train_acc_avg = 0
    val_loss_avg = 0
    val_acc_avg = 0

###Testing Segment### Outside of the epochs

ts_step = 0
ts_size = features.shape[0]
ts_loss = 0.0
ts_acc = 0.0
while ts_step * batch_size < ts_size:
    
    if Sparse:
            bbias = biases
    else:
            bbias = biases[ts_step*batch_size:(ts_step+1)*batch_size]
    
    _, acc_ts,loss_value_ts = evaluate(model,
                                        inputs=     features[ts_step*batch_size:(ts_step+1)*batch_size],
                                        bias_mat=     bbias,
                                        lbl_in =     y_test[ts_step*batch_size:(ts_step+1)*batch_size],
                                        msk_in =test_mask[ts_step*batch_size:(ts_step+1)*batch_size],
                                        training=False)
    ts_loss += loss_value_ts
    ts_acc += acc_ts
    ts_step += 1

print('Test loss:', ts_loss/ts_step, '; Test accuracy:', ts_acc/ts_step)
#print('Test loss: %.5f, acc = %.5f | Val: loss = %.5f, acc = %.5f' %
#                                    (train_loss_avg/tr_step, train_acc_avg/tr_step,
#                                    val_loss_avg/vl_step, val_acc_avg/vl_step))


Dataset: cora
----- Opt. hyperparams -----
lr: 0.005
l2_coef: 0.0005
----- Archi. hyperparams -----
nb. layers: 1
nb. units per layer: [8]
nb. attention heads: [8, 1]
residual: False
nonlinearity: <function elu at 0x000001B39F31CDC8>
adj.shape: (2708, 2708)
features.shape: (2708, 1433)
These are the parameters
batch_size: 1
nb_nodes: 2708
ft_size: 1433
nb_classes: 7
model: GAT
Training: loss = 1.99408, acc = 0.12857 | Val: loss = 1.98312, acc = 0.15800
Training: loss = 1.98571, acc = 0.15714 | Val: loss = 1.97586, acc = 0.24000
Training: loss = 1.97477, acc = 0.15714 | Val: loss = 1.97048, acc = 0.21600
Training: loss = 1.96900, acc = 0.14286 | Val: loss = 1.96593, acc = 0.23600
Training: loss = 1.96194, acc = 0.20714 | Val: loss = 1.96145, acc = 0.33600
Training: loss = 1.95802, acc = 0.18571 | Val: loss = 1.95750, acc = 0.43800
Training: loss = 1.96417, acc = 0.17143 | Val: loss = 1.95373, acc = 0.53200
Training: loss = 1.95319, acc = 0.24286 | Val: loss = 1.94983, acc = 0.62400
Trai

Training: loss = 1.64145, acc = 0.52143 | Val: loss = 1.63290, acc = 0.80000
Training: loss = 1.61525, acc = 0.48571 | Val: loss = 1.62791, acc = 0.80400
Training: loss = 1.57089, acc = 0.47857 | Val: loss = 1.62324, acc = 0.80200
Training: loss = 1.53016, acc = 0.51429 | Val: loss = 1.61708, acc = 0.80400
Training: loss = 1.72220, acc = 0.45000 | Val: loss = 1.61090, acc = 0.80400
Training: loss = 1.56444, acc = 0.55714 | Val: loss = 1.60627, acc = 0.80600
Training: loss = 1.53542, acc = 0.50714 | Val: loss = 1.60092, acc = 0.80400
Training: loss = 1.60961, acc = 0.45000 | Val: loss = 1.59675, acc = 0.80600
Training: loss = 1.56773, acc = 0.50000 | Val: loss = 1.59332, acc = 0.80200
Training: loss = 1.57236, acc = 0.50714 | Val: loss = 1.59031, acc = 0.80600
Training: loss = 1.60898, acc = 0.54286 | Val: loss = 1.58744, acc = 0.80400
Training: loss = 1.53714, acc = 0.52857 | Val: loss = 1.58458, acc = 0.80400
Training: loss = 1.50840, acc = 0.51429 | Val: loss = 1.58271, acc = 0.80200

Training: loss = 1.45882, acc = 0.55714 | Val: loss = 1.38440, acc = 0.81000
Training: loss = 1.32593, acc = 0.57143 | Val: loss = 1.38321, acc = 0.80600
Training: loss = 1.43962, acc = 0.51429 | Val: loss = 1.38376, acc = 0.80600
Training: loss = 1.51672, acc = 0.53571 | Val: loss = 1.38550, acc = 0.80200
Training: loss = 1.40054, acc = 0.57857 | Val: loss = 1.38748, acc = 0.80400
Training: loss = 1.51710, acc = 0.49286 | Val: loss = 1.38719, acc = 0.80200
Training: loss = 1.45004, acc = 0.55000 | Val: loss = 1.38632, acc = 0.79800
Training: loss = 1.46456, acc = 0.53571 | Val: loss = 1.38441, acc = 0.79800
Training: loss = 1.46434, acc = 0.54286 | Val: loss = 1.37991, acc = 0.80400
Training: loss = 1.38389, acc = 0.59286 | Val: loss = 1.37676, acc = 0.80600
Training: loss = 1.54054, acc = 0.52143 | Val: loss = 1.37523, acc = 0.80000
Training: loss = 1.46665, acc = 0.52857 | Val: loss = 1.37317, acc = 0.79800
Training: loss = 1.42501, acc = 0.57143 | Val: loss = 1.36910, acc = 0.80000

Training: loss = 1.41184, acc = 0.57857 | Val: loss = 1.33667, acc = 0.79400
Training: loss = 1.41444, acc = 0.58571 | Val: loss = 1.33487, acc = 0.79200
Training: loss = 1.44967, acc = 0.50714 | Val: loss = 1.33267, acc = 0.78800
Training: loss = 1.39094, acc = 0.50000 | Val: loss = 1.32986, acc = 0.78400
Training: loss = 1.45070, acc = 0.50714 | Val: loss = 1.32638, acc = 0.79200
Training: loss = 1.54495, acc = 0.51429 | Val: loss = 1.32547, acc = 0.79200
Training: loss = 1.42511, acc = 0.59286 | Val: loss = 1.32401, acc = 0.79000
Training: loss = 1.49901, acc = 0.50000 | Val: loss = 1.32212, acc = 0.79400
Training: loss = 1.40820, acc = 0.55000 | Val: loss = 1.32002, acc = 0.79600
Training: loss = 1.46951, acc = 0.55714 | Val: loss = 1.31844, acc = 0.79600
Training: loss = 1.54517, acc = 0.52143 | Val: loss = 1.31793, acc = 0.79600
Training: loss = 1.44818, acc = 0.57143 | Val: loss = 1.31755, acc = 0.79800
Training: loss = 1.41179, acc = 0.57143 | Val: loss = 1.31565, acc = 0.80200

Training: loss = 1.45751, acc = 0.51429 | Val: loss = 1.28547, acc = 0.79600
Training: loss = 1.41425, acc = 0.58571 | Val: loss = 1.28595, acc = 0.80400
Training: loss = 1.49366, acc = 0.55000 | Val: loss = 1.28618, acc = 0.80400
Training: loss = 1.43489, acc = 0.55000 | Val: loss = 1.28727, acc = 0.80200
Training: loss = 1.39225, acc = 0.55000 | Val: loss = 1.28871, acc = 0.80000
Training: loss = 1.48114, acc = 0.52857 | Val: loss = 1.29026, acc = 0.79600
Training: loss = 1.33985, acc = 0.59286 | Val: loss = 1.29186, acc = 0.79800
Training: loss = 1.41084, acc = 0.56429 | Val: loss = 1.29592, acc = 0.79800
Training: loss = 1.41240, acc = 0.57857 | Val: loss = 1.30089, acc = 0.79400
Training: loss = 1.34941, acc = 0.60000 | Val: loss = 1.30682, acc = 0.79400
Training: loss = 1.43013, acc = 0.58571 | Val: loss = 1.31327, acc = 0.79600
Training: loss = 1.47640, acc = 0.52857 | Val: loss = 1.31962, acc = 0.79400
Training: loss = 1.47604, acc = 0.57143 | Val: loss = 1.32606, acc = 0.79400

Training: loss = 1.37054, acc = 0.55714 | Val: loss = 1.25286, acc = 0.80600
Training: loss = 1.34573, acc = 0.57857 | Val: loss = 1.25447, acc = 0.80800
Training: loss = 1.44390, acc = 0.54286 | Val: loss = 1.25428, acc = 0.80400
Training: loss = 1.42971, acc = 0.60000 | Val: loss = 1.25423, acc = 0.80200
Training: loss = 1.38067, acc = 0.55000 | Val: loss = 1.25530, acc = 0.80600
Training: loss = 1.37832, acc = 0.55714 | Val: loss = 1.25643, acc = 0.80800
Training: loss = 1.30024, acc = 0.60714 | Val: loss = 1.25814, acc = 0.80800
Training: loss = 1.49375, acc = 0.53571 | Val: loss = 1.25917, acc = 0.80800
Training: loss = 1.49031, acc = 0.52857 | Val: loss = 1.26032, acc = 0.80800
Training: loss = 1.33994, acc = 0.57143 | Val: loss = 1.26318, acc = 0.80000
Training: loss = 1.51041, acc = 0.47857 | Val: loss = 1.26786, acc = 0.79800
Training: loss = 1.39380, acc = 0.57143 | Val: loss = 1.27159, acc = 0.79400
Training: loss = 1.51471, acc = 0.52857 | Val: loss = 1.27285, acc = 0.79600