In [1]:
import pandas as pd

In [2]:
raw_data = pd.read_csv('/tmp/cora/cora.content',sep = '\t',header = None)

In [3]:
raw_data_cites = pd.read_csv('/tmp/cora/cora.cites',sep = '\t',header = None)
raw_data_cites[:10]

Unnamed: 0,0,1
0,35,1033
1,35,103482
2,35,103515
3,35,1050679
4,35,1103960
5,35,1103985
6,35,1109199
7,35,1112911
8,35,1113438
9,35,1113831


In [4]:
import numpy as np
import scipy.sparse as sp
import torch

In [5]:
def encode_onehot(labels):
    """
    Turn the label into a onehot vector
    """
    classes = set(labels)
    classes_dict = {c: np.identity(len(classes))[i, :] for i, c in
                    enumerate(classes)}
    print(classes_dict)
    labels_onehot = np.array(list(map(classes_dict.get, labels)),
                             dtype=np.int32)
    return labels_onehot

In [6]:
def normalize(mx):
    """
    Normalize sparse matrix by row
    """
    rowsum = np.array(mx.sum(1))
    r_inv = np.power(rowsum, -1).flatten()
    r_inv[np.isinf(r_inv)] = 0.
    r_mat_inv = sp.diags(r_inv)
    mx = r_mat_inv.dot(mx)
    return mx

In [7]:
def sparse_mx_to_torch_sparse_tensor(sparse_mx):
    """
    Convert a sparse matrix from scipy format to torch format
    """
    sparse_mx = sparse_mx.tocoo().astype(np.float32)
    indices = torch.from_numpy(
        np.vstack((sparse_mx.row, sparse_mx.col)).astype(np.int64))
    values = torch.from_numpy(sparse_mx.data)
    shape = torch.Size(sparse_mx.shape)
    return torch.sparse.FloatTensor(indices, values, shape)

In [8]:
def load_data(path="/tmp/cora/", dataset="cora"):
    
    """Load citation network dataset (cora only for now)"""
    print('Loading {} dataset...'.format(dataset))
    
    ## content data is converted to numpy vector
    idx_features_labels = np.genfromtxt("{}{}.content".format(path, dataset),
                                        dtype=np.dtype(str))
    
    # Take the bag-of-words vector of each paper as the feature vector of each article and store it in a sparse matrix format
    features = sp.csr_matrix(idx_features_labels[:, 1:-1], dtype=np.float32)
    
    # Take the type of each paper as a label and convert it into a one hot vector
    labels = encode_onehot(idx_features_labels[:, -1])

    # # Take out the id of each paper
    idx = np.array(idx_features_labels[:, 0], dtype=np.int32)
    idx_map = {j: i for i, j in enumerate(idx)}
    
    # cites data is converted to numpy vector
    edges_unordered = np.genfromtxt("{}{}.cites".format(path, dataset),
                                    dtype=np.int32)
    
    # Map the id in the cites data to the interval [0, 2708]
    edges = np.array(list(map(idx_map.get, edges_unordered.flatten())),
                     dtype=np.int32).reshape(edges_unordered.shape)
    
    # Store the citation relationship between papers in a sparse matrix format
    # 1, 0 matrix
    adj_v = sp.coo_matrix((np.ones(edges.shape[0]), (edges[:, 0], edges[:, 1])),
                        shape=(labels.shape[0], labels.shape[0]),
                        dtype=np.float32)

    # build symmetric adjacency matrix
    adj_v = adj_v + adj_v.T.multiply(adj_v.T > adj_v) - adj_v.multiply(adj_v.T > adj_v)
    
    # Normalize the characteristics of the article
    features = normalize(features)
    adj = normalize(adj_v + sp.eye(adj_v.shape[0]))
    
    # Produce the final vector
    idx_train = range(500)
    idx_val = range(500, 1500)

    features = torch.FloatTensor(np.array(features.todense()))
    labels = torch.LongTensor(np.where(labels)[1])
    adj = sparse_mx_to_torch_sparse_tensor(adj)

    idx_train = torch.LongTensor(idx_train)
    idx_val = torch.LongTensor(idx_val)

    return adj_v, adj, features, labels, idx_train, idx_val

In [9]:
adj_v, adj, features, labels, idx_train, idx_val = load_data()
features.shape, type(features), labels.shape, type(labels), adj.shape, type(adj)

Loading cora dataset...
{'Genetic_Algorithms': array([1., 0., 0., 0., 0., 0., 0.]), 'Case_Based': array([0., 1., 0., 0., 0., 0., 0.]), 'Neural_Networks': array([0., 0., 1., 0., 0., 0., 0.]), 'Probabilistic_Methods': array([0., 0., 0., 1., 0., 0., 0.]), 'Rule_Learning': array([0., 0., 0., 0., 1., 0., 0.]), 'Reinforcement_Learning': array([0., 0., 0., 0., 0., 1., 0.]), 'Theory': array([0., 0., 0., 0., 0., 0., 1.])}


(torch.Size([2708, 1433]),
 torch.Tensor,
 torch.Size([2708]),
 torch.Tensor,
 torch.Size([2708, 2708]),
 torch.Tensor)

In [10]:
import networkx as nx
import pickle as pkl

t = 'fairwalk'
embs = pkl.load(open('/tmp/embs.pkl', 'rb'))
rgraphs = pkl.load(open("/tmp/rgraphs.pkl", 'rb'))
labels = torch.LongTensor(pkl.load(open("/tmp/group_ids.pkl", 'rb')))
adj = nx.to_scipy_sparse_matrix(rgraphs[t])
adj = sparse_mx_to_torch_sparse_tensor(normalize(adj + sp.eye(adj.shape[0])))
features = torch.FloatTensor(embs[t])
idx_train, idx_val = torch.LongTensor(range(80)), torch.LongTensor(range(80, 105))
features.shape, type(features), labels.shape, type(labels), adj.shape, type(adj)

(torch.Size([105, 128]),
 torch.Tensor,
 torch.Size([105]),
 torch.Tensor,
 torch.Size([105, 105]),
 torch.Tensor)

In [11]:
import math
import torch

from torch.nn.parameter import Parameter
from torch.nn.modules.module import Module


In [12]:
class GraphConvolution(Module):
    """
     A simple implementation of graph convolution, please refer to the paper https://arxiv.org/abs/1609.02907
     ...
     Attributes
     ----------
     in_features: int
         The size of the image convolution input feature vector, namely $|H^{(l)}|$
     out_features: int
         The size of the image convolution output vector, namely $|H^{(l+1)}|$
     bias: bool
         Whether to use the offset vector, the default is True, that is, the default is to use the offset vector
     weight: Parameter
         Trainable parameters in graph convolution,
        
     Methods
     -------
     __init__(self, in_features, out_features, bias=True)
         The constructor of the graph convolution, defines the size of the input feature, the size of the output vector, whether to use offset, parameters
     reset_parameters(self)
         Initialize the parameters in the graph convolution
     forward(self, input, adj)
         Forward propagation function, input is the feature input, and adj is the transformed adjacency matrix $N(A)=D^{-1}\tilde{A}$. Completing the calculation logic of forward propagation, $N(A) H^{(l)} W^{(l)}$
     __repr__(self)
         Refactored class name expression
     """

    def __init__(self, in_features, out_features, bias=True):
        super(GraphConvolution, self).__init__()
        
        # create Weight and Bias trainable parameters
        self.in_features = in_features
        self.out_features = out_features
        self.weight = Parameter(torch.FloatTensor(in_features, out_features))
        if bias:
            self.bias = Parameter(torch.FloatTensor(out_features))
        else:
            self.register_parameter('bias', None)
        self.reset_parameters()

    def reset_parameters(self):
        # standard weight to be uniform
        stdv = 1. / math.sqrt(self.weight.size(1))
        self.weight.data.uniform_(-stdv, stdv)
        if self.bias is not None:
            self.bias.data.uniform_(-stdv, stdv)

    def forward(self, input, adj):
        # H * W
        support = torch.mm(input, self.weight)
        
        # N(A) * H * W # Addition aggregation by multiplying
        output = torch.spmm(adj, support)
        
        if self.bias is not None:
            # N(A) * H * W + b
            return output + self.bias
        else:
            return output

    def __repr__(self):
        return self.__class__.__name__ + ' (' + str(self.in_features) + ' -> ' + str(self.out_features) + ')'

In [13]:
import torch.nn as nn
import torch.nn.functional as F

In [14]:
class GCN(nn.Module):
    '''
     Multiple graph convolutional neural network model
     ...
     Attributes
     ----------
     n_feat: int
         The size of the input feature vector of the graph network
     n_hid: int
         The size of the hidden layer, that is, the size of the output vector of the first layer of the convolutional layer
     n_class: int
         Number of classifier categories
     dropout: float
         dropout rate
        
     Methods
     -------
     __init__(self, n_feat, n_hid, n_class, dropout)
         Two-layer graph convolutional neural network constructor, defining the dimension of the input feature, the dimension of the hidden layer, the number of classifier categories, and the dropout rate
     forward(self, x, adj)
         Forward propagation function, x is the input feature of the graph network, adj is the adjacency matrix that has been transformed $N(A)$
     '''
    def __init__(self, n_feat,n_hids, n_class, dropout):
        super(GCN, self).__init__()
        
        # Define the layers of graph convolutional layer
        
        layers_units = [n_feat] +  n_hids
        
        self.graph_layers = nn.ModuleList([GraphConvolution(layers_units[idx],layers_units[idx+1]) for idx in range(len(layers_units)-1)])
        
        self.output_layer = GraphConvolution(layers_units[-1],n_class)

        self.dropout = dropout

    def forward(self, x, adj):
        
        for graph_layer in self.graph_layers:
            x = F.relu(graph_layer(x, adj))
            # dropout
            x = F.dropout(x, self.dropout, training=self.training)
            
        # The output of the final convolutional layer is mapped to the output category dimension
        x = self.output_layer(x, adj)
        
        # Calculate log softmax
        # https://discuss.pytorch.org/t/logsoftmax-vs-softmax/21386/20
        return F.log_softmax(x, dim=1)

In [15]:
def accuracy(output, labels):
    """
    Accuracy calculation method
    """
    preds = output.max(1)[1].type_as(labels)
    correct = preds.eq(labels).double()
    correct = correct.sum()
    return correct / len(labels)

In [16]:
import time
import argparse
import numpy as np

import torch
import torch.nn.functional as F
import torch.optim as optim


# Training hyperparameter configuration
class Args:
    no_cuda = False # Whether to use cuda/gpu
    seed = 42 # Set random seed
    epochs = 200 # number of iterations
    lr = 0.01 # learning rate
    weight_decay = 5e-4 # Learning rate decay
    n_hid = 16 # hidden layer dimension
    dropout = 0.5 # dropout rate

In [17]:
args = Args()
args.cuda = not args.no_cuda and torch.cuda.is_available()

In [18]:
args.cuda

True

In [19]:
# random seed
np.random.seed(args.seed)
torch.manual_seed(args.seed)
if args.cuda:
    torch.cuda.manual_seed(args.seed)

In [20]:
input_features = features.shape[1]
model = GCN(n_feat=input_features,
            n_hids=[128,32],
            n_class=labels.max().item() + 1,
            dropout=args.dropout)

In [21]:
# Construct an Adam optimizer,
# The parameters to be optimized are the trainable parameters in the GCN model
# Learning rate is set to args.lr
# Learning rate decay is args.weight_decay
optimizer = optim.Adam(model.parameters(),
                       lr=args.lr, 
                       weight_decay=args.weight_decay)

In [22]:

def accuracy(output, labels):
    """
    Accuracy calculation method
    """
    preds = output.max(1)[1].type_as(labels)
    correct = preds.eq(labels).double()
    correct = correct.sum()
    return correct / len(labels)

In [23]:
# Model training function, epoch is the number of iterations
def train(epoch):
    # Record the start time of the epoch iteration
    t = time.time()
    # Mark the GCN model is in train mode
    model.train()
    # In each epoch, you need to clear the previously calculated gradient
    optimizer.zero_grad()
    
    """
    Input the graph network input feature and the transformed adjacency matrix adj into the graph convolutional neural network GCN model, and the output is obtained through forward propagation, 
    which is the predicted probability of the classification category
    """
    
    output = model(features, adj)
    
    
    """
    Find the corresponding output probability and label according to the data index of the training set, 
    and then calculate the loss and accuracy
    """
    
    loss_train = F.nll_loss(output[idx_train], labels[idx_train])
    acc_train = accuracy(output[idx_train], labels[idx_train])
    
    # # Error back propagation
    loss_train.backward()
    
    # The optimizer starts to optimize the trainable parameters in GCN
    optimizer.step()

    
    """Use the validation set data to verify the epoch training results. 
    The verification process needs to close the train mode and open the eval model"""
    
    model.eval()
    
    # # Same forward propagation
    output = model(features, adj)
    """
    Find the corresponding output probability and label according to the data index of the validation set,
    and then calculate the loss and accuracy
    """
    
    loss_val = F.nll_loss(output[idx_val], labels[idx_val])
    acc_val = accuracy(output[idx_val], labels[idx_val])
    
    # Print all the results and the time required
    print('Epoch: {:04d}'.format(epoch+1),
          'loss_train: {:.4f}'.format(loss_train.item()),
          'acc_train: {:.4f}'.format(acc_train.item()),
          'loss_val: {:.4f}'.format(loss_val.item()),
          'acc_val: {:.4f}'.format(acc_val.item()),
          'time: {:.4f}s'.format(time.time() - t))

In [24]:
# Record the start time of model training
t_start = time.time()
# Start iterative training of the GCN model, the number of iterations is set to args.epochs
for epoch in range(args.epochs):
    train(epoch)
    
print("Model training is complete!")
print("Total model training time: {:.4f}s".format(time.time() - t_start))

Epoch: 0001 loss_train: 0.9695 acc_train: 0.6125 loss_val: 0.8638 acc_val: 0.8800 time: 0.0032s
Epoch: 0002 loss_train: 0.7781 acc_train: 0.7625 loss_val: 0.5636 acc_val: 0.9200 time: 0.0018s
Epoch: 0003 loss_train: 0.6431 acc_train: 0.8250 loss_val: 0.3377 acc_val: 0.9200 time: 0.0017s
Epoch: 0004 loss_train: 0.4924 acc_train: 0.8250 loss_val: 0.2369 acc_val: 0.9200 time: 0.0016s
Epoch: 0005 loss_train: 0.4457 acc_train: 0.8375 loss_val: 0.2031 acc_val: 0.9200 time: 0.0016s
Epoch: 0006 loss_train: 0.3984 acc_train: 0.8375 loss_val: 0.1757 acc_val: 0.9200 time: 0.0016s
Epoch: 0007 loss_train: 0.4595 acc_train: 0.8375 loss_val: 0.1661 acc_val: 0.9200 time: 0.0017s
Epoch: 0008 loss_train: 0.3790 acc_train: 0.8125 loss_val: 0.1641 acc_val: 0.8800 time: 0.0016s
Epoch: 0009 loss_train: 0.3260 acc_train: 0.8375 loss_val: 0.1636 acc_val: 0.9200 time: 0.0016s
Epoch: 0010 loss_train: 0.3244 acc_train: 0.8125 loss_val: 0.1728 acc_val: 0.9200 time: 0.0016s
Epoch: 0011 loss_train: 0.3453 acc_train

Epoch: 0124 loss_train: 0.1804 acc_train: 0.9250 loss_val: 0.2209 acc_val: 0.8800 time: 0.0017s
Epoch: 0125 loss_train: 0.1734 acc_train: 0.9250 loss_val: 0.2040 acc_val: 0.9200 time: 0.0013s
Epoch: 0126 loss_train: 0.1634 acc_train: 0.9125 loss_val: 0.1983 acc_val: 0.9200 time: 0.0017s
Epoch: 0127 loss_train: 0.1741 acc_train: 0.9000 loss_val: 0.1991 acc_val: 0.9200 time: 0.0015s
Epoch: 0128 loss_train: 0.1636 acc_train: 0.9125 loss_val: 0.1999 acc_val: 0.9200 time: 0.0014s
Epoch: 0129 loss_train: 0.1635 acc_train: 0.9125 loss_val: 0.2012 acc_val: 0.9200 time: 0.0014s
Epoch: 0130 loss_train: 0.1605 acc_train: 0.9125 loss_val: 0.2030 acc_val: 0.9200 time: 0.0016s
Epoch: 0131 loss_train: 0.1669 acc_train: 0.9000 loss_val: 0.2059 acc_val: 0.9200 time: 0.0013s
Epoch: 0132 loss_train: 0.1693 acc_train: 0.9125 loss_val: 0.2096 acc_val: 0.9200 time: 0.0014s
Epoch: 0133 loss_train: 0.1752 acc_train: 0.9000 loss_val: 0.2063 acc_val: 0.9200 time: 0.0014s
Epoch: 0134 loss_train: 0.1723 acc_train

In [25]:
# Model test function
def test():
    # First mark the model as eval mode
    model.eval()
    """
    # Input the graph network input feature and the transformed adjacency matrix adj into the two-layer graph convolutional neural network GCN model, 
    and the output is obtained through forward propagation, which is the predicted probability of the classification category
    """
    output = model(features, adj)
    """
    Find the corresponding output probability and label according to the data index of the test set, 
    and then calculate the loss and accuracy
    """
    loss_test = F.nll_loss(output[idx_val], labels[idx_val])
    acc_test = accuracy(output[idx_val], labels[idx_val])
    
    print("Test set results:",
          "loss= {:.4f}".format(loss_test.item()),
          "accuracy= {:.4f}".format(acc_test.item()))
    
test()

Test set results: loss= 0.2270 accuracy= 0.8800


In [26]:
features.shape

torch.Size([105, 128])

In [27]:
adj

tensor(indices=tensor([[  0,   1,   2,  ...,  98, 103, 104],
                       [  0,   0,   0,  ..., 104, 104, 104]]),
       values=tensor([0.0909, 0.1176, 0.1818, 0.2857, 0.1538, 0.1818, 0.1818,
                      0.0588, 0.1818, 0.2222, 0.1538, 0.1818, 0.0952, 0.6667,
                      0.2857, 0.1818, 0.1176, 0.0909, 0.2857, 0.1538, 0.1818,
                      0.1176, 0.1111, 0.0690, 0.0952, 0.0571, 0.1818, 0.1818,
                      0.1429, 0.1538, 0.1818, 0.1176, 0.1818, 0.2857, 0.0769,
                      0.1818, 0.2857, 0.1818, 0.1176, 0.1818, 0.1538, 0.0909,
                      0.2857, 1.0000, 0.0526, 0.0465, 0.1818, 0.0408, 0.0513,
                      0.0426, 0.0513, 0.0444, 0.0408, 0.0952, 1.0000, 0.0303,
                      0.0690, 0.0952, 0.0513, 0.0465, 0.0741, 0.0465, 0.0408,
                      0.0513, 0.0465, 0.0606, 0.0426, 0.0488, 0.0513, 0.0444,
                      0.0408, 0.0571, 0.3333, 0.0444, 0.0769, 0.0465, 0.0465,
                  

In [28]:
model(features, adj).shape

torch.Size([105, 3])

In [29]:
labels.shape

torch.Size([105])

In [30]:
input_features

128