In [None]:
from collections import namedtuple

import mxnet as mx
from mxnet import gluon
import numpy as np

import dgl
import dgl.function as fn
from dgl.data import load_data

# Load a small dataset

cora is a small graph of 2708 nodes and 10556 edges.

In [None]:
DataArgs = namedtuple('DataArgs', 'dataset')
args = DataArgs('cora')
data = load_data(args)

print("""----Data statistics------'
      #Nodes %d
      #Edges %d
      #Classes %d
      #Train samples %d
      #Val samples %d
      #Test samples %d""" %
          (data.graph.number_of_nodes(),
           data.graph.number_of_edges(),
           data.num_labels,
           data.train_mask.sum(),
           data.val_mask.sum(),
           data.test_mask.sum()))

# Load data into DGL

In [None]:
g = dgl.DGLGraph(data.graph, readonly=True)
g.ndata['features'] = mx.nd.array(data.features)
g.ndata['labels'] = mx.nd.array(data.labels)

train_mask = mx.nd.array(data.train_mask)
val_mask = mx.nd.array(data.val_mask)
test_mask = mx.nd.array(data.test_mask)

train_nid = mx.nd.array(np.nonzero(data.train_mask)[0]).astype(np.int64)
test_nid = mx.nd.array(np.nonzero(data.test_mask)[0]).astype(np.int64)

# Compute a node embedding with  2-layer GCN in mini-batches

To enable mini-batch training, we need to compute the node embeddings on a subset of nodes. For a 2-layer GCN, we first compute the node embeddings of the neighbors in the first layer of the GCN model.

$$h_i^{(1)} = \sigma(\Sigma_{j \in N(i)} h_j^{(0)}), \forall i \in N(v)$$

Then we compute the node embeddings of the second layer, which is the final embeddings of the nodes in this model.

$$h_v^{(2)} = \sigma(\Sigma_{i \in N(v)} h_i^{(1)})$$

The data and computation dependency is illustrated in the figure below:
![title](https://s3.us-east-2.amazonaws.com/dgl.ai/amlc_tutorial/Dependency.png)

We could use the DGL API from the previous session to implement the mini-batch training.

In [None]:
class NodeUpdate(gluon.Block):
    def __init__(self, in_feats, out_feats, activation=None):
        super(NodeUpdate, self).__init__()
        self.dense = gluon.nn.Dense(out_feats, in_units=in_feats)
        self.activation = activation

    def forward(self, node):
        h = node.data['h']
        h = self.dense(h)
        if self.activation:
            h = self.activation(h)
        return {'h': h}

In [None]:
# Input dimension size
in_feats = data.features.shape[1]
# Hidden dimension size
n_hidden = 64

# If we want to compute the embeddings of the nodes 0, 1, 2
v = [0, 1, 2]

nfunc_layer1 = NodeUpdate(in_feats, n_hidden, mx.nd.relu)
nfunc_layer2 = NodeUpdate(n_hidden, n_hidden, mx.nd.relu)
nfunc_layer1.initialize()
nfunc_layer2.initialize()

# We first find the neighbors of these nodes.
neighbors = np.unique(g.in_edges(v)[0].asnumpy())
# compute the embeddings of the neighbors
g.pull(neighbors, fn.copy_src('features', 'msg'), fn.sum('msg', 'h'), nfunc_layer1)
# compute the embeddings of the nodes in the batch.
g.pull(v, fn.copy_src('h', 'msg'), fn.sum('msg', 'h'), nfunc_layer2)

The computation above is very expensive because computing the node embeddings of the model involve in many neighbor nodes. One solution is to use neighbor sampling to prune some of the dependency.

![title](https://s3.us-east-2.amazonaws.com/dgl.ai/amlc_tutorial/neighbor_sampling.png)

Neighbor sampling requires to perform pruning in every neighborhood separately. We could implement neighbor sampling with Python as follow. However, the code below is tedious to implement and is also very slow when running on a large graph.

In [None]:
# If we want to compute the embeddings of the nodes 0, 1, 2
nodes = [0, 1, 2]
num_neighs = 2

# Sample neighbors in the neighborhood of each node separately.
for n in nodes:
    src2, dst2 = g.in_edges(n)
    # Randomly select neighbors
    idx = np.random.choice(np.arange(len(src2)), size=num_neighs)
    src2 = src2[idx]
    dst2 = dst2[idx]
    
    # Sample neighbors of neighbors.
    for neighbor in src2:
        src1, dst1 = g.in_edges(neighbor)
        idx = np.random.choice(np.arange(len(src1)), size=num_neighs)
        src1 = src1[idx]
        dst1 = dst1[idx]
        g.send_and_recv((src1, dst1), fn.copy_src('features', 'msg'), fn.sum('msg', 'h'), nfunc_layer1)
        
    g.send_and_recv((src2, dst2), fn.copy_src('h', 'msg'), fn.sum('msg', 'h'), nfunc_layer2)

# Neighbor sampling and NodeFlow

DGL provides a set of sampling algorithms. These sampling algorithms create NodeFlows for mini-batch training on a graph. The code below creates a neighbor sampler that creates NodeFlows for 2-layer GCN. Each NodeFlow (mini-batch) has 3 target nodes and it samples at maximal 4 neighbors on each neighborhood.

In [None]:
sampler = iter(dgl.contrib.sampling.NeighborSampler(g, 3, num_neighs, neighbor_type='in', num_hops=2))
nf = next(sampler)

### Have a look at the data inside the NodeFlow.

The NodeFlow should have 3 layers and 2 blocks.

![title](https://s3.us-east-2.amazonaws.com/dgl.ai/amlc_tutorial/NodeFlow.png)

In [None]:
nf.num_layers

In [None]:
nf.num_blocks

Nodes in NodeFlow can be identified by two sets of Ids: global node Id (the node Ids in the original graph) and local node Id (used inside this NodeFlow). Local node Ids inside a NodeFlow are labelled starting from 0.

In [None]:
print(nf.layer_nid(0).asnumpy())
print(nf.layer_parent_nid(0).asnumpy())

The NodeFlow should contain 3 target nodes to compute node embeddings.

In [None]:
nodes = nf.layer_parent_nid(-1).asnumpy()
print("The target nodes in the batch: ", nodes)

We can double check that the nodes and edges in the NodeFlow are sampled from the parent graph. One way of checking it is to see the neighbors of the target nodes also exist in the neighborhoods of the target nodes in the parent graph.

In [None]:
layer1_nodes = nf.layer_parent_nid(1).asnumpy()
full_graph_neighbors = g.in_edges(nodes)[0].asnumpy()
print("The nodes in the second layer: ", layer1_nodes)
print("The in-neighbors of the nodes: ", np.sort(full_graph_neighbors))

### Get data from the parent graph

When a NodeFlow is created from the neighbor sampler, it doesn't contain node data or edge data. We need to explicitly copy data from the parent graph.

In [None]:
try:
    print(nf.layers[0].data['labels'])
except KeyError:
    print("labels don't exist in node data of the NodeFlow")

In [None]:
nf.copy_from_parent()
print(nf.layers[-1].data['labels'].asnumpy())
print(g.nodes[nf.layer_parent_nid(-1)].data['labels'].asnumpy())

### Trigger the computation on NodeFlow

We can triger the computation on NodeFlow with `block_compute`, which performs the computation on a block. To compute the node embeddings of the target nodes, we start the block computation in block 0 and propogate to the last layer.

In [None]:
nfunc_layers = []
nfunc_layers.append(NodeUpdate(in_feats, n_hidden, mx.nd.relu))
for i in range(nf.num_blocks - 1):
    nfunc_layers.append(NodeUpdate(n_hidden, n_hidden, mx.nd.relu))
for l in nfunc_layers:
    l.initialize()

nf.layers[0].data['h'] = nf.layers[0].data['features']
for i in range(nf.num_blocks):
    nf.block_compute(i, fn.copy_src('h', 'msg'), fn.sum('msg', 'h'), nfunc_layers[i])

# Train GCN with neighbor sampling
In an $L$-layer graph convolution network (GCN), given a graph $G=(V, E)$, represented as an adjacency matrix $A$, with node features $H^{(0)} = X \in \mathbb{R}^{|V| \times d}$, the hidden feature of a node $v$ in $(l+1)$-th layer $h_v^{(l+1)}$ depends on the features of all its neighbors in the previous layer $h_u^{(l)}$:
$$
z_v^{(l+1)} = \sum_{u \in \mathcal{N}(v)} \tilde{A}_{uv} h_u^{(l)} \qquad h_v^{(l+1)} = \sigma ( z_v^{(l+1)} W^{(l)})
$$
where $\mathcal{N}(v)$ is the neighborhood of $v$, $\tilde{A}$ could be any normalized version of $A$ such as $D^{-1} A$ in Kipf et al., $\sigma(\cdot)$ is an activation function, and $W^{(l)}$ is a trainable parameter of the $l$-th layer.

Instead of using all the $L$-hop neighbors of a node $v$, [Hamilton et al.](https://arxiv.org/abs/1706.02216) propose *neighbor sampling*, which randomly samples a few neighbors $\hat{\mathcal{N}}^{(l)}(v)$ to estimate the aggregation $z_v^{(l+1)}$ of its total neighbors $\mathcal{N}(v)$ in $l$-th GCN layer, by an unbiased estimator $\hat{z}_v^{(l+1)}$
$$
\hat{z}_v^{(l+1)} = \frac{\vert \mathcal{N}(v) \vert }{\vert \hat{\mathcal{N}}^{(l)}(v) \vert} \sum_{u \in \hat{\mathcal{N}}^{(l)}(v)} \tilde{A}_{uv} \hat{h}_u^{(l)} \qquad
\hat{h}_v^{(l+1)} = \sigma ( \hat{z}_v^{(l+1)} W^{(l)} )
$$
Let $D^{(l)}$ be the number of neighbors to be sampled for each node at the $l$-th layer,
then the receptive field size of each node can be controlled under $\prod_{i=0}^{L-1} D^{(l)}$ by *neighbor sampling*.

Here we define the node UDF which is a fully-connected layer:

In [None]:
class GCNSampling(gluon.Block):
    def __init__(self,
                 in_feats,
                 n_hidden,
                 n_classes,
                 n_blocks,
                 activation,
                 dropout,
                 **kwargs):
        super(GCNSampling, self).__init__(**kwargs)
        self.dropout = dropout
        with self.name_scope():
            self.blocks = gluon.nn.Sequential()
            # input block
            self.blocks.add(NodeUpdate(in_feats, n_hidden, activation))
            # hidden blocks
            for i in range(1, n_blocks-1):
                self.blocks.add(NodeUpdate(n_hidden, n_hidden, activation))
            # output block
            self.blocks.add(NodeUpdate(n_hidden, n_classes))

    def forward(self, nf):
        nf.layers[0].data['h'] = nf.layers[0].data['features']
        for i, block in enumerate(self.blocks):
            h = nf.layers[i].data['h']
            if self.dropout:
                h = mx.nd.Dropout(h, p=self.dropout)
            nf.layers[i].data['h'] = h
            # block_compute() computes the feature of layer i given layer
            # i-1, with the given message, reduce, and apply functions.
            # Here, we essentially aggregate the neighbor node features in
            # the previous layer, and update it with the `layer` function.
            nf.block_compute(i,
                             fn.copy_src(src='h', out='m'),
                             lambda node : {'h': node.mailbox['m'].mean(axis=1)},
                             block)
        return nf.layers[-1].data.pop('h')

The code below puts everything together to train a 2-layer GCN in mini-batches with neighbor sampling.

In [None]:
# dropout probability
dropout = 0.2
# batch size
batch_size = 100
# number of neighbors to sample
num_neighbors = 4
# number of epochs
num_epochs = 30

# The number of classes we want to classify the nodes
n_classes = data.num_labels
# The number of layers of GCN.
L = 2

# initialize the model and cross entropy loss
model = GCNSampling(in_feats, n_hidden, n_classes, L,
                    mx.nd.relu, dropout, prefix='GCN')
model.initialize()
loss_fcn = gluon.loss.SoftmaxCELoss()

# use adam optimizer
trainer = gluon.Trainer(model.collect_params(), 'adam',
                        {'learning_rate': 0.03, 'wd': 0})

labels = g.ndata['labels']

In [None]:
for epoch in range(num_epochs):
    for nf in dgl.contrib.sampling.NeighborSampler(g, batch_size,
                                                   num_neighbors,
                                                   neighbor_type='in',
                                                   shuffle=True,
                                                   num_hops=L,
                                                   seed_nodes=train_nid):
        # When `NodeFlow` is generated from `NeighborSampler`, it only contains
        # the topology structure, on which there is no data attached.
        # Users need to call `copy_from_parent` to copy specific data,
        # such as input node features, from the original graph.
        nf.copy_from_parent()
        with mx.autograd.record():
            # forward
            pred = model(nf)
            batch_nids = nf.layer_parent_nid(-1).astype('int64')
            batch_labels = labels[batch_nids]
            # cross entropy loss
            loss = loss_fcn(pred, batch_labels)
            loss = loss.sum() / len(batch_nids)
        # backward
        loss.backward()
        # optimization
        trainer.step(batch_size=1)
        print("Epoch[{}]: loss {}".format(epoch, loss.asscalar()))