In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:90% !important; }</style>"))

### Base class: messange passing

In [2]:
import sys
import inspect
#  provides several useful functions to help get information about live objects such as modules, classes, methods,
#     functions, tracebacks, frame objects, and code objects.

import torch
from torch_geometric.utils import scatter_

special_args = [
    'edge_index', 'edge_index_i', 'edge_index_j', 'size', 'size_i', 'size_j'
]
__size_error_msg__ = ('All tensors which should get mapped to the same source '
                      'or target nodes must be of same size in dimension 0.')

is_python2 = sys.version_info[0] < 3
getargspec = inspect.getargspec if is_python2 else inspect.getfullargspec

class MessagePassing(torch.nn.Module):
    r"""Base class for creating message passing layers

    .. math::
        \mathbf{x}_i^{\prime} = \gamma_{\mathbf{\Theta}} \left( \mathbf{x}_i,
        \square_{j \in \mathcal{N}(i)} \, \phi_{\mathbf{\Theta}}
        \left(\mathbf{x}_i, \mathbf{x}_j,\mathbf{e}_{i,j}\right) \right),

    where :math:`\square` denotes a differentiable, permutation invariant
    function, *e.g.*, sum, mean or max, and :math:`\gamma_{\mathbf{\Theta}}`
    and :math:`\phi_{\mathbf{\Theta}}` denote differentiable functions such as
    MLPs.
    See `here <https://pytorch-geometric.readthedocs.io/en/latest/notes/
    create_gnn.html>`__ for the accompanying tutorial.

    Args:
        aggr (string, optional): The aggregation scheme to use
            (:obj:`"add"`, :obj:`"mean"` or :obj:`"max"`).
            (default: :obj:`"add"`)
        flow (string, optional): The flow direction of message passing
            (:obj:`"source_to_target"` or :obj:`"target_to_source"`).
            (default: :obj:`"source_to_target"`)
    """

    def __init__(self, aggr='add', flow='source_to_target'):
        super(MessagePassing, self).__init__()

        self.aggr = aggr
        assert self.aggr in ['add', 'mean', 'max']

        self.flow = flow
        assert self.flow in ['source_to_target', 'target_to_source']
        
        # the self.message is just hte member function
#         Get the names and default values of a Python function’s parameters. A named tuple is returned:
# FullArgSpec(args, varargs, varkw, defaults, kwonlyargs, kwonlydefaults, annotations)
        self.__message_args__ = getargspec(self.message)[0][1:]  # get the arguments of the self.message except: x_j
        self.__special_args__ = [(i, arg)
                                 for i, arg in enumerate(self.__message_args__)
                                 if arg in special_args]
        # !!! exclude the special arguments
        self.__message_args__ = [
            arg for arg in self.__message_args__ if arg not in special_args
        ]
        self.__update_args__ = getargspec(self.update)[0][2:]    # we skip the edge_index and size args in propogate

    def propagate(self, edge_index, size=None, **kwargs):
        r"""The initial call to start propagating messages.

        Args:
            edge_index (Tensor): The indices of a general (sparse) assignment
                matrix with shape :obj:`[N, M]` (can be directed or
                undirected).
            size (list or tuple, optional): The size :obj:`[N, M]` of the
                assignment matrix. If set to :obj:`None`, the size is tried to
                get automatically inferred. (default: :obj:`None`)
            **kwargs: Any additional data which is needed to construct messages
                and to update node embeddings.
        """
        # some candiates for the kwargs: may be test_mask
        dim = 0
        size = [None, None] if size is None else list(size)
        assert len(size) == 2

        i, j = (0, 1) if self.flow == 'target_to_source' else (1, 0)
        ij = {"_i": i, "_j": j}
        
        # collect the message arguments
        message_args = []
        for arg in self.__message_args__:
            if arg[-2:] in ij.keys():   # check the suffix
                tmp = kwargs.get(arg[:-2], None)
                if tmp is None:  # pragma: no cover
                    message_args.append(tmp)
                else:
                    idx = ij[arg[-2:]]
                    if isinstance(tmp, tuple) or isinstance(tmp, list):
                        assert len(tmp) == 2
                        if tmp[1 - idx] is not None:
                            if size[1 - idx] is None:
                                size[1 - idx] = tmp[1 - idx].size(dim)
                            if size[1 - idx] != tmp[1 - idx].size(dim):
                                raise ValueError(__size_error_msg__)
                        tmp = tmp[idx]

                    if tmp is None:
                        message_args.append(tmp)
                    else:
                        if size[idx] is None:
                            size[idx] = tmp.size(dim)
                        if size[idx] != tmp.size(dim):
                            raise ValueError(__size_error_msg__)

                        tmp = torch.index_select(tmp, dim, edge_index[idx])
                        message_args.append(tmp)
            else:
                message_args.append(kwargs.get(arg, None))
        # either one of the size dimension is None, we make it the (N, N)
        size[0] = size[1] if size[0] is None else size[0]
        size[1] = size[0] if size[1] is None else size[1]

        kwargs['edge_index'] = edge_index
        kwargs['size'] = size

        for (idx, arg) in self.__special_args__:
            if arg[-2:] in ij.keys():
                message_args.insert(idx, kwargs[arg[:-2]][ij[arg[-2:]]])
            else:
                message_args.insert(idx, kwargs[arg])

        update_args = [kwargs[arg] for arg in self.__update_args__]

        out = self.message(*message_args)
        # aggreates all values from the src
        '''
         scatter_(name, src, index, dim_size=None)[source]

            Aggregates all values from the src tensor at the indices specified in the index tensor along the first dimension. If multiple 
            indices reference the same location, their contributions are aggregated according to name (either "add", "mean" or "max").
                Parameters:	

                name (string) – The aggregation to use ("add", "mean", "max").
                src (Tensor) – The source tensor.
                index (LongTensor) – The indices of elements to scatter.
                dim_size (int, optional) – Automatically create output tensor with size dim_size in the first dimension. 
                If set to None, a minimal sized output tensor is returned. (default: None)
        '''
        
        '''
        from source to the target: i == 1, therefore, the output tensor is in batches of column size
         scatter_(name, src, index, dim_size=None)[source]
            Aggregates all values from the src tensor at the indices specified in the index tensor along the first dimension. 
            If multiple indices reference the same location, their contributions are aggregated according to name (either "add", "mean" or "max").
        here we take the dest of the edge as the index in the scatter
         out = op(src, index, 0, None, dim_size, fill_value)
        the default dim will be 0
        '''
        out = scatter_(self.aggr, out, edge_index[i], dim_size=size[i])
        
        out = self.update(out, *update_args)
        # at last return the updated arguments
        return out

#     when the derivative class which inherits from this base class can override this function by providing more arguments
    def message(self, x_j):  # pragma: no cover
        r"""Constructs messages in analogy to :math:`\phi_{\mathbf{\Theta}}`
        for each edge in :math:`(i,j) \in \mathcal{E}`.
        Can take any argument which was initially passed to :meth:`propagate`.
        In addition, features can be lifted to the source node :math:`i` and
        target node :math:`j` by appending :obj:`_i` or :obj:`_j` to the
        variable name, *.e.g.* :obj:`x_i` and :obj:`x_j`."""

        return x_j

    def update(self, aggr_out):  # pragma: no cover
        r"""Updates node embeddings in analogy to
        :math:`\gamma_{\mathbf{\Theta}}` for each node
        :math:`i \in \mathcal{V}`.
        Takes in the output of aggregation as first argument and any argument
        which was initially passed to :meth:`propagate`."""

        return aggr_out

In [28]:
import inspect

def message(a=3, b=4):
    return a+b

print(inspect.getfullargspec(message))
print(inspect.getargspec(message)[0][2:])
a = [2, 3]
# b = *a


FullArgSpec(args=['a', 'b'], varargs=None, varkw=None, defaults=(3, 4), kwonlyargs=[], kwonlydefaults=None, annotations={})
[]


  import sys


### Use messagepassing to contruct the GCNConv (informal of GCNConv definition)

In [None]:
import torch
from torch_geometric.nn import MessagePassing
from torch_geometric.utils import add_self_loops, degree

class GCNConv(MessagePassing):
    def __init__(self, in_channels, out_channels):
        # this calling of the super function is just old fasioned
        # in python3: we can use the super().__init__(args)
        super(GCNConv, self).__init__(aggr='add')  # "Add" aggregation.
        # weight is gienerated by  self.weight = Parameter(torch.Tensor(out_features, in_features))
        self.lin = torch.nn.Linear(in_channels, out_channels)

    def forward(self, x, edge_index):
        # x has shape [N, in_channels], which is the matrix
        # edge_index has shape [2, E]

        # Step 1: Add self-loops to the adjacency matrix.
        # add extra columns of node inces and igore the edge_weights as _
        edge_index, _ = add_self_loops(edge_index, num_nodes=x.size(0))
        
        # Step 2: Linearly transform node feature matrix.
        # this should be where we apply the weight
        x = self.lin(x)

        # Step 3-5: Start propagating messages.
        return self.propagate(edge_index, size=(x.size(0), x.size(0)), x=x)

    # msg here are all the special messages
    def message(self, x_j, edge_index, size):
        # x_j has shape [E, out_channels], still a tensor, this E should contain the self loops

        # Step 3: Normalize node features.
        # row is the tensor of the source nodes while col is the dest nodes in an edge
        row, col = edge_index
        # size(0) should be the number of row in the assignment matrix
        # since row is the source node, the degree calculates the out degree of each node
        deg = degree(row, size[0], dtype=x_j.dtype)
        deg_inv_sqrt = deg.pow(-0.5)
        norm = deg_inv_sqrt[row] * deg_inv_sqrt[col]
        # view the columns as one columns. (E by 1)
        # this is not GEMM, this is the element wise, each feature values is normalized by the square root of both out-degree and in-degree
        return norm.view(-1, 1) * x_j

    def update(self, aggr_out):
        # aggr_out has shape [N, out_channels]

        # Step 5: Return new node embeddings.
        return aggr_out

In [35]:
import torch
# Test the random initialization of the Tensor
a = torch.Tensor(2, 3)
print(a)
a = torch.Tensor(2, 3)
print(a)
# for a 2-D tensor, we can assign them by the first dimension
b = torch.Tensor([[3, 4], [5, 6]])
m, n = b
print(b, b.shape, '\n', m, m.shape, n, n.shape)
print(norm.view(-1, 1) )
trial_x = torch.tensor([[1, 2], [3, 4], [5, 6], [7, 8], [8, 7], [6, 5], [4, 3], [2, 1]], dtype=torch.float)
print(norm.view(-1, 1) * trial_x)

tensor([[8.5305e+02, 1.8515e+28, 3.0866e+29],
        [1.1675e-10, 8.9683e-44, 0.0000e+00]])
tensor([[-6.9565e+33,  3.0880e-41, -3.4440e+35],
        [ 3.0880e-41,  7.1426e+05,  6.2706e+22]])
tensor([[3., 4.],
        [5., 6.]]) torch.Size([2, 2]) 
 tensor([3., 4.]) torch.Size([2]) tensor([5., 6.]) torch.Size([2])
tensor([[16.],
        [16.],
        [16.],
        [16.],
        [ 4.],
        [ 4.],
        [ 4.],
        [ 4.]])
tensor([[ 16.,  32.],
        [ 48.,  64.],
        [ 80.,  96.],
        [112., 128.],
        [ 32.,  28.],
        [ 24.,  20.],
        [ 16.,  12.],
        [  8.,   4.]])


In [32]:
from torch_geometric.utils import degree
a = torch.tensor([2, 1, 3, 2, 1, 5, 3, 4])
b = torch.tensor([1, 2, 2, 3, 5, 1, 4, 3])
deg = degree(a, 6, torch.float)
deg_square = deg.pow(2)
print( deg, type(deg), deg.shape, deg_square)

print(deg_square[a], deg_square[a].shape, '\n', deg_square[b], deg_square[b].shape)
norm = deg_square[a] * deg_square[b]
print(norm, norm.shape)

tensor([0., 2., 2., 2., 1., 1.]) <class 'torch.Tensor'> torch.Size([6]) tensor([0., 4., 4., 4., 1., 1.])
tensor([4., 4., 4., 4., 4., 1., 4., 1.]) torch.Size([8]) 
 tensor([4., 4., 4., 4., 1., 4., 1., 4.]) torch.Size([8])
tensor([16., 16., 16., 16.,  4.,  4.,  4.,  4.]) torch.Size([8])


#### torch_geometric.utils :  add_self_loops, degree

In [None]:
def add_self_loops(edge_index, edge_weight=None, fill_value=1, num_nodes=None):
    r"""Adds a self-loop :math:`(i,i) \in \mathcal{E}` to every node
    :math:`i \in \mathcal{V}` in the graph given by :attr:`edge_index`.
    In case the graph is weighted, self-loops will be added with edge weights
    denoted by :obj:`fill_value`.

    Args:
        edge_index (LongTensor): The edge indices.
        edge_weight (Tensor, optional): One-dimensional edge weights.
            (default: :obj:`None`)
        fill_value (int, optional): If :obj:`edge_weight` is not :obj:`None`,
            will add self-loops with edge weights of :obj:`fill_value` to the
            graph. (default: :obj:`1`)
        num_nodes (int, optional): The number of nodes, *i.e.*
            :obj:`max_val + 1` of :attr:`edge_index`. (default: :obj:`None`)

    :rtype: (:class:`LongTensor`, :class:`Tensor`)
    """
    num_nodes = maybe_num_nodes(edge_index, num_nodes)
    
    '''
    Returns a 1-D tensor of size (end-start)/step with values from the interval [start, end) taken with common difference step beginning from start.
    '''
    loop_index = torch.arange(0, num_nodes, dtype=torch.long, device=edge_index.device)
    
    '''
    unsqueeze() :
        Returns a new tensor with a dimension of size one inserted at the specified position.
        The returned tensor shares the same underlying data with this tensor.
        
    tensor.repeat(shape):
    repeat the tnesor with shape (m, n) matrix
    Here, we have two rows of the same node ([[i], [i]])
    '''
    loop_index = loop_index.unsqueeze(0).repeat(2, 1)

    if edge_weight is not None:
        # torch.numel(input):  return the total number of elements in the input tnesor
        assert edge_weight.numel() == edge_index.size(1)   # check whether the number of of weight and # of edges match
#         new_full(size, fill_value, dtype=None, device=None, requires_grad=False) → Tensor
#         Returns a Tensor of size size filled with fill_value. By default, the returned Tensor has the same torch.dtype and torch.device as this tensor.
        loop_weight = edge_weight.new_full((num_nodes, ), fill_value)
        # cancatenate tensors in the 0 (row) dimension
        edge_weight = torch.cat([edge_weight, loop_weight], dim=0)

    edge_index = torch.cat([edge_index, loop_index], dim=1)
    # at last add the num_nodes columns to the end of edge_index
    return edge_index, edge_weight

In [13]:
a = torch.tensor([2, 3, 4])
print(a.shape, a.unsqueeze(0), a.unsqueeze(0).shape)
b = torch.tensor([[2, 3, 4], [5, 6, 7]])
print(b.shape,'\n', b.unsqueeze(0).shape,'\n', b.unsqueeze(1).shape)

aa = a.unsqueeze(0)
print(a.repeat(2, 1),'\n', aa.repeat(2,1))
c =  aa.repeat(2,1)
# self loop to modify the edge_index
print(torch.cat([b, c], dim=1))

torch.Size([3]) tensor([[2, 3, 4]]) torch.Size([1, 3])
torch.Size([2, 3]) 
 torch.Size([1, 2, 3]) 
 torch.Size([2, 1, 3])
tensor([[2, 3, 4],
        [2, 3, 4]]) 
 tensor([[2, 3, 4],
        [2, 3, 4]])
tensor([[2, 3, 4, 2, 3, 4],
        [5, 6, 7, 2, 3, 4]])


### Formal definition of GCNConv layers

In [14]:
import torch
from torch.nn import Parameter
from torch_scatter import scatter_add
from torch_geometric.nn.conv import MessagePassing
from torch_geometric.utils import add_remaining_self_loops

# from ..inits import glorot, zeros  # This two imports are replaced by the source functions:
import math
def glorot(tensor):
    if tensor is not None:
        stdv = math.sqrt(6.0 / (tensor.size(-2) + tensor.size(-1)))
        '''
         uniform_(from=0, to=1) → Tensor
            Fills self tensor with numbers sampled from the continuous uniform distribution:
            P(x)=1/(to - from)
        '''
        # very similar to how we initialize the weight of the neural networks
        tensor.data.uniform_(-stdv, stdv)
        
def zeros(tensor):
    if tensor is not None:
        tensor.data.fill_(0)

class GCNConv(MessagePassing):
    
    def __init__(self, in_channels, out_channels, improved=False, cached=False,
                 bias=True, **kwargs):
        super(GCNConv, self).__init__(aggr='add', **kwargs)

        self.in_channels = in_channels
        self.out_channels = out_channels
        self.improved = improved  # Can be A+2I for improvements
        self.cached = cached   
        
        '''
        torch.nn.Parameter
            A kind of Tensor that is to be considered a module parameter.
            Parameters are Tensor subclasses, that have a very special property when used with Module s - when they’re assigned as Module attributes
            they are automatically added to the list of its parameters, and will appear e.g. in parameters() iterator.
        '''
        self.weight = Parameter(torch.Tensor(in_channels, out_channels))

        if bias:
            # random bias for the output
            self.bias = Parameter(torch.Tensor(out_channels))  # this Parameter() func will implicitly call the register function                
        else:
            self.register_parameter('bias', None)

        self.reset_parameters()

    def reset_parameters(self):
        glorot(self.weight)   # at first self.weight is a random generated tensor, here we use a specific continous uniform distribution for weights
        zeros(self.bias)      
        self.cached_result = None
        self.cached_num_edges = None

    # use the python decorator 
    @staticmethod
    def norm(edge_index, num_nodes, edge_weight=None, improved=False, dtype=None):
        if edge_weight is None:
#             1-D tensor, eah weight is for each edge
            edge_weight = torch.ones((edge_index.size(1), ), dtype=dtype, device=edge_index.device)
        
        # fill value will only define the missing self-loop-edges weights
        fill_value = 1 if not improved else 2  # diagonal elements
        '''
        Adds remaining self-loop (i,i)∈E to every node i∈V in the graph given by edge_index. In case the graph is weighted and already contains a few self-loops, 
        only non-existent self-loops will be added with edge weights denoted by fill_value.
        '''
        edge_index, edge_weight = add_remaining_self_loops(edge_index, edge_weight, fill_value, num_nodes)
    
        row, col = edge_index   # row stores the source nodes while col stores the destination nodes
        deg = scatter_add(edge_weight, row, dim=0, dim_size=num_nodes)
        deg_inv_sqrt = deg.pow(-0.5)
        deg_inv_sqrt[deg_inv_sqrt == float('inf')] = 0
        # return the added self-loops edges and the normlaized edge weights
        return edge_index, deg_inv_sqrt[row] * edge_weight * deg_inv_sqrt[col]

    def forward(self, x, edge_index, edge_weight=None):
        """"""
        # these weight is for the node features
        x = torch.matmul(x, self.weight)   # self.weight shape: (in_channels, out_channels)
        # after this, x already became E by out_channels matrix
        if self.cached and self.cached_result is not None:
            if edge_index.size(1) != self.cached_num_edges:
                raise RuntimeError(
                    'Cached {} number of edges, but found {}. Please '
                    'disable the caching behavior of this layer by removing '
                    'the `cached=True` argument in its constructor.'.format(
                        self.cached_num_edges, edge_index.size(1)))

        if not self.cached or self.cached_result is None:
            self.cached_num_edges = edge_index.size(1)
            edge_index, norm = self.norm(edge_index, x.size(0), edge_weight, self.improved, x.dtype)
            # store the temp results as the tuple
            self.cached_result = edge_index, norm

        edge_index, norm = self.cached_result
        # function calling
        # inside the propagate function, the final used message_args as the list for out:
        '''
     1) first out returns the message_args list:
        a) for kwargs in propagate function:
        1. add  x_j:
             tmp = x not None  (x is a tensor)
             idx = j which is 0
             
             inferred: size[0] = x.size(0)
             tmp = torch.index_select(tmp, 0, edge_index[0])
                The finla tmp will be E by in_channel matrix, E is the number of edges including the self_loops
             message_args.append(tmp)
             
        2. message_args.append(kwargs.get(norm) )
        3. add kwargs['edge_index'] = edge_index; kwargs['size'] = size
        
        b) out = self.message(*message_args):
        Normalize all the features, return a E by out_channel matrix
        '''
        # inside: torch.index_select(input, dim, index, out=None) → Tensor
#         Returns a new tensor which indexes the input tensor along dimension dim using the entries in index which is a LongTensor.
#         The returned tensor has the same number of dimensions as the original tensor (input). 
#         The dimth dimension has the same size as the length of index; other dimensions have the same size as in the original tensor.
        
        '''
    2) second out use the scatter function to call the aggregation
        
        self.aggr = add ; i = 1
        out = scatter_(self.aggr, out, edge_index[i], dim_size=size[i])
            Still E by out_channels, redistribute the weighted feature values of each node to the target nodes
            
        '''
    '''
    3) out = self.update(out, *update_args)
        
        out = self.update(out, *update_args)
            may add bias in the self.update function
    
    '''
    # aggreates all values from the src
        '''
         scatter_(name, src, index, dim_size=None)[source]

            Aggregates all values from the src tensor at the indices specified in the index tensor along the first dimension. If multiple 
            indices reference the same location, their contributions are aggregated according to name (either "add", "mean" or "max").
                Parameters:	

                name (string) – The aggregation to use ("add", "mean", "max").
                src (Tensor) – The source tensor.
                index (LongTensor) – The indices of elements to scatter.
                dim_size (int, optional) – Automatically create output tensor with size dim_size in the first dimension. 
                If set to None, a minimal sized output tensor is returned. (default: None)
        '''
        
        '''
        from source to the target: i == 1, therefore, the output tensor is in batches of column size
         scatter_(name, src, index, dim_size=None)[source]
            Aggregates all values from the src tensor at the indices specified in the index tensor along the first dimension. 
            If multiple indices reference the same location, their contributions are aggregated according to name (either "add", "mean" or "max").
        here we take the dest of the edge as the index in the scatter
         out = op(src, index, 0, None, dim_size, fill_value)
        the default dim will be 0
        '''
    
        # progate three msg: edge_index (including self-loops), weighted node-feature matrix, normlized weights of edges
        return self.propagate(edge_index, x=x, norm=norm)
    
    # according to the base class: messagepassing
    ''' x_j and norm are both the message_args, they will be append back in the same order in the propagate func '''
    def message(self, x_j, norm):
        # x_j is of the shape: (E, out_channel), E is the number of the edges with self-loops
        return norm.view(-1, 1) * x_j    # E by 1 (single column) multiply (E by out_channel), element-wise normalization

    def update(self, aggr_out):
        if self.bias is not None:
            aggr_out = aggr_out + self.bias   # bet this also the element-wise tensor op
        return aggr_out

    def __repr__(self):
        return '{}({}, {})'.format(self.__class__.__name__, self.in_channels,
                                   self.out_channels)

In [28]:
a = torch.tensor([[2, 3, 4], [4, 5, 6]], dtype=torch.float)
print(a.size(), a.size(-2), a.size(-1))
# uniform distriubtion for a tensor
# a = a.cuda()
a.data.uniform_(0., 10.)
print(a)


torch.Size([2, 3]) 2 3
tensor([[3.2241, 2.3083, 9.0218],
        [9.8002, 8.3985, 2.1255]])


In [37]:
a = torch.Tensor(5)
print(a)

tensor([-6.7384e+34,  3.0880e-41, -9.0104e+35,  3.0880e-41,  8.9683e-44])


<font color=orange|green>

**What is a static method?**

Static methods, much like class methods, are methods that are bound to a class rather than its object.

They do not require a class instance creation. So, are not dependent on the state of the object.

The difference between a static method and a class method is:

    Static method knows nothing about the class and just deals with the parameters.
    Class method works with the class since its parameter is always the class itself.

They can be called both by the class and its object.

Class.staticmethodFunc()
or even
Class().staticmethodFunc()

## Costumized GCNConv based on Customized messagepassing

In [25]:
import sys
import inspect

import torch
from torch_geometric.utils import scatter_

special_args = ['edge_index', 'edge_index_i', 'edge_index_j', 'size', 'size_i', 'size_j']

__size_error_msg__ = ('All tensors which should get mapped to the same source '
                      'or target nodes must be of same size in dimension 0.')

is_python2 = sys.version_info[0] < 3
getargspec = inspect.getargspec if is_python2 else inspect.getfullargspec


class custom_MessagePassing(torch.nn.Module):

    def __init__(self, aggr='add', flow='source_to_target'):
        super(custom_MessagePassing, self).__init__()

        self.aggr = aggr
        assert self.aggr in ['add', 'mean', 'max']

        self.flow = flow
        assert self.flow in ['source_to_target', 'target_to_source']

        self.__message_args__ = getargspec(self.message)[0][1:]
        self.__special_args__ = [(i, arg)
                                 for i, arg in enumerate(self.__message_args__)
                                 if arg in special_args]
        self.__message_args__ = [
            arg for arg in self.__message_args__ if arg not in special_args
        ]
        self.__update_args__ = getargspec(self.update)[0][2:]

    def propagate(self, edge_index, size=None, **kwargs):
        print('='*200)
        print('Start output the info from the propagation function from messagepassing class (inherited by GCNConv):')
        dim = 0
        size = [None, None] if size is None else list(size)
        assert len(size) == 2

        i, j = (0, 1) if self.flow == 'target_to_source' else (1, 0)
        ij = {"_i": i, "_j": j}

        message_args = []
        for arg in self.__message_args__:
            if arg[-2:] in ij.keys():
                tmp = kwargs.get(arg[:-2], None)
                if tmp is None:  # pragma: no cover
                    message_args.append(tmp)
                else:
                    idx = ij[arg[-2:]]
                    if isinstance(tmp, tuple) or isinstance(tmp, list):
                        assert len(tmp) == 2
                        if tmp[1 - idx] is not None:
                            if size[1 - idx] is None:
                                size[1 - idx] = tmp[1 - idx].size(dim)
                            if size[1 - idx] != tmp[1 - idx].size(dim):
                                raise ValueError(__size_error_msg__)
                        tmp = tmp[idx]

                    if tmp is None:
                        message_args.append(tmp)
                    else:
                        if size[idx] is None:
                            size[idx] = tmp.size(dim)
                        if size[idx] != tmp.size(dim):
                            raise ValueError(__size_error_msg__)

                        tmp = torch.index_select(tmp, dim, edge_index[idx])
                        message_args.append(tmp)
            else:
                message_args.append(kwargs.get(arg, None))

        size[0] = size[1] if size[0] is None else size[0]
        size[1] = size[0] if size[1] is None else size[1]

        kwargs['edge_index'] = edge_index
        kwargs['size'] = size

        for (idx, arg) in self.__special_args__:
            if arg[-2:] in ij.keys():
                message_args.insert(idx, kwargs[arg[:-2]][ij[arg[-2:]]])
            else:
                message_args.insert(idx, kwargs[arg])

        update_args = [kwargs[arg] for arg in self.__update_args__]
        print('update_args: ', update_args)
        print('kwargs include the args: ', kwargs.keys(), '\n')
        print('message_args during propagation for each convolution step contains: ')
        # assume all the elements inside the message are tensor
        for idx, val in enumerate(message_args):
            print('Number ', idx, ' val type: ', type(message_args[idx]), ' val shape: ', message_args[idx].shape)
        
        
        print('\n call the message function inside the GCNConv (normalize the feature according to in- or out- degree of each nodes): ')
        out = self.message(*message_args)
        print('type and shape of the embedding after normalization based on in-dgree and out-degree of each node: ', type(out), out.shape, '\n')
        
        print('\n Step-4: call the scatter_ function (aggregates the feature values of source nodes into target nodes): ')
        out = scatter_(self.aggr, out, edge_index[i], dim_size=size[i])
        print('type and shape of embedding after scattering: ', type(out), out.shape, '\n')
        
        print('call the update function (may add the bias for GCNConv, default all zeros): ')
        out = self.update(out, *update_args)
        print('type and shape of embedding after udpating (may add bias): ', type(out), out.shape, '\n')
        
        print('\n Step-5: return the new node embeddings: number_of_nodes by out_channels tensor. ')
        print('End of the Info from the propagation function in messagepassing class ')
        print('='*200)
        return out

    def message(self, x_j):  # pragma: no cover
        return x_j

    def update(self, aggr_out):  # pragma: no cover
        return aggr_out


In [26]:
from torch.nn import Parameter
from torch_scatter import scatter_add
from torch_geometric.utils import add_remaining_self_loops
import math

def glorot(tensor):
    if tensor is not None:
        stdv = math.sqrt(6.0 / (tensor.size(-2) + tensor.size(-1)))
        tensor.data.uniform_(-stdv, stdv)
        
def zeros(tensor):
    if tensor is not None:
        tensor.data.fill_(0)

class custom_GCNConv(custom_MessagePassing):
    '''
        Here the __init__ will only be called once, when it is first created. Then each time we use the instance such as conv1,
        we actually are calling the forward function. 
        Therefore, we are keep updating the self.weight and the self.bias , which are the two model parameters for each GCNConv layer
    '''
    def __init__(self, in_channels, out_channels, improved=False, cached=False,
                 bias=True, **kwargs):
        super().__init__(aggr='add', **kwargs)

        self.in_channels = in_channels
        self.out_channels = out_channels
        self.improved = improved
        self.cached = cached

        self.weight = Parameter(torch.Tensor(in_channels, out_channels))

        if bias:
            self.bias = Parameter(torch.Tensor(out_channels))
        else:
            self.register_parameter('bias', None)

        self.reset_parameters()

    def reset_parameters(self):
        glorot(self.weight)
        zeros(self.bias)
        self.cached_result = None
        self.cached_num_edges = None


    @staticmethod
    def norm(edge_index, num_nodes, edge_weight=None, improved=False, dtype=None):
        print('Inside the norm function: ')
        
        if edge_weight is None:
            edge_weight = torch.ones((edge_index.size(1), ), dtype=dtype, device=edge_index.device)
        print('type and shape of edge_weights (default to be all 1): ', type(edge_weight), edge_weight.shape)
        
        fill_value = 1 if not improved else 2
        
        print('\n Step-2: Add remaining self loops to the edge_index:')
        edge_index, edge_weight = add_remaining_self_loops(
            edge_index, edge_weight, fill_value, num_nodes)
        print('type and shape of updated edge_index', type(edge_index), edge_index.shape)
        print('type and shape of updated edge weights', type(edge_weight), edge_weight.shape, '\n')
        
        row, col = edge_index
        deg = scatter_add(edge_weight, row, dim=0, dim_size=num_nodes)
        deg_inv_sqrt = deg.pow(-0.5)
        deg_inv_sqrt[deg_inv_sqrt == float('inf')] = 0
        print('type and shape of out-degree of each node:', type(deg[row]), deg[row].shape)
        print('type and shape of in-degree of each node:', type(deg[col]), deg[col].shape, '\n')
        
        return edge_index, deg_inv_sqrt[row] * edge_weight * deg_inv_sqrt[col]

    def forward(self, x, edge_index, edge_weight=None):
        """"""
        print('start calling the forward of the GCNConv: ')
        print('type and shape of the node feature matrix: ', type(x), x.shape)
        print('type and shape of the edge_index matrix in COO format: ', type(x), x.shape)
        x = torch.matmul(x, self.weight)
        
        print('\n Step-1: Initialize the weights (continuous uniform distribution) and use it to linearly transform feature matrix: ')
        print('type and shape of the node feature matrix after linear transformation', type(x), x.shape, '\n')
        
        if self.cached and self.cached_result is not None:
            if edge_index.size(1) != self.cached_num_edges:
                raise RuntimeError(
                    'Cached {} number of edges, but found {}. Please '
                    'disable the caching behavior of this layer by removing '
                    'the `cached=True` argument in its constructor.'.format(
                        self.cached_num_edges, edge_index.size(1)))
        
        print('%' * 200)
        print('Call the norm function inside forward (add self loops (i, i) edges to the edge_index and compute nomalization constants): ')
        if not self.cached or self.cached_result is None:
            self.cached_num_edges = edge_index.size(1)
            edge_index, norm = self.norm(edge_index, x.size(0), edge_weight,
                                         self.improved, x.dtype)
            self.cached_result = edge_index, norm

        edge_index, norm = self.cached_result
        print('End of calling the norm function')
        print('%' * 200)
        print()
        
        print('Start calling the propagationg function inside the GCNConv forward func: ')
        return self.propagate(edge_index, x=x, norm=norm)

    def message(self, x_j, norm):
        print('\n Step-3: normalize the node feature, normalization constants are sqrt(in-dgree)*sqrt(out-dgree) of each node (inside the message function)')
        return norm.view(-1, 1) * x_j

    def update(self, aggr_out):
        if self.bias is not None:
            aggr_out = aggr_out + self.bias
        return aggr_out

    def __repr__(self):
        return '{}({}, {})'.format(self.__class__.__name__, self.in_channels,
                                   self.out_channels)

### Prepare the dataset for input

In [6]:
'''Allow additional attributes of the instance to be set by defining: __setitem__ function '''
from torch_geometric.datasets import Planetoid
dataset = Planetoid(root='~/tmp/Planetoid/PubMed', name='PubMed')
# print(len(dataset), dataset.num_classes, dataset.num_node_features)
data = dataset[0]
print('Info (attributes) of a single data instance')
print(data, '\n number of nodes: ', data.num_nodes, '\n number of edges: ', data.num_edges, \
      '\n number of features per ndoe: ', data.num_node_features, '\n number of edge features: ', data.num_edge_features, \
      '\n all the attributes of data: ', data.keys)
# print(data.train_mask.shape)   # 1-D attributes


device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# device = 'cpu'
data = data.to(device)
# optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)

# calling one single GCNConv layer
conv1 = custom_GCNConv(dataset.num_node_features, 2).cuda()

Info (attributes) of a single data instance
Data(edge_index=[2, 88648], test_mask=[19717], train_mask=[19717], val_mask=[19717], x=[19717, 500], y=[19717]) 
 number of nodes:  19717 
 number of edges:  88648 
 number of features per ndoe:  500 
 number of edge features:  0 
 all the attributes of data:  ['x', 'edge_index', 'y', 'train_mask', 'val_mask', 'test_mask']


### Use the customized GCNConv layer

In [7]:
x, edge_index = data.x.cuda(), data.edge_index.cuda()
# each row inside the x is the feature vector of a single node inside the graph
print('before the GCN layer, node feature matrix type and shape: ', type(x), x.shape)
print('before the GCN layer, edge_index matrix type and shape: ', type(edge_index), edge_index.shape)
print('Start calling the GCNConv \n')
print('*' * 200)
x = conv1(x, edge_index)
print('*' * 200)
print('End of calling the GCNConv \n')

print('after the GCN layer, node feature matrix type and shape: ', type(x), x.shape)

before the GCN layer, node feature matrix type and shape:  <class 'torch.Tensor'> torch.Size([19717, 500])
before the GCN layer, edge_index matrix type and shape:  <class 'torch.Tensor'> torch.Size([2, 88648])
Start calling the GCNConv 

********************************************************************************************************************************************************************************************************
start calling the forward of the GCNConv: 
type and shape of the node feature matrix:  <class 'torch.Tensor'> torch.Size([19717, 500])
type and shape of the edge_index matrix in COO format:  <class 'torch.Tensor'> torch.Size([19717, 500])

 Step-1: Initialize the weights (continuous uniform distribution) and use it to linearly transform feature matrix: 
type and shape of the node feature matrix after linear transformation <class 'torch.Tensor'> torch.Size([19717, 2]) 

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

In [92]:
# to check that in the COO format of the current undirected graph, each edge has been recorded twice
ref = {}

x, y = data.edge_index
print(x, y)
x, y = list(x.data.cpu().numpy()), list(y.data.cpu().numpy())
res = list(zip(x, y))
print(len(res))
print(len(set(res)))
for a, b in zip(x, y):
    if (b, a) in ref:
        ref[(b, a)] += 1
    else:
        ref[(a, b)] = 1
res1 = [key for key, val in ref.items() if val == 1]
res2 = [key for key, val in ref.items() if val == 2]
print(len(res1), len(res2))

tensor([    0,     0,     0,  ..., 19714, 19715, 19716], device='cuda:0') tensor([ 1378,  1544,  6092,  ..., 12278,  4284, 16030], device='cuda:0')
88648
88648
0 44324


### Analyse the models constructed by GCNConvs

In [30]:

import torch
import torch.nn.functional as F
from torch_geometric.nn import GCNConv

class Net(torch.nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.conv1 = GCNConv(dataset.num_node_features, 16)
        self.conv2 = GCNConv(16, dataset.num_classes)

    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        print('start calling the forward of the GCNConv, before any layers: ')
        print('type and shape of the node feature matrix: ', type(x), x.shape)
        print('type and shape of the edge_index matrix in COO format: ', type(x), x.shape)
        
        x = self.conv1(x, edge_index)
        print('After the first GCNConv layer, type and shape of the node feature matrix: ', type(x), x.shape)
        
#         # here we introduce the non-linearity
#         x = F.relu(x)
#         print('After the first relu, type and shape of the node feature matrix: ', type(x), x.shape)
        
#         x = F.dropout(x, training=self.training)
#         print('After the first drop out layer, type and shape of the node feature matrix: ', type(x), x.shape)
        
#         x = self.conv2(x, edge_index)
#         print('After the second GCNConv layer, type and shape of the node feature matrix: ', type(x), x.shape)
                
        return F.log_softmax(x, dim=1)
    

In [31]:
'''Allow additional attributes of the instance to be set by defining: __setitem__ function '''
from torch_geometric.datasets import Planetoid
dataset = Planetoid(root='~/tmp/Planetoid/PubMed', name='PubMed')
# print(len(dataset), dataset.num_classes, dataset.num_node_features)
data = dataset[0]
print('Info (attributes) of a single data instance')
print(data, '\n number of nodes: ', data.num_nodes, '\n number of edges: ', data.num_edges, \
      '\n number of features per ndoe: ', data.num_node_features, '\n number of edge features: ', data.num_edge_features, \
      '\n all the attributes of data: ', data.keys)
# print(data.train_mask.shape)   # 1-D attributes


Info (attributes) of a single data instance
Data(edge_index=[2, 88648], test_mask=[19717], train_mask=[19717], val_mask=[19717], x=[19717, 500], y=[19717]) 
 number of nodes:  19717 
 number of edges:  88648 
 number of features per ndoe:  500 
 number of edge features:  0 
 all the attributes of data:  ['x', 'edge_index', 'y', 'train_mask', 'val_mask', 'test_mask']


In [32]:

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = Net().to(device)
data = dataset[0].to(device)
# Parameter is a generator associated with a model instance (of torch.nn.Module)
# here is the weight and bias for each GCNConv layers
print(model.parameters(), type(model.parameters()))
for param in model.parameters():
    print(type(param.data), param.size())
# specify which parameters need to be updated
optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)


model.train()
print('*'*100, '\nstart the train:\n')
for epoch in range(1):
    optimizer.zero_grad()
    out = model(data)
    print('\nafter the GCN based model forward, classification result matrix of probability type and shape: ', type(out), out.shape)
    print('compare the predict and label: ')
    print('predict shape: ',out[data.train_mask].shape, 'label shape: ', data.y[data.train_mask].shape)
    loss = F.nll_loss(out[data.train_mask], data.y[data.train_mask])
    print(loss)
    print('after the GCN based model forward, loss type and shape: ', type(loss), loss.shape, loss.item())
    loss.backward()
    optimizer.step()

    
model.eval()
### this prediction will be the second call of the model instance
# _, pred = model(data).max(dim=1)
# correct = float (pred[data.test_mask].eq(data.y[data.test_mask]).sum().item())
# acc = correct / data.test_mask.sum().item()

<generator object Module.parameters at 0x7fc89e8d8b10> <class 'generator'>
<class 'torch.Tensor'> torch.Size([500, 16])
<class 'torch.Tensor'> torch.Size([16])
<class 'torch.Tensor'> torch.Size([16, 3])
<class 'torch.Tensor'> torch.Size([3])
**************************************************************************************************** 
start the train:

start calling the forward of the GCNConv, before any layers: 
type and shape of the node feature matrix:  <class 'torch.Tensor'> torch.Size([19717, 500])
type and shape of the edge_index matrix in COO format:  <class 'torch.Tensor'> torch.Size([19717, 500])
After the first GCNConv layer, type and shape of the node feature matrix:  <class 'torch.Tensor'> torch.Size([19717, 16])

after the GCN based model forward, classification result matrix of probability type and shape:  <class 'torch.Tensor'> torch.Size([19717, 16])
tensor(2.7716, device='cuda:0', grad_fn=<NllLossBackward>)
after the GCN based model forward, loss type and shape:



Net(
  (conv1): GCNConv(500, 16)
  (conv2): GCNConv(16, 3)
)

**What does the backward() function do:**

loss.backward() computes dloss/dx for every parameter x which has requires_grad=True. These are accumulated into x.grad for every parameter x. In pseudo-code:

x.grad += dloss/dx

optimizer.step updates the value of x using the gradient x.grad. For example, the SGD optimizer performs:

x += -lr * x.grad

https://discuss.pytorch.org/t/what-does-the-backward-function-do/9944
