Test of  Mask Graph N-N

In [1]:
import math
import torch
import torch.nn as nn
import numpy as np

In [2]:
class CustomizedLinearFunction(torch.autograd.Function):
    """
    autograd function which masks it's weights by 'mask'.
    """

    # Note that both forward and backward are @staticmethods
    @staticmethod
    # bias, mask is an optional argument
    def forward(ctx, input, weight, bias=None, mask=None):
        if mask is not None:
            # change weight to 0 where mask == 0  
            for i in range(mask.shape[0]):
                for j in range(mask.shape[1]):
                    if torch.equal(mask[i][j], torch.tensor(2.)):
                        weight[i][j]=1
                    else:
                        weight[i][j] = weight[i][j] * mask[i][j]
                
        output = input.mm(weight.t())
        if bias is not None:
            output += bias.unsqueeze(0).expand_as(output)
        ctx.save_for_backward(input, weight, bias, mask)
        return output

    # This function has only a single output, so it gets only one gradient
    @staticmethod
    def backward(ctx, grad_output):
        # This is a pattern that is very convenient - at the top of backward
        # unpack saved_tensors and initialize all gradients w.r.t. inputs to
        # None. Thanks to the fact that additional trailing Nones are
        # ignored, the return statement is simple even when the function has
        # optional inputs.
        input, weight, bias, mask = ctx.saved_tensors
        grad_input = grad_weight = grad_bias = grad_mask = None

        # These needs_input_grad checks are optional and there only to
        # improve efficiency. If you want to make your code simpler, you can
        # skip them. Returning gradients for inputs that don't require it is
        # not an error.
        if ctx.needs_input_grad[0]:
            grad_input = grad_output.mm(weight)
        if ctx.needs_input_grad[1]:
            grad_weight = grad_output.t().mm(input)
            if mask is not None:
                # change grad_weight to 0 where mask == 0
                for i in range(mask.shape[0]):
                    for j in range(mask.shape[1]):
                        if torch.equal(mask[i][j], torch.tensor(2.)):
                            grad_weight[i][j]=0
                        else:
                            grad_weight[i][j] = grad_weight[i][j] * mask[i][j]
        #if bias is not None and ctx.needs_input_grad[2]:
        if ctx.needs_input_grad[2]:
            grad_bias = grad_output.sum(0).squeeze(0)

        return grad_input, grad_weight, grad_bias, grad_mask



In [3]:
class CustomizedLinear(nn.Module):
    def __init__(self, mask, bias=True):
        """
        extended torch.nn module which mask connection.
        Argumens
        ------------------
        mask [torch.tensor]:
            the shape is (n_input_feature, n_output_feature).
            the elements are 0 or 1 which declare un-connected or
            connected.
        bias [bool]:
            flg of bias.
        """
        super(CustomizedLinear, self).__init__()
        self.input_features = mask.shape[0]
        self.output_features = mask.shape[1]
        if isinstance(mask, torch.Tensor):
            self.mask = mask.type(torch.float).t()
        else:
            self.mask = torch.tensor(mask, dtype=torch.float).t()

        self.mask = nn.Parameter(self.mask, requires_grad=False)

        # nn.Parameter is a special kind of Tensor, that will get
        # automatically registered as Module's parameter once it's assigned
        # as an attribute. Parameters and buffers need to be registered, or
        # they won't appear in .parameters() (doesn't apply to buffers), and
        # won't be converted when e.g. .cuda() is called. You can use
        # .register_buffer() to register buffers.
        # nn.Parameters require gradients by default.
        self.weight = nn.Parameter(torch.Tensor(self.output_features, self.input_features))

        if bias:
            self.bias = nn.Parameter(torch.Tensor(self.output_features))
        else:
            # You should always register all possible parameters, but the
            # optional ones can be None if you want.
            self.register_parameter('bias', None)
        self.reset_parameters()

        # mask weight
        self.weight.data = self.weight.data * self.mask

    def reset_parameters(self):
        stdv = 1. / math.sqrt(self.weight.size(1))
        self.weight.data.uniform_(-stdv, stdv)
        if self.bias is not None:
            self.bias.data.uniform_(-stdv, stdv)


    def forward(self, input):
        # See the autograd section for explanation of what happens here.
        return CustomizedLinearFunction.apply(input, self.weight, self.bias, self.mask)

    def extra_repr(self):
        # (Optional)Set the extra information about this module. You can test
        # it by printing an object of this class.
        return 'input_features={}, output_features={}, bias={}'.format(
            self.input_features, self.output_features, self.bias is not None
        )

In [4]:
if __name__ == 'check grad':
    from torch.autograd import gradcheck

    # gradcheck takes a tuple of tensors as input, check if your gradient
    # evaluated with these tensors are close enough to numerical
    # approximations and returns True if they all verify this condition.

    customlinear = CustomizedLinearFunction.apply

    input = (
            torch.randn(20,20,dtype=torch.double,requires_grad=True),
            torch.randn(30,20,dtype=torch.double,requires_grad=True),
            None,
            None,
            )
    test = gradcheck(customlinear, input, eps=1e-6, atol=1e-4)
    print(test)

In [5]:
# define mask matrix to customize linear
mask = torch.tensor(
  [[1, 2, 1],
   [0, 1, 0],
   [1, 2, 1],
   [1, 0, 1],]
  )

# define size of layers.
# this architect is [INPUT, HIDDEN(masked(customized) linear), OUTPUT]-layers.
Dim_INPUT  = mask.shape[0]
Dim_HIDDEN = mask.shape[1]
Dim_OUTPUT = 1

In [6]:
# create randomly input:x, output:y as train dataset.
batch = 1
x = torch.randn(batch, Dim_INPUT)
y = torch.randn(batch, Dim_OUTPUT)

# pipe as model
model = torch.nn.Sequential(
        CustomizedLinear(mask, bias=None), # dimmentions is set from mask.size 
        torch.nn.Linear(Dim_HIDDEN, Dim_OUTPUT, bias=None),
        )

In [132]:
# backward pass
print('=== mask matrix ===')
print(mask)
print('===================')
learning_rate = 0.1
for t in range(3):
    # forward
    y_pred = model(x)

    # loss
    loss = (y_pred - y).abs().mean()

    # Zero the gradients before running the backward pass.
    model.zero_grad()

    # Use autograd to compute the backward pass
    loss.backward()

    # Update the weights
    with torch.no_grad():
        for param in model.parameters():
            # mask is also saved in param, but mask.requires_grad=False
            if param.requires_grad: 
                param -= learning_rate * param.grad
                # check masked param.grad
                if np.array(param.grad).size == np.array(mask).size:
                    print('--- epoch={}, loss={} ---'.format(t,loss.item()))
                    print('↓↓↓masked weight↓↓↓')
                    print(param.t())
                    print('↓↓↓masked grad of weight↓↓↓')
                    print(param.grad.t())

=== mask matrix ===
tensor([[1, 2, 1],
        [0, 1, 0],
        [1, 2, 1],
        [1, 0, 1]])
--- epoch=0, loss=1.7676911354064941 ---
↓↓↓masked weight↓↓↓
tensor([[-0.1822,  1.0000, -0.0035],
        [ 0.0000, -0.1386,  0.0000],
        [-0.3157,  1.0000,  0.3445],
        [ 0.0891, -0.0000,  0.1516]], requires_grad=True)
↓↓↓masked grad of weight↓↓↓
tensor([[ 0.0485,  0.0000, -0.2757],
        [ 0.0000, -0.3177, -0.0000],
        [-0.1935,  0.0000,  1.1005],
        [-0.0396,  0.0000,  0.2250]])
--- epoch=1, loss=1.0690386295318604 ---
↓↓↓masked weight↓↓↓
tensor([[-0.1828,  1.0000,  0.0165],
        [ 0.0000, -0.1244,  0.0000],
        [-0.3130,  1.0000,  0.2648],
        [ 0.0897, -0.0000,  0.1353]], requires_grad=True)
↓↓↓masked grad of weight↓↓↓
tensor([[ 0.0067,  0.0000, -0.1997],
        [ 0.0000, -0.1423,  0.0000],
        [-0.0269,  0.0000,  0.7970],
        [-0.0055,  0.0000,  0.1629]])
--- epoch=2, loss=0.5059181451797485 ---
↓↓↓masked weight↓↓↓
tensor([[-0.1797,  1.0000,  