In [1]:
%matplotlib inline


PyTorch: Defining New autograd Functions
----------------------------------------

A fully-connected ReLU network with one hidden layer and no biases, trained to
predict y from x by minimizing squared Euclidean distance.

This implementation computes the forward pass using operations on PyTorch
Variables, and uses PyTorch autograd to compute gradients.

In this implementation we implement our own custom autograd function to perform
the ReLU function.



In [2]:
import torch


class MyReLU(torch.autograd.Function):
    """
    We can implement our own custom autograd Functions by subclassing
    torch.autograd.Function and implementing the forward and backward passes
    which operate on Tensors.
    """

    @staticmethod
    def forward(ctx, input):
        """
        In the forward pass we receive a Tensor containing the input and return
        a Tensor containing the output. ctx is a context object that can be used
        to stash information for backward computation. You can cache arbitrary
        objects for use in the backward pass using the ctx.save_for_backward method.
        """
        ctx.save_for_backward(input)
        return input.clamp(min=0)

    @staticmethod
    def backward(ctx, grad_output):
        """
        In the backward pass we receive a Tensor containing the gradient of the loss
        with respect to the output, and we need to compute the gradient of the loss
        with respect to the input.
        """
        input, = ctx.saved_tensors
        grad_input = grad_output.clone()
        grad_input[input < 0] = 0
        return grad_input


dtype = torch.float
device = torch.device("cpu")
# device = torch.device("cuda:0") # Uncomment this to run on GPU

# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random Tensors to hold input and outputs.
x = torch.randn(N, D_in, device=device, dtype=dtype)
y = torch.randn(N, D_out, device=device, dtype=dtype)

# Create random Tensors for weights.
w1 = torch.randn(D_in, H, device=device, dtype=dtype, requires_grad=True)
w2 = torch.randn(H, D_out, device=device, dtype=dtype, requires_grad=True)

learning_rate = 1e-6
for t in range(500):
    # To apply our Function, we use Function.apply method. We alias this as 'relu'.
    relu = MyReLU.apply

    # Forward pass: compute predicted y using operations; we compute
    # ReLU using our custom autograd operation.
    y_pred = relu(x.mm(w1)).mm(w2)

    # Compute and print loss
    loss = (y_pred - y).pow(2).sum()
    print(t, loss.item())

    # Use autograd to compute the backward pass.
    loss.backward()

    # Update weights using gradient descent
    with torch.no_grad():
        w1 -= learning_rate * w1.grad
        w2 -= learning_rate * w2.grad

        # Manually zero the gradients after updating weights
        w1.grad.zero_()
        w2.grad.zero_()

0 40358812.0
1 40655856.0
2 43313424.0
3 39624056.0
4 27641106.0
5 14660755.0
6 6785062.5
7 3388318.0
8 2053703.375
9 1471801.625
10 1158314.0
11 952621.3125
12 800510.9375
13 680644.875
14 583274.75
15 503085.40625
16 436339.40625
17 380286.3125
18 332874.34375
19 292812.8125
20 258510.609375
21 229057.15625
22 203636.78125
23 181606.015625
24 162415.203125
25 145655.703125
26 130962.0234375
27 118076.171875
28 106719.984375
29 96683.5546875
30 87789.9765625
31 79862.96875
32 72784.9453125
33 66447.9921875
34 60762.40234375
35 55654.28515625
36 51051.1015625
37 46897.9609375
38 43140.5546875
39 39734.59765625
40 36643.93359375
41 33834.15625
42 31275.453125
43 28940.798828125
44 26808.609375
45 24857.330078125
46 23069.310546875
47 21430.6328125
48 19926.876953125
49 18543.87890625
50 17270.4765625
51 16097.51953125
52 15016.189453125
53 14017.427734375
54 13094.0283203125
55 12239.4990234375
56 11447.8515625
57 10714.0205078125
58 10033.037109375
59 9400.361328125
60 8812.40234375
61