In [1]:
import torch

dtype = torch.float
device = torch.device("cuda:0")

In [2]:
# initialization
N, D_in, H, D_out = 64, 1000, 100, 10

# create random tensors to hold input and outputs
# setting requires_grad = False indicates that we do not need to compute gradients
# with respect to these Tensors during the backward pass
w1 = torch.randn(D_in, H, device=device, dtype=dtype, requires_grad=True)
w2 = torch.randn(H, D_out, device=device, dtype=dtype, requires_grad=True)

In [None]:
learninig_rate = 1e-6
for t in range(500):
    # Forward pass: compute predicted y using operations on Tensors
    # these are exactly the same operations we used to compute the forward pass
    # using Tensors, but we do not need to keep references to intermediate
    # values since we are not implementing the backward pass by hand
    y_pred = x.mm(w1).clamp(min=0).mm(w2)
    
    # compute and print loss using operations on Tensors
    # now loss is a Tensor of shape (1,)
    # loss.item() gets the a scalar value held in the loss
    loss = (y_pred - y).pow(2).sum()
    print(t, loss.item())
    
    # use autograd to compute the backward pass. This call will compute the 
    # gradient of loss with respect to all Tensors with requires_grad=True
    # After this call w1.grad and w2.grad will be Tensors holding the gradient 
    # of the loss with respect to w1 and w2 respectively
    loss.backward()
    
    # Manually update weights using gradient descent. Wrap in torch.no_grad()
    # because weights have requires_grad = True, but we don't need to track
    # this in autograd
    
    # An alternative way is to operate on weight.data and weight.grad.data
    # Recall that tensor.data gives a tensor that shares the storage with 
    # tensor, but doesn't track history
    # Can also use torch.optim.SGD to achieve this
    with torch.no_grad():
        w1 -= learning_rate * w1.grad
        w2 -= learning_rate * w2.grad
        
        # manually zero the gradients after updating weights
        w1.grad.zero_()
        w2.grad.zero_()

Each primitive autograd operator is really two functions that operate on Tensors.<br>The **forward** function computes output Tensors from input Tensors.<br>The **backward** function receives the gradient of the output Tensors with respect to some scalar value, and computes the gradient of the input Tensors with respect to that same scalar value.

In [6]:
import torch

class MyReLU(torch.autograd.Function):
    # implement out own custom autograd Functions by subclassing 
    # torch.autograd.Function and implementing the forward and backward
    # passes which operate on Tensors
    
    @staticmethod
    def forward(ctx, input):
        # in the forward pass, we receive a Tensor containing the input and 
        # return a Tensor containing the output. ctx is a context object that
        # can be used to stash information for backward computation.
        ctx.save_for_backward(input)
        return input.clamp(min=0)
    
    @staticmethod
    def backward(ctx, grad_output):
        # in the backward pass, we receive a Tensor containing the gradient
        # of the loss with respect to the output, and we need to compute the 
        # gradient of the losswith respect to the input
        input, = ctx.saved_tensors
        grad_input = grad_output.clone()
        grad_input[input < 0] = 0
        return grad_input

In [7]:
dtype = torch.float
device = torch.device("cuda:0")

N, D_in, H, D_out = 64, 1000, 100, 10

# create random Tensors to hold input and outputs
x = torch.randn(N, D_in, device=device, dtype=dtype)
y = torch.randn(N, D_out, device=device, dtype=dtype)

# create random Tensors for weights
w1 = torch.randn(D_in, H, device=device, dtype=dtype, requires_grad=True)
w2 = torch.randn(H, D_out, device=device, dtype=dtype, requires_grad=True)

learning_rate = 1e-6
for t in range(500):
    # To apply our Function, we use Function.apply method. We alias this as 'relu'.
    relu = MyReLU.apply

    # Forward pass: compute predicted y using operations; we compute
    # ReLU using our custom autograd operation.
    y_pred = relu(x.mm(w1)).mm(w2)

    # Compute and print loss
    loss = (y_pred - y).pow(2).sum()
    print(t, loss.item())

    # Use autograd to compute the backward pass.
    loss.backward()

    # Update weights using gradient descent
    with torch.no_grad():
        w1 -= learning_rate * w1.grad
        w2 -= learning_rate * w2.grad

        # Manually zero the gradients after updating weights
        w1.grad.zero_()
        w2.grad.zero_()

0 27505700.0
1 21825602.0
2 21244482.0
3 22264314.0
4 22612990.0
5 20552628.0
6 16185956.0
7 11005992.0
8 6748862.5
9 3937228.0
10 2327863.0
11 1451868.0
12 977546.125
13 710063.0
14 549152.375
15 444335.625
16 370681.375
17 315479.3125
18 272100.34375
19 236833.9375
20 207559.484375
21 182873.28125
22 161863.28125
23 143809.671875
24 128206.203125
25 114647.734375
26 102814.0234375
27 92442.8359375
28 83321.6640625
29 75284.03125
30 68169.890625
31 61880.79296875
32 56282.70703125
33 51284.9140625
34 46813.0546875
35 42801.6015625
36 39195.734375
37 35946.73828125
38 33012.0625
39 30357.421875
40 27955.615234375
41 25776.55078125
42 23795.3515625
43 21991.185546875
44 20345.64453125
45 18842.6953125
46 17468.822265625
47 16210.923828125
48 15057.0166015625
49 13997.69921875
50 13024.3359375
51 12128.6376953125
52 11303.6611328125
53 10542.896484375
54 9840.5078125
55 9193.0791015625
56 8595.7548828125
57 8042.796875
58 7530.498046875
59 7055.5615234375
60 6614.49365234375
61 6204.8906

440 0.002232269151136279
441 0.002174162771552801
442 0.002116383984684944
443 0.002060707425698638
444 0.0020055449567735195
445 0.0019547506235539913
446 0.0019014577846974134
447 0.0018518504220992327
448 0.0018041272414848208
449 0.0017574866069480777
450 0.0017128109466284513
451 0.0016697099199518561
452 0.0016252878122031689
453 0.0015859502600505948
454 0.0015450258506461978
455 0.00150502217002213
456 0.0014686014037579298
457 0.001433422788977623
458 0.00139753264375031
459 0.0013621656689792871
460 0.0013291677460074425
461 0.0012957497965544462
462 0.0012626450043171644
463 0.0012348862364888191
464 0.0012039887951686978
465 0.001175607554614544
466 0.0011470505269244313
467 0.0011189209762960672
468 0.0010934759629890323
469 0.0010664132423698902
470 0.0010416917502880096
471 0.0010180867975577712
472 0.00099430070258677
473 0.0009708348079584539
474 0.0009476985433138907
475 0.0009266933193430305
476 0.0009064298938028514
477 0.0008854640764184296
478 0.000866246293298900