In [1]:
%matplotlib inline


PyTorch: Tensors and autograd
-------------------------------

A fully-connected ReLU network with one hidden layer and no biases, trained to
predict y from x by minimizing squared Euclidean distance.

This implementation computes the forward pass using operations on PyTorch
Tensors, and uses PyTorch autograd to compute gradients.


A PyTorch Tensor represents a node in a computational graph. If ``x`` is a
Tensor that has ``x.requires_grad=True`` then ``x.grad`` is another Tensor
holding the gradient of ``x`` with respect to some scalar value.



In [4]:
import torch

dtype = torch.float
# device = torch.device("cpu")
device = torch.device("cuda:0") # Uncomment this to run on GPU

# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random Tensors to hold input and outputs.
# Setting requires_grad=False indicates that we do not need to compute gradients
# with respect to these Tensors during the backward pass.
x = torch.randn(N, D_in, device=device, dtype=dtype)
y = torch.randn(N, D_out, device=device, dtype=dtype)

# Create random Tensors for weights.
# Setting requires_grad=True indicates that we want to compute gradients with
# respect to these Tensors during the backward pass.
w1 = torch.randn(D_in, H, device=device, dtype=dtype, requires_grad=True)
w2 = torch.randn(H, D_out, device=device, dtype=dtype, requires_grad=True)

learning_rate = 1e-6

for t in range(500):
    # Forward pass: compute predicted y using operations on Tensors; these
    # are exactly the same operations we used to compute the forward pass using
    # Tensors, but we do not need to keep references to intermediate values since
    # we are not implementing the backward pass by hand.
    y_pred = x.mm(w1).clamp(min=0).mm(w2)

    # Compute and print loss using operations on Tensors.
    # Now loss is a Tensor of shape (1,)
    # loss.item() gets the a scalar value held in the loss.
    loss = (y_pred - y).pow(2).sum()
    print(t, loss.item())

    # Use autograd to compute the backward pass. This call will compute the
    # gradient of loss with respect to all Tensors with requires_grad=True.
    # After this call w1.grad and w2.grad will be Tensors holding the gradient
    # of the loss with respect to w1 and w2 respectively.
    loss.backward()

    # Manually update weights using gradient descent. Wrap in torch.no_grad()
    # because weights have requires_grad=True, but we don't need to track this
    # in autograd.
    
    # An alternative way is to operate on weight.data and weight.grad.data.
    # Recall that tensor.data gives a tensor that shares the storage with
    # tensor, but doesn't track history.
    # You can also use torch.optim.SGD to achieve this.
    with torch.no_grad():
        w1 -= learning_rate * w1.grad
        w2 -= learning_rate * w2.grad

        # Manually zero the gradients after updating weights
        w1.grad.zero_()
        w2.grad.zero_()

0 27392828.0
1 21725772.0
2 19557388.0
3 17959060.0
4 15690210.0
5 12682205.0
6 9418000.0
7 6563860.5
8 4401853.0
9 2931817.0
10 1984665.0
11 1388778.875
12 1011713.8125
13 767774.5
14 604510.75
15 490833.625
16 408272.6875
17 345919.28125
18 297254.0625
19 258162.859375
20 226051.953125
21 199197.84375
22 176461.390625
23 157005.359375
24 140213.359375
25 125641.7109375
26 112906.1171875
27 101723.6484375
28 91882.171875
29 83175.171875
30 75443.6640625
31 68562.875
32 62422.5390625
33 56926.625
34 52005.12109375
35 47589.1015625
36 43608.86328125
37 40014.1171875
38 36763.0390625
39 33817.91796875
40 31143.642578125
41 28712.490234375
42 26499.80078125
43 24482.0390625
44 22639.513671875
45 20955.240234375
46 19414.390625
47 18003.05859375
48 16708.875
49 15519.9599609375
50 14427.005859375
51 13421.013671875
52 12494.1669921875
53 11639.5537109375
54 10850.87109375
55 10122.328125
56 9448.8212890625
57 8825.6640625
58 8249.470703125
59 7715.84228515625
60 7220.79638671875
61 6761.30

494 0.00030520028667524457
495 0.00029992428608238697
496 0.0002939214464277029
497 0.0002890592732001096
498 0.00028342934092506766
499 0.0002776745823211968
