In [1]:
%matplotlib inline


PyTorch: Tensors and autograd
-------------------------------

A fully-connected ReLU network with one hidden layer and no biases, trained to
predict y from x by minimizing squared Euclidean distance.

This implementation computes the forward pass using operations on PyTorch
Tensors, and uses PyTorch autograd to compute gradients.


A PyTorch Tensor represents a node in a computational graph. If ``x`` is a
Tensor that has ``x.requires_grad=True`` then ``x.grad`` is another Tensor
holding the gradient of ``x`` with respect to some scalar value.



In [2]:
import torch

dtype = torch.float
# device = torch.device("cpu")
device = torch.device("cuda:0") # Uncomment this to run on GPU

# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random Tensors to hold input and outputs.
# Setting requires_grad=False indicates that we do not need to compute gradients
# with respect to these Tensors during the backward pass.
x = torch.randn(N, D_in, device=device, dtype=dtype)
y = torch.randn(N, D_out, device=device, dtype=dtype)

# Create random Tensors for weights.
# Setting requires_grad=True indicates that we want to compute gradients with
# respect to these Tensors during the backward pass.
w1 = torch.randn(D_in, H, device=device, dtype=dtype, requires_grad=True)
w2 = torch.randn(H, D_out, device=device, dtype=dtype, requires_grad=True)

learning_rate = 1e-6
for t in range(500):
    # Forward pass: compute predicted y using operations on Tensors; these
    # are exactly the same operations we used to compute the forward pass using
    # Tensors, but we do not need to keep references to intermediate values since
    # we are not implementing the backward pass by hand.
    y_pred = x.mm(w1).clamp(min=0).mm(w2)

    # Compute and print loss using operations on Tensors.
    # Now loss is a Tensor of shape (1,)
    # loss.item() gets the a scalar value held in the loss.
    loss = (y_pred - y).pow(2).sum()
    print(t, loss.item())

    # Use autograd to compute the backward pass. This call will compute the
    # gradient of loss with respect to all Tensors with requires_grad=True.
    # After this call w1.grad and w2.grad will be Tensors holding the gradient
    # of the loss with respect to w1 and w2 respectively.
    loss.backward()

    # Manually update weights using gradient descent. Wrap in torch.no_grad()
    # because weights have requires_grad=True, but we don't need to track this
    # in autograd.
    # An alternative way is to operate on weight.data and weight.grad.data.
    # Recall that tensor.data gives a tensor that shares the storage with
    # tensor, but doesn't track history.
    # You can also use torch.optim.SGD to achieve this.
    with torch.no_grad():
        w1 -= learning_rate * w1.grad
        w2 -= learning_rate * w2.grad

        # Manually zero the gradients after updating weights
        w1.grad.zero_()
        w2.grad.zero_()

0 26903142.0
1 24462994.0
2 25169538.0
3 25643084.0
4 23336704.0
5 18188986.0
6 12035918.0
7 7135078.5
8 4066694.5
9 2410369.5
10 1553893.875
11 1102023.0
12 845475.25
13 684835.375
14 573889.625
15 490916.53125
16 425342.9375
17 371677.09375
18 326852.0
19 288872.09375
20 256421.75
21 228492.765625
22 204294.984375
23 183249.0
24 164851.4375
25 148711.640625
26 134536.1875
27 122031.4453125
28 110947.578125
29 101097.3828125
30 92312.5703125
31 84464.4921875
32 77425.7265625
33 71101.8984375
34 65407.02734375
35 60269.34375
36 55622.8203125
37 51412.96484375
38 47590.890625
39 44113.97265625
40 40945.73828125
41 38054.47265625
42 35411.65625
43 32991.95703125
44 30773.126953125
45 28735.4609375
46 26863.09375
47 25138.55078125
48 23549.38671875
49 22081.453125
50 20724.32421875
51 19468.75
52 18305.708984375
53 17227.09765625
54 16225.7373046875
55 15294.640625
56 14429.982421875
57 13624.3017578125
58 12872.443359375
59 12170.671875
60 11515.3671875
61 10902.3076171875
62 10328.66992