In [1]:
%matplotlib inline


PyTorch: Tensors and autograd
-------------------------------

A fully-connected ReLU network with one hidden layer and no biases, trained to
predict y from x by minimizing squared Euclidean distance.

This implementation computes the forward pass using operations on PyTorch
Tensors, and uses PyTorch autograd to compute gradients.


A PyTorch Tensor represents a node in a computational graph. If ``x`` is a
Tensor that has ``x.requires_grad=True`` then ``x.grad`` is another Tensor
holding the gradient of ``x`` with respect to some scalar value.



In [2]:
import torch

dtype = torch.float
device = torch.device("cpu")
# device = torch.device("cuda:0") # Uncomment this to run on GPU

# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random Tensors to hold input and outputs.
# Setting requires_grad=False indicates that we do not need to compute gradients
# with respect to these Tensors during the backward pass.
x = torch.randn(N, D_in, device=device, dtype=dtype)
y = torch.randn(N, D_out, device=device, dtype=dtype)

# Create random Tensors for weights.
# Setting requires_grad=True indicates that we want to compute gradients with
# respect to these Tensors during the backward pass.
w1 = torch.randn(D_in, H, device=device, dtype=dtype, requires_grad=True)
w2 = torch.randn(H, D_out, device=device, dtype=dtype, requires_grad=True)

learning_rate = 1e-6
for t in range(500):
    # Forward pass: compute predicted y using operations on Tensors; these
    # are exactly the same operations we used to compute the forward pass using
    # Tensors, but we do not need to keep references to intermediate values since
    # we are not implementing the backward pass by hand.
    y_pred = x.mm(w1).clamp(min=0).mm(w2)

    # Compute and print loss using operations on Tensors.
    # Now loss is a Tensor of shape (1,)
    # loss.item() gets the a scalar value held in the loss.
    loss = (y_pred - y).pow(2).sum()
    print(t, loss.item())

    # Use autograd to compute the backward pass. This call will compute the
    # gradient of loss with respect to all Tensors with requires_grad=True.
    # After this call w1.grad and w2.grad will be Tensors holding the gradient
    # of the loss with respect to w1 and w2 respectively.
    loss.backward()

    # Manually update weights using gradient descent. Wrap in torch.no_grad()
    # because weights have requires_grad=True, but we don't need to track this
    # in autograd.
    # An alternative way is to operate on weight.data and weight.grad.data.
    # Recall that tensor.data gives a tensor that shares the storage with
    # tensor, but doesn't track history.
    # You can also use torch.optim.SGD to achieve this.
    with torch.no_grad():
        w1 -= learning_rate * w1.grad
        w2 -= learning_rate * w2.grad

        # Manually zero the gradients after updating weights
        w1.grad.zero_()
        w2.grad.zero_()

0 32929920.0
1 29555502.0
2 28037760.0
3 24574410.0
4 18701992.0
5 12247577.0
6 7297312.0
7 4245776.0
8 2586848.0
9 1710297.75
10 1231201.25
11 948238.75
12 764829.0
13 635288.625
14 537635.8125
15 460786.0
16 398556.59375
17 347227.1875
18 304243.0
19 267892.75
20 236952.90625
21 210457.578125
22 187643.6875
23 167881.171875
24 150660.015625
25 135593.65625
26 122363.71875
27 110703.6015625
28 100395.3828125
29 91259.2734375
30 83126.421875
31 75858.4453125
32 69347.3046875
33 63499.76171875
34 58234.328125
35 53481.3125
36 49183.30078125
37 45289.39453125
38 41752.86328125
39 38536.50390625
40 35607.09375
41 32934.68359375
42 30492.271484375
43 28256.955078125
44 26209.03515625
45 24330.794921875
46 22605.44140625
47 21018.984375
48 19560.634765625
49 18216.75
50 16977.72265625
51 15833.4853515625
52 14776.3134765625
53 13796.2587890625
54 12888.72265625
55 12048.033203125
56 11268.39453125
57 10545.177734375
58 9873.97265625
59 9250.3466796875
60 8670.408203125
61 8130.6083984375
62