In [1]:
%matplotlib inline


PyTorch: Tensors and autograd
-------------------------------

A fully-connected ReLU network with one hidden layer and no biases, trained to
predict y from x by minimizing squared Euclidean distance.

This implementation computes the forward pass using operations on PyTorch
Tensors, and uses PyTorch autograd to compute gradients.


A PyTorch Tensor represents a node in a computational graph. If ``x`` is a
Tensor that has ``x.requires_grad=True`` then ``x.grad`` is another Tensor
holding the gradient of ``x`` with respect to some scalar value.



In [2]:
import torch

dtype = torch.float
# device = torch.device("cpu")
device = torch.device("cuda:0") # Uncomment this to run on GPU

# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random Tensors to hold input and outputs.
# Setting requires_grad=False indicates that we do not need to compute gradients
# with respect to these Tensors during the backward pass.
x = torch.randn(N, D_in, device=device, dtype=dtype)
y = torch.randn(N, D_out, device=device, dtype=dtype)

# Create random Tensors for weights.
# Setting requires_grad=True indicates that we want to compute gradients with
# respect to these Tensors during the backward pass.
w1 = torch.randn(D_in, H, device=device, dtype=dtype, requires_grad=True)
w2 = torch.randn(H, D_out, device=device, dtype=dtype, requires_grad=True)

learning_rate = 1e-6
for t in range(500):
    # Forward pass: compute predicted y using operations on Tensors; these
    # are exactly the same operations we used to compute the forward pass using
    # Tensors, but we do not need to keep references to intermediate values since
    # we are not implementing the backward pass by hand.
    y_pred = x.mm(w1).clamp(min=0).mm(w2)

    # Compute and print loss using operations on Tensors.
    # Now loss is a Tensor of shape (1,)
    # loss.item() gets the a scalar value held in the loss.
    loss = (y_pred - y).pow(2).sum()
    print(t, loss.item())

    # Use autograd to compute the backward pass. This call will compute the
    # gradient of loss with respect to all Tensors with requires_grad=True.
    # After this call w1.grad and w2.grad will be Tensors holding the gradient
    # of the loss with respect to w1 and w2 respectively.
    loss.backward()

    # Manually update weights using gradient descent. Wrap in torch.no_grad()
    # because weights have requires_grad=True, but we don't need to track this
    # in autograd.
    # An alternative way is to operate on weight.data and weight.grad.data.
    # Recall that tensor.data gives a tensor that shares the storage with
    # tensor, but doesn't track history.
    # You can also use torch.optim.SGD to achieve this.
    with torch.no_grad():
        w1 -= learning_rate * w1.grad
        w2 -= learning_rate * w2.grad

        # Manually zero the gradients after updating weights
        w1.grad.zero_()
        w2.grad.zero_()

0 36648040.0
1 31270364.0
2 28291568.0
3 23690084.0
4 17462892.0
5 11329606.0
6 6811175.0
7 4049316.5
8 2517866.0
9 1685369.75
10 1216834.5
11 934351.6875
12 749346.75
13 618704.4375
14 520777.1875
15 443974.4375
16 382033.09375
17 331037.875
18 288493.25
19 252631.140625
20 222124.3125
21 196008.609375
22 173514.0625
23 154025.078125
24 137121.15625
25 122399.734375
26 109516.6171875
27 98204.6875
28 88241.0859375
29 79438.9453125
30 71650.9453125
31 64740.890625
32 58590.296875
33 53107.3203125
34 48203.07421875
35 43810.359375
36 39867.0859375
37 36321.234375
38 33129.45703125
39 30251.53515625
40 27651.212890625
41 25299.625
42 23170.48828125
43 21239.447265625
44 19486.140625
45 17892.6875
46 16442.896484375
47 15129.53125
48 13931.080078125
49 12836.708984375
50 11836.71875
51 10921.63671875
52 10084.16796875
53 9316.7373046875
54 8612.8466796875
55 7966.734375
56 7373.3291015625
57 6828.0546875
58 6326.4384765625
59 5864.708984375
60 5439.453125
61 5047.50390625
62 4686.08251953

392 0.00020756993035320193
393 0.00020294445857871324
394 0.00019733811495825648
395 0.0001927552802953869
396 0.00018786126747727394
397 0.00018302342505194247
398 0.0001785367203410715
399 0.00017467838188167661
400 0.00017058514640666544
401 0.0001660620328038931
402 0.0001617136731510982
403 0.0001578264927957207
404 0.00015442079165950418
405 0.00015049260400701314
406 0.00014696312427986413
407 0.00014397682389244437
408 0.00014077976811677217
409 0.0001375796418869868
410 0.00013483833754435182
411 0.00013198294618632644
412 0.00012876381515525281
413 0.0001258822885574773
414 0.0001230129855684936
415 0.00012038624845445156
416 0.00011794274905696511
417 0.0001152399490820244
418 0.00011270192771917209
419 0.00011036627984140068
420 0.00010818462033057585
421 0.00010550846491241828
422 0.00010388552618678659
423 0.00010178507363889366
424 9.97145107248798e-05
425 9.770008182385936e-05
426 9.56034054979682e-05
427 9.372118074679747e-05
428 9.227659757016227e-05
429 9.04084372450