In [None]:
%matplotlib inline


PyTorch: Tensors and autograd
-------------------------------

A fully-connected ReLU network with one hidden layer and no biases, trained to
predict y from x by minimizing squared Euclidean distance.

This implementation computes the forward pass using operations on PyTorch
Tensors, and uses PyTorch autograd to compute gradients.


A PyTorch Tensor represents a node in a computational graph. If ``x`` is a
Tensor that has ``x.requires_grad=True`` then ``x.grad`` is another Tensor
holding the gradient of ``x`` with respect to some scalar value.



In [2]:
import torch

dtype = torch.float
device = torch.device("cpu")
# device = torch.device("cuda:0") # Uncomment this to run on GPU

# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random Tensors to hold input and outputs.
# Setting requires_grad=False indicates that we do not need to compute gradients
# with respect to these Tensors during the backward pass.
x = torch.randn(N, D_in, device=device, dtype=dtype)
y = torch.randn(N, D_out, device=device, dtype=dtype)

# Create random Tensors for weights.
# Setting requires_grad=True indicates that we want to compute gradients with
# respect to these Tensors during the backward pass.
w1 = torch.randn(D_in, H, device=device, dtype=dtype, requires_grad=True)
w2 = torch.randn(H, D_out, device=device, dtype=dtype, requires_grad=True)

learning_rate = 1e-6
for t in range(500):
    # Forward pass: compute predicted y using operations on Tensors; these
    # are exactly the same operations we used to compute the forward pass using
    # Tensors, but we do not need to keep references to intermediate values since
    # we are not implementing the backward pass by hand.
    y_pred = x.mm(w1).clamp(min=0).mm(w2)

    # Compute and print loss using operations on Tensors.
    # Now loss is a Tensor of shape (1,)
    # loss.item() gets the a scalar value held in the loss.
    loss = (y_pred - y).pow(2).sum()
    print(t, loss.item())

    # Use autograd to compute the backward pass. This call will compute the
    # gradient of loss with respect to all Tensors with requires_grad=True.
    # After this call w1.grad and w2.grad will be Tensors holding the gradient
    # of the loss with respect to w1 and w2 respectively.
    loss.backward()

    # Manually update weights using gradient descent. Wrap in torch.no_grad()
    # because weights have requires_grad=True, but we don't need to track this
    # in autograd.
    # An alternative way is to operate on weight.data and weight.grad.data.
    # Recall that tensor.data gives a tensor that shares the storage with
    # tensor, but doesn't track history.
    # You can also use torch.optim.SGD to achieve this.
    with torch.no_grad():
        w1 -= learning_rate * w1.grad
        w2 -= learning_rate * w2.grad

        # Manually zero the gradients after updating weights
        w1.grad.zero_()
        w2.grad.zero_()

tensor([[ 0.5239, -0.7421,  1.0093,  ..., -0.3349, -0.9140, -0.3707],
        [-0.0129,  0.3322, -0.5722,  ..., -0.4037,  1.1060,  0.4266],
        [ 1.0878, -0.0209, -0.9979,  ...,  1.4156,  1.0337,  1.0965],
        ...,
        [-0.6757,  0.0124,  0.4122,  ..., -0.3400,  0.4119, -1.6669],
        [ 0.6572, -0.2575, -0.8540,  ...,  1.6093, -0.6456,  0.8244],
        [-0.2951,  0.7106,  1.5956,  ...,  0.6517, -0.7071, -0.0829]],
       requires_grad=True)
tensor([[ 0.5260, -0.7453,  1.0118,  ..., -0.3384, -0.9079, -0.3702],
        [-0.0223,  0.3295, -0.5829,  ..., -0.4110,  1.1157,  0.4276],
        [ 1.0817, -0.0251, -1.0070,  ...,  1.4207,  1.0400,  1.0988],
        ...,
        [-0.6761,  0.0066,  0.4105,  ..., -0.3422,  0.4152, -1.6677],
        [ 0.6691, -0.2479, -0.8437,  ...,  1.6082, -0.6542,  0.8253],
        [-0.2833,  0.7102,  1.5918,  ...,  0.6431, -0.7097, -0.0832]],
       requires_grad=True)
tensor([[ 0.5262, -0.7450,  1.0097,  ..., -0.3409, -0.9008, -0.3712],
        

tensor([[ 0.5399, -0.7617,  1.0097,  ..., -0.3964, -0.8686, -0.3775],
        [-0.0092,  0.3586, -0.5962,  ..., -0.4539,  1.1599,  0.4385],
        [ 1.1115, -0.0596, -1.0277,  ...,  1.4439,  1.0037,  1.1141],
        ...,
        [-0.6507, -0.0508,  0.3920,  ..., -0.3842,  0.3792, -1.6585],
        [ 0.6675, -0.1919, -0.8066,  ...,  1.5900, -0.6472,  0.8426],
        [-0.2631,  0.7031,  1.5821,  ...,  0.5869, -0.7583, -0.0772]],
       requires_grad=True)
tensor([[ 0.5399, -0.7617,  1.0097,  ..., -0.3964, -0.8686, -0.3775],
        [-0.0092,  0.3586, -0.5962,  ..., -0.4539,  1.1599,  0.4385],
        [ 1.1115, -0.0596, -1.0277,  ...,  1.4439,  1.0037,  1.1141],
        ...,
        [-0.6507, -0.0508,  0.3920,  ..., -0.3842,  0.3792, -1.6585],
        [ 0.6675, -0.1919, -0.8066,  ...,  1.5900, -0.6472,  0.8426],
        [-0.2631,  0.7031,  1.5821,  ...,  0.5869, -0.7583, -0.0772]],
       requires_grad=True)
tensor([[ 0.5399, -0.7617,  1.0097,  ..., -0.3964, -0.8686, -0.3775],
        

tensor([[ 0.5397, -0.7611,  1.0108,  ..., -0.3969, -0.8691, -0.3782],
        [-0.0090,  0.3592, -0.5960,  ..., -0.4539,  1.1600,  0.4382],
        [ 1.1113, -0.0597, -1.0268,  ...,  1.4434,  1.0031,  1.1136],
        ...,
        [-0.6504, -0.0512,  0.3912,  ..., -0.3852,  0.3800, -1.6585],
        [ 0.6672, -0.1916, -0.8057,  ...,  1.5891, -0.6478,  0.8426],
        [-0.2632,  0.7031,  1.5826,  ...,  0.5863, -0.7587, -0.0768]],
       requires_grad=True)
tensor([[ 0.5397, -0.7611,  1.0108,  ..., -0.3969, -0.8691, -0.3782],
        [-0.0090,  0.3592, -0.5960,  ..., -0.4539,  1.1600,  0.4382],
        [ 1.1113, -0.0597, -1.0268,  ...,  1.4434,  1.0031,  1.1136],
        ...,
        [-0.6504, -0.0512,  0.3912,  ..., -0.3852,  0.3801, -1.6585],
        [ 0.6672, -0.1916, -0.8057,  ...,  1.5891, -0.6478,  0.8426],
        [-0.2632,  0.7031,  1.5826,  ...,  0.5863, -0.7587, -0.0768]],
       requires_grad=True)
tensor([[ 0.5397, -0.7611,  1.0108,  ..., -0.3969, -0.8691, -0.3782],
        

tensor([[ 0.5396, -0.7609,  1.0109,  ..., -0.3970, -0.8693, -0.3784],
        [-0.0090,  0.3594, -0.5960,  ..., -0.4539,  1.1600,  0.4382],
        [ 1.1112, -0.0597, -1.0266,  ...,  1.4433,  1.0030,  1.1135],
        ...,
        [-0.6504, -0.0512,  0.3911,  ..., -0.3853,  0.3802, -1.6586],
        [ 0.6671, -0.1915, -0.8056,  ...,  1.5888, -0.6480,  0.8426],
        [-0.2633,  0.7031,  1.5827,  ...,  0.5863, -0.7588, -0.0767]],
       requires_grad=True)
tensor([[ 0.5396, -0.7609,  1.0109,  ..., -0.3970, -0.8693, -0.3784],
        [-0.0090,  0.3594, -0.5960,  ..., -0.4539,  1.1600,  0.4382],
        [ 1.1112, -0.0597, -1.0266,  ...,  1.4433,  1.0030,  1.1135],
        ...,
        [-0.6504, -0.0512,  0.3911,  ..., -0.3853,  0.3802, -1.6586],
        [ 0.6671, -0.1915, -0.8056,  ...,  1.5888, -0.6480,  0.8426],
        [-0.2633,  0.7031,  1.5827,  ...,  0.5863, -0.7588, -0.0767]],
       requires_grad=True)
tensor([[ 0.5396, -0.7609,  1.0109,  ..., -0.3970, -0.8693, -0.3784],
        

tensor([[ 0.5395, -0.7609,  1.0110,  ..., -0.3970, -0.8693, -0.3784],
        [-0.0090,  0.3594, -0.5960,  ..., -0.4539,  1.1600,  0.4381],
        [ 1.1112, -0.0597, -1.0266,  ...,  1.4432,  1.0030,  1.1135],
        ...,
        [-0.6504, -0.0512,  0.3911,  ..., -0.3854,  0.3802, -1.6586],
        [ 0.6671, -0.1915, -0.8055,  ...,  1.5887, -0.6480,  0.8426],
        [-0.2633,  0.7031,  1.5828,  ...,  0.5863, -0.7588, -0.0767]],
       requires_grad=True)
tensor([[ 0.5395, -0.7609,  1.0110,  ..., -0.3970, -0.8693, -0.3784],
        [-0.0090,  0.3594, -0.5960,  ..., -0.4539,  1.1600,  0.4381],
        [ 1.1112, -0.0597, -1.0266,  ...,  1.4432,  1.0030,  1.1135],
        ...,
        [-0.6504, -0.0512,  0.3911,  ..., -0.3854,  0.3802, -1.6586],
        [ 0.6671, -0.1915, -0.8055,  ...,  1.5887, -0.6480,  0.8426],
        [-0.2633,  0.7031,  1.5828,  ...,  0.5863, -0.7588, -0.0767]],
       requires_grad=True)
tensor([[ 0.5395, -0.7609,  1.0110,  ..., -0.3970, -0.8693, -0.3784],
        