In [1]:
import torch

## Calculate Gradient

In [3]:
x = torch.randn(3, requires_grad=True)
print(x) # Tensor with 3 random values

tensor([1.0704, 0.6912, 1.1842], requires_grad=True)


With the requires_grad = True parameter, PyTorch will create a computtational graph for us.

In [4]:
y = x + 2

The computation above will create a computational graph below.

<img src="pic/comp_graph.png" width=150>

Here the operation is the "+" assign, and the inputs are "X" and "2" and the output is "Y". With this graph and backpropagation, we can calculate the gradients and optimize the tensor.

In [5]:
print(y) # Note the grad_fn=<AddBackward0>

tensor([3.0704, 2.6912, 3.1842], grad_fn=<AddBackward0>)


In [6]:
# Continue to build up the computational graph
z = y*y*2
print(z)

tensor([18.8543, 14.4850, 20.2784], grad_fn=<MulBackward0>)


In [7]:
# Continue to build up the computational graph
z = z.mean() # Convert to a scaler.
print(z)

tensor(17.8726, grad_fn=<MeanBackward0>)


In [8]:
# Calculate the gradient of z with respect to x
# dz/dx
z.backward()

In [9]:
# Print the gradient of each elemenet of x
print(x.grad)

tensor([4.0938, 3.5883, 4.2456])


What if you don't specify **requires_grad=True** argument?

In [10]:
x = torch.randn(3, requires_grad=False)
y = x + 2
z = y * y * 2
z = z.mean()
z.backward()

RuntimeError: element 0 of tensors does not require grad and does not have a grad_fn

You will receive a RuntimeError here since **it doesn't have a grad_fn**. In the background what it basically does is that this will create a so-called **vector Jacobian product** to get the gradients.

<img src="pic/vjproduct.png" width=400>

The result is the final gradient that we are interested in.

## Prevent PyTorch from Tracking the Gradient Function

In [13]:
# Option 1
# x.requires_grad_(False)

x = torch.randn(3, requires_grad=True)
print(x)

x.requires_grad_(False)
print(x)

tensor([2.6796, 0.8546, 0.4711], requires_grad=True)
tensor([2.6796, 0.8546, 0.4711])


In [14]:
# Option 2
# x.detach()

x = torch.randn(3, requires_grad=True)
print(x)

y = x.detach()
print(y)


tensor([-1.3099,  1.4419,  0.2297], requires_grad=True)
tensor([-1.3099,  1.4419,  0.2297])


In [15]:
# Option 3
# with torch.no_grad():

x = torch.randn(3, requires_grad=True)
print(x)

with torch.no_grad():
    y = x + 2
    print(y)

tensor([ 0.5318, -0.9862,  0.4070], requires_grad=True)
tensor([2.5318, 1.0138, 2.4070])


## Dummy Training Example

In [17]:
weights = torch.ones(4, requires_grad = True)
for epoch in range(1):
    model_output = (weights * 3).sum()
    model_output.backward()
    print(weights.grad)

tensor([3., 3., 3., 3.])


In [20]:
weights = torch.ones(4, requires_grad = True)
for epoch in range(3):
    model_output = (weights * 3).sum()
    model_output.backward()
    print(weights.grad) 
    
    # All the values are summed up in loops and the gradients are clearly incorrect


tensor([3., 3., 3., 3.])
tensor([6., 6., 6., 6.])
tensor([9., 9., 9., 9.])


In [21]:
weights = torch.ones(4, requires_grad = True)
for epoch in range(3):
    model_output = (weights * 3).sum()
    model_output.backward()
    print(weights.grad) 
    
    weights.grad.zero_() 
    
    # Before entering the next interation, we must empty the gradient here.
    # Now the gradient is correct again.
    

tensor([3., 3., 3., 3.])
tensor([3., 3., 3., 3.])
tensor([3., 3., 3., 3.])
