In [None]:
# AutoDiff with torch.autograd
# One of the main methods of training is the back propagation algorithm.
# In this algorithm, the weights are updated based on the gradient of the loss function
# wrt the given parameters

# EXTRA NOTE on forward/backdiff
# Autodiff can be viewed as "just" the chain rule, but with some specific optimizations
# unique to computers. 
# Due to the chain rule, the gradient of a function is the product of its partial 
# derivatives. 
# The difference between forward and backwards differentiation is simply the order
# these sub gradients are multiplied

# Let f: R^n -> R^m, i.e. n is the input dimension, and m is the output dimension
# Back diff is usually faster when m << n, which is most applications
# Forward diff is usually faster when m >> n, which is rare in practice
# In rare cases, the architecture of the network could dictate that the min(m, n) is 
# actually in the middle of the network, in which case some hybrid of 
# forward and backward diff could be used

# sources:
# https://math.stackexchange.com/questions/2195377/reverse-mode-differentiation-vs-forward-mode-differentiation-where-are-the-be
# https://www.cs.toronto.edu/~rgrosse/courses/csc321_2018/slides/lec10.pdf


In [None]:
# Computational Graphs
# autodiff works by decomposing the function into a computational graph
# Each node in the graph represents a function, and the edges represent the
# input/output relationship between the functions
# The graph is then traversed in reverse order to compute the gradients
# The gradients are then used to update the weights in the network

# Example computational graph
import torch

# This is essentially a linear regression DGP, with cross entropy loss
# actually this DGP looks very wrong, but for demonstration purposes it suffices 
# input
x = torch.ones(5)
# output
y = torch.zeros(3)
# set requires_grad here to track the gradients
# OR you can manually change its attributes later using x.requires_grad_(True) method
w = torch.randn(5, 3, requires_grad=True)
b = torch.randn(3, requires_grad=True)
# signal from the inputs
z = torch.matmul(x, w) + b
# cross entropy loss
loss = torch.nn.functional.binary_cross_entropy_with_logits(z, y)

# These outputs are not human readable
print(f"Gradient function for z: {z.grad_fn}")
print(f"Gradient function for loss: {loss.grad_fn}")

Gradient function for z: <AddBackward0 object at 0x769fcc3ca4d0>
Gradient function for loss: <BinaryCrossEntropyWithLogitsBackward0 object at 0x769fcc3c9a80>


In [None]:
# Actually computing the gradients
# call loss.backward
loss.backward()
# view the values of the gradients
print(w.grad)
print(b.grad)

# Important Note
# we can only call loss.backward() once, because it accumulates the gradients
# if we need to do several backward calls on the same graph, 
# we need to pass retain_graph=True tot he backward call, because
# backward() by default clears the gradients after the backward pass for computational purposes

RuntimeError: Trying to backward through the graph a second time (or directly access saved tensors after they have already been freed). Saved intermediate values of the graph are freed when you call .backward() or autograd.grad(). Specify retain_graph=True if you need to backward through the graph a second time or if you need to access saved tensors after calling backward.

In [7]:
# Disable Gradiant Tracking
# tracking gradients is computationally expensive
# we can disable tracking by using the torch.no_grad() context manager
# this is useful when we are only interested in the forward pass (what they call prediction)

z = torch.matmul(x, w) + b
print(z.requires_grad)

# to disable tracking
with torch.no_grad():
    z = torch.matmul(x, w) + b
print(z.requires_grad)

# Alternatively, we can also use the detach method
z = torch.matmul(x, w) + b
z_det = z.detach()
print(z_det.requires_grad)

True
False
False


In [None]:
# Extra: Tensor Gradients
# Usually, the loss function is a scalar, and the above is straightforward enough
# However, there are cases where the output function is some arbitrary tensor
# In this case, PyTorch allows you to compute the so-called Jacobian Product
# The Jacobian Product is the product of the Jacobian matrix of the output function
# and the vector in question, and not the actual gradient
# v' J, where J is the Jacobian matrix of the output function
# v is the vector in question, should be the same size as the original input

inp = torch.eye(5, requires_grad=True)
# out = some arbitrary tensor output
# adds 1 to inp, and raises it to the power of 2, and transposes it
out = (inp + 1).pow(2).t()

# First call
out.backward(torch.ones_like(inp), retain_graph=True)
print(f"First call\n{inp.grad}")

# Second call
out.backward(torch.ones_like(inp), retain_graph=True)
print(f"Second call\n{inp.grad}")

# Third call, but this time we first zero out the gradients
inp.grad.zero_()
out.backward(torch.ones_like(inp), retain_graph=True)
print(f"Third call, after zeroing out gradients\n{inp.grad}")


First call
tensor([[4., 2., 2., 2., 2.],
        [2., 4., 2., 2., 2.],
        [2., 2., 4., 2., 2.],
        [2., 2., 2., 4., 2.],
        [2., 2., 2., 2., 4.]])
Second call
tensor([[8., 4., 4., 4., 4.],
        [4., 8., 4., 4., 4.],
        [4., 4., 8., 4., 4.],
        [4., 4., 4., 8., 4.],
        [4., 4., 4., 4., 8.]])
Third call, after zeroing out gradients
tensor([[4., 2., 2., 2., 2.],
        [2., 4., 2., 2., 2.],
        [2., 2., 4., 2., 2.],
        [2., 2., 2., 4., 2.],
        [2., 2., 2., 2., 4.]])
