In [1]:
# Often called Autograd.

# Calculating Derivatives is vital in optimization algorithms to train NN.
# Deep Learning frameworks ease this tedious task using AUTOMATIC DIFFERENTIATION.

# As we pass data through each fxn, the framework builds a computational graph that tracks 
# how each value depends on others.

# To calculate derivatives, automatic differentiation works backwards through this graph
# while applying the chain rule.

# This algorithm for chain rule is called BACKPROPOGATION.


In [3]:
import torch

In [5]:
# Assuming we want to differentiate the fxn
# y = 2X dot product with X
# wrt column vector x.

# Assign x an initial value.

x = torch.arange(4.0)
x

tensor([0., 1., 2., 3.])

In [7]:
# We need to store X.
# Avoid allocating new memory because we may run out of memory in a Deep Neural Network.
# Thus we save the Gradient.

x.requires_grad_(True) # Alt is torch.arange(4.0, requires_grad = True)
x.grad # None by default

In [8]:
# Assign our function to Y.

y = 2 * torch.dot(x,x)
y

tensor(28., grad_fn=<MulBackward0>)

In [9]:
# We can now take gradient of Y
# By calling its backward method.
# We can also access the gradient via X.grad


y.backward()
x.grad


tensor([ 0.,  4.,  8., 12.])

In [10]:
# We know the gradient of 2x^2 is 4x. 
# We now verify it.

x.grad == 4*x

tensor([True, True, True, True])

In [12]:
# Now we calculate another function of X & take grad.
# Pytorch doesn;t reset the gradient buffer automatically.
# We have to do it.
# Pytorch adds a new gradient to the already stored gradient.

y.backward()
z = x.grad
print('Z : ', z)

y.backward()
z = x.grad
print('Z : ', z)

RuntimeError: Trying to backward through the graph a second time (or directly access saved tensors after they have already been freed). Saved intermediate values of the graph are freed when you call .backward() or autograd.grad(). Specify retain_graph=True if you need to backward through the graph a second time or if you need to access saved tensors after calling backward.

In [13]:
x.grad.zero_() # Reset it

y = x.sum()
y.backward()
x.grad

tensor([1., 1., 1., 1.])

# Backward Propogration For non Scalar Variables 

In [14]:
# For Non Scalar, we need to tell Pytorch how to convert it to scalar.
# This means we need to give it some vector to compute the scalar product.

x.grad.zero_()
y = x * x
y.backward(gradient = torch.ones(len(y))) # Alt - y.sum().backward()
x.grad

tensor([0., 2., 4., 6.])

# Detaching COmputation

In [15]:
# Used when we made some intermediate calculations but don't want it to show up during
# backward flow.

x.grad.zero_()
y = x*x
u = y.detach()
z = u*x

z.sum().backward()

x.grad ==u

tensor([True, True, True, True])

In [16]:
# This procedure detaches y's ancestors from the graph leading to z,
# the computed graph leading to y persists and thus
# we can calculate the gradients of y wrt x.

x.grad.zero_()
y.sum().backward()
x.grad == 2 * x

tensor([True, True, True, True])

# Gradients and Python Flow Control

In [17]:
# Previously the input and output had a well defined function (z = x*x*x).
# Now we make our answer depend on auxiliary variables.
# We can still get the gradients via automatic differentiation.

def f(a):
    b = a*2
    while b.norm() < 1000:
        b = b * 2
        
    if b.sum() > 0:
        c = b

    else:
        c = 100*b

    return c

In [18]:
# Defining a.

a = torch.randn(size = (), requires_grad = True)
print(a)

d = f(a)
d.backward()

tensor(-0.1943, requires_grad=True)


In [20]:
a.grad == d/a

tensor(False)

In [21]:
a.grad

tensor(819200.)

In [22]:
d/a

tensor(819200.0625, grad_fn=<DivBackward0>)