## Automatic differentation

In [1]:
#Author-Vishal Burman

In [2]:
from mxnet import autograd, nd

In [3]:
#Let's take a toy example
# y = 2*x_T*x ~ 2*(x)^2

#So the above differentiation should be 4*x

In [4]:
x=nd.arange(4)

In [5]:
x, type(x)

(
 [0. 1. 2. 3.]
 <NDArray 4 @cpu(0)>, mxnet.ndarray.ndarray.NDArray)

In [6]:
#The gradient can be stored on the fly by calling the attach_grad method

In [7]:
x.attach_grad()

In [8]:
#MxNet generates computation graph on the fly...but does when put inside the autograd.record function()...

In [9]:
with autograd.record():
    y=2*(x*x)


In [10]:
#Now we can calculate the gradients of the input by calling the backward() function...

In [11]:
y.backward()

In [12]:
#Now gradient of the function should be equal to 4x...lets verify this

In [13]:
x.grad-4*x


[0. 0. 0. 0.]
<NDArray 4 @cpu(0)>

In [14]:
#If x is used in another computation graph....its content will be overwritten

In [15]:
with autograd.record():
    y=x.norm()
y.backward()
x.grad


[0.         0.26726124 0.5345225  0.80178374]
<NDArray 4 @cpu(0)>

## Backward for Non-Scalar variable

In [16]:
#When y is not a scalar the gradients could be a high order tensor

In [17]:
#We generally compute gradients for loss functions...whose values are often scalars

In [18]:
#MXNet sums up the elements of y to get new variable by default...

In [19]:
#It then finds the analytical gradient of variable with respect to x

In [20]:
with autograd.record():
    y=x*x
y.backward()



u=x.copy()
u.attach_grad()
with autograd.record():
    v=(u*u).sum()
v.backward()
x.grad-u.grad


[0. 0. 0. 0.]
<NDArray 4 @cpu(0)>

In [21]:
#The above difference yields the result as 0...erifies the above property

## Detach Computations

In [22]:
#y.detach() returns the same values as y but forgets how u is computed

In [23]:
#Namely u is treated as a constant

In [24]:
#The following backward computes du^2x/dx with u=x instead of dx^3/dx

In [25]:
with autograd.record():
    y=x*x
    u=y.detach()
    z=u*x
z.backward()
x.grad-u
    


[0. 0. 0. 0.]
<NDArray 4 @cpu(0)>

In [26]:
#Since y is still recorded we can call y.backward() to get dy/dx=2x

In [27]:
y.backward()
x.grad-2*x


[0. 0. 0. 0.]
<NDArray 4 @cpu(0)>

## Attach gradients to Internal Variables

In [28]:
#Attaching gradients to variable x implicitly calls x=x.detach().
#This part of computation will not be used in the backward function

In [29]:
y=nd.ones(4)*2
y.attach_grad()
with autograd.record():
    u=x*y
    u.attach_grad()
    z=u+x
z.backward()
x.grad, u.grad, y.grad

(
 [1. 1. 1. 1.]
 <NDArray 4 @cpu(0)>, 
 [1. 1. 1. 1.]
 <NDArray 4 @cpu(0)>, 
 [0. 0. 0. 0.]
 <NDArray 4 @cpu(0)>)

## Head gradients

In [30]:
#Detaching allows to break the computations into various parts

In [31]:
#u=f(x) & z=g(u) dz/dx=(dz/du) * (du/dx)

In [32]:
y=nd.ones(4)*2
y.attach_grad()
with autograd.record():
    u=x*y
    v=u.detach()#u still keeps the computation graph
    z=v+x
z.backward()
x.grad, y.grad

(
 [1. 1. 1. 1.]
 <NDArray 4 @cpu(0)>, 
 [0. 0. 0. 0.]
 <NDArray 4 @cpu(0)>)

In [33]:
u.backward(v.grad)
x.grad, y.grad

(
 [2. 2. 2. 2.]
 <NDArray 4 @cpu(0)>, 
 [0. 1. 2. 3.]
 <NDArray 4 @cpu(0)>)

## Computing the gradient of Python Control Flow

In [34]:
def f(a):
    b=a*2
    while b.norm().asscalar()<1000:
        b=b*2
    if b.sum().asscalar()>0:
        c=b
    else:
        c=100*b
    return c

In [35]:
a=nd.random.normal(shape=1)

In [39]:
a.attach_grad()
with autograd.record():
    d=f(a)
d.backward()

In [40]:
print(a.grad==(d/a))


[1.]
<NDArray 1 @cpu(0)>


## Training Mode and Prediction mode

In [None]:
#After the autograd.record() function MXNet will record the operations and claculate the gradient

In [41]:
print(autograd.is_training())
with autograd.record():
    print(autograd.is_training())

False
True
