In [None]:
import tensorflow as tf

# A Toy Example
param = tf.Variable([1.0])
with tf.GradientTape() as tape:
    loss = (param - 3) ** 2
grad = tape.gradient(loss, param)
print("Gradient before step:", grad)
w  = tf.Variable([1.0, 2.0])
x  = tf.Variable([4.0, 6.0])
with tf.GradientTape() as tape:
  y = w * x
tape.gradient(y, w)



# Normally, the input sample has more than 1 example. Let's take `num_example=2`. 
num_example = 2
tf.random.set_seed(22)
W = tf.Variable(tf.random.normal((2, 10)))
X = tf.constant(tf.random.normal((10, num_example)))
print('W:', W.numpy())
print('X:', X.numpy())

 
with  tf.GradientTape() as tape:
    z = tf.matmul(W, X)
    print('z:', z.numpy())

dz_dW = tape.gradient(z, W)
dz_dW

# The gradients w.r.t `W` we got are summed from two examples. 
# For example, since the gradient w.r.t w1 or `W[0]` equals to x1 or `X[0]`, 
# here we got 0.2138209(= -0.12328745+0.33710834). 
# However, during the backward pass of neural network, 
# we average the gradients across training examples(considering the learning rate, it doesnot necessarily need to be averaged). 
# Some common misunderstanding may be found [here](https://datascience.stackexchange.com/questions/33489/why-averaging-the-gradient-works-in-gradient-descent).
# However, if the function output is a vector, it would compute the gradients of vector's sum. 
# Check [`jacobian`](https://www.tensorflow.org/api_docs/python/tf/GradientTape#jacobian) for computing autodiff for each element in the output vector.
