In [1]:
import tensorflow as tf

In [2]:
a = tf.fill([4], 0.25)
b = tf.fill([4], 2.)

In [3]:
# [0.25, 0.25, 0.25, 0.25] * [2., 2., 2., 2.] = [0.5, 0.5, 0.5, 0.5]
a*b

<tf.Tensor: shape=(4,), dtype=float32, numpy=array([0.5, 0.5, 0.5, 0.5], dtype=float32)>

In [4]:
w = tf.constant(1.)
x = tf.constant(2.)

In [5]:
y = x*w

In [6]:
# 只有将复合运算包到下面才能自动求导数
with tf.GradientTape() as tape:
    # 对非tf.Variable类的数据需要加入监控才能够训练
    tape.watch([w])
    y2 = x*w
# 默认求一次导数后会释放显存计算的资源,默认只能求一次
# grad1 = tape.gradient(y, [w])
grad2 = tape.gradient(y2, [w])

In [7]:
grad2

[<tf.Tensor: shape=(), dtype=float32, numpy=2.0>]

In [8]:
with tf.GradientTape(persistent=True) as tape:
    tape.watch([w])
    y2 = x*w
# 将persistent参数设置为True可进行多次求导
grad1 = tape.gradient(y, [w])
grad2 = tape.gradient(y2, [w])

In [9]:
grad1,grad2

([None], [<tf.Tensor: shape=(), dtype=float32, numpy=2.0>])

In [10]:
x = tf.Variable(x)
w = tf.Variable(w)

In [11]:
b = tf.Variable(0.)

In [12]:
# 二阶求导
with tf.GradientTape() as t1:
    with tf.GradientTape() as t2:
        y = x * w + b
    dy_dw, dy_db = t2.gradient(y,[w, b])
d2y_dw2 = t1.gradient(dy_dw, w)

In [13]:
print(dy_dw,dy_db,d2y_dw2)

tf.Tensor(2.0, shape=(), dtype=float32) tf.Tensor(1.0, shape=(), dtype=float32) None


In [14]:
x,w,b

(<tf.Variable 'Variable:0' shape=() dtype=float32, numpy=2.0>,
 <tf.Variable 'Variable:0' shape=() dtype=float32, numpy=1.0>,
 <tf.Variable 'Variable:0' shape=() dtype=float32, numpy=0.0>)

In [15]:
# MSE Gradient
x = tf.random.normal([2, 4])
w = tf.random.normal([4 ,3])
b = tf.zeros([3])
y = tf.constant([2, 0])

In [16]:
with tf.GradientTape() as tape:
    tape.watch([w, b])
    prob = tf.nn.softmax(x@w+b, axis=1)
    loss = tf.reduce_mean(tf.losses.MSE(tf.one_hot(y, depth=3), prob))
grads = tape.gradient(loss, [w,b])

In [17]:
grads[0]

<tf.Tensor: shape=(4, 3), dtype=float32, numpy=
array([[ 0.06164198,  0.00467886, -0.06632083],
       [ 0.02359621, -0.03324214,  0.00964593],
       [ 0.05432683,  0.05197052, -0.10629735],
       [ 0.13250771, -0.03988266, -0.09262505]], dtype=float32)>

In [18]:
grads[1]

<tf.Tensor: shape=(3,), dtype=float32, numpy=array([-0.08361635,  0.03119816,  0.05241819], dtype=float32)>

In [19]:
# cross_entropy gradient
x = tf.random.normal([2, 4])
w = tf.random.normal([4, 3])
b = tf.zeros([3])
y = tf.constant([2, 0])

In [20]:
with tf.GradientTape() as tape:
    tape.watch([w, b])
    logits = x @ w + b
    # from_logits=True设置soft_max + cross_entropy,并且进行了数值稳定处理 
    loss = tf.reduce_mean(tf.losses.categorical_crossentropy(tf.one_hot(y, depth=3), logits, from_logits=True))
grads = tape.gradient(loss, [w, b])

In [21]:
grads[0], grads[1]

(<tf.Tensor: shape=(4, 3), dtype=float32, numpy=
 array([[-0.26351568,  0.27256033, -0.00904462],
        [-0.04631647,  0.181802  , -0.13548553],
        [ 0.02050748,  0.0602363 , -0.08074378],
        [ 0.19000828, -0.03584233, -0.15416594]], dtype=float32)>,
 <tf.Tensor: shape=(3,), dtype=float32, numpy=array([-0.189046  ,  0.33562157, -0.14657557], dtype=float32)>)

# 链式法则

In [24]:
x = tf.constant(1.)
w1 = tf.constant(2.)
b1 = tf.constant(1.)
w2 = tf.constant(2.)
b2 = tf.constant(1.)

In [25]:
with tf.GradientTape(persistent=True) as tape:
    tape.watch([w1, b1, w2, b2])
    y1 = x * w1 + b1
    y2 = y1 * w2 + b2
dy2_dy1 = tape.gradient(y2, [y1])[0]
dy1_dw1 = tape.gradient(y1, [w1])[0]
dy2_dw1 = tape.gradient(y2, [w1])[0]

In [27]:
print(dy2_dy1*dy1_dw1,"\n",dy2_dw1)

tf.Tensor(2.0, shape=(), dtype=float32) 
 tf.Tensor(2.0, shape=(), dtype=float32)
