In [1]:
import tensorflow as tf

In [2]:
w = tf.constant(1.)
x = tf.constant(2.)
y = x*w

In [3]:
with tf.GradientTape() as tape:
    tape.watch([w])
    y2 = x*w

In [4]:
grad1 = tape.gradient(y, [w])

In [5]:
# 只能调用一次
# 可以用tf.GradientTape(persistent=True) as tape

with tf.GradientTape() as tape:
    tape.watch([w])
    y2 = x*w

In [6]:
grad2 = tape.gradient(y2, [w])
print(grad2)

[<tf.Tensor: id=7, shape=(), dtype=float32, numpy=2.0>]


## MSE Gradient

In [7]:
x = tf.random.normal([2, 4])

In [8]:
w = tf.random.normal([4, 3])

In [12]:
b = tf.zeros([3])
print(b)

tf.Tensor([0. 0. 0.], shape=(3,), dtype=float32)


In [11]:
y = tf.constant([2, 0])
print(y)

tf.Tensor([2 0], shape=(2,), dtype=int32)


In [13]:
with tf.GradientTape() as tape:
    tape.watch([w, b])
    prob = tf.nn.softmax(x@w + b, axis=1)
    loss = tf.reduce_mean(tf.losses.MSE(tf.one_hot(y, depth=3), prob))

In [14]:
grads = tape.gradient(loss, [w, b])

In [15]:
grads[0]

<tf.Tensor: id=90, shape=(4, 3), dtype=float32, numpy=
array([[-0.09146545,  0.06671529,  0.02475017],
       [-0.07156439,  0.05997042,  0.01159397],
       [-0.07628677,  0.09001668, -0.01372991],
       [-0.06432106,  0.09093941, -0.02661836]], dtype=float32)>

In [16]:
grads[1]

<tf.Tensor: id=89, shape=(3,), dtype=float32, numpy=array([-0.0838195 ,  0.10676907, -0.02294957], dtype=float32)>

## Crossentropy gradient

In [17]:
x = tf.random.normal([2, 4])

In [18]:
w = tf.random.normal([4, 3])

In [19]:
b = tf.zeros([3])

In [20]:
y = tf.constant([2, 0])

In [22]:
with tf.GradientTape() as tape:
    tape.watch([w, b])
    logits = x@w + b
    loss = tf.reduce_mean(tf.losses.categorical_crossentropy(tf.one_hot(y, depth=3), logits, from_logits=True))

In [23]:
grads = tape.gradient(loss, [w, b])

In [24]:
grads[0]

<tf.Tensor: id=180, shape=(4, 3), dtype=float32, numpy=
array([[ 0.3878701 ,  0.20546626, -0.5933364 ],
       [ 0.15277956,  0.11635216, -0.26913172],
       [-0.08306631, -0.32926387,  0.41233018],
       [-0.35213044, -0.40440443,  0.7565349 ]], dtype=float32)>

In [25]:
grads[1]

<tf.Tensor: id=179, shape=(3,), dtype=float32, numpy=array([ 0.02585843,  0.283982  , -0.30984044], dtype=float32)>

## 单输出感知机梯度

In [26]:
x = tf.random.normal([1, 3])

In [27]:
w = tf.ones([3,1])

In [28]:
b = tf.ones([1])

In [29]:
y = tf.constant([1])

In [30]:
with tf.GradientTape() as tape:
    tape.watch([w,b])
    logits = tf.sigmoid(x@w+b)
    loss = tf.reduce_mean(tf.losses.MSE(y, logits))

In [31]:
grads = tape.gradient(loss, [w, b])

In [32]:
grads[0]

<tf.Tensor: id=247, shape=(3, 1), dtype=float32, numpy=
array([[-0.00942246],
       [ 0.00144909],
       [-0.00694714]], dtype=float32)>

In [33]:
grads[1]

<tf.Tensor: id=246, shape=(1,), dtype=float32, numpy=array([-0.0095673], dtype=float32)>

## 多输出感知机梯度

In [35]:
x = tf.random.normal([2, 4])

In [36]:
w = tf.ones([4,3])

In [37]:
b = tf.zeros([3])

In [38]:
y = tf.constant([2, 0])

In [39]:
with tf.GradientTape() as tape:
    tape.watch([w,b])
    prob = tf.nn.softmax(x@w+b, axis=1)
    loss = tf.reduce_mean(tf.losses.MSE(tf.one_hot(y, depth=3), prob))

In [40]:
grads = tape.gradient(loss, [w, b])

In [41]:
grads[0]

<tf.Tensor: id=327, shape=(4, 3), dtype=float32, numpy=
array([[-0.03777831,  0.0453375 , -0.00755919],
       [-0.09684772, -0.00252806,  0.09937578],
       [-0.08477277,  0.04696488,  0.03780789],
       [ 0.0258387 ,  0.01839288, -0.04423157]], dtype=float32)>

In [42]:
grads[1]

<tf.Tensor: id=326, shape=(3,), dtype=float32, numpy=array([-0.03703704,  0.07407407, -0.03703704], dtype=float32)>

## 链式法则

In [43]:
# ppt

## 反向传播算法

In [44]:
# ppt