In [15]:
import tensorflow as tf
tf.keras.backend.set_floatx('float32')

# 不包含model的简单版本

In [28]:
def process(x):
    return 10 * x

def lossfunction(x):
    return x - 5

x = tf.Variable([[1]], dtype=tf.float32)
with tf.GradientTape(watch_accessed_variables=False) as tape:
    # watch的监控对象必须是tf.Variable类型
    tape.watch(x)
    # loss = y - 5 = 10 * x - 5，梯度应该是10
    y = process(x)
    loss = lossfunction(x)
gradients = tape.gradient(loss, x)
print ('gradients = ', gradients)
optimizer = tf.keras.optimizers.SGD()
print ('x before update = ', x)
optimizer.apply_gradients([(gradients, x)])
# 默认lr = 0.01，因此x = x - lr * gradients = 1 - 0.01 * 1 = 0.99
print ('x after update = ', x)

gradients =  tf.Tensor([[1.]], shape=(1, 1), dtype=float32)
x before update =  <tf.Variable 'Variable:0' shape=(1, 1) dtype=float32, numpy=array([[1.]], dtype=float32)>
x after update =  <tf.Variable 'Variable:0' shape=(1, 1) dtype=float32, numpy=array([[0.99]], dtype=float32)>


# 使用keras提供的loss function

In [34]:
def process(x):
    return x

x = tf.Variable([[10]], dtype=tf.float32)
with tf.GradientTape(watch_accessed_variables=False) as tape:
    # watch的监控对象必须是tf.Variable类型
    tape.watch(x)
    y = process(x)
    loss = tf.keras.losses.mse(y,tf.constant(5, tf.float32))
gradients = tape.gradient(loss, x)
print ('gradients = ', gradients)
optimizer = tf.keras.optimizers.SGD()
print ('x before update = ', x)
optimizer.apply_gradients([(gradients, x)])
print ('x after update = ', x)

gradients =  tf.Tensor([[10.]], shape=(1, 1), dtype=float32)
x before update =  <tf.Variable 'Variable:0' shape=(1, 1) dtype=float32, numpy=array([[10.]], dtype=float32)>
x after update =  <tf.Variable 'Variable:0' shape=(1, 1) dtype=float32, numpy=array([[9.9]], dtype=float32)>


# 包含frozed模型，更新x

In [30]:
class ExampleRandomNormal(tf.keras.initializers.Initializer):

    def __init__(self, weights):
        self.weights = weights

    def __call__(self, shape, dtype=None):
        return self.weights

    def get_config(self):  # To support serialization
        return {'weights': self.weights}

# y = 10 * x
layer = tf.keras.layers.Dense(1, kernel_initializer=ExampleRandomNormal([[10]]), input_shape=[None, 1])
model = tf.keras.Sequential([layer])

x = tf.Variable([[1]], dtype=tf.float32)
with tf.GradientTape(watch_accessed_variables=False) as tape:
    # watch的监控对象必须是tf.Variable类型
    tape.watch(x)
    # y = 10 * x，梯度应该是10
    y = model(x)
    loss = tf.keras.losses.mse(y,tf.constant(5, tf.float32))
gradients = tape.gradient(loss, x)
print ('gradients = ', gradients)
optimizer = tf.keras.optimizers.SGD()
print ('x before update = ', x)
optimizer.apply_gradients([(gradients, x)])
print ('x after update = ', x)

gradients =  tf.Tensor([[100.]], shape=(1, 1), dtype=float32)
x before update =  <tf.Variable 'Variable:0' shape=(1, 1) dtype=float32, numpy=array([[1.]], dtype=float32)>
x after update =  <tf.Variable 'Variable:0' shape=(1, 1) dtype=float32, numpy=array([[0.]], dtype=float32)>


# 更新模型参数

In [43]:
class ExampleRandomNormal(tf.keras.initializers.Initializer):

    def __init__(self, weights):
        self.weights = weights

    def __call__(self, shape, dtype=None):
        return self.weights

    def get_config(self):  # To support serialization
        return {'weights': self.weights}

# y = 10 * x
layer = tf.keras.layers.Dense(1, kernel_initializer=ExampleRandomNormal([[10]]), input_shape=[None, 1])
model = tf.keras.Sequential([layer])

x = tf.Variable([[1]], dtype=tf.float32)
with tf.GradientTape(watch_accessed_variables=True) as tape:
    y = model(x)
    loss = y
gradients = tape.gradient(loss, model.trainable_variables)
print ('gradients = ', gradients)
print ('model parameters before update = ', model.weights)
optimizer.apply_gradients(zip(gradients, model.trainable_variables))
print ('model parameters after update = ', model.weights)

gradients =  [<tf.Tensor: shape=(1, 1), dtype=float32, numpy=array([[1.]], dtype=float32)>, <tf.Tensor: shape=(1,), dtype=float32, numpy=array([1.], dtype=float32)>]
model parameters before update =  [<tf.Variable 'dense_17/kernel:0' shape=(1, 1) dtype=float32, numpy=array([[10.]], dtype=float32)>, <tf.Variable 'dense_17/bias:0' shape=(1,) dtype=float32, numpy=array([0.], dtype=float32)>]
model parameters after update =  [<tf.Variable 'dense_17/kernel:0' shape=(1, 1) dtype=float32, numpy=array([[9.99]], dtype=float32)>, <tf.Variable 'dense_17/bias:0' shape=(1,) dtype=float32, numpy=array([-0.01], dtype=float32)>]


# 更新多个模型的参数

In [46]:
class ExampleRandomNormal(tf.keras.initializers.Initializer):

    def __init__(self, weights):
        self.weights = weights

    def __call__(self, shape, dtype=None):
        return self.weights

    def get_config(self):  # To support serialization
        return {'weights': self.weights}

# y = 10 * x
layer1 = tf.keras.layers.Dense(1, kernel_initializer=ExampleRandomNormal([[10]]), input_shape=[None, 1])
model1 = tf.keras.Sequential([layer1])
layer2 = tf.keras.layers.Dense(1, kernel_initializer=ExampleRandomNormal([[5]]), input_shape=[None, 1])
model2 = tf.keras.Sequential([layer2])

def process(x):
    x = model1(x)
    x = model2(x)
    return x

x = tf.Variable([[1]], dtype=tf.float32)
# persistent=True！！！
with tf.GradientTape(watch_accessed_variables=True, persistent=True) as tape:
    y = process(x)
    loss = y
# 计算第一个模型的梯度
gradients = tape.gradient(loss, model1.trainable_variables)
print ('gradients = ', gradients)
print ('model1 parameters before update = ', model1.weights)
print ('model2 parameters before update = ', model2.weights)
# 更新第一个模型的参数
optimizer.apply_gradients(zip(gradients, model1.trainable_variables))
print ('model parameters after update = ', model1.weights)
print ('model parameters after update = ', model2.weights)

# 计算第二个模型的梯度
gradients = tape.gradient(loss, model2.trainable_variables)
print ('gradients = ', gradients)
print ('model1 parameters before update = ', model1.weights)
print ('model2 parameters before update = ', model2.weights)
# 更新第一个模型的参数
optimizer.apply_gradients(zip(gradients, model1.trainable_variables))
print ('model parameters after update = ', model1.weights)
print ('model parameters after update = ', model2.weights)

gradients =  [<tf.Tensor: shape=(1, 1), dtype=float32, numpy=array([[5.]], dtype=float32)>, <tf.Tensor: shape=(1,), dtype=float32, numpy=array([5.], dtype=float32)>]
model1 parameters before update =  [<tf.Variable 'dense_20/kernel:0' shape=(1, 1) dtype=float32, numpy=array([[10.]], dtype=float32)>, <tf.Variable 'dense_20/bias:0' shape=(1,) dtype=float32, numpy=array([0.], dtype=float32)>]
model2 parameters before update =  [<tf.Variable 'dense_21/kernel:0' shape=(1, 1) dtype=float32, numpy=array([[5.]], dtype=float32)>, <tf.Variable 'dense_21/bias:0' shape=(1,) dtype=float32, numpy=array([0.], dtype=float32)>]
model parameters after update =  [<tf.Variable 'dense_20/kernel:0' shape=(1, 1) dtype=float32, numpy=array([[9.95]], dtype=float32)>, <tf.Variable 'dense_20/bias:0' shape=(1,) dtype=float32, numpy=array([-0.05], dtype=float32)>]
model parameters after update =  [<tf.Variable 'dense_21/kernel:0' shape=(1, 1) dtype=float32, numpy=array([[5.]], dtype=float32)>, <tf.Variable 'dense_

# 重要！！！

process函数和loss函数中必须全部使用tf函数，不能使用相同作用的np函数，否则无法计算梯度。  
普通的运算符可以直接使用，不必写成tf函数的形式。  