# tf 自定义求导

In [1]:
# 导入
import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
import sklearn
import pandas as pd
import os
import sys
import time
import tensorflow as tf
from tensorflow import keras

print(tf.__version__)
print(sys.version_info)
for module in mpl,np,pd,sklearn,tf,keras:
    print(module.__name__,module.__version__)

2.0.0
sys.version_info(major=3, minor=6, micro=10, releaselevel='final', serial=0)
matplotlib 3.1.2
numpy 1.18.1
pandas 0.25.3
sklearn 0.22.1
tensorflow 2.0.0
tensorflow_core.keras 2.2.4-tf


### 求导回顾--近似求导

In [2]:
# 一元函数求导
def f(x):
    return 3. * x ** 2 + 2. * x - 1

def approximate_derivative(f, x, eps=1e-3):
    return (f(x + eps) - f(x - eps)) / (2. * eps)

print(approximate_derivative(f, 1.))

7.999999999999119


In [3]:
# 多元函数求导
def g(x1, x2):
    return (x1 + 5) * (x2 ** 2)

def approximate_gradient(g, x1, x2, eps=1e-3):
    dg_x1 = approximate_derivative(lambda x: g(x, x2), x1, eps)
    dg_x2 = approximate_derivative(lambda x: g(x1, x), x2, eps)
    return dg_x1,dg_x2

print(approximate_gradient(g, 2., 3.))

(8.999999999993236, 41.999999999994486)


### 在TensorFlow中自定义导数--tf.GradientTape

In [4]:
# tape只能使用一次
x1 = tf.Variable(2.0)
x2 = tf.Variable(3.0)
with tf.GradientTape() as tape:
    z = g(x1, x2)
    
dz_x1 = tape.gradient(z, x1)
print(dz_x1)

try:
    dz_x2 = tape.gradient(z, x2)
except RuntimeError as ex:
    print(ex)

tf.Tensor(9.0, shape=(), dtype=float32)
GradientTape.gradient can only be called once on non-persistent tapes.


In [5]:
# 解决上述问题：将tape保存，设置persistent=True，用完后自己进行删除
x1 = tf.Variable(2.0)
x2 = tf.Variable(3.0)
with tf.GradientTape(persistent=True) as tape:
    z = g(x1, x2)
    
dz_x1 = tape.gradient(z, x1)
dz_x2 = tape.gradient(z, x2)
print(dz_x1, dz_x2)

del tape

tf.Tensor(9.0, shape=(), dtype=float32) tf.Tensor(42.0, shape=(), dtype=float32)


In [6]:
# 一次性求多个梯度
x1 = tf.Variable(2.0)
x2 = tf.Variable(3.0)
with tf.GradientTape() as tape:
    z = g(x1, x2)
    
dz_x1x2 = tape.gradient(z, [x1, x2])
print(dz_x1x2)

[<tf.Tensor: id=89, shape=(), dtype=float32, numpy=9.0>, <tf.Tensor: id=95, shape=(), dtype=float32, numpy=42.0>]


In [7]:
# 对常量求导
x1 = tf.constant(2.0)
x2 = tf.constant(3.0)
with tf.GradientTape() as tape:
    z = g(x1, x2)
    
dz_x1x2 = tape.gradient(z, [x1, x2])
print(dz_x1x2)

[None, None]


In [8]:
# 解决上述问题：tape设置关注常量
x1 = tf.constant(2.0)
x2 = tf.constant(3.0)
with tf.GradientTape() as tape:
    tape.watch(x1)
    tape.watch(x2)
    z = g(x1, x2)
    
dz_x1x2 = tape.gradient(z, [x1, x2])
print(dz_x1x2)

[<tf.Tensor: id=111, shape=(), dtype=float32, numpy=9.0>, <tf.Tensor: id=117, shape=(), dtype=float32, numpy=42.0>]


In [9]:
# 两个目标函数对一个变量求导
x = tf.Variable(5.0)
with tf.GradientTape() as tape:
    z1 = 3 * x
    z2 = x ** 2
    
tape.gradient([z1, z2], x)

<tf.Tensor: id=140, shape=(), dtype=float32, numpy=13.0>

In [10]:
# 求二阶导数
x1 = tf.Variable(2.0)
x2 = tf.Variable(3.0)
with tf.GradientTape(persistent=True) as outer_tape:
    with tf.GradientTape(persistent=True) as inner_tape:
        z = g(x1, x2)
    inner_grads = inner_tape.gradient(z, [x1, x2])
outer_grads = [outer_tape.gradient(inner_grad, [x1, x2]) 
               for inner_grad in inner_grads]
print(outer_grads)

del inner_tape
del outer_tape

[[None, <tf.Tensor: id=177, shape=(), dtype=float32, numpy=6.0>], [<tf.Tensor: id=188, shape=(), dtype=float32, numpy=6.0>, <tf.Tensor: id=186, shape=(), dtype=float32, numpy=14.0>]]


### 模拟实现梯度下降算法

In [11]:
# 简单的梯度下降模拟
learning_rate = 0.1
x = tf.Variable(0.0)

for _ in range(100):
    with tf.GradientTape() as tape:
        z = f(x)
    dz_dx = tape.gradient(z, x)
    x.assign_sub(learning_rate * dz_dx)
    
print(x)

<tf.Variable 'Variable:0' shape=() dtype=float32, numpy=-0.3333333>


### GradientTape和optimizer结合使用

In [12]:
learning_rate = 0.1
x = tf.Variable(0.0)
optimizer = keras.optimizers.SGD(learning_rate=learning_rate)

for _ in range(100):
    with tf.GradientTape() as tape:
        z = f(x)
    dz_dx = tape.gradient(z, x)
    optimizer.apply_gradients([(dz_dx, x)])
    
print(x)

<tf.Variable 'Variable:0' shape=() dtype=float32, numpy=-0.3333333>


### 利用手工求导方式解决回归问题

In [13]:
# 导入数据集 房价预测
from sklearn.datasets import fetch_california_housing

housing = fetch_california_housing()
print(housing.DESCR)
print(housing.data.shape)
print(housing.target.shape)

.. _california_housing_dataset:

California Housing dataset
--------------------------

**Data Set Characteristics:**

    :Number of Instances: 20640

    :Number of Attributes: 8 numeric, predictive attributes and the target

    :Attribute Information:
        - MedInc        median income in block
        - HouseAge      median house age in block
        - AveRooms      average number of rooms
        - AveBedrms     average number of bedrooms
        - Population    block population
        - AveOccup      average house occupancy
        - Latitude      house block latitude
        - Longitude     house block longitude

    :Missing Attribute Values: None

This dataset was obtained from the StatLib repository.
http://lib.stat.cmu.edu/datasets/

The target variable is the median house value for California districts.

This dataset was derived from the 1990 U.S. census, using one row per census
block group. A block group is the smallest geographical unit for which the U.S.
Census Bur

In [14]:
# 切分数据集
from sklearn.model_selection import train_test_split

x_train_all,x_test,y_train_all,y_test = train_test_split(
    housing.data,housing.target,random_state=7)
x_train,x_valid,y_train,y_valid = train_test_split(
    x_train_all,y_train_all,random_state=11)

print(x_train.shape,y_train.shape)
print(x_valid.shape,y_valid.shape)
print(x_test.shape,y_test.shape)

(11610, 8) (11610,)
(3870, 8) (3870,)
(5160, 8) (5160,)


In [15]:
# 对数据进行归一化
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
x_train_scaled = scaler.fit_transform(x_train)
x_valid_scaled = scaler.transform(x_valid)
x_test_scaled = scaler.transform(x_test)

metric 使用

In [16]:
# metric 使用
# mse均方差的使用
metric = keras.metrics.MeanSquaredError()
print(metric([5.], [2.]))
print(metric([0.], [1.]))
# metric会自动累加
print(metric.result())

# 如果不想累加，就进行清空
metric.reset_states()
print(metric.result())
metric([1.], [3.])
print(metric.result())

tf.Tensor(9.0, shape=(), dtype=float32)
tf.Tensor(5.0, shape=(), dtype=float32)
tf.Tensor(5.0, shape=(), dtype=float32)
tf.Tensor(0.0, shape=(), dtype=float32)
tf.Tensor(4.0, shape=(), dtype=float32)


手动的进行求导，并且在训练集上训练

model.fit的步骤
1. batch遍历训练集，得到metric
   - 这部分要进行自动求导
2. 一个epoch结束后，在验证集上进行验证，得到metric

In [17]:
# 准备工作，定义一些用到的变量
epochs = 100
batch_size = 32
steps_per_epoch = len(x_train_scaled) // batch_size
optimizer = keras.optimizers.Adam()
metric = keras.metrics.MeanSquaredError()

# 取数据
def random_batch(x, y, batch_size=32):
    idx = np.random.randint(0, len(x), size=batch_size)
    return x[idx], y[idx]

# 构建模型
model = keras.models.Sequential([
    keras.layers.Dense(30, activation='relu',input_shape=x_train.shape[1:]),
    keras.layers.Dense(1)
])

In [18]:
# 模型训练，fit函数实现
for epoch in range(epochs):
    metric.reset_states() # 每轮清空metric
    # 1.遍历训练集，得到 metric
    for step in range(steps_per_epoch):
        x_batch, y_batch = random_batch(x_train_scaled, y_train, batch_size) # 手动取数据
        # 打开tape
        with tf.GradientTape() as tape:
            y_pred = model(x_batch) # 得到预测值
            loss = tf.reduce_mean(
                keras.losses.mean_squared_error(y_batch, y_pred)) # 得到loss值
            metric(y_batch, y_pred) # 累积计算metric
        grads = tape.gradient(loss, model.variables) # 手动求梯度
        grads_and_vars = zip(grads, model.variables) # 将求得的梯度和模型变量绑定
        optimizer.apply_gradients(grads_and_vars) # apply到optimizer上
    print('\rEpoch:', epoch, ' train mse:', metric.result().numpy(), end='')
    # 2.一个epoch结束后，在验证集上进行验证
    y_valid_pred = model(x_valid_scaled)
    valid_loss = tf.reduce_mean(
        keras.losses.mean_squared_error(y_valid_pred, y_valid)) # 得到验证集loss值
    print('\t valid mse:', valid_loss.numpy())



To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.

Epoch:  0  train mse:  2.4945946 0  train mse:  3.9841251 0  train mse:  3.5711834 0  train mse:  4.3275003 0  train mse:  3.364888 0  train mse:  3.2155404 0  train mse:  3.0330994 0  train mse:  2.81172 0  train mse:  2.707118 0  train mse:  2.7559512 0  train mse:  2.6659937 0  train mse:  2.58327412.5138779 0  train mse:  2.5741804 0  train mse:  2.5352063 0  train mse:  2.4917145	 valid mse:  1.7801257581802976
Epoch:  1  train mse:  1.6175961  train mse:  2.821956 1  train mse:  2.585198 1  train mse:  2.19634  train mse:  2.1405828 1  train mse:  2.0106177 1  train mse:  1.9385847 1  train mse:  1.8852016 1  train mse:  1.84603741.8070872 1  train mse:  1.7687284 1  train mse:  1.733