# **5 tf中的求导**

In [1]:
# **tf中的求导**
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
import sklearn 
import pandas as pd
import os, gc, sys, time
import tensorflow as tf
from tensorflow import keras

import warnings
warnings.filterwarnings('ignore')

## **5.1 导数定义**

In [2]:
def f(x):
    return 3. * x ** 2 + 2 * x - 1

def approximae_derivative(f, x, eta=1e-3):
    return (f(x + eta) - f(x - eta)) / (2 * eta)

In [3]:
approximae_derivative(f, 1.)

7.999999999999119

## **5.2 tf.GradientTape**

### **5.2.1 GradientTape的基本使用**

In [4]:
x1 = tf.Variable(2.0)
x2 = tf.Variable(3.0)
with tf.GradientTape() as tape:
    # 打开一个梯度记录的tape
    z = x1 ** 2 + x2 ** 3
dz_x1 = tape.gradient(z, x1) # 参数为y和待求导的参数, tape只能使用一次
print(dz_x1)

try:
    dz_x2 = tape.gradient(z, x2)
except RuntimeError as ex:
    print(ex)

tf.Tensor(4.0, shape=(), dtype=float32)
GradientTape.gradient can only be called once on non-persistent tapes.


> tape只能使用一次

In [5]:
x1 = tf.Variable(2.0)
x2 = tf.Variable(3.0)
with tf.GradientTape(persistent=True) as tape:
    # 打开一个梯度记录的tape
    z = x1 ** 2 + x2 ** 3
dz_x1 = tape.gradient(z, x1) # 参数为y和待求导的参数, tape只能使用一次
print(dz_x1)

try:
    dz_x2 = tape.gradient(z, x2)
    print(dz_x2)
except RuntimeError as ex:
    print(ex)
del tape

tf.Tensor(4.0, shape=(), dtype=float32)
tf.Tensor(27.0, shape=(), dtype=float32)


> 设置persist属性即可多次使用，但是需要显式删除

In [6]:
x1 = tf.Variable(2.0)
x2 = tf.Variable(3.0)
with tf.GradientTape() as tape:
    # 打开一个梯度记录的tape
    z = x1 ** 2 + x2 ** 3
dz = tape.gradient(z, [x1, x2]) # 参数为y和待求导的参数, tape只能使用一次
print(dz)
del tape

[<tf.Tensor: id=102, shape=(), dtype=float32, numpy=4.0>, <tf.Tensor: id=107, shape=(), dtype=float32, numpy=27.0>]


> tape默认不会记录constant的梯度，需要指定watch

In [7]:
x1 = tf.constant(2.0)
x2 = tf.constant(3.0)
with tf.GradientTape() as tape:
    # 打开一个梯度记录的tape
    tape.watch(x1)
    tape.watch(x2)
    z = x1 ** 2 + x2 ** 3
dz = tape.gradient(z, [x1, x2]) # 参数为y和待求导的参数, tape只能使用一次
print(dz)

[<tf.Tensor: id=120, shape=(), dtype=float32, numpy=4.0>, <tf.Tensor: id=125, shape=(), dtype=float32, numpy=27.0>]


### **5.2.2 多函数求导**

In [8]:
x = tf.Variable(5.0)
with tf.GradientTape() as tape:
    z1 = 3 * x
    z2 = x ** 2
tape.gradient([z1, z2], x)

<tf.Tensor: id=148, shape=(), dtype=float32, numpy=13.0>

> 实际上是两个结果对x的导数之和

### **5.2.3 多阶导数求导**

In [9]:
x1 = tf.Variable(2.0)
x2 = tf.Variable(3.0)

with tf.GradientTape(persistent=True) as outer_tape:
    with tf.GradientTape(persistent=True) as inner_tape:
        z = x1 ** 2 + x2 + 1
    inner_grads = inner_tape.gradient(z, [x1, x2])
    
outer_grads = [outer_tape.gradient(inner_grad, [x1, x2])
               for inner_grad in inner_grads]
print(outer_grads)
del inner_tape
del outer_tape

[[<tf.Tensor: id=183, shape=(), dtype=float32, numpy=2.0>, None], [None, None]]


以上四个分别是x1的二阶导，x1x2的二阶导，x2x1的二阶导，x2的二阶导

## **5.3 利用GradientTape实现梯度下降**

In [10]:
lerning_rate = 0.1
def f(x):
    return x**2 - x*2 + 1

x = tf.Variable(0.0)
for _ in range(100):
    with tf.GradientTape() as tape:
        z = f(x)
    grad = tape.gradient(z, x)
    x.assign_sub(lerning_rate * grad) # Varible不能使用=更新
print(x)

<tf.Variable 'Variable:0' shape=() dtype=float32, numpy=0.9999999>


## **5.4 optimizer和GradientTape**

In [11]:
learning_rate = 0.1
def f(x):
    return x**2 - x*2 + 1

x = tf.Variable(0.0)
optimizer = keras.optimizers.SGD(lr = learning_rate)
for _ in range(100):
    with tf.GradientTape() as tape:
        z = f(x)
    grad = tape.gradient(z, x)
    optimizer.apply_gradients([(grad, x)]) # 使用apply_gradients来更新[(梯度，参数),.....]
  
print(x)

<tf.Variable 'Variable:0' shape=() dtype=float32, numpy=0.9999999>


## **5.5 tf.keras和tf.GradientTape()结合使用**

### **5.5.1 数据引入与模型搭建**

In [12]:
from sklearn.datasets import fetch_california_housing
housing = fetch_california_housing()
from sklearn.model_selection import train_test_split
X_train_all, X_test, y_train_all, y_test = train_test_split(housing.data, housing.target, random_state=1)
X_train, X_val, y_train, y_val = train_test_split(X_train_all, y_train_all, random_state=1)
from sklearn.preprocessing import StandardScaler
std_scaler = StandardScaler()
X_train_scaled = std_scaler.fit_transform(X_train)
X_val_scaled = std_scaler.transform(X_val)
X_test_scaled = std_scaler.transform(X_test)

In [21]:
model = keras.models.Sequential()
model.add(keras.layers.Dense(128, activation='relu', input_shape=X_train.shape[1:]))
model.add(keras.layers.Dense(1, activation='relu'))

### **5.5.2 fit的修改**

fit的工作
- batch 遍历数据集, 求metric
    - 自动求导， 更新参数
- epoch结束 在验证集上验证, 求metric

In [17]:
# metric使用
metric = keras.metrics.MeanSquaredError()
print(metric([5.0], [6]))
print(metric([4.0], [6]))

tf.Tensor(1.0, shape=(), dtype=float32)
tf.Tensor(2.5, shape=(), dtype=float32)


> metirc具有记录功能，使用`reset_states()`清空记录

In [19]:
epochs = 100
batch_size = 32
steps = len(X_train) // batch_size
optimizer = keras.optimizers.SGD()
metric = keras.metrics.MeanSquaredError()

In [20]:
# data的准备
def random_batch(x, y, batch_size=32):
    idx = np.random.randint(0, len(x), size=batch_size)
    return x[idx], y[idx]

In [28]:
for epoch in range(epochs):
    metric.reset_states() # 重置以下epoch
    for step in range(steps):
        x_batch, y_batch = random_batch(X_train_scaled, y_train, batch_size=32)
        with tf.GradientTape() as tape:
            y_hat = model(x_batch)
            loss = tf.reduce_mean(keras.losses.mean_squared_error(y_hat, y_batch))
            metric(y_batch, y_hat)
        grads = tape.gradient(loss, model.variables)
        grads_and_vars = zip(grads, model.variables)
        optimizer.apply_gradients(grads_and_vars)
        print('\rEPOCH:', epoch, 'train_mse', metric.result().numpy(), end='')
    y_valid_hat = model(X_val_scaled)
    valid_loss = tf.reduce_mean(keras.losses.mean_squared_error(y_val, y_valid_hat))
    print('valid mse', valid_loss.numpy())

EPOCH: 0 train_mse 1.3648723valid mse 1.3992026
EPOCH: 1 train_mse 1.291308valid mse 1.3198565
EPOCH: 2 train_mse 1.2525109valid mse 1.3112245
EPOCH: 3 train_mse 1.3224132valid mse 1.3091447
EPOCH: 4 train_mse 1.3051001valid mse 1.311061664 1.2810376
EPOCH: 5 train_mse 1.2616057valid mse 1.307519895 train_mse 1.2879657 5 train_mse 1.27133041.2586941.2596067 5 train_mse 1.2632151
EPOCH: 6 train_mse 1.2920135valid mse 1.30683421.2950865
EPOCH: 7 train_mse 1.2984773valid mse 1.3154217
EPOCH: 8 train_mse 1.2959733valid mse 1.3588241train_mse 1.3375747 1.283094
EPOCH: 9 train_mse 1.3050661valid mse 1.3071806
EPOCH: 10 train_mse 1.2543155valid mse 1.3069462
EPOCH: 11 train_mse 1.3023864valid mse 1.30376456768train_mse 1.3181422 1.3067721train_mse 1.3083869train_mse 1.3131604
EPOCH: 12 train_mse 1.2607656valid mse 1.30387998708
EPOCH: 13 train_mse 1.2952853valid mse 1.31050938train_mse 1.2910746
EPOCH: 14 train_mse 1.288251valid mse 1.307414
EPOCH: 15 train_mse 1.2626013valid mse 1.3039262
EP

本部分实际上就是讲解了keras内部到底是怎么去进行求导的，上面的代码其实和pytorch的过程有点类似，先计算模型输出，然后进行反向求导，最后进行参数的更新