# 搭建 Bi-GRU (双端GRU)网络实现 MNIST 分类

在上个例子中，我们已经理解了在 TensorFlow 中如何来实现多层的 LSTM 和 GRU。现在我们来实现一下更加常用的 Bi-GRU

In [1]:
import warnings
warnings.filterwarnings('ignore')  # 不打印 warning 

import tensorflow as tf

# 设置GPU按需增长
config = tf.ConfigProto()
config.gpu_options.allow_growth = True
sess = tf.Session(config=config)

import numpy as np

# 用tensorflow 导入数据
from tensorflow.examples.tutorials.mnist import input_data
mnist = input_data.read_data_sets('../data/MNIST_data', one_hot=True) 

# 看看咱们样本的数量
print(mnist.test.labels.shape)
print(mnist.train.labels.shape)

Instructions for updating:
Use the retry module or similar alternatives.
Instructions for updating:
Please use alternatives such as official/mnist/dataset.py from tensorflow/models.
Instructions for updating:
Please write your own downloading logic.
Instructions for updating:
Please use tf.data to implement this functionality.
Extracting ../data/MNIST_data/train-images-idx3-ubyte.gz
Instructions for updating:
Please use tf.data to implement this functionality.
Extracting ../data/MNIST_data/train-labels-idx1-ubyte.gz
Instructions for updating:
Please use tf.one_hot on tensors.
Extracting ../data/MNIST_data/t10k-images-idx3-ubyte.gz
Extracting ../data/MNIST_data/t10k-labels-idx1-ubyte.gz
Instructions for updating:
Please use alternatives such as official/mnist/dataset.py from tensorflow/models.
(10000, 10)
(55000, 10)


 ** 一、首先设置好模型用到的各个超参数 **

In [2]:
lr = 1e-3
input_size = 28      # 每个时刻的输入特征是28维的，就是每个时刻输入一行，一行有 28 个像素
timestep_size = 28   # 时序持续长度为28，即每做一次预测，需要先输入28行
hidden_size = 256    # 隐含层的数量
layer_num = 2        # LSTM layer 的层数
class_num = 10       # 最后输出分类类别数量，如果是回归预测的话应该是 1
cell_type = "block_gru"   # gru 或者 block_gru

X_input = tf.placeholder(tf.float32, [None, 784])
y_input = tf.placeholder(tf.float32, [None, class_num])
# 在训练和测试的时候，我们想用不同的 batch_size.所以采用占位符的方式
batch_size = tf.placeholder(tf.int32, [])  # 注意类型必须为 tf.int32, batch_size = 128
keep_prob = tf.placeholder(tf.float32, [])

 ** 二、开始搭建 GRU 模型，和 LSTM 模型基本一致 **

In [3]:
# 把784个点的字符信息还原成 28 * 28 的图片
# 下面几个步骤是实现 RNN / gru 的关键

# **步骤1：RNN 的输入shape = (batch_size, timestep_size, input_size) 
X = tf.reshape(X_input, [-1, 28, 28])

# ** 步骤2：创建 gru 结构
def gru_cell(cell_type, num_nodes, keep_prob):
    assert(cell_type in ["gru", "block_gru"], "Wrong cell type.")
    if cell_type == "gru":
        cell = tf.contrib.rnn.GRUCell(num_nodes)
    else:
        cell = tf.contrib.rnn.GRUBlockCellV2(num_nodes)
    cell = tf.contrib.rnn.DropoutWrapper(cell, output_keep_prob=keep_prob)
    return cell

cells_fw = [gru_cell(cell_type, hidden_size, keep_prob) for _ in range(layer_num)]
cells_bw = [gru_cell(cell_type, hidden_size, keep_prob) for _ in range(layer_num)]

outputs, _, _ = tf.contrib.rnn.stack_bidirectional_dynamic_rnn(cells_fw=cells_fw,  cells_bw=cells_bw, inputs=X, dtype=tf.float32)

print(outputs)  # shape=(?, 28, 512)
# h_state = outputs[:, -1, :]
# print(h_state)

# 之前的例子中我们都取最后一个时间步作为输出，现在我们取每个时间步的平均值作为特征输出
h_state = tf.reduce_mean(axis=1, input_tensor=outputs, keepdims=False)
print(h_state)



Tensor("stack_bidirectional_rnn/cell_1/concat:0", shape=(?, 28, 512), dtype=float32)
Tensor("Mean:0", shape=(?, 512), dtype=float32)


 ** 三、最后设置 loss function 和 优化器，展开训练并完成测试 **

In [4]:
import time 

# 开始训练和测试
W = tf.Variable(tf.truncated_normal([hidden_size*2, class_num], stddev=0.1), dtype=tf.float32)
bias = tf.Variable(tf.constant(0.1,shape=[class_num]), dtype=tf.float32)
y_pre = tf.nn.softmax(tf.matmul(h_state, W) + bias)


# 损失和评估函数
cross_entropy = -tf.reduce_mean(y_input * tf.log(y_pre))
train_op = tf.train.AdamOptimizer(lr).minimize(cross_entropy)

correct_prediction = tf.equal(tf.argmax(y_pre,1), tf.argmax(y_input,1))
accuracy = tf.reduce_mean(tf.cast(correct_prediction, "float"))


sess.run(tf.global_variables_initializer())
time0 = time.time()
for i in range(5000):
    _batch_size=100
    X_batch, y_batch = mnist.train.next_batch(batch_size=_batch_size)
    cost, acc,  _ = sess.run([cross_entropy, accuracy, train_op], feed_dict={X_input: X_batch, y_input: y_batch, keep_prob: 0.5, batch_size: _batch_size})
    if (i+1) % 500 == 0:
        # 分 100 个batch 迭代
        test_acc = 0.0
        test_cost = 0.0
        N = 100
        for j in range(N):
            X_batch, y_batch = mnist.test.next_batch(batch_size=_batch_size)
            _cost, _acc = sess.run([cross_entropy, accuracy], feed_dict={X_input: X_batch, y_input: y_batch, keep_prob: 1.0, batch_size: _batch_size})
            test_acc += _acc
            test_cost += _cost
        print("step {}, train cost={:.6f}, acc={:.6f}; test cost={:.6f}, acc={:.6f}; pass {}s".format(i+1, cost, acc, test_cost/N, test_acc/N, time.time() - time0))
        time0 = time.time()

step 500, train cost=0.018590, acc=0.960000; test cost=0.010676, acc=0.964100; pass 48.847047090530396s
step 1000, train cost=0.003681, acc=0.980000; test cost=0.007433, acc=0.976600; pass 47.62926483154297s
step 1500, train cost=0.009871, acc=0.960000; test cost=0.006339, acc=0.980500; pass 46.96517825126648s
step 2000, train cost=0.013135, acc=0.960000; test cost=0.004675, acc=0.985000; pass 46.62891340255737s
step 2500, train cost=0.000233, acc=1.000000; test cost=0.005693, acc=0.982500; pass 46.65591073036194s
step 3000, train cost=0.001926, acc=0.990000; test cost=0.005315, acc=0.984700; pass 46.85902285575867s
step 3500, train cost=0.010246, acc=0.980000; test cost=0.003928, acc=0.988100; pass 47.245213747024536s
step 4000, train cost=0.003431, acc=0.990000; test cost=0.004989, acc=0.985800; pass 46.78474521636963s
step 4500, train cost=0.000430, acc=1.000000; test cost=0.003681, acc=0.989100; pass 46.685664653778076s
step 5000, train cost=0.000663, acc=1.000000; test cost=0.0033

一般来说，这种序列分类的任务中，Bi-GRU 的结果都会比单向的要好一些，只是这个 MNIST 分类的任务比较简单，也没能够体现出明显的优势。但是和上一个实验比较，你就会发现，使用 Bi-GRU 的速度明显要比单向的慢呀，而且慢得很多。

下面统计了不同模型在 MNIST 上面的速度，每迭代 500 个 training batch(batch_size=100) 和 100 个 testing batch 所花费的时间(second).因为准确率都差不多，就不写了。每个模型都是在没有其他任务的情况下跑的，所以时间还是比较有参考意义的。

|Model|1-layer|2-layer|
|:----:|:---:|:---:|
|LSTM|6.3|13.2|
|GRU|4.8|10.2|
|Bi-GRU|24.0|46.7|
|CNN|-|2.4|

从上面的结果可以看到：
- CNN 的速度相对 RNN 来说要快不少
- GRU 要比 LSTM 稍微快一些
- Bi-GRU 比 GRU 要慢好多好多

为什么一层的 BIGRU 都能慢这么多，这个还真没搞清楚。