In [2]:
import numpy as np
import tensorflow as tf
from keras.datasets import imdb
from tensorflow.contrib.rnn import GRUCell
from tensorflow.python.ops.rnn import bidirectional_dynamic_rnn as bi_rnn
from tqdm import tqdm

Using TensorFlow backend.


In [20]:
# 读取数据集
NUM_WORDS = 10000
INDEX_FROM = 3
(X_train, y_train), (X_test, y_test) = imdb.load_data(path="D:/kesai/textmining/data/imdb.npz",
                                                      num_words=NUM_WORDS, index_from=INDEX_FROM)

## 序列化预处理

In [21]:
def get_vocabulary_size(X):
    return max([max(x) for x in X]) + 1

vocabulay_size = get_vocabulary_size(X_train)
print(vocabulay_size)

10000


In [22]:
def fit_in_vocabulary(X, voc_size):
    return [[w for w in x if w < voc_size] for x in X]

X_test = fit_in_vocabulary(X_test, vocabulay_size)

In [23]:
SEQUENCE_LENGTH = 250

def zero_pad(X, seq_len):
    return np.array([x[:seq_len - 1] + [0] * max(seq_len - len(x), 1) for x in X])

X_train = zero_pad(X_train, SEQUENCE_LENGTH)
X_test = zero_pad(X_test, SEQUENCE_LENGTH)
print(X_train.shape)

(25000, 250)


## 构建模型

In [27]:
with tf.name_scope("Inputs"):
    batch_ph = tf.placeholder(tf.int32, [None, SEQUENCE_LENGTH], name="batch_ph")
    target_ph = tf.placeholder(tf.float32, [None], name="target_ph")
    seq_len_ph = tf.placeholder(tf.int32, [None], name="seq_len_ph")
    keeb_prob_ph = tf.placeholder(tf.float32, name="keep_prob_ph")

In [28]:
# Embedding层
EMBEDDING_DIM = 100

with tf.name_scope("Embedding_layer"):
    embedding_var = tf.Variable(tf.random_uniform([vocabulay_size, EMBEDDING_DIM], -1.0, 1.0), trainable=True)
    tf.summary.histogram("embedding", embedding_var)
    batch_embedded = tf.nn.embedding_lookup(embedding_var, batch_ph)

Instructions for updating:
Colocations handled automatically by placer.


In [29]:
# (Bi-)RNN 层
HIDDEN_SIZE = 150

rnn_outputs, _ = bi_rnn(GRUCell(HIDDEN_SIZE), GRUCell(HIDDEN_SIZE),
                        inputs=batch_embedded, sequence_length=seq_len_ph, dtype=tf.float32)
tf.summary.histogram("RNN_outputs", rnn_outputs)

Instructions for updating:
This class is equivalent as tf.keras.layers.GRUCell, and will be replaced by that in Tensorflow 2.0.
Instructions for updating:
Please use `keras.layers.Bidirectional(keras.layers.RNN(cell))`, which is equivalent to this API
Instructions for updating:
Please use `keras.layers.RNN(cell)`, which is equivalent to this API
Instructions for updating:
Use tf.cast instead.


<tf.Tensor 'RNN_outputs:0' shape=() dtype=string>

## attention层

In [38]:
def attention(inputs, attention_size, time_major=False, return_alphas=False):
    
    if isinstance(inputs, tuple):
        inputs = tf.concat(inputs, 2)
        
    if time_major:
        inputs = tf.array_ops.transpose(inputs, [1, 0, 2])
    
    hidden_size = inputs.shape[2].value
    
    # 训练参数
    w_omega = tf.Variable(tf.random_normal([hidden_size, attention_size], stddev=0.1))
    b_omega = tf.Variable(tf.random_normal([attention_size], stddev=0.1))
    u_omega = tf.Variable(tf.random_normal([attention_size], stddev=0.1))
    
    with tf.name_scope('v'):
        v = tf.tanh(tf.tensordot(inputs, w_omega, axes=1) + b_omega)
        
    vu = tf.tensordot(v, u_omega, axes=1, name="vu")
    alphas = tf.nn.softmax(vu, name="alphas")
    
    output = tf.reduce_sum(inputs * tf.expand_dims(alphas, -1), 1)
    
    if not return_alphas:
        return output
    else:
        return output, alphas

In [39]:
ATTENTION_SIZE = 50

with tf.name_scope("Attention_layer"):
    attention_output, alphas = attention(rnn_outputs, ATTENTION_SIZE, return_alphas=True)
    tf.summary.histogram("alphas", alphas)

In [41]:
# Droput
drop = tf.nn.dropout(attention_output, keeb_prob_ph)

Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.


In [47]:
# 全连接层
HIDDEN_SIZE = 150

with tf.name_scope("Fully_connected_layer"):
    W = tf.Variable(tf.truncated_normal([HIDDEN_SIZE * 2, 1], stddev=0.1))
    b = tf.Variable(tf.constant(0, shape=[1], dtype=tf.float32))
    y_hat = tf.nn.xw_plus_b(drop, W, b)
    y_hat = tf.squeeze(y_hat)
    tf.summary.histogram("W", W)

In [52]:
with tf.name_scope("Metrics"):
    loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=y_hat, labels=target_ph))
    tf.summary.scalar("loss", loss)
    optimizer = tf.train.AdamOptimizer(learning_rate=1e-3).minimize(loss)
    
    # 准确率
    accuracy = tf.reduce_mean(tf.cast(tf.equal(tf.round(tf.sigmoid(y_hat)), target_ph), tf.float32))
    tf.summary.scalar("accuracy", accuracy)

In [54]:
merged = tf.summary.merge_all()

## 批量生成

In [74]:
def batch_generator(X, y, batch_size):
    size = X.shape[0]
    X_copy = X.copy()
    y_copy = y.copy()
    indices = np.arange(size)
    np.random.shuffle(indices)
    X_copy = X_copy[indices]
    y_copy = y_copy[indices]
    i = 0
    while True:
        if i + batch_size <= size:
            yield X_copy[i:i+batch_size], y_copy[i:i+batch_size]
            i += batch_size
        else:
            i = 0
            indices = np.arange(size)
            np.random.shuffle(indices)
            X_copy = X_copy[indices]
            y_copy = y_copy[indices]
            continue

In [75]:
BATCH_SIZE = 256

train_batch_generator = batch_generator(X_train, y_train, BATCH_SIZE)
test_batch_generator = batch_generator(X_test, y_test, BATCH_SIZE)

## 开始训练

In [58]:
train_writer = tf.summary.FileWriter("./logdir/train", accuracy.graph)
test_writer = tf.summary.FileWriter("./logdir/test", accuracy.graph)

session_conf = tf.ConfigProto(gpu_options=tf.GPUOptions(allow_growth=True))

saver = tf.train.Saver()

In [76]:
NUM_EPOCHS = 3  # Model easily overfits without pre-trained words embeddings, that's why train for a few epochs
DELTA = 0.5
MODEL_PATH = './model'
KEEP_PROB = 0.8

if __name__ == "__main__":
    with tf.Session(config=session_conf) as sess:
        sess.run(tf.global_variables_initializer())
        print("Start learning...")
        for epoch in range(NUM_EPOCHS):
            loss_train = 0
            loss_test = 0
            accuracy_train = 0
            accuracy_test = 0
            
            print("epoch: {}\t".format(epoch+1), end="")
            
            # 训练
            num_batches = X_train.shape[0] // BATCH_SIZE
            for b in tqdm(range(num_batches)):
                x_batch, y_batch = next(train_batch_generator)
                seq_len = np.array([list(x).index(0) + 1 for x in x_batch ])
                loss_tr, acc, _, summary = sess.run([loss, accuracy, optimizer ,merged],
                                                    feed_dict={batch_ph: x_batch,
                                                               target_ph: y_batch,
                                                               seq_len_ph: seq_len,
                                                               keeb_prob_ph: KEEP_PROB})
                accuracy_train += acc
                loss_train = loss_tr * DELTA + loss_train * (1 - DELTA)
                train_writer.add_summary(summary, b + num_batches * epoch)
            accuracy_train /= num_batches
            
            # Testing
            num_batches = X_test.shape[0] // BATCH_SIZE
            for b in tqdm(range(num_batches)):
                x_batch, y_batch = next(test_batch_generator)
                seq_len = np.array([list(x).index(0) + 1 for x in x_batch])
                loss_test_batch, acc, summary = sess.run([loss, accuracy, merged],
                                                         feed_dict={batch_ph: x_batch,
                                                                    target_ph: y_batch,
                                                                    seq_len_ph: seq_len,
                                                                    keeb_prob_ph: 1.0})
                accuracy_test += acc
                loss_test += loss_test_batch
                test_writer.add_summary(summary, b + num_batches * epoch)
            accuracy_test /= num_batches
            loss_test /= num_batches
            
            print("loss:{:.3f}, val_loss:{:.3f}, acc:{:.3f}, val_loss:{:.3f}".format(
                loss_train, loss_test, accuracy_train, accuracy_test))
        train_writer.close()
        test_writer.close()
        saver.save(sess, MODEL_PATH)
        print("Run 'tensorboard --logdir ./ --host=127.0.0.1' to checkout tensorboard logs.")

Start learning...
epoch: 0	

100%|██████████████████████████████████████████| 97/97 [11:25<00:00,  7.06s/it]
100%|██████████████████████████████████████████| 97/97 [04:22<00:00,  2.70s/it]


loss:0.432, val_loss:0.424, acc:0.713,val_loss:0.804
epoch: 1	

100%|██████████████████████████████████████████| 97/97 [08:51<00:00,  5.48s/it]
100%|██████████████████████████████████████████| 97/97 [04:10<00:00,  2.59s/it]


loss:0.320, val_loss:0.351, acc:0.846,val_loss:0.845
epoch: 2	

100%|██████████████████████████████████████████| 97/97 [09:34<00:00,  5.93s/it]
100%|██████████████████████████████████████████| 97/97 [04:13<00:00,  2.62s/it]


loss:0.231, val_loss:0.328, acc:0.891,val_loss:0.859
Run 'tensorboard --logdir=./logdir' to checkout tensorboard logs.
