In [1]:
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
import sklearn
import pandas as pd
import os, sys, time
import tensorflow as tf
from tensorflow import keras

在进行模型建设之前，一般应该完成如下三个操作:
- 词典构建(id - &gt; word, word -&gt; id)
- 数据变换 (sentence -&gt; id)
- 训练集和测试集的构建(abcd -> bcd&lt;eos&gt;)

## **3.1 数据集读取**

In [2]:
input_filepath = './data/poem/tang_poems_7.txt'

with open(input_filepath, 'r', encoding='utf-8') as f:
    text = ''
    for s in f.readlines():
        if '□' not in s:
            text = text + s
print(len(text))
print(text[0:85])

1243210
采玉采玉须水碧，琢作步摇徒好色。
傍水野禽通体白，饤盘山菓半边红。
宜秋下邑摧凶丑，当锋入阵宋中丞。
吾为子起歌都护，酒阑插剑肝胆露。
南北东西九千里，除兄与弟更无人。



## **3.2 词表操作**
### **3.2.1 生成词表**

In [3]:
vocab = sorted(set(text))
print(len(vocab))
print(vocab[10:100])

6359
['䝙', '䯄', '䲡', '䴔', '䴖', '䴙', '一', '丁', '七', '万', '丈', '三', '上', '下', '不', '与', '丐', '丑', '专', '且', '丕', '世', '丘', '丙', '业', '丛', '东', '丝', '丞', '两', '严', '丧', '个', '丫', '中', '丰', '丱', '串', '临', '丸', '丹', '为', '主', '丽', '举', '乃', '久', '么', '义', '之', '乌', '乍', '乎', '乏', '乐', '乔', '乖', '乘', '乙', '九', '乞', '也', '习', '乡', '书', '买', '乱', '乳', '乾', '了', '予', '争', '事', '二', '于', '亏', '云', '互', '五', '井', '亘', '亚', '些', '亡', '亢', '交', '亥', '亦', '产', '亨']


In [4]:
char2idx = {char : idx for idx, char in enumerate(vocab)}
print(char2idx['一'])

16


In [5]:
vocab[:10]

['\n', '。', '㔉', '㧑', '㶉', '䃅', '䌷', '䍀', '䗖', '䜩']

In [6]:
idx2_char = np.array(vocab)

## **3.2.2 将数据集转换为数字**

In [7]:
text_as_int = np.array([char2idx[c] for c in text])
print(text_as_int[0:17])
print(text[0:17])

[5497 3177 5497 3177 5857 2597 3565 6270 3222  188 2537 1960 1524 1009
 4334    1    0]
采玉采玉须水碧，琢作步摇徒好色。



In [8]:
def split_input_target(id_text):
    '''
    abcde -> abcd, bcde, 这里是给定生成，如果需要对联的话，需要id_text[0:7],[7:15]
    '''
    return id_text[0: 7], id_text[8:-2]

char_dataset = tf.data.Dataset.from_tensor_slices(text_as_int)
seq_length = 16 # 7 + 1 + 7 + 1
seq_dataset = char_dataset.batch(seq_length + 1, drop_remainder = True) # 加一是为了产出\n, 这个batch是为了生成一个句子
for ch_id in char_dataset.take(2):
    print(ch_id, idx2_char[ch_id.numpy()])

for seq_id in seq_dataset.take(2):
    print(seq_id)
    print(''.join(idx2_char[seq_id.numpy()]))

tf.Tensor(5497, shape=(), dtype=int32) 采
tf.Tensor(3177, shape=(), dtype=int32) 玉
tf.Tensor(
[5497 3177 5497 3177 5857 2597 3565 6270 3222  188 2537 1960 1524 1009
 4334    1    0], shape=(17,), dtype=int32)
采玉采玉须水碧，琢作步摇徒好色。

tf.Tensor(
[ 288 2597 5501 3640 5342  182 3395 6270 5916 3431 1247 4479  523 5286
 3929    1    0], shape=(17,), dtype=int32)
傍水野禽通体白，饤盘山菓半边红。



In [9]:
#调用seq生成x和y
seq_dataset = seq_dataset.map(split_input_target)

for item_input, item_output in seq_dataset.take(2):
    print(item_input, ''.join(idx2_char[item_input.numpy()]))
    print(item_output, repr(''.join(idx2_char[item_output.numpy()])))

tf.Tensor([5497 3177 5497 3177 5857 2597 3565], shape=(7,), dtype=int32) 采玉采玉须水碧
tf.Tensor([3222  188 2537 1960 1524 1009 4334], shape=(7,), dtype=int32) '琢作步摇徒好色'
tf.Tensor([ 288 2597 5501 3640 5342  182 3395], shape=(7,), dtype=int32) 傍水野禽通体白
tf.Tensor([5916 3431 1247 4479  523 5286 3929], shape=(7,), dtype=int32) '饤盘山菓半边红'


## **3.3 模型构建**

In [10]:
batch_size = 64
buffer_size = 1000
seq_dataset = seq_dataset.shuffle(buffer_size).batch(batch_size, drop_remainder=True)

In [11]:

vocab_size = len(vocab)
embedding_dim = 128
rnn_units = 512

def build_model(vocab_size, embedding_dim, rnn_units, batch_size):
    model = keras.models.Sequential([
        keras.layers.Embedding(vocab_size, embedding_dim, batch_input_shape=[batch_size, None]),
        keras.layers.Bidirectional(keras.layers.LSTM(units=rnn_units, return_sequences=True, 
                                                     stateful = True,
                                                     recurrent_initializer = 'glorot_uniform')),
        keras.layers.Bidirectional(keras.layers.LSTM(units=rnn_units, return_sequences=True, 
                                                     stateful = True,
                                                     recurrent_initializer = 'glorot_uniform')),
        keras.layers.Dense(vocab_size)
    ])
    return model

model = build_model(vocab_size=vocab_size,
                    embedding_dim=embedding_dim,
                    rnn_units=rnn_units,
                    batch_size=batch_size)
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (64, None, 128)           813952    
_________________________________________________________________
bidirectional (Bidirectional (64, None, 1024)          2625536   
_________________________________________________________________
bidirectional_1 (Bidirection (64, None, 1024)          6295552   
_________________________________________________________________
dense (Dense)                (64, None, 6359)          6517975   
Total params: 16,253,015
Trainable params: 16,253,015
Non-trainable params: 0
_________________________________________________________________


In [12]:
for input_example_batch, target_example_batch in seq_dataset.take(1):
    example_batch_predictions = model(input_example_batch)
    print(example_batch_predictions.shape)

(64, 7, 6359)


In [13]:
# 选取所有的概率最大值的方式被称为贪心算法，这样的方式不一定能够得到整体概率最大值
# 我们使用随机采样的方式来获取
sample_indices = tf.random.categorical(logits=example_batch_predictions[0], num_samples=1)
print(sample_indices)
sample_indices = tf.squeeze(sample_indices, axis=-1)
print(repr(''.join(idx2_char[input_example_batch[0].numpy()])))
print(repr(''.join(idx2_char[target_example_batch[0].numpy()])))
print(''.join(idx2_char[sample_indices.numpy()]))

tf.Tensor(
[[2178]
 [3726]
 [3385]
 [2510]
 [3421]
 [1282]
 [2930]], shape=(7, 1), dtype=int64)
'道路悠悠不知处'
'山高海阔谁辛苦'
曲竖瘴次盈峡澎


In [14]:
def loss(labels, logits):
    return keras.losses.sparse_categorical_crossentropy(
        labels,
        logits,
        from_logits=True,
    )
model.compile(optimizer='adam', loss=loss)
example_loss = loss(target_example_batch, example_batch_predictions)
print(example_loss.shape, example_loss.numpy().mean())

(64, 7) 8.75763


In [15]:
output_dir = './text_generations_checkpoints'
if not os.path.exists(output_dir):
    os.mkdir(output_dir)
checkpoint_prefix = os.path.join(output_dir, 'check_{epoch}')
checkpoint_callback = keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_prefix,
    save_weights_only = True
)
epochs = 100
history = model.fit(seq_dataset, epochs = epochs, 
                    callbacks=[checkpoint_callback])

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


## **3.4 载入模型与预测**

In [16]:
tf.train.latest_checkpoint(output_dir)

'./text_generations_checkpoints\\check_100'

In [17]:
model2 = build_model(vocab_size, 
                     embedding_dim, 
                     rnn_units,
                     batch_size = 1)
model2.load_weights(tf.train.latest_checkpoint(output_dir))
model2.build(tf.TensorShape([1, None])) # 加载后设置输入1歌样本变长序列

序列生成的流程应该是:
- 将a输入模型得到b
- 将ab输入到模型得到c
- 将abc输入到模型得到d
- 直到输出eos

In [18]:
model2.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (1, None, 128)            813952    
_________________________________________________________________
bidirectional_2 (Bidirection (1, None, 1024)           2625536   
_________________________________________________________________
bidirectional_3 (Bidirection (1, None, 1024)           6295552   
_________________________________________________________________
dense_1 (Dense)              (1, None, 6359)           6517975   
Total params: 16,253,015
Trainable params: 16,253,015
Non-trainable params: 0
_________________________________________________________________


In [19]:
def generate_text(model, start_string, num_generate = 17, generate_type = 'greedy'):
    input_eval = [char2idx[ch] for ch in start_string] # 一维
    input_eval = tf.expand_dims(input_eval, 0)
    
    text_generated = []
    model.reset_states()
    if generate_type == 'greedy':
        predictions
#     for _ in range(num_generate):
#         # 1. model inference -> predictions
#         # 2. sample -> ch -> text_generated
#         # 3. update input_eval
#         predictions = model(input_eval) # [N, input_eval_len, vocab_size]
#         predictions = tf.squeeze(predictions, 0) # [input_eval_len, vocab_size]
#         predicted_id = tf.random.categorical(predictions, num_samples=1)[-1, 0].numpy() # [input_eval_len, 1]
#         text_generated.append(idx2_char[predicted_id])
#         input_eval = tf.expand_dims([predicted_id], 0) # 直接用predicted_id替换input_eval
#         if predicted_id.sum() == 0:
#             break
    return start_string + ''.join(text_generated)

In [30]:
new_text = generate_text(model2, '长安')
print(new_text)

长安蟠銮杯尽是是非命，霜出丰顽水华冷
