In [1]:
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
import sklearn
import pandas as pd
import os, sys, time
import tensorflow as tf
from tensorflow import keras

在进行模型建设之前，一般应该完成如下三个操作:
- 词典构建(id - &gt; word, word -&gt; id)
- 数据变换 (sentence -&gt; id)
- 训练集和测试集的构建(abcd -> bcd&lt;eos&gt;)

## **3.1 数据集读取**

In [2]:
input_filepath = './data/poem/tang_poems_7.txt'

with open(input_filepath, 'r', encoding='utf-8') as f:
    text = ''
    for s in f.readlines():
        if '□' not in s:
            text = text + s
print(len(text))
print(text[0:85])

1243210
采玉采玉须水碧，琢作步摇徒好色。
傍水野禽通体白，饤盘山菓半边红。
宜秋下邑摧凶丑，当锋入阵宋中丞。
吾为子起歌都护，酒阑插剑肝胆露。
南北东西九千里，除兄与弟更无人。



## **3.2 词表操作**
### **3.2.1 生成词表**

In [3]:
vocab = sorted(set(text))
print(len(vocab))
print(vocab[10:100])

6359
['䝙', '䯄', '䲡', '䴔', '䴖', '䴙', '一', '丁', '七', '万', '丈', '三', '上', '下', '不', '与', '丐', '丑', '专', '且', '丕', '世', '丘', '丙', '业', '丛', '东', '丝', '丞', '两', '严', '丧', '个', '丫', '中', '丰', '丱', '串', '临', '丸', '丹', '为', '主', '丽', '举', '乃', '久', '么', '义', '之', '乌', '乍', '乎', '乏', '乐', '乔', '乖', '乘', '乙', '九', '乞', '也', '习', '乡', '书', '买', '乱', '乳', '乾', '了', '予', '争', '事', '二', '于', '亏', '云', '互', '五', '井', '亘', '亚', '些', '亡', '亢', '交', '亥', '亦', '产', '亨']


In [4]:
char2idx = {char : idx for idx, char in enumerate(vocab)}
print(char2idx['一'])

16


In [5]:
vocab[:10]

['\n', '。', '㔉', '㧑', '㶉', '䃅', '䌷', '䍀', '䗖', '䜩']

In [6]:
idx2_char = np.array(vocab)

## **3.2.2 将数据集转换为数字**

In [7]:
text_as_int = np.array([char2idx[c] for c in text])
print(text_as_int[0:17])
print(text[0:17])

[5497 3177 5497 3177 5857 2597 3565 6270 3222  188 2537 1960 1524 1009
 4334    1    0]
采玉采玉须水碧，琢作步摇徒好色。



In [8]:
def split_input_target(id_text):
    '''
    abcde -> abcd, bcde, 这里是给定生成，如果需要对联的话，需要id_text[0:7],[7:15]
    '''
    return id_text[0: -1], id_text[1:]

char_dataset = tf.data.Dataset.from_tensor_slices(text_as_int)
seq_length = 16 # 7 + 1 + 7 + 1
seq_dataset = char_dataset.batch(seq_length + 1, drop_remainder = True) # 加一是为了产出\n, 这个batch是为了生成一个句子
for ch_id in char_dataset.take(2):
    print(ch_id, idx2_char[ch_id.numpy()])

for seq_id in seq_dataset.take(2):
    print(seq_id)
    print(''.join(idx2_char[seq_id.numpy()]))

tf.Tensor(5497, shape=(), dtype=int32) 采
tf.Tensor(3177, shape=(), dtype=int32) 玉
tf.Tensor(
[5497 3177 5497 3177 5857 2597 3565 6270 3222  188 2537 1960 1524 1009
 4334    1    0], shape=(17,), dtype=int32)
采玉采玉须水碧，琢作步摇徒好色。

tf.Tensor(
[ 288 2597 5501 3640 5342  182 3395 6270 5916 3431 1247 4479  523 5286
 3929    1    0], shape=(17,), dtype=int32)
傍水野禽通体白，饤盘山菓半边红。



In [9]:
#调用seq生成x和y
seq_dataset = seq_dataset.map(split_input_target)

for item_input, item_output in seq_dataset.take(2):
    print(item_input, ''.join(idx2_char[item_input.numpy()]))
    print(item_output, repr(''.join(idx2_char[item_output.numpy()])))

tf.Tensor(
[5497 3177 5497 3177 5857 2597 3565 6270 3222  188 2537 1960 1524 1009
 4334    1], shape=(16,), dtype=int32) 采玉采玉须水碧，琢作步摇徒好色。
tf.Tensor(
[3177 5497 3177 5857 2597 3565 6270 3222  188 2537 1960 1524 1009 4334
    1    0], shape=(16,), dtype=int32) '玉采玉须水碧，琢作步摇徒好色。\n'
tf.Tensor(
[ 288 2597 5501 3640 5342  182 3395 6270 5916 3431 1247 4479  523 5286
 3929    1], shape=(16,), dtype=int32) 傍水野禽通体白，饤盘山菓半边红。
tf.Tensor(
[2597 5501 3640 5342  182 3395 6270 5916 3431 1247 4479  523 5286 3929
    1    0], shape=(16,), dtype=int32) '水野禽通体白，饤盘山菓半边红。\n'


## **3.3 模型构建**

In [10]:
batch_size = 64
buffer_size = 1000
seq_dataset = seq_dataset.shuffle(buffer_size).batch(batch_size, drop_remainder=True)

In [11]:

vocab_size = len(vocab)
embedding_dim = 128
rnn_units = 512

def build_model(vocab_size, embedding_dim, rnn_units, batch_size):
    model = keras.models.Sequential([
        keras.layers.Embedding(vocab_size, embedding_dim, batch_input_shape=[batch_size, None]),
        keras.layers.LSTM(units=rnn_units, return_sequences=True),
        keras.layers.LSTM(units=rnn_units, return_sequences=True),
        keras.layers.Dense(vocab_size)
    ])
    return model

model = build_model(vocab_size=vocab_size,
                    embedding_dim=embedding_dim,
                    rnn_units=rnn_units,
                    batch_size=batch_size)
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (64, None, 128)           813952    
_________________________________________________________________
lstm (LSTM)                  (64, None, 512)           1312768   
_________________________________________________________________
lstm_1 (LSTM)                (64, None, 512)           2099200   
_________________________________________________________________
dense (Dense)                (64, None, 6359)          3262167   
Total params: 7,488,087
Trainable params: 7,488,087
Non-trainable params: 0
_________________________________________________________________


In [12]:
for input_example_batch, target_example_batch in seq_dataset.take(1):
    example_batch_predictions = model(input_example_batch)
    print(example_batch_predictions.shape)

(64, 16, 6359)


In [13]:
# 选取所有的概率最大值的方式被称为贪心算法，这样的方式不一定能够得到整体概率最大值
# 我们使用随机采样的方式来获取
sample_indices = tf.random.categorical(logits=example_batch_predictions[0], num_samples=1)
print(sample_indices)
sample_indices = tf.squeeze(sample_indices, axis=-1)
print(repr(''.join(idx2_char[input_example_batch[0].numpy()])))
print(repr(''.join(idx2_char[target_example_batch[0].numpy()])))
print(''.join(idx2_char[sample_indices.numpy()]))

tf.Tensor(
[[1040]
 [2278]
 [2051]
 [2789]
 [5527]
 [3640]
 [5483]
 [4177]
 [2275]
 [1959]
 [5252]
 [3023]
 [1851]
 [4816]
 [ 303]
 [2583]], shape=(16, 1), dtype=int64)
'南岸春田手自农，往来横截半江风。'
'岸春田手自农，往来横截半江风。\n'
姝枸斝淈钗禽醐肖枳摆轼焙按袭僰毹


In [14]:
def loss(labels, logits):
    return keras.losses.sparse_categorical_crossentropy(
        labels,
        logits,
        from_logits=True,
    )
model.compile(optimizer='adam', loss=loss)
example_loss = loss(target_example_batch, example_batch_predictions)
print(example_loss.shape, example_loss.numpy().mean())

(64, 16) 8.757677


In [17]:
output_dir = './text_generations_checkpoints'
if not os.path.exists(output_dir):
    os.mkdir(output_dir)
checkpoint_prefix = os.path.join(output_dir, 'check_{epoch}')
checkpoint_callback = keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_prefix,
    save_weights_only = True
)
epochs = 300
history = model.fit(seq_dataset, epochs = epochs, 
                    callbacks=[checkpoint_callback])

Epoch 1/300
Epoch 2/300
Epoch 3/300
Epoch 4/300
Epoch 5/300
Epoch 6/300
Epoch 7/300
Epoch 8/300
Epoch 9/300
Epoch 10/300
Epoch 11/300
Epoch 12/300
Epoch 13/300
Epoch 14/300
Epoch 15/300
Epoch 16/300
Epoch 17/300
Epoch 18/300
Epoch 19/300
Epoch 20/300
Epoch 21/300
Epoch 22/300
Epoch 23/300
Epoch 24/300
Epoch 25/300
Epoch 26/300
Epoch 27/300
Epoch 28/300
Epoch 29/300
Epoch 30/300
Epoch 31/300
Epoch 32/300
Epoch 33/300
Epoch 34/300
Epoch 35/300
Epoch 36/300
Epoch 37/300
Epoch 38/300
Epoch 39/300
Epoch 40/300
Epoch 41/300
Epoch 42/300
Epoch 43/300
Epoch 44/300
Epoch 45/300
Epoch 46/300
Epoch 47/300
Epoch 48/300
Epoch 49/300
Epoch 50/300
Epoch 51/300
Epoch 52/300
Epoch 53/300
Epoch 54/300
Epoch 55/300
Epoch 56/300
Epoch 57/300
Epoch 58/300
Epoch 59/300
Epoch 60/300
Epoch 61/300
Epoch 62/300
Epoch 63/300
Epoch 64/300
Epoch 65/300
Epoch 66/300
Epoch 67/300
Epoch 68/300
Epoch 69/300
Epoch 70/300
Epoch 71/300
Epoch 72/300
Epoch 73/300
Epoch 74/300
Epoch 75/300
Epoch 76/300
Epoch 77/300
Epoch 78

Epoch 95/300
Epoch 96/300
Epoch 97/300
Epoch 98/300
Epoch 99/300
Epoch 100/300
Epoch 101/300
Epoch 102/300
Epoch 103/300
Epoch 104/300
Epoch 105/300
Epoch 106/300
Epoch 107/300
Epoch 108/300
Epoch 109/300
Epoch 110/300
Epoch 111/300
Epoch 112/300
Epoch 113/300
Epoch 114/300
Epoch 115/300
Epoch 116/300
Epoch 117/300
Epoch 118/300
Epoch 119/300
Epoch 120/300
Epoch 121/300
Epoch 122/300
Epoch 123/300
Epoch 124/300
Epoch 125/300
Epoch 126/300
Epoch 127/300
Epoch 128/300
Epoch 129/300
Epoch 130/300
Epoch 131/300
Epoch 132/300
Epoch 133/300
Epoch 134/300
Epoch 135/300
Epoch 136/300
Epoch 137/300
Epoch 138/300
Epoch 139/300
Epoch 140/300
Epoch 141/300
Epoch 142/300
Epoch 143/300
Epoch 144/300
Epoch 145/300
Epoch 146/300
Epoch 147/300
Epoch 148/300
Epoch 149/300
Epoch 150/300
Epoch 151/300
Epoch 152/300
Epoch 153/300
Epoch 154/300
Epoch 155/300
Epoch 156/300
Epoch 157/300
Epoch 158/300
Epoch 159/300
Epoch 160/300
Epoch 161/300
Epoch 162/300
Epoch 163/300
Epoch 164/300
Epoch 165/300
Epoch 166/3

Epoch 189/300
Epoch 190/300
Epoch 191/300
Epoch 192/300
Epoch 193/300
Epoch 194/300
Epoch 195/300
Epoch 196/300
Epoch 197/300
Epoch 198/300
Epoch 199/300
Epoch 200/300
Epoch 201/300
Epoch 202/300
Epoch 203/300
Epoch 204/300
Epoch 205/300
Epoch 206/300
Epoch 207/300
Epoch 208/300
Epoch 209/300
Epoch 210/300
Epoch 211/300
Epoch 212/300
Epoch 213/300
Epoch 214/300
Epoch 215/300
Epoch 216/300
Epoch 217/300
Epoch 218/300
Epoch 219/300
Epoch 220/300
Epoch 221/300
Epoch 222/300
Epoch 223/300
Epoch 224/300
Epoch 225/300
Epoch 226/300
Epoch 227/300
Epoch 228/300
Epoch 229/300
Epoch 230/300
Epoch 231/300
Epoch 232/300
Epoch 233/300
Epoch 234/300
Epoch 235/300
Epoch 236/300
Epoch 237/300
Epoch 238/300
Epoch 239/300
Epoch 240/300
Epoch 241/300
Epoch 242/300
Epoch 243/300
Epoch 244/300
Epoch 245/300
Epoch 246/300
Epoch 247/300
Epoch 248/300
Epoch 249/300
Epoch 250/300
Epoch 251/300
Epoch 252/300
Epoch 253/300
Epoch 254/300
Epoch 255/300
Epoch 256/300
Epoch 257/300
Epoch 258/300
Epoch 259/300
Epoch 

Epoch 282/300
Epoch 283/300
Epoch 284/300
Epoch 285/300
Epoch 286/300
Epoch 287/300
Epoch 288/300
Epoch 289/300
Epoch 290/300
Epoch 291/300
Epoch 292/300
Epoch 293/300
Epoch 294/300
Epoch 295/300
Epoch 296/300
Epoch 297/300
Epoch 298/300
Epoch 299/300
Epoch 300/300


## **3.4 载入模型与预测**

In [18]:
tf.train.latest_checkpoint(output_dir)

'./text_generations_checkpoints\\check_300'

In [19]:
model2 = build_model(vocab_size, 
                     embedding_dim, 
                     rnn_units,
                     batch_size = 1)
model2.load_weights(tf.train.latest_checkpoint(output_dir))
model2.build(tf.TensorShape([1, None])) # 加载后设置输入1歌样本变长序列

序列生成的流程应该是:
- 将a输入模型得到b
- 将ab输入到模型得到c
- 将abc输入到模型得到d
- 直到输出eos

In [20]:
model2.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (1, None, 128)            813952    
_________________________________________________________________
lstm_2 (LSTM)                (1, None, 512)            1312768   
_________________________________________________________________
lstm_3 (LSTM)                (1, None, 512)            2099200   
_________________________________________________________________
dense_1 (Dense)              (1, None, 6359)           3262167   
Total params: 7,488,087
Trainable params: 7,488,087
Non-trainable params: 0
_________________________________________________________________


In [21]:
def generate_text(model, start_string, num_generate = 17):
    input_eval = [char2idx[ch] for ch in start_string] # 一维
    input_eval = tf.expand_dims(input_eval, 0)
    
    text_generated = []
    model.reset_states()
    for _ in range(num_generate - len(input_eval)):
        # 1. model inference -> predictions
        # 2. sample -> ch -> text_generated
        # 3. update input_eval
        predictions = model(input_eval) # [N, input_eval_len, vocab_size]
        predictions = tf.squeeze(predictions, 0) # [input_eval_len, vocab_size]
        predicted_id = tf.random.categorical(predictions, num_samples=1)[-1, 0].numpy() # [input_eval_len, 1]
        text_generated.append(idx2_char[predicted_id])
        input_eval = tf.expand_dims([predicted_id], 0) # 直接用predicted_id替换input_eval
        if predicted_id.sum() == 0:
            break
    return start_string + ''.join(text_generated)

In [26]:
new_text = generate_text(model2, '无边')
print(new_text)

无边无奈何时人情如何年少年少年年方知
