In [1]:
import tensorflow as tf
from tensorflow import keras
import numpy as np
import sklearn 
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
import os
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split

In [2]:
with open("./shakespeare.txt", mode="r") as f:
    text = f.read()

In [3]:
print(len(text))
print(text[:10])

1115394
First Citi


In [4]:
# 1.generate vocabulary
vocabulary = sorted(set(text))
print(len(vocabulary))
print(vocabulary)

65
['\n', ' ', '!', '$', '&', "'", ',', '-', '.', '3', ':', ';', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']


In [5]:
# 2.build mapping
char2idx = {value:key for key, value in enumerate(vocabulary)}
print(char2idx)

{'\n': 0, ' ': 1, '!': 2, '$': 3, '&': 4, "'": 5, ',': 6, '-': 7, '.': 8, '3': 9, ':': 10, ';': 11, '?': 12, 'A': 13, 'B': 14, 'C': 15, 'D': 16, 'E': 17, 'F': 18, 'G': 19, 'H': 20, 'I': 21, 'J': 22, 'K': 23, 'L': 24, 'M': 25, 'N': 26, 'O': 27, 'P': 28, 'Q': 29, 'R': 30, 'S': 31, 'T': 32, 'U': 33, 'V': 34, 'W': 35, 'X': 36, 'Y': 37, 'Z': 38, 'a': 39, 'b': 40, 'c': 41, 'd': 42, 'e': 43, 'f': 44, 'g': 45, 'h': 46, 'i': 47, 'j': 48, 'k': 49, 'l': 50, 'm': 51, 'n': 52, 'o': 53, 'p': 54, 'q': 55, 'r': 56, 's': 57, 't': 58, 'u': 59, 'v': 60, 'w': 61, 'x': 62, 'y': 63, 'z': 64}


In [6]:
# 根据索引找文字
idx2char = np.array(vocabulary)
print(idx2char)

['\n' ' ' '!' '$' '&' "'" ',' '-' '.' '3' ':' ';' '?' 'A' 'B' 'C' 'D' 'E'
 'F' 'G' 'H' 'I' 'J' 'K' 'L' 'M' 'N' 'O' 'P' 'Q' 'R' 'S' 'T' 'U' 'V' 'W'
 'X' 'Y' 'Z' 'a' 'b' 'c' 'd' 'e' 'f' 'g' 'h' 'i' 'j' 'k' 'l' 'm' 'n' 'o'
 'p' 'q' 'r' 's' 't' 'u' 'v' 'w' 'x' 'y' 'z']


In [7]:
# 3.transfer from char to index
text_as_index = [char2idx[c] for c in text]

print(len(text_as_index))
print(text_as_index[:10])
print(text[:10])

1115394
[18, 47, 56, 57, 58, 1, 15, 47, 58, 47]
First Citi


In [8]:
# from_tensor_slices: 数组 =》 dataset
char_dataset = tf.data.Dataset.from_tensor_slices(text_as_index)

In [9]:
for i in char_dataset.take(2):
    print(i, idx2char[i])

tf.Tensor(18, shape=(), dtype=int32) F
tf.Tensor(47, shape=(), dtype=int32) i


In [10]:
# batch: 分成batch大小的dataset
# batch_size:表示要在单个批次中合并的此数据集的连续元素个数
# drop_remainder：表示在少于batch_size元素的情况下是否应删除最后一批

seq_length = 100
seq_dataset = char_dataset.batch(batch_size=seq_length + 1, drop_remainder=True)

In [11]:
# 1115394 / 101 = 11043
len(seq_dataset)

11043

In [12]:
# 查看数据集元素
for seq in seq_dataset.take(2):
    print(seq)
    # idx => char
    print(repr("".join([idx2char[index] for index in seq])))

tf.Tensor(
[18 47 56 57 58  1 15 47 58 47 64 43 52 10  0 14 43 44 53 56 43  1 61 43
  1 54 56 53 41 43 43 42  1 39 52 63  1 44 59 56 58 46 43 56  6  1 46 43
 39 56  1 51 43  1 57 54 43 39 49  8  0  0 13 50 50 10  0 31 54 43 39 49
  6  1 57 54 43 39 49  8  0  0 18 47 56 57 58  1 15 47 58 47 64 43 52 10
  0 37 53 59  1], shape=(101,), dtype=int32)
'First Citizen:\nBefore we proceed any further, hear me speak.\n\nAll:\nSpeak, speak.\n\nFirst Citizen:\nYou '
tf.Tensor(
[39 56 43  1 39 50 50  1 56 43 57 53 50 60 43 42  1 56 39 58 46 43 56  1
 58 53  1 42 47 43  1 58 46 39 52  1 58 53  1 44 39 51 47 57 46 12  0  0
 13 50 50 10  0 30 43 57 53 50 60 43 42  8  1 56 43 57 53 50 60 43 42  8
  0  0 18 47 56 57 58  1 15 47 58 47 64 43 52 10  0 18 47 56 57 58  6  1
 63 53 59  1 49], shape=(101,), dtype=int32)
'are all resolved rather to die than to famish?\n\nAll:\nResolved. resolved.\n\nFirst Citizen:\nFirst, you k'


In [13]:
# 4.预测文本 
# data: abcd, return: abc,bcd
def split_input_target(data):
    return data[0:-1], data[1:]

In [14]:
seq_dataset = seq_dataset.map(split_input_target)

# inputdata, outputdata: shape=100
for inputdata, outputdata in seq_dataset.take(2):
    print(inputdata)
    print(outputdata)

tf.Tensor(
[18 47 56 57 58  1 15 47 58 47 64 43 52 10  0 14 43 44 53 56 43  1 61 43
  1 54 56 53 41 43 43 42  1 39 52 63  1 44 59 56 58 46 43 56  6  1 46 43
 39 56  1 51 43  1 57 54 43 39 49  8  0  0 13 50 50 10  0 31 54 43 39 49
  6  1 57 54 43 39 49  8  0  0 18 47 56 57 58  1 15 47 58 47 64 43 52 10
  0 37 53 59], shape=(100,), dtype=int32)
tf.Tensor(
[47 56 57 58  1 15 47 58 47 64 43 52 10  0 14 43 44 53 56 43  1 61 43  1
 54 56 53 41 43 43 42  1 39 52 63  1 44 59 56 58 46 43 56  6  1 46 43 39
 56  1 51 43  1 57 54 43 39 49  8  0  0 13 50 50 10  0 31 54 43 39 49  6
  1 57 54 43 39 49  8  0  0 18 47 56 57 58  1 15 47 58 47 64 43 52 10  0
 37 53 59  1], shape=(100,), dtype=int32)
tf.Tensor(
[39 56 43  1 39 50 50  1 56 43 57 53 50 60 43 42  1 56 39 58 46 43 56  1
 58 53  1 42 47 43  1 58 46 39 52  1 58 53  1 44 39 51 47 57 46 12  0  0
 13 50 50 10  0 30 43 57 53 50 60 43 42  8  1 56 43 57 53 50 60 43 42  8
  0  0 18 47 56 57 58  1 15 47 58 47 64 43 52 10  0 18 47 56 57 58  6  1
 63 53 

In [15]:
batch_size = 64
buffer_size = 1000

seq_dataset = seq_dataset.shuffle(buffer_size).batch(
    batch_size, drop_remainder=True)

# 11043/64 = 172
print(len(seq_dataset))

172


### GRU

In [16]:
def build_model(vocab_size, embedding_dim, batch_size, units):
    model = keras.models.Sequential([
        keras.layers.Embedding(
            input_dim = vocab_size, 
            output_dim = embedding_dim, 
            # [batch_size, None]:元素个数确定， 其他未定
            batch_input_shape = [batch_size, None]),
        keras.layers.GRU(
            # units：输出空间的维度
            # stateful：默认 False，如果为 True，则批次中索引 i 处的每个样品的最后状态将用作下一批次中索引 i 样品的初始状态。
            # recurrent_initializer
            # return_sequences：是返回输出序列中的最后一个输出，还是全部序列
            units=units, 
            stateful = True,
            recurrent_initializer = 'glorot_uniform',
            return_sequences=True),
        keras.layers.Dense(vocab_size),
    ])
    
    return model

In [17]:
vocab_size = len(vocabulary)
embedding_dim = 256
units = 1024
batch_size = 64

model = build_model(vocab_size, embedding_dim, batch_size, units)

model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (64, None, 256)           16640     
_________________________________________________________________
gru (GRU)                    (64, None, 1024)          3938304   
_________________________________________________________________
dense (Dense)                (64, None, 65)            66625     
Total params: 4,021,569
Trainable params: 4,021,569
Non-trainable params: 0
_________________________________________________________________


In [18]:
# 对于每个字符，模型会查找嵌入，把嵌入当作输入运行一个时间步，并用密集层生成逻辑回归 （logits），预测下一个字符的对数可能性。
# 预测文本
for input_example_batch, target_example_batch in seq_dataset.take(1):
    print(input_example_batch, input_example_batch.shape)
    print(target_example_batch, target_example_batch.shape)
    # 函数式调用
    example_batch_predictions = model(input_example_batch)
    print(example_batch_predictions.shape)

tf.Tensor(
[[53 59 56 ... 59 57 10]
 [35 46 43 ... 58  1 47]
 [53 40 50 ...  1 42 53]
 ...
 [47 43 57 ...  6  0 42]
 [ 1 44 47 ...  1 44 39]
 [61  1 58 ...  1 39  1]], shape=(64, 100), dtype=int32) (64, 100)
tf.Tensor(
[[59 56  1 ... 57 10  0]
 [46 43 52 ...  1 47 57]
 [40 50 43 ... 42 53  1]
 ...
 [43 57  8 ...  0 42 47]
 [44 47 56 ... 44 39 57]
 [ 1 58 46 ... 39  1 51]], shape=(64, 100), dtype=int32) (64, 100)
(64, 100, 65)


In [19]:
# tf.random.categorical：从一个分类分布中抽取样本
# logits: 形状为 [batch_size, num_classes]的张量
# num_samples：从每一行切片中抽取的独立样本的数量
# [batch_size * num_classes] => [batch_size * num_samples] , [100, 65] => [100, 1]
sample_indices = tf.random.categorical(logits=example_batch_predictions[0], num_samples=1)
print(sample_indices)

# 从input中删除一个纬度
sample_indice = tf.squeeze(input=sample_indices, axis=1)
print(sample_indice)

tf.Tensor(
[[17]
 [59]
 [30]
 [ 6]
 [56]
 [47]
 [20]
 [ 9]
 [48]
 [37]
 [63]
 [10]
 [49]
 [48]
 [10]
 [32]
 [32]
 [38]
 [48]
 [ 9]
 [14]
 [28]
 [56]
 [38]
 [ 1]
 [41]
 [43]
 [31]
 [49]
 [49]
 [30]
 [ 5]
 [ 1]
 [40]
 [37]
 [47]
 [13]
 [18]
 [ 2]
 [27]
 [58]
 [57]
 [ 6]
 [25]
 [43]
 [53]
 [22]
 [36]
 [28]
 [21]
 [ 8]
 [52]
 [25]
 [64]
 [43]
 [ 7]
 [27]
 [34]
 [60]
 [26]
 [14]
 [63]
 [34]
 [50]
 [27]
 [ 1]
 [ 2]
 [49]
 [20]
 [24]
 [53]
 [ 7]
 [41]
 [42]
 [35]
 [ 0]
 [24]
 [46]
 [49]
 [62]
 [34]
 [19]
 [15]
 [31]
 [60]
 [61]
 [41]
 [ 7]
 [35]
 [33]
 [30]
 [ 2]
 [ 7]
 [ 5]
 [31]
 [58]
 [52]
 [42]
 [59]
 [10]], shape=(100, 1), dtype=int64)
tf.Tensor(
[17 59 30  6 56 47 20  9 48 37 63 10 49 48 10 32 32 38 48  9 14 28 56 38
  1 41 43 31 49 49 30  5  1 40 37 47 13 18  2 27 58 57  6 25 43 53 22 36
 28 21  8 52 25 64 43  7 27 34 60 26 14 63 34 50 27  1  2 49 20 24 53  7
 41 42 35  0 24 46 49 62 34 19 15 31 60 61 41  7 35 33 30  2  7  5 31 58
 52 42 59 10], shape=(100,), dtype=int64)


In [20]:
# 对比预测的数据
print("input:", repr("".join([idx2char[i] for i in np.array(input_example_batch[0])])))
print("output:", repr("".join([idx2char[i] for i in np.array(target_example_batch[0])])))
print("predict:", repr("".join([idx2char[i] for i in sample_indice])))

input: 'our shields before your hearts, and fight\nWith hearts more proof than shields. Advance,\nbrave Titus:'
output: 'ur shields before your hearts, and fight\nWith hearts more proof than shields. Advance,\nbrave Titus:\n'
predict: "EuR,riH3jYy:kj:TTZj3BPrZ ceSkkR' bYiAF!Ots,MeoJXPI.nMze-OVvNByVlO !kHLo-cdW\nLhkxVGCSvwc-WUR!-'Stndu:"


In [21]:
# from_logits=True：因为返回的是逻辑回归，所以设置TRUE
def loss(labels, logits):
    return tf.keras.losses.sparse_categorical_crossentropy(y_true=labels,
                                                     y_pred=logits,
                                                     from_logits=True,)

model.compile(optimizer="adam", loss=loss)
example_loss = loss(target_example_batch, example_batch_predictions)
print(example_loss.shape)
print(example_loss.numpy().mean())

(64, 100)
4.1721706


In [22]:
# 设置检查点
output_dir = "./out_generation_checkpoints"
if not os.path.exists(output_dir):
    os.mkdir(output_dir)
    
checkpoint_prefix = os.path.join(output_dir, 'ckpt_{epoch}')
# 保存权重信息
checkpoint_callback = keras.callbacks.ModelCheckpoint(
    filepath = checkpoint_prefix,
    save_weights_only = True)

epochs = 10
history = model.fit(seq_dataset, epochs = epochs,
                    callbacks = [checkpoint_callback])

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [23]:
# 使用训练好的模型
for input_example_batch, target_example_batch in seq_dataset.take(1):
    print(input_example_batch, input_example_batch.shape)
    print(target_example_batch, target_example_batch.shape)
    # 函数式调用
    example_batch_predictions = model(input_example_batch)
    print(example_batch_predictions.shape)
    
# 对比预测的数据
print("input:", repr("".join([idx2char[i] for i in np.array(input_example_batch[0])])))
print("output:", repr("".join([idx2char[i] for i in np.array(target_example_batch[0])])))
print("predict:", repr("".join([idx2char[i] for i in sample_indice])))

tf.Tensor(
[[53  1 46 ... 41 53 52]
 [52 53 61 ... 57  1 49]
 [43  1 56 ... 21  1 40]
 ...
 [ 2  0  0 ... 41 49  6]
 [43  1 39 ... 53  1 49]
 [ 1 58 46 ... 57  1 51]], shape=(64, 100), dtype=int32) (64, 100)
tf.Tensor(
[[ 1 46 43 ... 53 52 42]
 [53 61  1 ...  1 49 47]
 [ 1 56 53 ...  1 40 39]
 ...
 [ 0  0 34 ... 49  6  1]
 [ 1 39 56 ...  1 49 43]
 [58 46 43 ...  1 51 59]], shape=(64, 100), dtype=int32) (64, 100)
(64, 100, 65)
input: "o help Cominius.\n\nLARTIUS:\nWorthy sir, thou bleed'st;\nThy exercise hath been too violent for\nA secon"
output: " help Cominius.\n\nLARTIUS:\nWorthy sir, thou bleed'st;\nThy exercise hath been too violent for\nA second"
predict: "EuR,riH3jYy:kj:TTZj3BPrZ ceSkkR' bYiAF!Ots,MeoJXPI.nMze-OVvNByVlO !kHLo-cdW\nLhkxVGCSvwc-WUR!-'Stndu:"


In [24]:
tf.train.latest_checkpoint(output_dir)

'./out_generation_checkpoints\\ckpt_10'

In [25]:
# 恢复最新的检查点
# 为保持此次预测步骤简单，将批大小设定为 1。
# 由于 RNN 状态从时间步传递到时间步的方式，模型建立好之后只接受固定的批大小。
# 若要使用不同的 batch_size 来运行模型，我们需要重建模型并从检查点中恢复权重。
model02 = build_model(vocab_size=vocab_size, embedding_dim=embedding_dim, batch_size=1, units=units)
model02.load_weights(tf.train.latest_checkpoint(output_dir))
model02.build(tf.TensorShape([1, None]))
model02.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (1, None, 256)            16640     
_________________________________________________________________
gru_1 (GRU)                  (1, None, 1024)           3938304   
_________________________________________________________________
dense_1 (Dense)              (1, None, 65)             66625     
Total params: 4,021,569
Trainable params: 4,021,569
Non-trainable params: 0
_________________________________________________________________


In [26]:
def generate_text(model, start_string):
    input_eval = [char2idx[c] for c in start_string]
    input_eval = tf.expand_dims(input_eval, 0)
    
    num_generate = 1000
    text_generate = []
    model.reset_states()
    
    for _ in range(num_generate):
        predictions = model(input_eval)
        
        predictions = tf.squeeze(predictions, 0)
        predicted_id = tf.random.categorical(predictions, num_samples=1)[-1,0].numpy()
        
        #sample_indice = tf.random.categorical(logits=predictions[0], num_samples=1)
        #sample_indice = tf.squeeze(input=sample_indices, axis=1)
        text_generate.append(idx2char[predicted_id])
        input_eval = tf.expand_dims([predicted_id], 0)

    return start_string + "".join(text_generate)

In [27]:
print(generate_text(model02, "All:"))

All: answer thies? E thank you, sir, which; though he would hear
The subjesher than ourserves, abtake or sheek;
And, fie, fie, father,.

BAPTISTA:

Cantil. I pray.

FROTH:
Why, there nemes, I know on 't. And how, no!
God full of request, I'll give with thee: Frey? or I'll bear myselves. Was it shall heard thee,
Than that he would brings them virtuous countrigment,
Ortentle, make thee might rough all, I said
against the impatch to met on youth you say, sir.

MIRANDA:
City, wretch: marry, sir, Hastings are,
Nidecyon me, gentle inquest of -indeeterance
In her trust in hability, shr dreams there; you hear,
If you that duintly not in powerful raze with some fire, spural!

WARWICK:
O, sir?
Happosts how? I find you, sir, I would possess to
Richard is it tongues that Lewis ask for 's.

SEBASTIAN:
A valling swenter on Cates.

ISABELLA:
O trephem Helpefore.

GREMIO:
Ay, marry, my leave, I fear me.

MIRANDA:
Lenceare my father; being order:
I'll make thee so shall my youngest house-tells;
Because