In [1]:
import os
import keras_nlp
import keras
import tensorflow.data as tf_data
import tensorflow.strings as tf_strings
import tensorflow as tf

for gpu in tf.config.experimental.list_physical_devices("GPU"):
    tf.config.experimental.set_memory_growth(gpu, True)

D:\anaconda\envs\tf-gpu-2.10.0-py-3.10\lib\site-packages\numpy\.libs\libopenblas.FB5AE2TYXYH2IJRDKGDGQ3XBKLKTF43H.gfortran-win_amd64.dll
D:\anaconda\envs\tf-gpu-2.10.0-py-3.10\lib\site-packages\numpy\.libs\libopenblas64__v0.3.23-246-g3d31191b-gcc_10_3_0.dll


Using TensorFlow backend


In [3]:
BATCH_SIZE = 64
MIN_STRING_LEN = 512
SEQ_LEN = 128
EMBED_DIM = 256
FEED_FORWARD_DIM = 128
NUM_HEADS = 3
NUM_LAYERS = 2
VOCAB_SIZE = 5000
EPOCHS = 5
NUM_TOKENS_TO_GENERATE = 80

In [24]:
keras.utils.get_file(origin="https://dldata-public.s3.us-east-2.amazonaws.com/simplebooks.zip", extract=True, )
dir = os.path.expanduser("~/.keras/datasets/simplebooks/")
raw_train_ds = tf_data.TextLineDataset(dir + "simplebooks-92-raw/train.txt").filter(lambda x: tf_strings.length(x) > MIN_STRING_LEN).batch(BATCH_SIZE).shuffle(buffer_size = 256)
raw_val_ds = tf_data.TextLineDataset(dir + "simplebooks-92-raw/valid.txt").filter(lambda x: tf_strings.length(x) > MIN_STRING_LEN).batch(BATCH_SIZE)
# 一个单词列表，并且前三个分别是 '[PAD]','[UNK]','[BOS]'
vocab = keras_nlp.tokenizers.compute_word_piece_vocabulary(raw_train_ds, vocabulary_size=VOCAB_SIZE, lowercase=True, reserved_tokens=["[PAD]", "[UNK]", "[BOS]"])
# 根据 vocab 可以给句子进行分词并转换成对应的 id ，id 长度为 SEQ_LEN ，不足的用 0 补齐，超出的截断
tokenizer = keras_nlp.tokenizers.WordPieceTokenizer(vocabulary=vocab, sequence_length=SEQ_LEN, lowercase=True)
# 在 id 序列的开始加上 [BOS] 对应的 id ，最后的 id 序列长度为 SEQ_LEN ，不足的用 0 补齐，超出的截断
start_packer = keras_nlp.layers.StartEndPacker(sequence_length=SEQ_LEN, start_value=tokenizer.token_to_id("[BOS]"))
def preprocess(inputs):
    outputs = tokenizer(inputs)
    features = start_packer(outputs)
    labels = outputs
    return features, labels
train_ds = raw_train_ds.map(preprocess, num_parallel_calls=tf_data.AUTOTUNE).prefetch(tf_data.AUTOTUNE)
val_ds = raw_val_ds.map(preprocess, num_parallel_calls=tf_data.AUTOTUNE).prefetch(tf_data.AUTOTUNE)

In [26]:
inputs = keras.layers.Input(shape=(None,), dtype="int32")
embedding_layer = keras_nlp.layers.TokenAndPositionEmbedding(vocabulary_size=VOCAB_SIZE, sequence_length=SEQ_LEN, embedding_dim=EMBED_DIM, mask_zero=True)
x = embedding_layer(inputs)
for _ in range(NUM_LAYERS):
    decoder_layer = keras_nlp.layers.TransformerDecoder(num_heads=NUM_HEADS, intermediate_dim=FEED_FORWARD_DIM)
    x = decoder_layer(x)
outputs = keras.layers.Dense(VOCAB_SIZE)(x)
model = keras.Model(inputs=inputs, outputs=outputs)
loss_fn = keras.losses.SparseCategoricalCrossentropy(from_logits=True)
perplexity = keras_nlp.metrics.Perplexity(from_logits=True, mask_token_id=0)
model.compile(optimizer="adam", loss=loss_fn, metrics=[perplexity])
model.fit(train_ds, validation_data=val_ds, epochs=EPOCHS)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x282840929e0>

In [90]:
prompt_tokens = start_packer(tokenizer(["today is "]))
prompt_tokens

<tf.Tensor: shape=(1, 128), dtype=int32, numpy=
array([[   2, 4608,  124,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0]])>

In [92]:
def next(prompt, cache, index):
    logits = model(prompt)[:, index-1, :]
    hidden_states = None
    return logits, hidden_states, cache

In [100]:
sampler = keras_nlp.samplers.GreedySampler()
output_tokens = sampler(next=next, prompt=prompt_tokens, index=3,) # 开始采样的 `prompt` 的第一个索引。通常将其设置为 `prompt` 中最短非填充序列的长度。  
txt = tokenizer.detokenize(output_tokens)
# 贪婪搜索一开始是有意义的，但很快就开始重复。这是文本生成的一个常见问题，可以通过稍后介绍的一些概率文本生成实用程序来解决！
print(f"Greedy search generated text: \n{txt}\n")

Greedy search generated text: 
[b'[BOS] today is the way to the westward , and the sparks of the sea , and the sea - shore of the sea , and the sea - shore of the sea , and the sea coasts of america , and the sea coasts of america , and the sea coasts of america , and the sea coasts of america , and the sea coasts of america , and the sea coasts of america , and the sea coasts of america , and the sea coasts of america , and the sea coasts of america , and the sea coasts of america , and the sea coasts of america , and the']



In [102]:
sampler = keras_nlp.samplers.BeamSampler(num_beams=10)  # num_beams=1贪婪搜索相同
output_tokens = sampler( next=next, prompt=prompt_tokens, index=3, )
txt = tokenizer.detokenize(output_tokens)
# 与贪婪搜索类似，束搜索很快开始重复，因为它仍然是一种确定性方法。
print(f"Beam search generated text: \n{txt}\n")

Beam search generated text: 
[b"[BOS] today is one of the most apprehension of the chatterer ' s chatterer ' s chatterer ' s chatterer ' s chatterer ' s chatterer ' s chatterer ' s chatterer ' s chatterer ' s chatterer ' s chatterer ' s chatterer ' s chatterer ' s chatterer ' s chatterer ' s chatterer ' s chatterer ' s chatterer ' s chatterer '"]



In [106]:
# 在每个时间步骤中，它使用模型提供的 softmax 概率对下一个标记进行采样。
sampler = keras_nlp.samplers.RandomSampler()
output_tokens = sampler(  next=next, prompt=prompt_tokens, index=3, )
txt = tokenizer.detokenize(output_tokens)
# 没有重复！但是，使用随机搜索时，我们可能会看到一些无意义的单词出现，因为词汇表中的任何单词都有机会通过这种采样方法出现。
print(f"Random search generated text: \n{txt}\n")

Random search generated text: 
[b"[BOS] today is the air of great husticity of need . haru b rivers made the play all over . this was very strange to hang alone in the heart of the northern power of being the most garagel , and the squirrel panther ' s print of the youthful and power , but a number of chatterer lines of the giants and he kept hidden from the magic whizard and duration . to relyanted mole quite that the insondence upon the brow of the treeping shadow witch ! if he were to go worship ;"]



In [108]:
# 与随机搜索类似，我们从模型提供的概率分布中抽取下一个标记。
# 唯一的区别是，在这里，我们选择最k有可能的标记，并在抽样之前将概率质量分布在它们上面。
# 这样，我们就不会从低概率的标记中抽样，因此我们会得到更少的无意义的单词！
sampler = keras_nlp.samplers.TopKSampler(k=10)
output_tokens = sampler(  next=next, prompt=prompt_tokens,  index=3, )
txt = tokenizer.detokenize(output_tokens)
print(f"Top-K search generated text: \n{txt}\n")

Top-K search generated text: 
[b'[BOS] today is one of these , and the other , a little boy who has been left behind and saw his wife coming up at her . he is a switch in the chatterer . the man , who is one of the most powerful scar , and he has never been able to get a chuckle in his hand and a man of great bluff . he has a good shot , too , and he has to make a strong scale . but he has the strength of his body , for he is a man . it has not been for a long time , and he has been in']



In [110]:
# 使用 top-k 搜索，数量k是固定的，这意味着它对任何概率分布都选择相同数量的标记。
# 考虑两种情况，一种是概率质量集中在 2 个单词上，另一种是概率质量均匀集中在 10 个单词上。我们应该选择k=2还是k=10？这里不适合 top-k 。
# 通过设置p=0.9，如果 90% 的概率集中在前 2 个 token 上，我们可以筛选出前 2 个 token 进行采样。如果 90% 的概率分布在 10 个 token 上，它同样会筛选出前 10 个 token 进行采样。
sampler = keras_nlp.samplers.TopPSampler(p=0.5)
output_tokens = sampler( next=next,  prompt=prompt_tokens,  index=3, )
txt = tokenizer.detokenize(output_tokens)
print(f"Top-P search generated text: \n{txt}\n")

Top-P search generated text: 
[b'[BOS] today is in the same place , and he had no more quarrels with his brother , but the political men were in the woods . the only time he tried to make a new kind of fire , but he was a good man , and he said : " i am a good fellow , " and he was not only the good fellow , but it is so i am not much sorry for him . it was the bad fellow , who was to be more to keep up the tree in his hand . " he is very smart . " but , when he came to the house of his wife , he was a man ,']

