In [1]:
import keras_nlp
import pathlib
import random
import keras
import tensorflow as tf

gpus = tf.config.experimental.list_physical_devices("GPU")
for gpu in gpus:
    tf.config.experimental.set_memory_growth(gpu, True)

D:\anaconda\envs\tf-gpu-2.10.0-py-3.10\lib\site-packages\numpy\.libs\libopenblas.FB5AE2TYXYH2IJRDKGDGQ3XBKLKTF43H.gfortran-win_amd64.dll
D:\anaconda\envs\tf-gpu-2.10.0-py-3.10\lib\site-packages\numpy\.libs\libopenblas64__v0.3.23-246-g3d31191b-gcc_10_3_0.dll


Using TensorFlow backend


In [11]:
BATCH_SIZE = 64
EPOCHS = 100
MAX_SEQUENCE_LENGTH = 40
ENG_VOCAB_SIZE = 15000
SPA_VOCAB_SIZE = 15000
EMBED_DIM = 256
INTERMEDIATE_DIM = 2048
NUM_HEADS = 8

In [3]:
text_file = keras.utils.get_file(fname="spa-eng.zip", origin="http://storage.googleapis.com/download.tensorflow.org/data/spa-eng.zip", extract=True,) # WindowsPath('C:/Users/13900K/.keras/datasets/spa-eng/spa.txt')
text_file = pathlib.Path(text_file).parent / "spa-eng" / "spa.txt"
text_pairs = []
with open(text_file, "r", encoding="utf-8") as f:
    lines = f.readlines()
    for line in lines:
        eng, spa = line.strip().split("\t")
        eng = eng.lower()
        spa = spa.lower()
        if eng and spa:
            text_pairs.append((eng, spa))
random.shuffle(text_pairs)
num_val_samples = int(0.15 * len(text_pairs))
num_train_samples = len(text_pairs) - 2 * num_val_samples
train_pairs = text_pairs[:num_train_samples]
val_pairs = text_pairs[num_train_samples: num_train_samples + num_val_samples]
test_pairs = text_pairs[num_train_samples+num_val_samples:]
for e,s in train_pairs[:3]:
    print(e,s)

are you still thinking about applying to harvard? ¿sigues pensando en postular a harvard?
do you want to eat prawns? ¿quieres comer camarones?
i had no difficulty in finding his house. no tuve problemas para encontrar su casa.


In [4]:
def train_word_piece(text_samples, vocab_size, reserved_tokens):
    word_piece_ds = tf.data.Dataset.from_tensor_slices(text_samples)
    vocab = keras_nlp.tokenizers.compute_word_piece_vocabulary(word_piece_ds.batch(1000).prefetch(2), vocabulary_size=vocab_size, reserved_tokens=reserved_tokens)
    return vocab
reserved_tokens = ["[PAD]", "[UNK]", "[START]", "[END]"]
eng_samples = [text_pair[0] for text_pair in train_pairs]
eng_vocab = train_word_piece(eng_samples, ENG_VOCAB_SIZE, reserved_tokens)
spa_samples = [text_pair[1] for text_pair in train_pairs]
spa_vocab = train_word_piece(spa_samples, SPA_VOCAB_SIZE, reserved_tokens)
eng_tokenizer = keras_nlp.tokenizers.WordPieceTokenizer(vocabulary=eng_vocab, lowercase=False)
spa_tokenizer = keras_nlp.tokenizers.WordPieceTokenizer(vocabulary=spa_vocab, lowercase=False)

print(eng_vocab[:10])
print(spa_vocab[:10])

eng_input_ex = text_pairs[0][0]
eng_tokens_ex = eng_tokenizer.tokenize(eng_input_ex)
print("English sentence: ", eng_input_ex)
print("Tokens: ", eng_tokens_ex)
print( "Recovered text after detokenizing: ",  eng_tokenizer.detokenize(eng_tokens_ex), )

print()

spa_input_ex = text_pairs[0][1]
spa_tokens_ex = spa_tokenizer.tokenize(spa_input_ex)
print("Spanish sentence: ", spa_input_ex)
print("Tokens: ", spa_tokens_ex)
print(  "Recovered text after detokenizing: ",  spa_tokenizer.detokenize(spa_tokens_ex),)


['[PAD]', '[UNK]', '[START]', '[END]', '!', '"', '$', '%', "'", ',']
['[PAD]', '[UNK]', '[START]', '[END]', '!', '"', '$', '%', '&', "'"]
English sentence:  are you still thinking about applying to harvard?
Tokens:  tf.Tensor([  83   64  211  591  115 2656  148   63 2193   25], shape=(10,), dtype=int32)
Recovered text after detokenizing:  tf.Tensor(b'are you still thinking about applying to harvard ?', shape=(), dtype=string)

Spanish sentence:  ¿sigues pensando en postular a harvard?
Tokens:  tf.Tensor([  62 3872  704   82   45  469  905 2285   30 2710   29], shape=(11,), dtype=int32)
Recovered text after detokenizing:  tf.Tensor(b'\xc2\xbf sigues pensando en postular a harvard ?', shape=(), dtype=string)


In [5]:
# StartEndPacker 会先将 tokens 加入 [START] 和 [END] 两个 token 
# 长度不够的用 [PAD] 补齐长度为 sequence_length ，直接全部返回
# 长度超了就将中间的代表文本的 tokens 截断，但是必须要保留 [START] 和 [END] 两个 token .最终加起来长度是 sequence_length
import numpy as np
inputs = [[5, 6, 7], [8, 9, 10, 11, 12, 13, 14]]
start_end_packer = keras_nlp.layers.StartEndPacker( sequence_length=5, start_value=1, end_value=2,)
outputs = start_end_packer(inputs)
print(np.array(outputs))
start_end_packer = keras_nlp.layers.StartEndPacker( sequence_length=10, start_value=1, end_value=2,)
outputs = start_end_packer(inputs)
print(np.array(outputs))

[[ 1  5  6  7  2]
 [ 1  8  9 10  2]]
[[ 1  5  6  7  2  0  0  0  0  0]
 [ 1  8  9 10 11 12 13 14  2  0]]


In [6]:
def process_batch(eng, spa):
    batch_size = tf.shape(spa)[0]
    eng = eng_tokenizer(eng)
    spa = spa_tokenizer(spa)
    eng_start_end_packer = keras_nlp.layers.StartEndPacker(sequence_length=MAX_SEQUENCE_LENGTH, pad_value=eng_tokenizer.token_to_id("[PAD]"))
    eng = eng_start_end_packer(eng)
    spa_start_end_packer = keras_nlp.layers.StartEndPacker(sequence_length=MAX_SEQUENCE_LENGTH+1, start_value=spa_tokenizer.token_to_id("[START]"), end_value=spa_tokenizer.token_to_id("[END]"), pad_value=spa_tokenizer.token_to_id("[PAD]"))
    spa = spa_start_end_packer(spa)
    return{"encoder_inputs": eng, "decoder_inputs": spa[:, :-1]},spa[:,1:]

def make_dataset(pairs):
    eng_texts, spa_texts = zip(*pairs)
    eng_texts = list(eng_texts)
    spa_texts = list(spa_texts)
    dataset = tf.data.Dataset.from_tensor_slices((eng_texts, spa_texts)).batch(BATCH_SIZE).map(process_batch, num_parallel_calls=tf.data.AUTOTUNE)
    return dataset.shuffle(2048).prefetch(16).cache()

train_ds = make_dataset(train_pairs)
val_ds = make_dataset(val_pairs)
for inputs, targets in train_ds.take(1):
    print(inputs["encoder_inputs"][:3])
    print(inputs["decoder_inputs"][:3])
    print(targets[:3])
    print(f'inputs["encoder_inputs"].shape: {inputs["encoder_inputs"].shape}')
    print(f'inputs["decoder_inputs"].shape: {inputs["decoder_inputs"].shape}')
    print(f"targets.shape: {targets.shape}")


tf.Tensor(
[[  79  289  160   26  650   11    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0]
 [  99    8   44  127  221   80    8  105  132   63   73   70   11    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0]
 [  67   26  418 1449   86 1106 1507  418 3400   69  956   11    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0]], shape=(3, 40), dtype=int32)
tf.Tensor(
[[   2   98  146  311   84  774   15    3    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0]
 [   2   76  629  645  383   30  124  111   15    3    0    0    0    0
     0    0    0    0    0    0    0    0    0  

In [12]:
# encoder
encoder_inputs = tf.keras.Input(shape=(None,), name="encoder_inputs")
x = keras_nlp.layers.TokenAndPositionEmbedding(vocabulary_size=ENG_VOCAB_SIZE, sequence_length=MAX_SEQUENCE_LENGTH, embedding_dim=EMBED_DIM)(encoder_inputs)
encoder_output = keras_nlp.layers.TransformerEncoder(intermediate_dim=INTERMEDIATE_DIM, num_heads=NUM_HEADS)(inputs=x)
encoder = tf.keras.Model(encoder_inputs, encoder_output)

# Decoder
decoder_inputs = tf.keras.Input(shape=(None,), name="decoder_inputs")
encoded_seq_inputs = keras.Input(shape=(None, EMBED_DIM), name="decoder_state_inputs")
x = keras_nlp.layers.TokenAndPositionEmbedding(vocabulary_size=SPA_VOCAB_SIZE, sequence_length=MAX_SEQUENCE_LENGTH, embedding_dim=EMBED_DIM)(decoder_inputs)
x = keras_nlp.layers.TransformerDecoder(intermediate_dim=INTERMEDIATE_DIM, num_heads=NUM_HEADS)(decoder_sequence=x, encoder_sequence=encoded_seq_inputs)
x = tf.keras.layers.Dropout(0.5)(x)
decoder_outputs = tf.keras.layers.Dense(SPA_VOCAB_SIZE, activation="softmax")(x)
decoder = tf.keras.Model([decoder_inputs, encoded_seq_inputs], decoder_outputs)
decoder_outputs = decoder([decoder_inputs, encoder_output])

# transformer
transformer = tf.keras.Model([encoder_inputs, decoder_inputs], decoder_outputs, name="transformer")
transformer.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=["accuracy"])
transformer.fit(train_ds, epochs=EPOCHS, validation_data=val_ds, callbacks=[tf.keras.callbacks.EarlyStopping(patience=3)])

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100


<keras.callbacks.History at 0x19bd616c340>

In [13]:
def decode_sequences(input_sentences):
    batch_size = len(input_sentences)
    encoder_input_tokens = eng_tokenizer(input_sentences)
    encoder_input_tokens = encoder_input_tokens.to_tensor(shape=[None, MAX_SEQUENCE_LENGTH], default_value=eng_tokenizer.token_to_id("[PAD]"))
    start = tf.fill([batch_size, 1], value=spa_tokenizer.token_to_id("[START]"))
    end = spa_tokenizer.token_to_id("[END]")
    done = tf.zeros([batch_size, 1], dtype=tf.bool)
    output_array = tf.TensorArray(dtype=tf.int32, size=0, dynamic_size=True)
    output_array = output_array.write(0, start)
    for i in tf.range(MAX_SEQUENCE_LENGTH):
        output = tf.transpose(output_array.stack(), perm=[1, 0, 2])
        predictions = transformer([encoder_input_tokens, tf.squeeze(output, axis=-1)])  # [B,1,V]
        predictions = predictions[:, -1:, :]
        predicted_id = tf.argmax(predictions, axis=-1,output_type=tf.int32)
        done |= predicted_id == end
        predicted_id = tf.where(done, tf.constant(0, dtype=tf.int32), predicted_id)  # [B, 1]
        output_array = output_array.write(i+1, predicted_id)
        if tf.reduce_all(done):
            break
    output = tf.transpose(output_array.stack(), perm=[1, 0, 2])
    output = tf.squeeze(output, axis=-1)  # Remove the last dimension
    texts =  spa_tokenizer.detokenize(output)
    return texts

In [14]:
test_eng_texts = [pair[0] for pair in test_pairs]
input_sentence = random.choices(test_eng_texts, k=2)
translated = decode_sequences(input_sentence)
for i,text in enumerate(input_sentence):
    print("英文：",text.strip())
    print("翻译：",translated.numpy()[i].decode("utf-8").replace("[START]", "").replace("[PAD]", "").replace("[END]", "").strip()) 
    print()

英文： tom worked on a farm last summer.
翻译： tom trabajó en un gran granja .

英文： you shouldn't spend more money than you earn.
翻译： no deberías gastar más dinero como si tienes .



In [15]:
rouge_1 = keras_nlp.metrics.RougeN(order=1)
rouge_2 = keras_nlp.metrics.RougeN(order=2)
for test_pair in test_pairs[:30]:
    input_sentence = test_pair[0]
    reference_sentence = test_pair[1]
    translated_sentence = decode_sequences([input_sentence])
    translated_sentence = translated_sentence.numpy()[0].decode("utf-8")
    translated_sentence = translated_sentence.replace("[PAD]", "").replace("[START]", "").replace("[END]", "").strip()
    rouge_1(reference_sentence, translated_sentence)
    rouge_2(reference_sentence, translated_sentence)

print("ROUGE-1 Score: ", rouge_1.result())
print("ROUGE-2 Score: ", rouge_2.result())

ROUGE-1 Score:  {'precision': <tf.Tensor: shape=(), dtype=float32, numpy=0.5684036>, 'recall': <tf.Tensor: shape=(), dtype=float32, numpy=0.5268915>, 'f1_score': <tf.Tensor: shape=(), dtype=float32, numpy=0.54191023>}
ROUGE-2 Score:  {'precision': <tf.Tensor: shape=(), dtype=float32, numpy=0.3932744>, 'recall': <tf.Tensor: shape=(), dtype=float32, numpy=0.35842922>, 'f1_score': <tf.Tensor: shape=(), dtype=float32, numpy=0.3713763>}


In [None]:
# 7.1G-4.3G

In [16]:
transformer.summary()

Model: "transformer"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 encoder_inputs (InputLayer)    [(None, None)]       0           []                               
                                                                                                  
 token_and_position_embedding_2  (None, None, 256)   3850240     ['encoder_inputs[0][0]']         
  (TokenAndPositionEmbedding)                                                                     
                                                                                                  
 decoder_inputs (InputLayer)    [(None, None)]       0           []                               
                                                                                                  
 transformer_encoder_1 (Transfo  (None, None, 256)   1315072     ['token_and_position_em