In [None]:
import tensorflow as tf
import tensorflow_datasets as tfds
from tensorflow.keras.layers import Input, Dense, Embedding
from tensorflow.keras.models import Model

dataset, info = tfds.load('wmt14_translate/de-en', with_info=True, as_supervised=True)
train_data, val_data = dataset['train'], dataset['validation']

tokenizer_en = tfds.deprecated.text.SubwordTextEncoder.build_from_corpus(
    (en.numpy() for en, de in train_data), target_vocab_size=2**13)
tokenizer_de = tfds.deprecated.text.SubwordTextEncoder.build_from_corpus(
    (de.numpy() for en, de in train_data), target_vocab_size=2**13)

def encode(en, de):
    en_tokens = [tokenizer_en.vocab_size] + tokenizer_en.encode(en.numpy()) + [tokenizer_en.vocab_size + 1]
    de_tokens = [tokenizer_de.vocab_size] + tokenizer_de.encode(de.numpy()) + [tokenizer_de.vocab_size + 1]
    return en_tokens, de_tokens

def tf_encode(en, de):
    result_en, result_de = tf.py_function(encode, [en, de], [tf.int64, tf.int64])
    result_en.set_shape([None])
    result_de.set_shape([None])
    return result_en, result_de

def filter_max_length(en, de, max_len=40):
    return tf.logical_and(tf.size(en) <= max_len, tf.size(de) <= max_len)

BATCH_SIZE = 64
BUFFER_SIZE = 20000
train_dataset = train_data.map(tf_encode)
train_dataset = train_dataset.filter(filter_max_length)
train_dataset = train_dataset.shuffle(BUFFER_SIZE).padded_batch(BATCH_SIZE)
train_dataset = train_dataset.prefetch(tf.data.experimental.AUTOTUNE)

def transformer_model(vocab_size_en, vocab_size_de):
    input_en = Input(shape=(None,), name='English')
    input_de = Input(shape=(None,), name='German')
    embed_en = Embedding(vocab_size_en + 2, 128)(input_en)
    embed_de = Embedding(vocab_size_de + 2, 128)(input_de)
    x = Dense(256, activation='relu')(embed_en)
    x = Dense(128, activation='relu')(x)
    output = Dense(vocab_size_de + 2, activation='softmax')(x)
    model = Model(inputs=[input_en, input_de], outputs=output)
    return model

model = transformer_model(tokenizer_en.vocab_size, tokenizer_de.vocab_size)
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

EPOCHS = 10
model.fit(train_dataset, epochs=EPOCHS)

def translate(sentence):
    tokens = [tokenizer_en.vocab_size] + tokenizer_en.encode(sentence) + [tokenizer_en.vocab_size + 1]
    tokens = tf.expand_dims(tokens, axis=0)
    output = model.predict([tokens, tokens])
    predicted_ids = tf.argmax(output, axis=-1).numpy()[0]
    translated = tokenizer_de.decode([i for i in predicted_ids if i < tokenizer_de.vocab_size])
    return translated

print(translate("Hello, how are you?"))



Downloading and preparing dataset Unknown size (download: Unknown size, generated: Unknown size, total: Unknown size) to /root/tensorflow_datasets/wmt14_translate/de-en/1.0.0...


Dl Completed...: 0 url [00:00, ? url/s]

Dl Size...: 0 MiB [00:00, ? MiB/s]

Extraction completed...: 0 file [00:00, ? file/s]

Dl Completed...: 0 url [00:00, ? url/s]

Dl Size...: 0 MiB [00:00, ? MiB/s]

Extraction completed...: 0 file [00:00, ? file/s]

Dl Completed...: 0 url [00:00, ? url/s]

Dl Size...: 0 MiB [00:00, ? MiB/s]

Extraction completed...: 0 file [00:00, ? file/s]