In [2]:
import numpy as np
import einops
import tensorflow as tf
import tensorflow_text as tf_text
import tensorflow as tf
gpu_list = tf.config.experimental.list_physical_devices('GPU')
if len(gpu_list) > 0:
    for gpu in gpu_list:
        try:
            tf.config.experimental.set_memory_growth(gpu, True)
        except RuntimeError as e:
            print(e) 
else:
    print("Got no gpus")

In [4]:
import pathlib

path_to_zip = tf.keras.utils.get_file( 'spa-eng.zip', origin='http://storage.googleapis.com/download.tensorflow.org/data/spa-eng.zip',extract=True)
path_to_file = pathlib.Path(path_to_zip).parent/'spa-eng/spa.txt'
def load_data(path):
    text = path.read_text(encoding='utf-8')
    lines = text.splitlines()
    pairs = [line.split('\t') for line in lines]
    context = np.array([context for target, context in pairs])
    target = np.array([target for target, context in pairs])
    return target, context
target_raw, context_raw = load_data(path_to_file)
print(len(target_raw) == len(context_raw))
print(target_raw[-1],"||",context_raw[-1])

True
If you want to sound like a native speaker, you must be willing to practice saying the same sentence over and over in the same way that banjo players practice the same phrase over and over until they can play it correctly and at the desired tempo. || Si quieres sonar como un hablante nativo, debes estar dispuesto a practicar diciendo la misma frase una y otra vez de la misma manera en que un músico de banjo practica el mismo fraseo una y otra vez hasta que lo puedan tocar correctamente y en el tiempo esperado.


In [6]:
BUFFER_SIZE = len(context_raw)
BATCH_SIZE = 64
is_train = np.random.uniform(size=(BUFFER_SIZE,)) < 0.8
train_raw = tf.data.Dataset.from_tensor_slices((context_raw[is_train], target_raw[is_train])).shuffle(BUFFER_SIZE).batch(BATCH_SIZE)
val_raw = tf.data.Dataset.from_tensor_slices((context_raw[~is_train], target_raw[~is_train])).shuffle(BUFFER_SIZE).batch(BATCH_SIZE)
for example_context_strings, example_target_strings in train_raw.take(1):
    print(example_context_strings[:5])
    print(example_target_strings[:5])

tf.Tensor(
[b'Tom y Mar\xc3\xada quer\xc3\xadan hablar sobre los viejos tiempos.'
 b'Su falda est\xc3\xa1 totalmente fuera de moda.'
 b'Volver\xc3\xa9 m\xc3\xa1s tarde.'
 b'S\xc3\xa9 que es dif\xc3\xadcil de creer.'
 b'Puedo repetirlo una y otra vez.'], shape=(5,), dtype=string)
tf.Tensor(
[b'Tom and Mary wanted to talk about old times.'
 b'Her skirt is totally out of fashion.' b"I'll come back later."
 b"I know it's difficult to believe." b'I can repeat it again and again.'], shape=(5,), dtype=string)


In [8]:
def tf_lower_and_split_punct(text):
    text = tf_text.normalize_utf8(text, 'NFKD')
    text = tf.strings.lower(text)
    text = tf.strings.regex_replace(text, '[^ a-z.?!,¿]', '')
    text = tf.strings.regex_replace(text, '[.?!,¿]', r' \0 ')
    text = tf.strings.strip(text)
    text = tf.strings.join(['[START]', text, '[END]'], separator=' ')
    return text

max_vocab_size = 5000
context_text_processor = tf.keras.layers.TextVectorization(standardize=tf_lower_and_split_punct, max_tokens=max_vocab_size, ragged=True)
context_text_processor.adapt(train_raw.map(lambda context, target:context))

target_text_processor = tf.keras.layers.TextVectorization(standardize=tf_lower_and_split_punct, max_tokens=max_vocab_size, ragged=True)
target_text_processor.adapt(train_raw.map(lambda context, target:target))

In [10]:
def process_text(context, target):
    context = context_text_processor(context).to_tensor()
    target = target_text_processor(target)
    target_in = target[:, :-1].to_tensor()
    target_ou = target[:, 1:].to_tensor()
    return (context, target_in), target_ou

train_ds = train_raw.map(process_text,  tf.data.AUTOTUNE)
val_ds = val_raw.map(process_text, tf.data.AUTOTUNE)

for (A,B),C in train_ds.take(1):
    print(A[0, :10])
    print(B[0, :10])
    print(C[0, :10])

tf.Tensor([   2    7   15    7 1993    6   16  588    4    3], shape=(10,), dtype=int64)
tf.Tensor([   2  126    5 1757   15   10  646    4    0    0], shape=(10,), dtype=int64)
tf.Tensor([ 126    5 1757   15   10  646    4    3    0    0], shape=(10,), dtype=int64)


In [15]:
UNITS = 516
class Encoder(tf.keras.layers.Layer):
    def __init__(self, text_processor, units):
        super(Encoder, self).__init__()
        self.text_processor = text_processor
        self.vocab_size = text_processor.vocabulary_size()
        self.units = units
        self.embedding = tf.keras.layers.Embedding(self.vocab_size, units, mask_zero=True)
        self.rnn = tf.keras.layers.Bidirectional(merge_mode='sum', layer=tf.keras.layers.GRU(units, return_sequences=True, recurrent_initializer='glorot_uniform'))
    
    def call(self, x):
        x = self.embedding(x)
        x = self.rnn(x)
        return x
    
    def convert_input(self, texts):
        texts = tf.convert_to_tensor(texts)
        if len(texts.shape) == 0:
            texts = tf.convert_to_tensor(texts)[tf.newaxis]
        context = self.text_processor(texts).to_tensor()
        context = self(context)
        return context
        

In [17]:
class CrossAttention(tf.keras.layers.Layer):
    def __init__(self, units, **kwargs):
        super().__init__()
        self.mha = tf.keras.layers.MultiHeadAttention(key_dim=units, num_heads=5, **kwargs)
        self.layernorm = tf.keras.layers.LayerNormalization()
        self.add = tf.keras.layers.Add()

    def call(self, x, context):
        attn_ouput, attn_scores = self.mha(query=x, value=context, return_attention_scores=True)
        attn_scores = tf.reduce_mean(attn_scores, axis=1)
        self.last_attention_weights = attn_scores
        x = self.add([x, attn_ouput])
        x = self.layernorm(x)
        return x

In [19]:
class Decoder(tf.keras.layers.Layer):
    def __init__(self, text_processor, units):
        super(Decoder, self).__init__()
        self.text_processor = text_processor
        self.vocab_size = text_processor.vocabulary_size()
        self.word_to_id = tf.keras.layers.StringLookup(vocabulary=text_processor.get_vocabulary(), mask_token='', oov_token='[UNK]')
        self.id_to_word = tf.keras.layers.StringLookup(vocabulary=text_processor.get_vocabulary(), mask_token='', oov_token='[UNK]', invert=True)
        self.start_token = self.word_to_id('[START]')
        self.end_token = self.word_to_id('[END]')
        self.units = units
        self.embedding = tf.keras.layers.Embedding(self.vocab_size, units, mask_zero=True)
        self.rnn = tf.keras.layers.GRU(units, return_sequences=True, return_state=True, recurrent_initializer='glorot_uniform')
        self.attention = CrossAttention(units)
        self.output_layer = tf.keras.layers.Dense(self.vocab_size)
        
    def call(self, context, x, state=None, return_state=False):
        x = self.embedding(x)
        x, state = self.rnn(x, initial_state=state)
        x = self.attention(x, context)
        self.last_attention_weights = self.attention.last_attention_weights
        logits = self.output_layer(x)
        if return_state:
            return logits, state
        return logits
    
    def get_initial_state(self, context):
        batch_size = tf.shape(context)[0]
        start_tokens = tf.fill([batch_size, 1], self.start_token)
        done = tf.zeros([batch_size, 1], dtype=tf.bool)
        embedded = self.embedding(start_tokens)
        return start_tokens, done, self.rnn.get_initial_state(embedded)[0]  # [batch_size, units]
    
    def tokens_to_text(self, tokens):
        words = self.id_to_word(tokens)
        result = tf.strings.reduce_join(words, axis=-1, separator=' ')
        result = tf.strings.regex_replace(result, '^ *\[START\] *', '')
        result = tf.strings.regex_replace(result, ' *\[END\] *$', '')
        return result
    
    def get_next_token(self, context, next_token, done, state,temperature=0.0):
        logits, state = self(context, next_token, state=state, return_state=True)
        if temperature == 0.0:
            next_token = tf.argmax(logits, axis=-1)
        else:
            logits = logits[:, -1, :]/temperature
            next_token = tf.random.categorical(logits, num_samples=1)
        done |= (next_token == self.end_token)
        next_token = tf.where(done, tf.constant(0, dtype=tf.int64), next_token)
        return next_token, done, state

In [21]:
class Translator(tf.keras.Model):
    def __init__(self, units, context_text_processor, target_text_processor):
        super().__init__()
        self.encoder = Encoder(context_text_processor, units)
        self.decoder = Decoder(target_text_processor, units)
    
    def call(self, inputs):
        context, x = inputs
        context = self.encoder(context)
        logits = self.decoder(context, x)
        try:
            del logits._keras_mask
        except AttributeError:
            pass
        return logits
    
    def translate(self, texts, max_length=500, temperature=tf.constant(0.0)):
        context = self.encoder.convert_input(texts)
        next_token, done, state = self.decoder.get_initial_state(context)
        tokens = tf.TensorArray(tf.int64, size=1, dynamic_size=True)
        for t in tf.range(max_length):
            next_token, done, state = self.decoder.get_next_token(context, next_token, done, state, temperature)
            tokens = tokens.write(t, next_token)
            if tf.reduce_all(done):
                break
        tokens = tokens.stack()
        tokens = einops.rearrange(tokens, 't batch 1 -> batch t')
        text = self.decoder.tokens_to_text(tokens)
        return text

In [23]:
def masked_loss(y_true, y_pred):
    loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction='none')
    loss = loss_fn(y_true, y_pred)
    mask = tf.cast(y_true!=0, dtype=loss.dtype)
    loss *= mask
    return tf.reduce_sum(loss) / tf.reduce_sum(mask)

def masked_acc(y_true, y_pred):
    y_pred = tf.argmax(y_pred, axis=-1)
    y_pred = tf.cast(y_pred, y_true.dtype)
    match = tf.cast(y_true == y_pred, tf.float32)
    mask = tf.cast(y_true !=0 ,tf.float32)
    return tf.reduce_sum(match) / tf.reduce_sum(mask)

In [None]:
model = Translator(UNITS, context_text_processor, target_text_processor)
model.compile(optimizer='adam', loss=masked_loss, metrics=[masked_acc, masked_loss])
history = model.fit(train_ds.repeat(), epochs=100, steps_per_epoch=100, validation_data=val_ds, validation_steps=20, callbacks=[tf.keras.callbacks.EarlyStopping(patience=5)])

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100

In [None]:
inputs = [
    'Hace mucho frio aqui.', # "It's really cold here."
    'Esta es mi vida.', # "This is my life."
    'Su cuarto es un desastre.' # "His room is a mess"
]
result = model.translate(inputs)
print(result)