In [1]:
import tensorflow as tf
import numpy as np
import pathlib
import warnings
import tensorflow_text as tf_text
warnings.filterwarnings('ignore')

2024-03-12 06:01:45.765641: I external/local_tsl/tsl/cuda/cudart_stub.cc:31] Could not find cuda drivers on your machine, GPU will not be used.
2024-03-12 06:01:46.279634: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-03-12 06:01:46.279671: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-03-12 06:01:46.288292: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-03-12 06:01:46.310346: I external/local_tsl/tsl/cuda/cudart_stub.cc:31] Could not find cuda drivers on your machine, GPU will not be used.
2024-03-12 06:01:46.311250: I tensorflow/core/platform/cpu_feature_guard.cc:1

In [2]:
path_to_file = tf.keras.utils.get_file("spa-eng.zip", origin='http://storage.googleapis.com/download.tensorflow.org/data/spa-eng.zip', extract=True)
path_to_file = pathlib.Path(path_to_file).parent/'spa-eng/spa.txt'
path_to_file

PosixPath('/home/krsethur/.keras/datasets/spa-eng/spa.txt')

In [3]:
def load_data(path):
    text = path.read_text(encoding="utf-8")
    lines = text.splitlines()
    pairs = [line.split("\t") for line in lines]
    context = np.array([context for target,context in pairs])
    target = np.array([target for target,context in pairs])
    return context, target

In [4]:
context_raw, target_raw = load_data(path_to_file)

In [5]:
context_raw.shape

(118964,)

In [6]:
context_raw[-1]

'Si quieres sonar como un hablante nativo, debes estar dispuesto a practicar diciendo la misma frase una y otra vez de la misma manera en que un músico de banjo practica el mismo fraseo una y otra vez hasta que lo puedan tocar correctamente y en el tiempo esperado.'

In [7]:
target_raw[-1]

'If you want to sound like a native speaker, you must be willing to practice saying the same sentence over and over in the same way that banjo players practice the same phrase over and over until they can play it correctly and at the desired tempo.'

In [8]:
BUFFER_SIZE=len(target_raw)
BATCH_SIZE=64
is_train = np.random.uniform(size=(len(target_raw,))) < 0.8
train_raw = tf.data.Dataset.from_tensor_slices( (context_raw[is_train], target_raw[is_train]) ).shuffle(BUFFER_SIZE).batch(BATCH_SIZE)
val_raw = tf.data.Dataset.from_tensor_slices( (context_raw[~is_train], target_raw[~is_train]) ).shuffle(BUFFER_SIZE).batch(BATCH_SIZE)

In [9]:
for (example_context_strings, example_target_strings) in train_raw.take(1):
    print(example_context_strings[:5])
    print(example_target_strings[:5])

tf.Tensor(
[b'\xc3\x89l es un hombre cruel.' b'El ruido me pone nervioso.'
 b'Ella tembl\xc3\xb3 de miedo.' b'Apreciamos su paciencia.'
 b'El avi\xc3\xb3n despegar\xc3\xa1 en diez minutos.'], shape=(5,), dtype=string)
tf.Tensor(
[b'He is a cruel person.' b'The noise gets on my nerves.'
 b'She trembled with fear.' b'We appreciate your patience.'
 b'The plane takes off in ten minutes.'], shape=(5,), dtype=string)


In [10]:
def tf_lower_and_split_punct(text):
    text = tf_text.normalize_utf8(text, "NFKD")
    text = tf.strings.lower(text)
    text = tf.strings.regex_replace(text, '[^ a-z.?!,¿]', '')
    text = tf.strings.regex_replace(text, '[.?!,¿]', r' \0 ')
    text = tf.strings.strip(text)
    text = tf.strings.join( [ '[START]', text, '[END]' ], separator=' ')
    return text

In [11]:
example_text = tf.constant('¿Todavía está en casa?')
print(tf_lower_and_split_punct(example_text).numpy().decode('UTF-8'))

[START] ¿ todavia esta en casa ? [END]


In [12]:
max_vocab_size=5000
context_text_processor = tf.keras.layers.TextVectorization(max_tokens=max_vocab_size, standardize=tf_lower_and_split_punct, ragged=True)
context_text_processor.adapt(train_raw.map(lambda context, target : context))

In [13]:
target_text_processor = tf.keras.layers.TextVectorization(max_tokens=max_vocab_size, standardize=tf_lower_and_split_punct, ragged=True)
target_text_processor.adapt(train_raw.map(lambda context, target : target))

In [14]:
context_text_processor(example_context_strings[:5])

<tf.RaggedTensor [[2, 7, 15, 16, 145, 2368, 4, 3], [2, 7, 570, 18, 1203, 1356, 4, 3],
 [2, 28, 1, 6, 360, 4, 3], [2, 1, 25, 1224, 4, 3],
 [2, 7, 482, 1, 14, 271, 364, 4, 3]]>

In [15]:
target_text_processor(example_target_strings[:5])

<tf.RaggedTensor [[2, 13, 12, 10, 1899, 295, 4, 3], [2, 5, 653, 557, 34, 24, 2601, 4, 3],
 [2, 26, 1, 36, 1006, 4, 3], [2, 32, 1328, 33, 1458, 4, 3],
 [2, 5, 597, 620, 164, 14, 354, 434, 4, 3]]>

In [16]:
context_text_processor.get_vocabulary()[:10]

['', '[UNK]', '[START]', '[END]', '.', 'que', 'de', 'el', 'a', 'no']

In [17]:
target_text_processor.get_vocabulary()[:10]

['', '[UNK]', '[START]', '[END]', '.', 'the', 'i', 'to', 'you', 'tom']

In [18]:
def process_text(context, target):
    context = context_text_processor(context).to_tensor()
    target = target_text_processor(target)
    targ_in = target[:,:-1].to_tensor()
    targ_out = target[:,1:].to_tensor()
    return (context, targ_in), targ_out

In [19]:
train_ds = train_raw.map(process_text, tf.data.AUTOTUNE)
val_ds = val_raw.map(process_text, tf.data.AUTOTUNE)

In [20]:
for (ex_context_tok, ex_tar_in), ex_tar_out in train_ds.take(1):
    print(ex_context_tok[0])
    print(ex_tar_in[0])
    print(ex_tar_out[0])

tf.Tensor(
[   2   10   34   32   64 1923   38   11  134   34    7  100    4    3
    0    0    0    0    0    0    0], shape=(21,), dtype=int64)
tf.Tensor([  2   9  40  31  28  57 660  57 160  40 108   4   0   0   0   0   0   0], shape=(18,), dtype=int64)
tf.Tensor([  9  40  31  28  57 660  57 160  40 108   4   3   0   0   0   0   0   0], shape=(18,), dtype=int64)


In [21]:
ex_context_tok.shape

TensorShape([64, 21])

In [22]:
ex_tar_in.shape

TensorShape([64, 18])

In [23]:
ex_tar_out.shape

TensorShape([64, 18])

In [24]:
UNITS=256

In [25]:
class Encoder(tf.keras.layers.Layer):
    def __init__(self, text_processor, units):
        super().__init__()
        self.text_processor = text_processor
        self.vocab_size = text_processor.vocabulary_size()
        self.units = units
        self.embedding = tf.keras.layers.Embedding(input_dim=self.vocab_size, output_dim=self.units, mask_zero=True)
        self.rnn = tf.keras.layers.Bidirectional(merge_mode='sum', 
                                                 layer=tf.keras.layers.GRU(self.units, 
                                                                           return_sequences=True, 
                                                                           recurrent_initializer='glorot_uniform')
                                                )
    def call(self, context):
        x= context
        x = self.embedding(x)
        x = self.rnn(x)
        return x

In [26]:
encoder = Encoder(context_text_processor, UNITS)

In [27]:
ex_context=encoder(ex_context_tok)

In [28]:
ex_context.shape

TensorShape([64, 21, 256])

In [29]:
class CrossAttention(tf.keras.layers.Layer):
    def __init__(self, units):
        super().__init__()
        self.mha = tf.keras.layers.MultiHeadAttention(key_dim=units, num_heads=1)
        self.layernorm = tf.keras.layers.LayerNormalization()
        self.add = tf.keras.layers.Add()
    def call(self, x, context):
        attn_output, attn_scores = self.mha(query=x, value=context, return_attention_scores=True)
        attn_scores = tf.reduce_mean(attn_scores, axis=1)
        self.last_attention_weights=attn_scores
        x = self.add([x, attn_output])
        return self.layernorm(x)

In [30]:
attention_layer = CrossAttention(UNITS)


In [31]:
embed = tf.keras.layers.Embedding(input_dim=target_text_processor.vocabulary_size(), output_dim=UNITS)
ex_embed = embed(ex_tar_in)
ex_embed.shape

TensorShape([64, 18, 256])

In [32]:
result=attention_layer(ex_embed, ex_context)
result.shape

TensorShape([64, 18, 256])

In [33]:
attention_layer.last_attention_weights[0].numpy().sum(axis=-1)

array([1.        , 1.        , 0.9999999 , 0.9999999 , 1.0000001 ,
       0.9999999 , 1.        , 0.9999999 , 1.        , 0.9999999 ,
       0.99999994, 1.        , 1.        , 1.        , 1.        ,
       1.        , 1.        , 1.        ], dtype=float32)

In [34]:
class Decoder(tf.keras.layers.Layer):
    @classmethod
    def add_method(cls, fun):
        setattr(cls,fun.__name__,fun)
        return fun

In [35]:
@Decoder.add_method
def __init__(self, text_processor, units):
    super(Decoder, self).__init__()
    self.text_processor = text_processor
    self.units = units
    self.vocab_size = text_processor.vocabulary_size()
    self.word_to_id = tf.keras.layers.StringLookup(vocabulary=text_processor.get_vocabulary(), mask_token='',oov_token='[UNK]')
    self.id_to_word = tf.keras.layers.StringLookup(vocabulary=text_processor.get_vocabulary(), mask_token='',oov_token='[UNK]', invert=True)
    self.start_token=self.word_to_id('[START]')
    self.end_token=self.word_to_id('[END]')
    self.embedding = tf.keras.layers.Embedding(input_dim=self.vocab_size, output_dim=self.units, mask_zero=True)
    self.rnn = tf.keras.layers.GRU(units, return_sequences=True, return_state=True, recurrent_initializer='glorot_uniform')
    self.attention = CrossAttention(self.units)
    self.output_layer = tf.keras.layers.Dense(self.vocab_size)

In [36]:
@Decoder.add_method
def call(self, context, x, state=None, return_state=False):
    x = self.embedding(x)
    x,state = self.rnn(x, initial_state=state)
    x = self.attention(x, context)
    logits = self.output_layer(x)
    if return_state:
        return logits, state
    else:
        return logits

In [37]:
decoder = Decoder(target_text_processor, UNITS)

In [38]:
decoder(ex_context,ex_tar_in).shape

TensorShape([64, 18, 5000])

In [39]:
@Decoder.add_method
def get_initial_state(self, context):
    batch_size = tf.shape(context)[0]
    start_tokens = tf.fill([batch_size,1], self.start_token)
    done = tf.zeros([batch_size,1], dtype=tf.bool)
    embedded = self.embedding(start_tokens)
    return start_tokens, done, self.rnn.get_initial_state(embedded)[0]

In [40]:
@Decoder.add_method
def tokens_to_text(self, tokens):
    words = self.id_to_word(tokens)
    result = tf.strings.reduce_join(words, axis=-1, separator=' ')
    result = tf.strings.regex_replace(result, '^ *\[START\] *', '')
    result = tf.strings.regex_replace(result, ' *\[END\] *$', '')
    return result

In [41]:
@Decoder.add_method
def get_next_token(self, context, next_token, done, state, temperature=0.0):
    logits, state = self(context, next_token, state, return_state=True)
    if temperature == 0.0:
        next_token = tf.argmax(logits, axis=-1)
    else:
        logits = logits[:,-1,:]/temperature
        next_token = tf.random.categorical(logits, num_samples=1)
    done = done | (next_token==self.end_token)
    next_token = tf.where(done, tf.constant(0, dtype=tf.int64), next_token)
    return next_token, done, state
        

In [42]:
next_token, done, state = decoder.get_initial_state(ex_context)
tokens=[]

for n in range(10):
    next_token, done, state = decoder.get_next_token(ex_context, next_token, done, state)
    tokens.append(next_token)

tokens = tf.concat(tokens, axis=-1)
result=decoder.tokens_to_text(tokens)
result[:3]

<tf.Tensor: shape=(3,), dtype=string, numpy=
array([b'midnight model cottage high bride against foot hiking banging skirts',
       b'midnight model cottage high bride against foot hiking banging skirts',
       b'midnight model cottage high bride against consciousness wisely dreaming crowded'],
      dtype=object)>

In [43]:
class Translator(tf.keras.Model):
    def __init__(self, units, context_text_processor, target_text_processor):
        super().__init__()
        self.encoder = Encoder(context_text_processor, units)
        self.decoder = Decoder(target_text_processor,units)
    def call(self, inputs):
        context,x = inputs
        context = self.encoder(context)
        logits = self.decoder(context, x)
        return logits

In [45]:
model = Translator(UNITS, context_text_processor, target_text_processor)
logits = model((ex_context_tok, ex_tar_in))

In [46]:
logits.shape

TensorShape([64, 18, 5000])

In [53]:
def masked_loss(y_true, y_pred):
    loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction='none')
    loss = loss_fn(y_true, y_pred)
    mask = tf.cast(y_true != 0, loss.dtype)
    loss *= mask
    return tf.reduce_sum(loss)/tf.reduce_sum(mask)

In [57]:
def masked_acc(y_true, y_pred):
    y_pred = tf.argmax(y_pred, axis=-1)
    y_pred = tf.cast(y_pred, y_true.dtype)
    match = tf.cast(y_pred == y_true, tf.float32)
    mask = tf.cast(y_true!=0, tf.float32)
    return tf.reduce_sum(match)/tf.reduce_sum(mask)

In [58]:
model.compile(optimizer='adam', loss=masked_loss, metrics=[masked_acc, masked_loss])

In [59]:
model.evaluate(val_ds, steps=20, return_dict=True)



{'loss': 8.531594276428223,
 'masked_acc': 0.00018946568889077753,
 'masked_loss': 8.531578063964844}

In [60]:
history = model.fit(train_ds, epochs=100, steps_per_epoch=100,validation_data=val_ds, validation_steps=20,callbacks=[
        tf.keras.callbacks.EarlyStopping(patience=3)])

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
