- Paper: https://arxiv.org/abs/1409.0473
- Paper: https://arxiv.org/pdf/1409.0473

The use of a fixed-length vector is a bottleneck in improving the performance of the basic encoder-decoder architecture, and attention model improves this by allowing a model to automatically (soft-)search for parts of a source sentence that are relevant to predicting a target word, without having to form these parts as a hard segment explicitly. 

In [1]:
import os
import unicodedata
import io
import re
import time

In [2]:
import tensorflow as tf
import tensorflow.keras.layers as L

In [3]:
import numpy as np
from sklearn.model_selection import train_test_split

In [4]:
path_to_zip = tf.keras.utils.get_file('spa-eng.zip',
                                      origin='http://storage.googleapis.com/download.tensorflow.org/data/spa-eng.zip',
                                      extract=True)

path_to_file = os.path.dirname(path_to_zip)+"/spa-eng/spa.txt"

In [5]:
path_to_file

'/Users/vinodgattani/.keras/datasets/spa-eng/spa.txt'

In [6]:
!wc -l $path_to_file

  118964 /Users/vinodgattani/.keras/datasets/spa-eng/spa.txt


In [7]:
!head -5 $path_to_file

Go.	Ve.
Go.	Vete.
Go.	Vaya.
Go.	Váyase.
Hi.	Hola.


In [8]:
def unicode_to_ascii(s):
    return ''.join(c for c in unicodedata.normalize('NFD', s) if unicodedata.category(c) != 'Mn')

unicode_to_ascii function converts character like "é", "á", "ü" to english characters like "e", "a" and "u"

In [9]:
with open(path_to_file, "r") as f:
    for i in range(118964):
        eng, esp = f.readline().split("\t")
        eng_lower = eng.lower().strip()
        eng_process = unicode_to_ascii(eng_lower)
        if eng_lower != eng_process:
            print(eng_lower, eng_process, sep="\t")

tom is my fiancé.	tom is my fiance.
mary is my fiancée.	mary is my fiancee.
we'd like a bottle of rosé.	we'd like a bottle of rose.
excuse me, where is the café?	excuse me, where is the cafe?
i was in são paulo in february.	i was in sao paulo in february.
what do you want for an entrée?	what do you want for an entree?
agnès looked at the brown boats.	agnes looked at the brown boats.
máire was injured in an accident.	maire was injured in an accident.
we are three hours from são paulo.	we are three hours from sao paulo.
i'm listening to björk's latest song.	i'm listening to bjork's latest song.
the girl dressed in white is his fiancée.	the girl dressed in white is his fiancee.
máire married the man who got first prize.	maire married the man who got first prize.
we went to the café that i told you about.	we went to the cafe that i told you about.
we went to the café that i told you about.	we went to the cafe that i told you about.
i have a vietnamese friend. her name is tiên.	i have a vie

In [10]:
with open(path_to_file, "r") as f:
    lines = f.readlines()

In [11]:
esp_sentences = [x.split("\t")[1].lower().strip() for x in lines]
len(esp_sentences)

118964

In [12]:
unicode_esp_sentences = [unicode_to_ascii(x)  for x in esp_sentences]
len(esp_sentences)

118964

In [13]:
changed_sentences = [[x, y] for x, y in zip(esp_sentences, unicode_esp_sentences) if x!= y]
len(changed_sentences)

73360

In [14]:
changed_sentences[-5:]

[['hay cuatro causas principales de muertes relacionadas con el alcohol. lesión por un accidente automovilístico o violencia es una. enfermedades como cirrosis del hígado, cáncer, enfermedades del corazón y del sistema circulatorio son las otras.',
  'hay cuatro causas principales de muertes relacionadas con el alcohol. lesion por un accidente automovilistico o violencia es una. enfermedades como cirrosis del higado, cancer, enfermedades del corazon y del sistema circulatorio son las otras.'],
 ['hay madres y padres que se quedan despiertos después de que sus hijos se hayan dormido y se preguntan cómo conseguir pagar la hipoteca o las facturas del médico, o cómo ahorrar el suficiente dinero para la educación universitaria de sus hijos.',
  'hay madres y padres que se quedan despiertos despues de que sus hijos se hayan dormido y se preguntan como conseguir pagar la hipoteca o las facturas del medico, o como ahorrar el suficiente dinero para la educacion universitaria de sus hijos.'],
 [

In [15]:
def preprocess_sentence(w):
    w = unicode_to_ascii(w.lower().strip())
    # creating a space between a word and the punctuation following it
    # eg: "he is a boy." => "he is a boy ."
    # Reference:- https://stackoverflow.com/questions/3645931/python-padding-punctuation-with-white-spaces-keeping-punctuation
    w = re.sub(r"([?.!,¿])", r" \1 ", w)
    w = re.sub(r'[" "]+', " ", w)

    # replacing everything with space except (a-z, A-Z, ".", "?", "!", ",")
    w = re.sub(r"[^a-zA-Z?.!,¿]+", " ", w)

    w = w.strip()

    # adding a start and an end token to the sentence
    # so that the model know when to start and stop predicting.
    w = '<start> ' + w + ' <end>'
    return w

In [16]:
def create_dataset(path, num_examples):
    lines = io.open(path, encoding='UTF-8').read().strip().split('\n')

    word_pairs = [[preprocess_sentence(w) for w in l.split('\t')]  for l in lines[:num_examples]]

    return zip(*word_pairs)

In [17]:
en, sp = create_dataset(path_to_file, None)

In [18]:
type(en), len(sp)

(tuple, 118964)

In [19]:
print(en[8], sp[8], sep="\n")

<start> fire ! <end>
<start> fuego ! <end>


In [20]:
def tokenize(lang):
    lang_tokenizer = tf.keras.preprocessing.text.Tokenizer(filters='')
    
    lang_tokenizer.fit_on_texts(lang)

    tensor = lang_tokenizer.texts_to_sequences(lang)

    tensor = tf.keras.preprocessing.sequence.pad_sequences(tensor,
                                                           padding='post')

    return tensor, lang_tokenizer

In [21]:
def load_dataset(path, tokenizer, num_examples=None):
    # creating cleaned input, output pairs
    targ_lang, inp_lang = create_dataset(path, num_examples)

    input_tensor, inp_lang_tokenizer = tokenizer(inp_lang)
    target_tensor, targ_lang_tokenizer = tokenizer(targ_lang)

    return input_tensor, target_tensor, inp_lang_tokenizer, targ_lang_tokenizer

In [22]:
# Try experimenting with the size of that dataset
num_examples = 300
input_tensor, target_tensor, inp_lang, targ_lang = load_dataset(path_to_file, tokenize, num_examples)

In [23]:
type(input_tensor), target_tensor.shape, type(inp_lang)

(numpy.ndarray, (300, 6), keras_preprocessing.text.Tokenizer)

In [24]:
input_tensor.shape

(300, 8)

In [25]:
max_length_targ, max_length_inp = target_tensor.shape[1], input_tensor.shape[1]
max_length_targ, max_length_inp

(6, 8)

In [26]:
# Creating training and validation sets using an 80-20 split
input_tensor_train, input_tensor_val, target_tensor_train, target_tensor_val = train_test_split(input_tensor, target_tensor, test_size=0.2)

# Show length
print(len(input_tensor_train), len(target_tensor_train), len(input_tensor_val), len(target_tensor_val))


240 240 60 60


In [27]:
def convert(lang, tensor):
    for t in tensor:
        if t!=0:
            print ("%d ----> %s" % (t, lang.index_word[t]))

In [28]:
print ("Input Language; index to word mapping")
convert(inp_lang, input_tensor_train[0])
print ()
print ("Target Language; index to word mapping")
convert(targ_lang, target_tensor_train[0])

Input Language; index to word mapping
1 ----> <start>
232 ----> esposale
3 ----> .
2 ----> <end>

Target Language; index to word mapping
1 ----> <start>
136 ----> cuff
58 ----> him
3 ----> .
2 ----> <end>


### create batch data

In [29]:
BUFFER_SIZE = len(input_tensor_train)
BATCH_SIZE = 64
steps_per_epoch = len(input_tensor_train)//BATCH_SIZE
embedding_dim = 256
units = 1024

In [30]:
BUFFER_SIZE = len(input_tensor_train)
BATCH_SIZE = 64
steps_per_epoch = len(input_tensor_train)//BATCH_SIZE
embedding_dim = 256
units = 1024

vocab_inp_size = len(inp_lang.word_index)+1 # + 1 is because of reserving padding (i.e. index zero).
vocab_tar_size = len(targ_lang.word_index)+1

dataset = tf.data.Dataset.from_tensor_slices((input_tensor_train, target_tensor_train)).shuffle(BUFFER_SIZE)
dataset = dataset.batch(BATCH_SIZE, drop_remainder=True)

In [31]:
example_input_batch, example_target_batch = next(iter(dataset))
example_input_batch.shape, example_target_batch.shape

(TensorShape([64, 8]), TensorShape([64, 6]))

In [32]:
class Encoder(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, enc_units, batch_sz):
        super(Encoder, self).__init__()
        self.batch_sz = batch_sz
        self.enc_units = enc_units
        self.embedding = L.Embedding(vocab_size, embedding_dim)
        self.gru = L.GRU(self.enc_units,
                       return_sequences=True,
                       return_state=True,
                       recurrent_initializer='glorot_uniform')

    def call(self, x, hidden):
        x = self.embedding(x)
        output, state = self.gru(x, initial_state = hidden)
        return output, state

    def initialize_hidden_state(self):
        return tf.zeros((self.batch_sz, self.enc_units))

In [33]:
encoder = Encoder(10, 32, 15, 2)
output = encoder(np.array([[1,2,4,5],[1,2,4,5]]), hidden=encoder.initialize_hidden_state())
[x.shape for x in output]

[TensorShape([2, 4, 15]), TensorShape([2, 15])]

In [34]:
encoder.summary()

Model: "encoder"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        multiple                  320       
_________________________________________________________________
gru (GRU)                    multiple                  2205      
Total params: 2,525
Trainable params: 2,525
Non-trainable params: 0
_________________________________________________________________


Parameters:

- GRU
    - Update Gate: (15*32 + 15) + (15*15 + 15)
    - reset Gate: (15*32 + 15) + (15*15 + 15)
    - candidate activation: (15*32 + 15) + (15*15 + 15)
    - 3×(𝑛2+𝑛𝑚+2𝑛) #𝑚  is the input dimension and 𝑛 is the output dimension
    
- Embedding
    - 10*32 = 320

In [35]:
vocab_inp_size, embedding_dim, units, BATCH_SIZE

(303, 256, 1024, 64)

In [37]:
units=1024
encoder = Encoder(vocab_inp_size, embedding_dim, units, BATCH_SIZE)
sample_hidden = encoder.initialize_hidden_state()
sample_output, sample_hidden = encoder(example_input_batch, sample_hidden)
print ('Encoder output shape: (batch size, sequence length, units) {}'.format(sample_output.shape))
print ('Encoder Hidden state shape: (batch size, units) {}'.format(sample_hidden.shape))

Encoder output shape: (batch size, sequence length, units) (64, 8, 1024)
Encoder Hidden state shape: (batch size, units) (64, 1024)


Since input is length 16, for every input word we have encoded vector with size 1024

In [115]:
units = 10
W1 = L.Dense(units)
query = tf.random.uniform([64,1024])
query_with_time_axis = tf.expand_dims(query, 1)
W1(query_with_time_axis).shape

TensorShape([64, 1, 10])

In [116]:
units = 10
W1 = L.Dense(units)
value = tf.random.uniform([64,16,1024])
W1(value).shape

TensorShape([64, 16, 10])

In [45]:
(W1(query_with_time_axis) + W1(value)).shape

TensorShape([64, 16, 10])

In [47]:
V = L.Dense(1)
_score = tf.nn.tanh(W1(query_with_time_axis) + W1(value))
score = V(_score)
score.shape

TensorShape([64, 16, 1])

In [53]:
attention_weights = tf.nn.softmax(score, axis=1)
attention_weights.shape

TensorShape([64, 16, 1])

In [55]:
context_vector = attention_weights * value
context_vector.shape, value.shape

(TensorShape([64, 16, 1024]), TensorShape([64, 16, 1024]))

In [60]:
context_vector = tf.reduce_sum(context_vector, axis=1)
context_vector.shape

TensorShape([64, 1024])

In [39]:
class BahdanauAttention(L.Layer):
    def __init__(self, units):
        super(BahdanauAttention, self).__init__()
        self.W1 = L.Dense(units)
        self.W2 = L.Dense(units)
        self.V = L.Dense(1)

    def call(self, query, values):
        '''
        query:
            Desc: hidden output at time stamp t in decoder network
            Shape: (batch_size, hidden_state)
        Value:
            Desc: Encoded vector for every word
            Shape: (batch_size, max_length, hidden size)
            
        '''
        #query_with_time_axis shape == (batch_size, 1, hidden size)
        # Broadcast hidden state along the time axis to calculate the score
        query_with_time_axis = tf.expand_dims(query, 1)

        
        #_score shape = (batch_size, max_length, 10)
        _score = tf.nn.tanh(
            self.W1(query_with_time_axis) + self.W2(values))
        
        # score shape == (batch_size, max_length, 1)
        score = self.V(_score)

        # attention_weights shape == (batch_size, max_length, 1)
        attention_weights = tf.nn.softmax(score, axis=1)
        
        # context_vector shape: (batch_size, max_length, hiddent_size)
        context_vector = attention_weights * values
        
        # context_vector shape: == (batch_size, hidden_size)
        context_vector = tf.reduce_sum(context_vector, axis=1)

        return context_vector, attention_weights

In [40]:
attention_layer = BahdanauAttention(10)
attention_result, attention_weights = attention_layer(sample_hidden, sample_output)

print("Attention result shape: (batch size, units) {}".format(attention_result.shape))
print("Attention weights shape: (batch_size, sequence_length, 1) {}".format(attention_weights.shape))

Attention result shape: (batch size, units) (64, 1024)
Attention weights shape: (batch_size, sequence_length, 1) (64, 8, 1)


- Here, conventional encoder generates encoder vector for every word in the input
- Decoder: at step 1 take the hidden state from last word in input
- And in subsequent layer it used the hidden state from previous word in the target
- This hidden step is concatenated with context vector


-- How is this contect vector generated
- For predicting word at time stamp t.
- Hidden state at time stamp (t-1) in the decoder is combined with encoder vector for every word
- This generate attention weights for every word
- Attention weights are multiplied with encoder vector to get context vector

- You then use convectional decoder on this context vector to get prediction at time stamp t

- From paper

The probability αij , or its associated energy eij ,(attention weights) reflects the importance of the annotation hj with respect to the previous hidden state si−1 in deciding the next state si and generating yi. 


Intuitively, this implements a mechanism of attention in the decoder. The decoder decides parts of the source
sentence to pay attention to. By letting the decoder have an attention mechanism, we relieve the
encoder from the burden of having to encode all information in the source sentence into a fixedlength vector. With this new approach the information can be spread throughout the sequence of
annotations, which can be selectively retrieved by the decoder accordingly.

In [41]:
class Decoder(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, dec_units, batch_sz):
        
        super(Decoder, self).__init__()
        self.batch_sz = batch_sz
        self.dec_units = dec_units
        
        self.embedding = L.Embedding(vocab_size, embedding_dim)
        self.gru = L.GRU(self.dec_units,
                                       return_sequences=True,
                                       return_state=True,
                                       recurrent_initializer='glorot_uniform')
        self.fc = L.Dense(vocab_size)

        # used for attention
        self.attention = BahdanauAttention(self.dec_units)

    def call(self, x, hidden, enc_output):
        # enc_output shape == (batch_size, max_length, hidden_size)
        context_vector, attention_weights = self.attention(hidden, enc_output)

        # x shape after passing through embedding == (batch_size, 1, embedding_dim)
        x = self.embedding(x)

        # x shape after concatenation == (batch_size, 1, embedding_dim + hidden_size)
        x = tf.concat([tf.expand_dims(context_vector, 1), x], axis=-1)

        # passing the concatenated vector to the GRU
        output, state = self.gru(x)

        # output shape == (batch_size * 1, hidden_size)
        output = tf.reshape(output, (-1, output.shape[2]))

        # output shape == (batch_size, vocab)
        x = self.fc(output)

        return x, state, attention_weights

In [42]:
vocab_inp_size, embedding_dim, units, BATCH_SIZE

(303, 256, 1024, 64)

In [43]:
decoder = Decoder(vocab_inp_size, embedding_dim, units, BATCH_SIZE)

In [44]:
optimizer = tf.keras.optimizers.Adam()
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True,
                                                            reduction='none')

def loss_function(real, pred):
    mask = tf.math.logical_not(tf.math.equal(real, 0))
    loss_ = loss_object(real, pred)

    mask = tf.cast(mask, dtype=loss_.dtype)
    loss_ *= mask

    return tf.reduce_mean(loss_)

In [45]:
@tf.function
def train_step(inp, targ, enc_hidden):
    loss = 0

    with tf.GradientTape() as tape:
        enc_output, enc_hidden = encoder(inp, enc_hidden)

        dec_hidden = enc_hidden

        dec_input = tf.expand_dims([targ_lang.word_index['<start>']] * BATCH_SIZE, 1)

        # Teacher forcing - feeding the target as the next input
        for t in range(1, targ.shape[1]):
          # passing enc_output to the decoder
            predictions, dec_hidden, _ = decoder(dec_input, dec_hidden, enc_output)

            loss += loss_function(targ[:, t], predictions)

            # using teacher forcing
            dec_input = tf.expand_dims(targ[:, t], 1)

    batch_loss = (loss / int(targ.shape[1]))

    variables = encoder.trainable_variables + decoder.trainable_variables

    gradients = tape.gradient(loss, variables)

    optimizer.apply_gradients(zip(gradients, variables))

    return batch_loss

In [62]:
checkpoint = tf.train.Checkpoint(optimizer=optimizer,
                                 encoder=encoder,
                                 decoder=decoder)

In [66]:
EPOCHS = 3

for epoch in range(EPOCHS):
    start = time.time()

    enc_hidden = encoder.initialize_hidden_state()
    total_loss = 0

    for (batch, (inp, targ)) in enumerate(dataset.take(steps_per_epoch)):
        batch_loss = train_step(inp, targ, enc_hidden)
        total_loss += batch_loss

        if batch % 100 == 0:
            print('Epoch {} Batch {} Loss {:.4f}'.format(epoch + 1,
                                                         batch,
                                                         batch_loss.numpy()))
    # saving (checkpoint) the model every 2 epochs
    #   if (epoch + 1) % 2 == 0:
    #     checkpoint.save(file_prefix = checkpoint_prefix)
    checkpoint.save(file_prefix = checkpoint_prefix)
    print('Epoch {} Loss {:.4f}'.format(epoch + 1,
                                      total_loss / steps_per_epoch))
    print('Time taken for 1 epoch {} sec\n'.format(time.time() - start))

Epoch 1 Batch 0 Loss 1.9749
Epoch 1 Loss 1.9583
Time taken for 1 epoch 2.2345409393310547 sec

Epoch 2 Batch 0 Loss 1.9417
Epoch 2 Loss 1.8796
Time taken for 1 epoch 1.5570471286773682 sec

Epoch 3 Batch 0 Loss 1.8214
Epoch 3 Loss 1.8280
Time taken for 1 epoch 1.5627319812774658 sec



In [48]:
def evaluate(sentence):
    attention_plot = np.zeros((max_length_targ, max_length_inp))

    sentence = preprocess_sentence(sentence)

    inputs = [inp_lang.word_index[i] for i in sentence.split(' ')]
    inputs = tf.keras.preprocessing.sequence.pad_sequences([inputs],
                                                         maxlen=max_length_inp,
                                                         padding='post')
    inputs = tf.convert_to_tensor(inputs)

    result = ''

    hidden = [tf.zeros((1, units))]
    enc_out, enc_hidden = encoder(inputs, hidden)

    dec_hidden = enc_hidden
    dec_input = tf.expand_dims([targ_lang.word_index['<start>']], 0)

    for t in range(max_length_targ):
        predictions, dec_hidden, attention_weights = decoder(dec_input,
                                                         dec_hidden,
                                                         enc_out)

    # storing the attention weights to plot later on
    attention_weights = tf.reshape(attention_weights, (-1, ))
    attention_plot[t] = attention_weights.numpy()

    predicted_id = tf.argmax(predictions[0]).numpy()

    result += targ_lang.index_word[predicted_id] + ' '

    if targ_lang.index_word[predicted_id] == '<end>':
        return result, sentence, attention_plot

    # the predicted ID is fed back into the model
    dec_input = tf.expand_dims([predicted_id], 0)

    return result, sentence, attention_plot

In [129]:
inp_lang.word_index

{'<start>': 1,
 '<end>': 2,
 '.': 3,
 '!': 4,
 'a': 5,
 '¿': 6,
 '?': 7,
 'lo': 8,
 'de': 9,
 'tom': 10,
 'me': 11,
 'vete': 12,
 'estoy': 13,
 'el': 14,
 'yo': 15,
 'se': 16,
 'que': 17,
 'soy': 18,
 'tomas': 19,
 'quien': 20,
 'la': 21,
 'aqui': 22,
 ',': 23,
 'no': 24,
 'orale': 25,
 'bien': 26,
 'hola': 27,
 'espera': 28,
 'en': 29,
 'por': 30,
 'largate': 31,
 'largo': 32,
 'ya': 33,
 'hasta': 34,
 'he': 35,
 'ganado': 36,
 'tengo': 37,
 'corrio': 38,
 'agarra': 39,
 'un': 40,
 'ayudame': 41,
 'despierta': 42,
 've': 43,
 'vaya': 44,
 'vayase': 45,
 'corre': 46,
 'ayuda': 47,
 'auxilio': 48,
 'salte': 49,
 'continua': 50,
 'corria': 51,
 'sali': 52,
 'dimito': 53,
 'ninguna': 54,
 'manera': 55,
 'ni': 56,
 'gracias': 57,
 'estate': 58,
 'sean': 59,
 'llamame': 60,
 'entre': 61,
 'entren': 62,
 'luego': 63,
 'vista': 64,
 'fuertemente': 65,
 'renuncio': 66,
 'acuerdo': 67,
 'gordo': 68,
 'gorda': 69,
 'viejo': 70,
 'abre': 71,
 'nos': 72,
 'comio': 73,
 'gano': 74,
 'las': 75,
 'pr

In [68]:
checkpoint.restore('./training_checkpoints/ckpt-7')

<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x15f206c90>

In [69]:
checkpoint.restore('./training_checkpoints/ckpt-8')

<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x16171cb90>

In [67]:
checkpoint_dir

'./training_checkpoints'

In [65]:
checkpoint.restore(tf.train.latest_checkpoint(checkpoint_dir))

<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x15edb4f10>

In [52]:
evaluate(u'me vete estoy')

('<end> ',
 '<start> me vete estoy <end>',
 array([[0.        , 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.        , 0.        ],
        [0.        , 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.        , 0.        ],
        [0.        , 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.        , 0.        ],
        [0.        , 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.        , 0.        ],
        [0.        , 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.        , 0.        ],
        [0.06035014, 0.06662343, 0.07268456, 0.08010215, 0.08937129,
         0.14522991, 0.20881045, 0.27682808]]))

In [50]:
tf.saved_model.save(encoder, "encoder.model")

TypeError: call() missing 1 required positional argument: 'hidden'

In [53]:
type(encoder)

__main__.Encoder

In [58]:
checkpoint_dir = './training_checkpoints'
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt")

In [60]:
encoder.save(checkpoint_prefix)

TypeError: call() missing 1 required positional argument: 'hidden'

In [135]:
encoder.save_weights("encoder.weights")
decoder.save_weights("decoder.weights")

In [136]:
units=1024
encoder_1 = Encoder(vocab_inp_size, embedding_dim, units, BATCH_SIZE)
decoder_1 = Decoder(vocab_inp_size, embedding_dim, units, BATCH_SIZE)

In [137]:
encoder_1.load_weights("encoder.weights")
decoder_1.load_weights("decoder.weights")

<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x14e9da9d0>

In [138]:
def evaluate_1(sentence):
    attention_plot = np.zeros((max_length_targ, max_length_inp))

    sentence = preprocess_sentence(sentence)

    inputs = [inp_lang.word_index[i] for i in sentence.split(' ')]
    inputs = tf.keras.preprocessing.sequence.pad_sequences([inputs],
                                                         maxlen=max_length_inp,
                                                         padding='post')
    inputs = tf.convert_to_tensor(inputs)

    result = ''

    hidden = [tf.zeros((1, units))]
    enc_out, enc_hidden = encoder_1(inputs, hidden)

    dec_hidden = enc_hidden
    dec_input = tf.expand_dims([targ_lang.word_index['<start>']], 0)

    for t in range(max_length_targ):
        predictions, dec_hidden, attention_weights = decoder_1(dec_input,
                                                         dec_hidden,
                                                         enc_out)

    # storing the attention weights to plot later on
    attention_weights = tf.reshape(attention_weights, (-1, ))
    attention_plot[t] = attention_weights.numpy()

    predicted_id = tf.argmax(predictions[0]).numpy()

    result += targ_lang.index_word[predicted_id] + ' '

    if targ_lang.index_word[predicted_id] == '<end>':
        return result, sentence, attention_plot

    # the predicted ID is fed back into the model
    dec_input = tf.expand_dims([predicted_id], 0)

    return result, sentence, attention_plot

In [139]:
evaluate_1(u'me vete estoy')

('<end> ',
 '<start> me vete estoy <end>',
 array([[0.        , 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.        , 0.        ],
        [0.        , 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.        , 0.        ],
        [0.        , 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.        , 0.        ],
        [0.        , 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.        , 0.        ],
        [0.        , 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.        , 0.        ],
        [0.05204115, 0.05777236, 0.06524841, 0.07147697, 0.08904393,
         0.14842041, 0.21881667, 0.29718018]]))