## Attention models : 4

### Data prep

In [1]:
import pandas as pd
import numpy as np

In [2]:
path = 'fra.txt'
eng_texts, fra_texts = [], []
num_samples = 10000

In [3]:
with open(path, 'r', encoding = 'UTF-8') as f : 
    lines = f.read().split('\n')

    for line in lines[:num_samples] : 
        eng_words, fra_words = line.split('\t')[:2]

        fra_words = '\t' + fra_words + '\n'

        eng_texts.append(eng_words)
        fra_texts.append(fra_words)

In [7]:
max_eng_word = max([len(word) for word in eng_texts])
max_fra_word = max([len(words) for words in fra_texts])
print(f"{max_eng_word} :: {max_fra_word}")

14 :: 59


### tokenizers

In [9]:
import tensorflow
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

### eng tokenizer

In [10]:
eng_token = Tokenizer(filters = ' ', lower = True)
eng_token.fit_on_texts(eng_texts)
eng_sequences = eng_token.texts_to_sequences(eng_texts)

In [13]:
encoder_input_data = pad_sequences(eng_sequences, maxlen = max_eng_word, padding = 'pre')

In [15]:
encoder_input_data.shape

(10000, 14)

### fra tokenizer

In [16]:
fra_token = Tokenizer(filters = ' ', lower = True)
fra_token.fit_on_texts(fra_texts)
decoder_input_data = fra_token.texts_to_sequences(fra_texts)

In [17]:
decoder_input_data = pad_sequences(decoder_input_data, maxlen = max_fra_word, padding = 'pre')

In [19]:
decoder_input_data.shape

(10000, 59)

In [20]:
decoder_target_data = np.zeros_like(decoder_input_data)
decoder_target_data[:, :-1] = decoder_input_data[:, 1:]

In [23]:
decoder_target_data.shape

(10000, 59)

In [24]:
decoder_input_data

array([[   0,    0,    0, ...,    0,   92,    2],
       [   0,    0,    0, ...,    0,    0,  991],
       [   0,    0,    0, ...,  145,  334,    2],
       ...,
       [   0,    0,    0, ...,   20, 5804,  380],
       [   0,    0,    0, ...,   20, 5805,  380],
       [   0,    0,    0, ..., 2286,  893,  456]])

In [25]:
decoder_target_data

array([[   0,    0,    0, ...,   92,    2,    0],
       [   0,    0,    0, ...,    0,  991,    0],
       [   0,    0,    0, ...,  334,    2,    0],
       ...,
       [   0,    0,    0, ..., 5804,  380,    0],
       [   0,    0,    0, ..., 5805,  380,    0],
       [   0,    0,    0, ...,  893,  456,    0]])

### vocabs

In [26]:
eng_vocab = len(eng_token.word_index) + 1
fra_vocab = len(fra_token.word_index) + 1
print(f"{eng_vocab} :: {fra_vocab}")

2707 :: 5806


### building the encoders

In [27]:
from tensorflow.keras.layers import Input, LSTM, Embedding

In [28]:
latent_dim = 256

In [39]:
enc_inputs = Input(shape = (max_eng_word, ))
enc_emb = Embedding(eng_vocab, latent_dim)(enc_inputs)
enc_lstm = LSTM(latent_dim, return_state = True, return_sequences = True)
enc_outputs, state_h, state_c = enc_lstm(enc_emb)

### decoders

In [40]:
dec_inputs = Input(shape = (max_fra_word, ))
dec_emb_layer = Embedding(fra_vocab, latent_dim)
dec_emb = dec_emb_layer(dec_inputs)
dec_lstm = LSTM(latent_dim, return_state = True, return_sequences = True)

### Bahdanau class

In [31]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, Layer, Concatenate

In [44]:
class BahdanauAttention(Layer) : 
    def __init__(self, units) : 
        super(BahdanauAttention, self).__init__()
        self.W1 = Dense(units)
        self.W2 = Dense(units)
        self.V = Dense(1)

    def call(self, query, values) : 
        query_with_time_axis = tensorflow.expand_dims(query, 1)
        score = self.V(tensorflow.nn.tanh(self.W1(values)) + self.W2(query_with_time_axis))
        attention_weights = tensorflow.nn.softmax(score, 1)
        context_vector = attention_weights * score
        context_vector = tensorflow.reduce_sum(context_vector, 1)

        return context_vector, attention_weights

In [None]:
attention = BahdanauAttention(latent_dim)
all_outputs = []
decoder_state_h, decoder_state_c = state_h, state_c
for t in range(max_fra_word) : 
    context_vector, _ = attention(decoder_state_h, decoder_state_c)
    X = tensorflow.expand_dims(dec_emb[:, t], 1)
    X = Concatenate(axis = -1)([context_vector[:, None, :], X])
    outputs, decoder_state_h, decoder_state_c = dec_lstm(X, initial_state = [decoder_state_h, decoder_state_c])
    outputs = Dense(fra_vocab, activation = 'softmax')(outputs)

    all_outputs.append(outputs)

decoder_outputs = tensorflow.concat(all_outputs, axis = 1)

### final model

In [48]:
model = Model()

In [49]:
model = Model([enc_inputs, dec_inputs], decoder_outputs)
model.compile(optimizer = 'adam', loss = 'sparse_categorical_crossentropy', metrics = ['accuracy'])

In [50]:
model.summary()

Model: "model_1"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_3 (InputLayer)           [(None, 14)]         0           []                               
                                                                                                  
 embedding_2 (Embedding)        (None, 14, 256)      692992      ['input_3[0][0]']                
                                                                                                  
 input_4 (InputLayer)           [(None, 59)]         0           []                               
                                                                                                  
 lstm_2 (LSTM)                  [(None, 14, 256),    525312      ['embedding_2[0][0]']            
                                 (None, 256),                                               

### final training

In [52]:
model.fit(
    [encoder_input_data, decoder_input_data],
    np.expand_dims(decoder_target_data, -1),
    batch_size = 128,
    epochs = 3,
    validation_split = 0.2
)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x21345e29bd0>