# <u>Neural Language Translator

Sequence to sequence model for language translation using Deep $LSTM$ network. 
<br>The following model translates from $English$ to $French$. 

In [1]:
from keras.models import Model, load_model
from keras.layers import Input, LSTM, Dense
from keras.callbacks import TensorBoard, ModelCheckpoint
import numpy as np
import os.path
from lang_trans_utility import *

Using TensorFlow backend.


In [2]:
# constants for the model
batch_size = 64  # batch size 
num_epochs = 70  # total epochs to train
latent_dim = 256  # no. of activation units or latent dimensionality of encoder
num_samples = 10000 # no. of samples to train on

# path to look for the data file 
data_path = r'data/fra.txt'

In [3]:
input_texts = []  # for storing the input text data
target_texts = []  # for storing the target text data
input_chars = set()  # for storing the unique chars in input text data
target_chars = set()  # for storing the unique chars in target text data

In [4]:
# Variable Initialization
encoder_unique_tokens = 0 # unique tokens in encoder input 
decoder_unique_tokens = 0 # unique tokens in decoder output
Tx = 0 # max length of input sequence for encoder
Ty = 0 # max length of output sequence for decoder

### Data Preprocessing

In [5]:
encoder_unique_tokens, decoder_unique_tokens, input_chars, target_chars, input_texts, target_texts, Tx, Ty = load_dataset(data_path, num_samples )
print('Number of samples:', len(input_texts))
print('Number of unique input tokens:', encoder_unique_tokens)
print('Number of unique output tokens:', decoder_unique_tokens)
print('Max sequence length for inputs:', Tx)
print('Max sequence length for Target outputs:', Ty)

Total no. of lines of Original Text data: 149862
Number of samples: 10000
Number of unique input tokens: 71
Number of unique output tokens: 94
Max sequence length for inputs: 16
Max sequence length for Target outputs: 59


In [6]:
print(input_chars)
print('\n\n')
print(target_chars)

[' ', '!', '$', '&', "'", ',', '-', '.', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'Y', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '’']



['\t', '\n', ' ', '!', '$', '&', "'", '(', ')', ',', '-', '.', '0', '1', '3', '5', '6', '8', '9', ':', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'Y', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '\xa0', '«', '»', 'À', 'Ç', 'É', 'Ê', 'à', 'â', 'ç', 'è', 'é', 'ê', 'ë', 'î', 'ï', 'ô', 'ù', 'û', 'œ', '\u2009', '‘', '’', '\u202f']


In [7]:
input_char_idx, input_idx_char, target_char_idx, target_idx_char = create_mappings(input_chars, target_chars)
print('No. of keys in input data char to idx dict: ' + str(len(input_char_idx)))
print(input_char_idx)
print('\n\n')
print('No. of keys in target data char to idx dict: ' + str(len(target_char_idx)))
print(target_char_idx)

No. of keys in input data char to idx dict: 71
{' ': 0, '!': 1, '$': 2, '&': 3, "'": 4, ',': 5, '-': 6, '.': 7, '0': 8, '1': 9, '2': 10, '3': 11, '4': 12, '5': 13, '6': 14, '7': 15, '8': 16, '9': 17, ':': 18, '?': 19, 'A': 20, 'B': 21, 'C': 22, 'D': 23, 'E': 24, 'F': 25, 'G': 26, 'H': 27, 'I': 28, 'J': 29, 'K': 30, 'L': 31, 'M': 32, 'N': 33, 'O': 34, 'P': 35, 'Q': 36, 'R': 37, 'S': 38, 'T': 39, 'U': 40, 'V': 41, 'W': 42, 'Y': 43, 'a': 44, 'b': 45, 'c': 46, 'd': 47, 'e': 48, 'f': 49, 'g': 50, 'h': 51, 'i': 52, 'j': 53, 'k': 54, 'l': 55, 'm': 56, 'n': 57, 'o': 58, 'p': 59, 'q': 60, 'r': 61, 's': 62, 't': 63, 'u': 64, 'v': 65, 'w': 66, 'x': 67, 'y': 68, 'z': 69, '’': 70}



No. of keys in target data char to idx dict: 94
{'\t': 0, '\n': 1, ' ': 2, '!': 3, '$': 4, '&': 5, "'": 6, '(': 7, ')': 8, ',': 9, '-': 10, '.': 11, '0': 12, '1': 13, '3': 14, '5': 15, '6': 16, '8': 17, '9': 18, ':': 19, '?': 20, 'A': 21, 'B': 22, 'C': 23, 'D': 24, 'E': 25, 'F': 26, 'G': 27, 'H': 28, 'I': 29, 'J': 30, 

In [8]:
# now we will prepare data for the model
# no. of training examples
m = len(input_texts)
# encoder input data
enc_input_data = np.zeros((m, Tx, encoder_unique_tokens), dtype = 'float32')
# decoder input data
dec_input_data = np.zeros((m, Ty, decoder_unique_tokens), dtype = 'float32')
# decoder output target data
dec_target_data = np.zeros((m, Ty, decoder_unique_tokens), dtype = 'float32')

### Create training examples
Training examples will be of format (X, Y), where X is input and Y is target output.

For X we will take text sentences from **input_texts** and for Y we will take text sentences from **target_texts**.<br>
But for machine translation we will be using an Architecture where the output from the encoder network is given to the decoder network and using that it produces the target output in the 1st time step , then that produced output is again fed to the decoder network in the next time step, this continues till we get **'\n'** or exceed max sequence length. 

For the decoder network the output in each time step is one time step ahead of the input. The 1st input is **start_char** to the decoder and the output for that time step is the input for the next time step.

For input we will be using **One Hot encoding(OHE)** for the encoder network. Similarly for the decoder network we will be using **OHE** for input and output representations.

In [9]:
# creating training examples
for i, (input_text, target_text) in enumerate(zip(input_texts, target_texts)):
    # for encoder network
    # make the indices for the chars hot i.e, 1 in input text
    for curr_timestep, char in enumerate(input_text):
        enc_input_data[i, curr_timestep, input_char_idx[char]] = 1
        
    # for decoder network
    # make the indices for the chars hot i.e, 1 in target input text
    for curr_timestep, char in enumerate(target_text):
        dec_input_data[i, curr_timestep, target_char_idx[char]] = 1
    if curr_timestep > 0:     
        # make the indices for the chars hot i.e, 1 in target text, only this will 
        # be one time step ahead of decoder input
        for curr_timestep, char in enumerate(target_text):
            dec_target_data[i, curr_timestep-1, target_char_idx[char]] = 1    

## <u>Model

Since we want the weights to be same for the different timesteps so for achieving that we do global declaration for the various components.

Also we return the state information from the encoder network and use that information for the decoder network.

In [10]:
# for ENCODER network
# for taking input for the encoder network
encoder_inputs = Input(shape=(None, encoder_unique_tokens))
# we will LSTM units 
encoder_lstm = LSTM(latent_dim, return_state=True)
# we will save the activation and cell mem state information of encoder network
# No need to save the outputs
_, activation, cell_mem = encoder_lstm(encoder_inputs)
encoder_states = [activation, cell_mem]

# for DECODER network
# we will use the encoder state information as initial state for decoder network
decoder_inputs = Input(shape = (None, decoder_unique_tokens))
# we will save the state info of decoder network and use it 
# for making predictions later and return the output from decoder network units
decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)
# get the LSTM outputs
decoder_outputs, _, _ = decoder_lstm(decoder_inputs, initial_state=encoder_states)
# pass the LSTM output to a softmax layer
decoder_dense = Dense(decoder_unique_tokens, activation = 'softmax')
# get the final output from the softmax layer
decoder_outputs = decoder_dense(decoder_outputs)

In [11]:
# define the model
model = Model( [encoder_inputs, decoder_inputs], decoder_outputs)
model.compile(optimizer = 'Adam', loss = 'categorical_crossentropy')

In [12]:
# load any previously saved model
model_path = r'models/fra_eng_orig_wt.h5'
if os.path.exists(model_path):
    model.load_weights(model_path)

In [14]:
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, None, 71)     0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            (None, None, 94)     0                                            
__________________________________________________________________________________________________
lstm_1 (LSTM)                   [(None, 256), (None, 335872      input_1[0][0]                    
__________________________________________________________________________________________________
lstm_2 (LSTM)                   [(None, None, 256),  359424      input_2[0][0]                    
                                                                 lstm_1[0][1]                     
          

In [73]:
# start training the model
model.fit( [enc_input_data, dec_input_data], dec_target_data,
         batch_size = batch_size, epochs = num_epochs, 
         validation_split = 0.2)
# save the model
model.save_weights(model_path)

Train on 8000 samples, validate on 2000 samples
Epoch 1/70
Epoch 2/70
Epoch 3/70
Epoch 4/70
Epoch 5/70
Epoch 6/70
Epoch 7/70
Epoch 8/70
Epoch 9/70
Epoch 10/70
Epoch 11/70
Epoch 12/70
Epoch 13/70
Epoch 14/70
Epoch 15/70
Epoch 16/70
Epoch 17/70
Epoch 18/70
Epoch 19/70
Epoch 20/70
Epoch 21/70
Epoch 22/70
Epoch 23/70
Epoch 24/70
Epoch 25/70
Epoch 26/70
Epoch 27/70
Epoch 28/70
Epoch 29/70
Epoch 30/70
Epoch 31/70
Epoch 32/70
Epoch 33/70
Epoch 34/70
Epoch 35/70
Epoch 36/70
Epoch 37/70
Epoch 38/70
Epoch 39/70
Epoch 40/70
Epoch 41/70
Epoch 42/70
Epoch 43/70
Epoch 44/70
Epoch 45/70
Epoch 46/70
Epoch 47/70
Epoch 48/70
Epoch 49/70
Epoch 50/70
Epoch 51/70
Epoch 52/70
Epoch 53/70
Epoch 54/70
Epoch 55/70
Epoch 56/70
Epoch 57/70
Epoch 58/70
Epoch 59/70
Epoch 60/70
Epoch 61/70
Epoch 62/70
Epoch 63/70
Epoch 64/70
Epoch 65/70
Epoch 66/70
Epoch 67/70
Epoch 68/70
Epoch 69/70
Epoch 70/70


### Prediction and Sampling
To see how good the model is working we sample outputs from it, so we are going to do that.

Our Model can be divided into two parts: Encoder and Decoder. <br>
1. We pass the text input through the encoder network to get the states.
2. We use the encoder states as initial states for the decoder network.
3. We feed start character '\t' as the input for the first time step to the decoder network and then the predicted output is fed as input to the next time step.
4. We do this till we get '\n' or exceed max char length.

In [13]:
# take the encoder part 
# this basically takes the encoder part of the training model as 
# we are saying that the input is encoder_inputs and as output we get encoder_outputs
encoder_model = Model( encoder_inputs, encoder_states)

# for the decoder part
decoder_activation_state_input = Input(shape=(latent_dim,))
decoder_mem_state_input = Input(shape=(latent_dim,))
decoder_states_inputs = [decoder_activation_state_input, decoder_mem_state_input]
# using decoder lstm
decoder_outputs, activation_state, mem_state = decoder_lstm(
    decoder_inputs, initial_state=decoder_states_inputs)

decoder_states = [activation_state, mem_state]
# the activations go through the softmax layer
decoder_outputs = decoder_dense(decoder_outputs)
# model compilation
decoder_model = Model(
    [decoder_inputs] + decoder_states_inputs,
    [decoder_outputs] + decoder_states)

### Make Translations
Now we will make translations for the input sequence. For that we first pass the input sequence through the encoder and then pass its state info to the decoder network and do decoding.

In [15]:
# decode the output from the encoder network
def do_translation(input_text):
    
    # encoder state values
    state_vals = encoder_model.predict(input_text)
    
    # make a target input consisting of start character '\t'
    dec_input_seq = np.zeros((1,1, decoder_unique_tokens), dtype = 'float32')
    dec_input_seq[0, 0, target_char_idx['\t']] = 1
    
    # now we start the translation process by sampling out the predictions
    # each time sampling a single character
    translated_text = ''
    # decides whether to continue generating samples, becomes false
    # on encountering '\n' or when the the output sequence length exceeds max limit
    run_loop = True
    
    while run_loop:
        output_tokens, acti, mem = decoder_model.predict([dec_input_seq] + state_vals)
        
        # sample a char token
        # since we get softmax prob. from the output layer, we pick the 
        # index with max prob.
        sampled_token_idx = np.argmax(output_tokens[0, -1, :])
        sampled_token = target_idx_char[sampled_token_idx]
        translated_text += sampled_token
        
        # check for loop condition
        if len(translated_text)> Ty or sampled_token == '\n':
            run_loop = False
        
        # now update the decoder input for the next time step
        dec_input_seq = np.zeros((1,1, decoder_unique_tokens), dtype = 'float32')
        dec_input_seq[0, 0, sampled_token_idx] = 1
        
        # update state values
        state_vals = [acti, mem]
        
    return translated_text

In [16]:
indices = [2, 11, 500, 2001, 3478]
for seq_index in indices:
    # Take one sequence (part of the training set)
    # for trying out decoding.
    input_seq = enc_input_data[seq_index: seq_index + 1]
    decoded_sentence = do_translation(input_seq)
    print('Input sentence:', input_texts[seq_index])
    print('Decoded sentence:', decoded_sentence)

Input sentence: Run!
Decoded sentence: Atteque la votte !

Input sentence: Wait!
Decoded sentence: Attaquez !

Input sentence: Grab this.
Decoded sentence: Attrape ça !

Input sentence: I felt safe.
Decoded sentence: Je me suis sentie freille.

Input sentence: I went twice.
Decoded sentence: Je vous ai eu.



In [17]:
# takes user input in english and translates the phrase to french
def take_input():
    print('Enter in English (max length ' + str(Tx) + ')')
    user_input = input()
    
    # create a One Hot representation
    enc_input_data = to_OHE(user_input, Tx, encoder_unique_tokens, input_char_idx)
    
    decoded_sentence = do_translation(enc_input_data)
    print('Input English sentence: ', user_input)
    print('Decoded French sentence: ', decoded_sentence)

In [18]:
take_input()

Enter in English (max length 16)
hello
Input English sentence:  hello
Decoded French sentence:  Tiens ceci !



**For bigger phrases translations will appear way out of line since the training data was very very less.  **

### <u>Credits:
The code is heavly based on 
https://blog.keras.io/a-ten-minute-introduction-to-sequence-to-sequence-learning-in-keras.html