Importing Libraries:

In [2]:
import pandas as pd
import string
import re
import io
import numpy as np
from unicodedata import normalize
import keras, tensorflow
from keras.models import Model
from keras.layers import Input, LSTM, Dense

Using TensorFlow backend.


Reading data:

In [0]:
def read_data(file):
    data = []
    with io.open(file, 'r') as file:
        for entry in file:
            entry = entry.strip()
            data.append(entry)
    return data

In [0]:
data = read_data('/content/bilingual_pairs.txt')

In [6]:
data[90:100]

['Come on.\tViens !',
 'Come on.\tVenez !',
 'Drop it!\tLaisse tomber !',
 'Drop it!\tLaissez tomber !',
 'Drop it!\tLaisse-le tomber !',
 'Drop it!\tLaissez-le tomber !',
 'Get out!\tSortez\u202f!',
 'Get out!\tSors !',
 'Get out!\tSortez !',
 'Get out.\tSors.']

In [7]:
len(data)

145437

In [0]:
data=data[:140000]

Splitting our data into English and French sentences:

In [0]:
def build_english_french_sentences(data):
    english_sentences = []
    french_sentences = []
    for data_point in data:
        english_sentences.append(data_point.split("\t")[0])
        french_sentences.append(data_point.split("\t")[1])
    return english_sentences, french_sentences

In [0]:
english_sentences, french_sentences = build_english_french_sentences(data)

In [11]:
len(english_sentences),len(french_sentences)

(140000, 140000)

Data Cleaning:

In [0]:
def clean_sentences(sentence):
    # prepare regex for char filtering
    re_print = re.compile('[^%s]' % re.escape(string.printable))
    # prepare translation table for removing punctuation
    table = str.maketrans('', '', string.punctuation)
    cleaned_sent = normalize('NFD', sentence).encode('ascii', 'ignore')
    cleaned_sent = cleaned_sent.decode('UTF-8')
    cleaned_sent = cleaned_sent.split()
    cleaned_sent = [word.lower() for word in cleaned_sent]
    cleaned_sent = [word.translate(table) for word in cleaned_sent]
    cleaned_sent = [re_print.sub('', w) for w in cleaned_sent]
    cleaned_sent = [word for word in cleaned_sent if word.isalpha()]
    return ' '.join(cleaned_sent)

In [0]:
def build_clean_english_french_sentences(english_sentences, french_sentences):
    french_sentences_cleaned = []
    english_sentences_cleaned = []
    for sent in french_sentences:
        french_sentences_cleaned.append(clean_sentences(sent))
    for sent in english_sentences:
        english_sentences_cleaned.append(clean_sentences(sent))
    return english_sentences_cleaned, french_sentences_cleaned

In [0]:
english_sentences_cleaned, french_sentences_cleaned = build_clean_english_french_sentences(english_sentences, french_sentences)

In [15]:
english_sentences_cleaned[4020]

'youre early'

In [16]:
french_sentences_cleaned[4020]

'vous etes matinal'

Building our input and target variables:

In [0]:
def build_data(english_sentences_cleaned, french_sentences_cleaned):
    input_dataset = []
    target_dataset = []
    input_characters = set()
    target_characters = set()
    
    for french_sentence in french_sentences_cleaned:
        input_datapoint = french_sentence
        input_dataset.append(input_datapoint)
        for char in input_datapoint:
            input_characters.add(char)
        
    for english_sentence in english_sentences_cleaned:
        target_datapoint = "\t" + english_sentence + "\n"
        target_dataset.append(target_datapoint)
        for char in target_datapoint:
            target_characters.add(char)
            
    return input_dataset, target_dataset, sorted(list(input_characters)), sorted(list(target_characters))

In [0]:
input_dataset, target_dataset, input_characters, target_characters = build_data(english_sentences_cleaned,  french_sentences_cleaned)

In [0]:
def build_metadata(input_dataset, target_dataset, input_characters, target_characters):
    num_encoder_tokens = len(input_characters)
    num_decoder_tokens = len(target_characters)
    max_encoder_seq_length = max([len(data_point) for data_point in input_dataset])
    max_decoder_seq_length = max([len(data_point) for data_point in target_dataset])

    print('Number of data points:', len(input_dataset))
    print('Number of unique input tokens:', num_encoder_tokens)
    print('Number of unique output tokens:', num_decoder_tokens)
    print('Maximum sequence length for inputs:', max_encoder_seq_length)
    print('Maximum sequence length for outputs:', max_decoder_seq_length)
    
    return num_encoder_tokens, num_decoder_tokens, max_encoder_seq_length, max_decoder_seq_length

In [20]:
num_encoder_tokens, num_decoder_tokens, max_encoder_seq_length, max_decoder_seq_length = build_metadata(input_dataset,
                                                                                                        target_dataset,
                                                                                                        input_characters,
                                                                                                        target_characters)

Number of data points: 140000
Number of unique input tokens: 27
Number of unique output tokens: 29
Maximum sequence length for inputs: 117
Maximum sequence length for outputs: 58


Developing mappings for character to index :

In [0]:
def build_indices(input_characters, target_characters):
    input_char_to_idx = {}
    input_idx_to_char = {}
    target_char_to_idx = {}
    target_idx_to_char = {}
    
    for i, char in enumerate(input_characters):
        input_char_to_idx[char] = i
        input_idx_to_char[i] = char
    
    for i, char in enumerate(target_characters):
        target_char_to_idx[char] = i
        target_idx_to_char[i] = char
    
    return input_char_to_idx, input_idx_to_char, target_char_to_idx, target_idx_to_char

input_char_to_idx, input_idx_to_char, target_char_to_idx, target_idx_to_char = build_indices(input_characters,
                                                                                             target_characters)

In [22]:
def build_data_structures(length_input_dataset, max_encoder_seq_length, max_decoder_seq_length, num_encoder_tokens, num_decoder_tokens):
    encoder_input_data = np.zeros((length_input_dataset, max_encoder_seq_length, num_encoder_tokens), dtype='float32')
    decoder_input_data = np.zeros((length_input_dataset, max_decoder_seq_length, num_decoder_tokens), dtype='float32')
    decoder_target_data = np.zeros((length_input_dataset, max_decoder_seq_length, num_decoder_tokens), dtype='float32')
    print("Dimensionality of encoder input data is : ", encoder_input_data.shape)
    print("Dimensionality of decoder input data is : ", decoder_input_data.shape)
    print("Dimensionality of decoder target data is : ", decoder_target_data.shape)
    
    return encoder_input_data, decoder_input_data, decoder_target_data

encoder_input_data, decoder_input_data, decoder_target_data = build_data_structures(len(input_dataset), 
                                                                                    max_encoder_seq_length, 
                                                                                    max_decoder_seq_length, 
                                                                                    num_encoder_tokens, 
                                                                                    num_decoder_tokens)

Dimensionality of encoder input data is :  (140000, 117, 27)
Dimensionality of decoder input data is :  (140000, 58, 29)
Dimensionality of decoder target data is :  (140000, 58, 29)


In [0]:
def add_data_to_data_structures(input_dataset, target_dataset, encoder_input_data, decoder_input_data, decoder_target_data):
    for i, (input_data_point, target_data_point) in enumerate(zip(input_dataset, target_dataset)):
        for t, char in enumerate(input_data_point):
            encoder_input_data[i, t, input_char_to_idx[char]] = 1.
        for t, char in enumerate(target_data_point):
            # decoder_target_data is ahead of decoder_input_data by one timestep
            decoder_input_data[i, t, target_char_to_idx[char]] = 1.
            if t > 0:
                # decoder_target_data will be ahead by one timestep
                # and will not include the start character.
                decoder_target_data[i, t - 1, target_char_to_idx[char]] = 1.
    return encoder_input_data, decoder_input_data, decoder_target_data

In [0]:
encoder_input_data, decoder_input_data, decoder_target_data = add_data_to_data_structures(input_dataset, 
                                                                                          target_dataset, 
                                                                                          encoder_input_data, 
                                                                                          decoder_input_data, 
                                                                                          decoder_target_data)

Defining our model hyperparameters

In [0]:
batch_size = 512
epochs = 100
latent_dim = 256

Encoder:

In [0]:
encoder_inputs = Input(shape=(None, num_encoder_tokens))
encoder = LSTM(latent_dim, return_state=True)
encoder_outputs, state_h, state_c = encoder(encoder_inputs)
encoder_states = [state_h, state_c]

Decoder:

In [0]:
decoder_inputs = Input(shape=(None, num_decoder_tokens))
decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_inputs, initial_state=encoder_states)
decoder_dense = Dense(num_decoder_tokens, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

Building model:

In [0]:
model = Model(inputs=[encoder_inputs, decoder_inputs], 
              outputs=decoder_outputs)

In [29]:
model.compile(optimizer='rmsprop', loss='categorical_crossentropy')
model.summary()

Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, None, 27)     0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            (None, None, 29)     0                                            
__________________________________________________________________________________________________
lstm_1 (LSTM)                   [(None, 256), (None, 290816      input_1[0][0]                    
__________________________________________________________________________________________________
lstm_2 (LSTM)                   [(None, None, 256),  292864      input_2[0][0]                    
                                                                 lstm_1[0][1]               

Model Training:

In [32]:
model.fit([encoder_input_data, decoder_input_data], decoder_target_data,
          batch_size=batch_size,
          epochs=epochs,
          validation_split=0.2)

Train on 112000 samples, validate on 28000 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/

<keras.callbacks.callbacks.History at 0x7f9ac5d9c400>

In [0]:
model.save('neural_machine_translation_french_to_english.h5')

In [0]:
encoder_model = Model(encoder_inputs, encoder_states)

decoder_state_input_c = Input(shape=(latent_dim,))
decoder_state_input_h = Input(shape=(latent_dim,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]

decoder_outputs, state_h, state_c = decoder_lstm(decoder_inputs, initial_state=decoder_states_inputs)
decoder_states = [state_h, state_c]
decoder_outputs = decoder_dense(decoder_outputs)

decoder_model = Model([decoder_inputs] + decoder_states_inputs,
                      [decoder_outputs] + decoder_states)

In [0]:
def decode_sequence(input_seq):

    states_value = encoder_model.predict(input_seq)

    target_seq = np.zeros((1, 1, num_decoder_tokens))
    target_seq[0, 0, target_char_to_idx['\t']] = 1.

    stop_condition = False
    decoded_sentence = ''
    while not stop_condition:
        output_tokens, h, c = decoder_model.predict([target_seq] + states_value)
    
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_char = target_idx_to_char[sampled_token_index]
        decoded_sentence += sampled_char

        if (sampled_char == '\n' or len(decoded_sentence) > max_decoder_seq_length):
              stop_condition = True
      

        target_seq = np.zeros((1, 1, num_decoder_tokens))
        target_seq[0, 0, sampled_token_index] = 1.
    
        states_value = [h, c]
    
    return decoded_sentence

In [0]:
def decode(seq_index):
    input_seq = encoder_input_data[seq_index: seq_index + 1]
    decoded_sentence = decode_sequence(input_seq)
    print('-')
    print('Input sentence:', input_dataset[seq_index])
    print('Decoded sentence:', decoded_sentence)

In [46]:
decode(14441)

-
Input sentence: essayez de ne pas rire
Decoded sentence: try to relax anything



In [47]:
decode(12345)

-
Input sentence: ton chien est la
Decoded sentence: your father is in the room

