In [25]:
import os, sys
from tqdm import tqdm
from keras.models import Model
from keras.layers import Input, LSTM, GRU, Dense, Embedding
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [116]:
BATCH_SIZE = 16
EPOCHS = 20
LSTM_NODES =256
NUM_SENTENCES = 5000
MAX_SENTENCE_LENGTH = 50
MAX_NUM_WORDS = 20000
EMBEDDING_SIZE = 100

In [117]:
with open(r'D:/Personal Coding/NLP - Machine Translation - FRA to ENG/fra.txt') as players_data:
    players_data.read()

In [118]:
import csv
with open(r'D:/Personal Coding/NLP - Machine Translation - FRA to ENG/fra.txt', newline = '') as players_data:                                                                                          
    dt = csv.reader(players_data, delimiter='\t')
    dataset = []
    for sent in tqdm(dt):
        dataset.append(sent)

177210it [00:00, 259961.15it/s]


In [119]:
dataset[0:2]

[['Go.',
  'Va !',
  'CC-BY 2.0 (France) Attribution: tatoeba.org #2877272 (CM) & #1158250 (Wittydev)'],
 ['Hi.',
  'Salut !',
  'CC-BY 2.0 (France) Attribution: tatoeba.org #538123 (CM) & #509819 (Aiji)']]

In [120]:
input_sentences = []
output_sentences = []
output_sentences_inputs = []
for i in tqdm(range(NUM_SENTENCES)):
    input_sentences.append(dataset[i][0])
    output_sentences.append(dataset[i][1] + ' <eos>')
    output_sentences_inputs.append('<sos> ' + dataset[i][1])

100%|██████████| 5000/5000 [00:00<00:00, 1002894.17it/s]


In [121]:
print("num samples input:", len(input_sentences))
print("num samples output:", len(output_sentences))
print("num samples output input:", len(output_sentences_inputs))

num samples input: 5000
num samples output: 5000
num samples output input: 5000


In [122]:
print(input_sentences[172])
print(output_sentences[172])
print(output_sentences_inputs[172])

I'm hit!
Je suis touchÃ©e ! <eos>
<sos> Je suis touchÃ©e !


In [123]:
input_tokenizer = Tokenizer(num_words=MAX_NUM_WORDS)
input_tokenizer.fit_on_texts(input_sentences)
input_integer_seq = input_tokenizer.texts_to_sequences(input_sentences)

word2idx_inputs = input_tokenizer.word_index
print('Total unique words in the input: %s' % len(word2idx_inputs))

max_input_len = max(len(sen) for sen in input_integer_seq)
print("Length of longest sentence in input: %g" % max_input_len)

Total unique words in the input: 1246
Length of longest sentence in input: 4


In [124]:
output_tokenizer = Tokenizer(num_words=MAX_NUM_WORDS, filters='')
output_tokenizer.fit_on_texts(output_sentences + output_sentences_inputs)
output_integer_seq = output_tokenizer.texts_to_sequences(output_sentences)
output_input_integer_seq = output_tokenizer.texts_to_sequences(output_sentences_inputs)

word2idx_outputs = output_tokenizer.word_index
print('Total unique words in the output: %s' % len(word2idx_outputs))

num_words_output = len(word2idx_outputs) + 1
max_out_len = max(len(sen) for sen in output_integer_seq)
print("Length of longest sentence in the output: %g" % max_out_len)

Total unique words in the output: 3379
Length of longest sentence in the output: 11


In [125]:
encoder_input_sequences = pad_sequences(input_integer_seq, maxlen=max_input_len)
print("encoder_input_sequences.shape:", encoder_input_sequences.shape)
print("encoder_input_sequences[172]:", encoder_input_sequences[172])

encoder_input_sequences.shape: (5000, 4)
encoder_input_sequences[172]: [  0   0   2 254]


In [126]:
print(word2idx_inputs["i'm"])
print(word2idx_inputs["ill"])

2
224


In [127]:
decoder_input_sequences = pad_sequences(output_input_integer_seq, maxlen=max_out_len, padding='post')
print("decoder_input_sequences.shape:", decoder_input_sequences.shape)
print("decoder_input_sequences[172]:", decoder_input_sequences[172])

decoder_input_sequences.shape: (5000, 11)
decoder_input_sequences[172]: [   2    3    5 1383    4    0    0    0    0    0    0]


In [128]:
decoder_output_sequences = pad_sequences(output_integer_seq, maxlen=max_out_len, padding='post')
print("decoder_output_sequences.shape:", decoder_output_sequences.shape)
print("decoder_output_sequences[172]:", decoder_output_sequences[172])

decoder_output_sequences.shape: (5000, 11)
decoder_output_sequences[172]: [   3    5 1383    4    1    0    0    0    0    0    0]


In [129]:
print(word2idx_outputs["<sos>"])
print(word2idx_outputs["je"])
print(word2idx_outputs["suis"])
print(word2idx_outputs["malade."])

2
3
5
114


In [130]:
from numpy import array
from numpy import asarray
from numpy import zeros

embeddings_dictionary = dict()

glove_file = open(r'D:/Personal Coding/NLP - Machine Translation - FRA to ENG/glove.6B.100d.txt', encoding="utf8")

for line in tqdm(glove_file):
    records = line.split()
    word = records[0]
    vector_dimensions = asarray(records[1:], dtype='float32')
    embeddings_dictionary[word] = vector_dimensions
glove_file.close()

400000it [00:13, 30568.44it/s]


In [131]:
num_words = min(MAX_NUM_WORDS, len(word2idx_inputs) + 1)
embedding_matrix = zeros((num_words, EMBEDDING_SIZE))
for word, index in tqdm(word2idx_inputs.items()):
    embedding_vector = embeddings_dictionary.get(word)
    if embedding_vector is not None:
        embedding_matrix[index] = embedding_vector

100%|██████████| 1246/1246 [00:00<00:00, 249909.28it/s]


In [132]:
print(embeddings_dictionary["ill"])

[ 0.12648    0.1366     0.22192   -0.025204  -0.7197     0.66147
  0.48509    0.057223   0.13829   -0.26375   -0.23647    0.74349
  0.46737   -0.462      0.20031   -0.26302    0.093948  -0.61756
 -0.28213    0.1353     0.28213    0.21813    0.16418    0.22547
 -0.98945    0.29624   -0.62476   -0.29535    0.21534    0.92274
  0.38388    0.55744   -0.14628   -0.15674   -0.51941    0.25629
 -0.0079678  0.12998   -0.029192   0.20868   -0.55127    0.075353
  0.44746   -0.71046    0.75562    0.010378   0.095229   0.16673
  0.22073   -0.46562   -0.10199   -0.80386    0.45162    0.45183
  0.19869   -1.6571     0.7584    -0.40298    0.82426   -0.386
  0.0039546  0.61318    0.02701   -0.3308    -0.095652  -0.082164
  0.7858     0.13394   -0.32715   -0.31371   -0.20247   -0.73001
 -0.49343    0.56445    0.61038    0.36777   -0.070182   0.44859
 -0.61774   -0.18849    0.65592    0.44797   -0.10469    0.62512
 -1.9474    -0.60622    0.073874   0.50013   -1.1278    -0.42066
 -0.37322   -0.50538    0

In [133]:
print(embedding_matrix[539])

[-0.050131    0.66355997  0.84315002 -0.75295001 -0.42848     0.161
 -0.03673     0.074777    0.35251001 -0.28556001  0.20641001 -0.043102
 -0.37977999 -0.052587    0.29311001 -0.45879    -0.47363001  0.20305
 -0.45185     0.81729001  0.16955     0.55387998 -0.56667    -0.68040001
  0.10162     0.43305999 -0.041035   -0.33195999  0.39855    -0.42923999
  0.16037001  0.67054999  0.56774998 -0.036795    0.36761999  0.20562001
  0.48372    -0.0023657   0.30895999 -0.55975002 -0.093893    0.045387
  0.13003001 -0.47782999 -0.62755001 -0.92216998  0.034079   -0.23645
 -0.79759997 -1.01370001  0.11884     0.051346    0.18324     0.44277999
 -0.47314    -1.55309999  0.56762999  0.63796997  0.64317     0.21479
 -0.20058     0.28966001 -0.66742003 -0.77429998  0.70982999  0.25972
  0.45919999  0.27178001 -0.57331002 -0.072424    0.36847001  0.46355
 -0.018428   -0.41951999  0.13414     0.33214    -0.033299    0.10074
 -0.43860999 -0.37786001  0.33419999 -0.66377997  0.20946001 -0.36172
 -1.2676

In [134]:
embedding_layer = Embedding(num_words, EMBEDDING_SIZE, weights=[embedding_matrix], input_length=max_input_len)

In [135]:
decoder_targets_one_hot = np.zeros((
        len(input_sentences),
        max_out_len,
        num_words_output
    ),
    dtype='float32'
)
decoder_targets_one_hot.shape

(5000, 11, 3380)

In [136]:
for i, d in enumerate(decoder_output_sequences):
    for t, word in enumerate(d):
        decoder_targets_one_hot[i, t, word] = 1

In [137]:
encoder_inputs_placeholder = Input(shape=(max_input_len,))
x = embedding_layer(encoder_inputs_placeholder)
encoder = LSTM(LSTM_NODES, return_state=True)

encoder_outputs, h, c = encoder(x)
encoder_states = [h, c]

In [138]:
decoder_inputs_placeholder = Input(shape=(max_out_len,))

decoder_embedding = Embedding(num_words_output, LSTM_NODES)
decoder_inputs_x = decoder_embedding(decoder_inputs_placeholder)

decoder_lstm = LSTM(LSTM_NODES, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_inputs_x, initial_state=encoder_states)

decoder_dense = Dense(num_words_output, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

In [139]:
model = Model([encoder_inputs_placeholder,
  decoder_inputs_placeholder], decoder_outputs)
model.compile(
    optimizer='rmsprop',
    loss='categorical_crossentropy',
    metrics=['accuracy']
)

In [140]:
r = model.fit(
    [encoder_input_sequences, decoder_input_sequences],
    decoder_targets_one_hot,
    batch_size=BATCH_SIZE,
    epochs=EPOCHS,
    validation_split=0.1,
)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [141]:
encoder_model = Model(encoder_inputs_placeholder, encoder_states)

In [142]:
decoder_state_input_h = Input(shape=(LSTM_NODES,))
decoder_state_input_c = Input(shape=(LSTM_NODES,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]

In [143]:
decoder_inputs_single = Input(shape=(1,))
decoder_inputs_single_x = decoder_embedding(decoder_inputs_single)

In [144]:
decoder_outputs, h, c = decoder_lstm(decoder_inputs_single_x, initial_state=decoder_states_inputs)

In [145]:
decoder_states = [h, c]
decoder_outputs = decoder_dense(decoder_outputs)

In [146]:
decoder_model = Model(
    [decoder_inputs_single] + decoder_states_inputs,
    [decoder_outputs] + decoder_states
)

In [147]:
idx2word_input = {v:k for k, v in word2idx_inputs.items()}
idx2word_target = {v:k for k, v in word2idx_outputs.items()}

In [148]:
def translate_sentence(input_seq):
    states_value = encoder_model.predict(input_seq)
    target_seq = np.zeros((1, 1))
    target_seq[0, 0] = word2idx_outputs['<sos>']
    eos = word2idx_outputs['<eos>']
    output_sentence = []

    for _ in range(max_out_len):
        output_tokens, h, c = decoder_model.predict([target_seq] + states_value)
        idx = np.argmax(output_tokens[0, 0, :])

        if eos == idx:
            break

        word = ''

        if idx > 0:
            word = idx2word_target[idx]
            output_sentence.append(word)

        target_seq[0, 0] = idx
        states_value = [h, c]

    return ' '.join(output_sentence)

In [150]:
i = np.random.choice(len(input_sentences))
input_seq = encoder_input_sequences[i:i+1]
translation = translate_sentence(input_seq)
print('-')
print('Input:', input_sentences[i])
print('Response:', translation)

-
Input: I often read.
Response: je sais des elle arrãªte
