In [None]:
import collections

import os
import numpy as np

from tensorflow.keras import Sequential
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import GRU, Dense, TimeDistributed, Activation, RepeatVector, Bidirectional, LSTM
from keras.layers.embeddings import Embedding
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import sparse_categorical_crossentropy

In [None]:
def load_data(path):
    input_file = os.path.join(path)
    with open(input_file, "r") as f:
        data = f.read()

    return data.split('\n')

In [None]:
# Load English data
english_sentences = load_data('small_vocab_en')
# Load French data
french_sentences = load_data('small_vocab_fr')

In [None]:
for sample_i in range(1):
    print('small_vocab_en Line {}:  {}'.format(sample_i + 1, english_sentences[sample_i]))
    print('small_vocab_fr Line {}:  {}'.format(sample_i + 1, french_sentences[sample_i]))

small_vocab_en Line 1:  new jersey is sometimes quiet during autumn , and it is snowy in april .
small_vocab_fr Line 1:  new jersey est parfois calme pendant l' automne , et il est neigeux en avril .


In [None]:
english_words_counter = collections.Counter([word for sentence in english_sentences for word in sentence.split()])
french_words_counter = collections.Counter([word for sentence in french_sentences for word in sentence.split()])

print('{} total English words.'.format(len([word for sentence in english_sentences for word in sentence.split()])))
print('{} unique English words.'.format(len(english_words_counter)))
print()
print('{} total French words.'.format(len([word for sentence in french_sentences for word in sentence.split()])))
print('{} unique French words.'.format(len(french_words_counter)))

1823250 total English words.
227 unique English words.

1961295 total French words.
355 unique French words.


In [None]:
def tokenize(x):
    tokenizer = Tokenizer(split=' ', char_level=False)
    tokenizer.fit_on_texts(x)
    return tokenizer.texts_to_sequences(x), tokenizer

In [None]:
def pad(x, length=None):
    if length is None:
        length = max([len(sentence) for sentence in x])
    
    return pad_sequences(x, maxlen=length, padding='post', truncating='post')

In [None]:
def preprocess(x, y):
    preprocess_x, x_tk = tokenize(x)
    preprocess_y, y_tk = tokenize(y)

    preprocess_x = pad(preprocess_x)
    preprocess_y = pad(preprocess_y)

    # Keras's sparse_categorical_crossentropy function requires the labels to be in 3 dimensions
    preprocess_y = preprocess_y.reshape(*preprocess_y.shape, 1) 

    return preprocess_x, preprocess_y, x_tk, y_tk

In [None]:
preproc_english_sentences, preproc_french_sentences, english_tokenizer, french_tokenizer = preprocess(english_sentences, french_sentences)

max_english_sequence_length = preproc_english_sentences.shape[1]
max_french_sequence_length = preproc_french_sentences.shape[1]
english_vocab_size = len(english_tokenizer.word_index)
french_vocab_size = len(french_tokenizer.word_index)

print("Max English sentence length:", max_english_sequence_length)
print("Max French sentence length:", max_french_sequence_length)
print("English vocabulary size:", english_vocab_size)
print("French vocabulary size:", french_vocab_size)

Max English sentence length: 15
Max French sentence length: 21
English vocabulary size: 199
French vocabulary size: 344


In [None]:
print(preproc_english_sentences.shape)

(137861, 15)


In [None]:
def logits_to_text(logits, tokenizer):
    index_to_words = {ids: word for word, ids in tokenizer.word_index.items()}
    index_to_words[0] = '<PAD>'

    decoded_sentence = []
    for prediction in np.argmax(logits, 1):
      if index_to_words[prediction]!='<PAD>':
        decoded_sentence.append(index_to_words[prediction])

    return ' '.join(decoded_sentence)

## Model 1

In [None]:
def token_to_words(sequence, tokenizer):
    index_to_words = {ids: word for word, ids in tokenizer.word_index.items()}
    index_to_words[0] = '<PAD>'

    decoded_sentence = []
    for token in sequence:
      if index_to_words[token]!='<PAD>':
        decoded_sentence.append(index_to_words[token])

    return decoded_sentence

In [None]:
def simple_model(input_shape, output_sequence_length, english_vocab_size, french_vocab_size):
    learning_rate = 0.001

    model = Sequential()
    
    model.add(LSTM(64, return_sequences=True, activation="tanh"))
    model.add(TimeDistributed(Dense(french_vocab_size, activation="softmax")))
    
    model.compile(loss=sparse_categorical_crossentropy,
                  optimizer=Adam(learning_rate),
                  metrics=['accuracy'])
    
    model.build((None,21,1)) # ????
    
    return model

In [None]:
# Reshaping the input to work with a basic RNN
tmp_x = pad(preproc_english_sentences, max_french_sequence_length)
tmp_x = tmp_x.reshape(tmp_x.shape[0],tmp_x.shape[1],1) # reshape as 3D (batchsize, timesteps, 1) for LSTM input

# Train the neural network
simple_rnn_model = simple_model(
    tmp_x.shape,
    max_french_sequence_length,
    english_vocab_size,
    french_vocab_size)

simple_rnn_model.summary()

simple_rnn_model.fit(tmp_x, preproc_french_sentences, batch_size=1024, epochs=20, validation_split=0.2)

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm (LSTM)                 (None, 21, 64)            16896     
                                                                 
 time_distributed (TimeDistr  (None, 21, 344)          22360     
 ibuted)                                                         
                                                                 
Total params: 39,256
Trainable params: 39,256
Non-trainable params: 0
_________________________________________________________________
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x7f2d7cf65890>

In [None]:
def translate(prediction, french_sentence):
    translation = logits_to_text(prediction[0], french_tokenizer)
    standard = ' '.join(token_to_words(french_sentence[0][:,0],french_tokenizer)) 
    print('---- French Sentence ----')
    print(standard)
    print()
    print('---- Prediction ----')
    print(translation)
    print()

In [None]:
print('---- English Sentence ----')
print(' '.join(token_to_words(tmp_x[:1][0][:,0],english_tokenizer)))
print()
translate(simple_rnn_model.predict(tmp_x[:1]), preproc_french_sentences[:1])

---- English Sentence ----
new jersey is sometimes quiet during autumn and it is snowy in april

---- French Sentence ----
new jersey est parfois calme pendant l' automne et il est neigeux en avril

---- Prediction ----
new jersey est parfois chaud en l' et il est est en en



## Model 2

In [None]:
def embed_model(input_shape, output_sequence_length, english_vocab_size, french_vocab_size):
    learning_rate = 0.001
    embedding_size = 256 # dimensions of word vectors
    
    model = Sequential()
    
    model.add(Embedding(input_dim = english_vocab_size, output_dim = embedding_size, 
                           input_length = output_sequence_length))
    
    model.add(LSTM(64, return_sequences=True, activation="tanh"))
    
    model.add(TimeDistributed(Dense(french_vocab_size, activation="softmax")))
    
    model.compile(loss=sparse_categorical_crossentropy,
                  optimizer=Adam(learning_rate),
                  metrics=['accuracy'])
    
    return model

In [None]:
tmp_x = pad(preproc_english_sentences, max_french_sequence_length)
# tmp_x = tmp_x.reshape((-1, preproc_french_sentences.shape[-2]))    # reshaped to (batchsize, seq_length) for Embedding input

# Train the neural network
embed_rnn_model = embed_model(
    tmp_x.shape,
    max_french_sequence_length,
    english_vocab_size,
    french_vocab_size)

embed_rnn_model.summary()
    
embed_rnn_model.fit(tmp_x, preproc_french_sentences, batch_size=1024, epochs=20, validation_split=0.2)

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 21, 256)           50944     
                                                                 
 lstm_1 (LSTM)               (None, 21, 64)            82176     
                                                                 
 time_distributed_1 (TimeDis  (None, 21, 344)          22360     
 tributed)                                                       
                                                                 
Total params: 155,480
Trainable params: 155,480
Non-trainable params: 0
_________________________________________________________________
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x7f2d78adc550>

In [None]:
print('---- English Sentence ----')
print(' '.join(token_to_words(tmp_x[:1][0],english_tokenizer) ))
print()
translate(embed_rnn_model.predict(tmp_x[:1]), preproc_french_sentences[:1])

---- English Sentence ----
new jersey is sometimes quiet during autumn and it is snowy in april

---- French Sentence ----
new jersey est parfois calme pendant l' automne et il est neigeux en avril

---- Prediction ----
new jersey est parfois calme en l' automne et il est neigeux en avril



## Model 3

In [None]:
def emb_bd_model(input_shape, output_sequence_length, english_vocab_size, french_vocab_size):
    learning_rate = 0.001
    embedding_size = 256

    model = Sequential()
    
    model.add(Embedding(input_dim = english_vocab_size, output_dim = embedding_size, 
                           input_length = output_sequence_length))
    
    model.add(Bidirectional(LSTM(64, return_sequences=True, activation="tanh")))
    
    model.add(TimeDistributed(Dense(french_vocab_size, activation="softmax")))
    
    model.compile(loss=sparse_categorical_crossentropy,
                  optimizer=Adam(learning_rate),
                  metrics=['accuracy'])
   
    return model

In [None]:
tmp_x = pad(preproc_english_sentences, max_french_sequence_length)

# Train the neural network
emb_bd_rnn_model = emb_bd_model(
                        tmp_x.shape,
                        max_french_sequence_length,
                        english_vocab_size,
                        french_vocab_size)

emb_bd_rnn_model.summary()

emb_bd_rnn_model.fit(tmp_x, preproc_french_sentences, batch_size=1024, epochs=20, validation_split=0.2)

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 21, 256)           50944     
                                                                 
 bidirectional (Bidirectiona  (None, 21, 128)          164352    
 l)                                                              
                                                                 
 time_distributed_2 (TimeDis  (None, 21, 344)          44376     
 tributed)                                                       
                                                                 
Total params: 259,672
Trainable params: 259,672
Non-trainable params: 0
_________________________________________________________________
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/

<keras.callbacks.History at 0x7f2d788b91d0>

In [None]:
print('---- English Sentence ----')
print(' '.join(token_to_words(tmp_x[:1][0],english_tokenizer) ))
print()
translate(emb_bd_rnn_model.predict(tmp_x[:1]), preproc_french_sentences[:1])

---- English Sentence ----
new jersey is sometimes quiet during autumn and it is snowy in april

---- French Sentence ----
new jersey est parfois calme pendant l' automne et il est neigeux en avril

---- Prediction ----
new jersey est parfois calme en automne et il est neigeux en avril



## Model 4
Read this https://machinelearningmastery.com/encoder-decoder-long-short-term-memory-networks/

In [None]:
def model_final(input_shape, output_sequence_length, english_vocab_size, french_vocab_size):
    learning_rate = 6e-3
    embedding_size = 256
    units = 256
    
    model = Sequential()
    
    ########### ENCODER ###########
    
    model.add(Embedding(input_dim = english_vocab_size, output_dim = embedding_size, 
                           input_length= input_shape[1], name="Embedding_layer"))
    model.add(Bidirectional(LSTM(units, return_sequences=False), name='Bi_LSTM_Encoder'))
    
    ########### INTERMEDIARY ###########
    # Repeat Vector adjusts the shape of the Encoder output (2D) to the need of the Decoder (3D input).
    # We repeat the 2D vector over sequence_length times to produce the shape (batchsize, seq_length, num_units)
    model.add(RepeatVector(output_sequence_length, name='RepeatVector'))
    
    ########### DECODER ###########
    
    model.add(LSTM(units, return_sequences=True, name='LSTM_Decoder'))
    model.add(TimeDistributed(Dense(french_vocab_size, activation='softmax'), name='Dense'))

    
    model.compile(loss=sparse_categorical_crossentropy,
                  optimizer=Adam(learning_rate),
                  metrics=['accuracy'])
    
    return model


In [None]:
tmp_x = pad(preproc_english_sentences, max_french_sequence_length) # pad input sequence to output sequence length

# Train the neural network
final_model = model_final(
                        tmp_x.shape,
                        max_french_sequence_length,
                        english_vocab_size,
                        french_vocab_size)

final_model.summary()

final_model.fit(tmp_x, preproc_french_sentences, batch_size=1024, epochs=20, validation_split=0.2)

Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 Embedding_layer (Embedding)  (None, 21, 256)          50944     
                                                                 
 Bi_LSTM_Encoder (Bidirectio  (None, 512)              1050624   
 nal)                                                            
                                                                 
 RepeatVector (RepeatVector)  (None, 21, 512)          0         
                                                                 
 LSTM_Decoder (LSTM)         (None, 21, 256)           787456    
                                                                 
 Dense (TimeDistributed)     (None, 21, 344)           88408     
                                                                 
Total params: 1,977,432
Trainable params: 1,977,432
Non-trainable params: 0
____________________________________________

<keras.callbacks.History at 0x7f2d775a0250>

In [None]:
print('---- English Sentence ----')
print(' '.join(token_to_words(tmp_x[:1][0],english_tokenizer) ))
print()
translate(final_model.predict(tmp_x[:1]), preproc_french_sentences[:1])

---- English Sentence ----
new jersey is sometimes quiet during autumn and it is snowy in april

---- French Sentence ----
new jersey est parfois calme pendant l' automne et il est neigeux en avril

---- Prediction ----
new jersey est parfois calme à l'automne et il est neigeux en avril

