## Introduction

In [1]:
%load_ext autoreload
%aimport helper, tests
%autoreload 1

In [2]:
import collections
import numpy as np
import os

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Model
from keras.layers import GRU, Input, Dense, TimeDistributed, Activation, RepeatVector, Bidirectional
from keras.layers.embeddings import Embedding
from keras.optimizers import Adam
from keras.losses import sparse_categorical_crossentropy
from tensorflow.python.client import device_lib
from keras.callbacks import ModelCheckpoint
from keras.models import load_model

import helper
import project_tests as tests

Using TensorFlow backend.


### Verify access to the GPU

In [3]:
print(device_lib.list_local_devices())

[name: "/device:CPU:0"
device_type: "CPU"
memory_limit: 268435456
locality {
}
incarnation: 1866131090633625833
, name: "/device:XLA_CPU:0"
device_type: "XLA_CPU"
memory_limit: 17179869184
locality {
}
incarnation: 4817000684872770058
physical_device_desc: "device: XLA_CPU device"
, name: "/device:XLA_GPU:0"
device_type: "XLA_GPU"
memory_limit: 17179869184
locality {
}
incarnation: 11183506563668443864
physical_device_desc: "device: XLA_GPU device"
, name: "/device:GPU:0"
device_type: "GPU"
memory_limit: 14728907981
locality {
  bus_id: 1
  links {
  }
}
incarnation: 11111021372968035024
physical_device_desc: "device: 0, name: Tesla T4, pci bus id: 0000:00:04.0, compute capability: 7.5"
]


## Dataset

In [4]:
data_dir = './data'

en_filepath = os.path.join(data_dir, 'small_vocab_en')
fr_filepath = os.path.join(data_dir, 'small_vocab_fr')

In [5]:
# Load English data
english_sentences = helper.load_data(en_filepath)
# Load French data
french_sentences = helper.load_data(fr_filepath)

print('Dataset Loaded')

Dataset Loaded


In [6]:
for sample_i in range(2):
    print('small_vocab_en Line {}:  {}'.format(sample_i + 1, english_sentences[sample_i]))
    print('small_vocab_fr Line {}:  {}'.format(sample_i + 1, french_sentences[sample_i]))

small_vocab_en Line 1:  new jersey is sometimes quiet during autumn , and it is snowy in april .
small_vocab_fr Line 1:  new jersey est parfois calme pendant l' automne , et il est neigeux en avril .
small_vocab_en Line 2:  the united states is usually chilly during july , and it is usually freezing in november .
small_vocab_fr Line 2:  les états-unis est généralement froid en juillet , et il gèle habituellement en novembre .


In [7]:
english_words_counter = collections.Counter([word for sentence in english_sentences for word in sentence.split()])
french_words_counter = collections.Counter([word for sentence in french_sentences for word in sentence.split()])

print('{} English words.'.format(len([word for sentence in english_sentences for word in sentence.split()])))
print('{} unique English words.'.format(len(english_words_counter)))
print('10 Most common words in the English dataset:')
print('"' + '" "'.join(list(zip(*english_words_counter.most_common(10)))[0]) + '"')
print()
print('{} French words.'.format(len([word for sentence in french_sentences for word in sentence.split()])))
print('{} unique French words.'.format(len(french_words_counter)))
print('10 Most common words in the French dataset:')
print('"' + '" "'.join(list(zip(*french_words_counter.most_common(10)))[0]) + '"')

1823250 English words.
227 unique English words.
10 Most common words in the English dataset:
"is" "," "." "in" "it" "during" "the" "but" "and" "sometimes"

1961295 French words.
355 unique French words.
10 Most common words in the French dataset:
"est" "." "," "en" "il" "les" "mais" "et" "la" "parfois"


## Preprocess

In [8]:
def tokenize(x):
    """
    Tokenize x
    :param x: List of sentences/strings to be tokenized
    :return: Tuple of (tokenized x data, tokenizer used to tokenize x)
    """
    
    x_tk = Tokenizer()
    x_tk.fit_on_texts(x)

    return x_tk.texts_to_sequences(x), x_tk

In [9]:
tests.test_tokenize(tokenize)

In [10]:
# Tokenize Example output
text_sentences = [
    'The quick brown fox jumps over the lazy dog .',
    'By Jove , my quick study of lexicography won a prize .',
    'This is a short sentence .']
text_tokenized, text_tokenizer = tokenize(text_sentences)
print(text_tokenizer.word_index)
print()
for sample_i, (sent, token_sent) in enumerate(zip(text_sentences, text_tokenized)):
    print('Sequence {} in x'.format(sample_i + 1))
    print('  Input:  {}'.format(sent))
    print('  Output: {}'.format(token_sent))

{'the': 1, 'quick': 2, 'a': 3, 'brown': 4, 'fox': 5, 'jumps': 6, 'over': 7, 'lazy': 8, 'dog': 9, 'by': 10, 'jove': 11, 'my': 12, 'study': 13, 'of': 14, 'lexicography': 15, 'won': 16, 'prize': 17, 'this': 18, 'is': 19, 'short': 20, 'sentence': 21}

Sequence 1 in x
  Input:  The quick brown fox jumps over the lazy dog .
  Output: [1, 2, 4, 5, 6, 7, 1, 8, 9]
Sequence 2 in x
  Input:  By Jove , my quick study of lexicography won a prize .
  Output: [10, 11, 12, 2, 13, 14, 15, 16, 3, 17]
Sequence 3 in x
  Input:  This is a short sentence .
  Output: [18, 19, 3, 20, 21]


In [11]:
def pad(x, length=None):
    """
    Pad x
    :param x: List of sequences.
    :param length: Length to pad the sequence to.  If None, use length of longest sequence in x.
    :return: Padded numpy array of sequences
    """
    
    length = length if length is not None else max([len(element) for element in x])
    
    return pad_sequences(x, maxlen=length, padding='post')

In [12]:
tests.test_pad(pad)

In [13]:
# Pad Tokenized output
test_pad = pad(text_tokenized)
for sample_i, (token_sent, pad_sent) in enumerate(zip(text_tokenized, test_pad)):
    print('Sequence {} in x'.format(sample_i + 1))
    print('  Input:  {}'.format(np.array(token_sent)))
    print('  Output: {}'.format(pad_sent))

Sequence 1 in x
  Input:  [1 2 4 5 6 7 1 8 9]
  Output: [1 2 4 5 6 7 1 8 9 0]
Sequence 2 in x
  Input:  [10 11 12  2 13 14 15 16  3 17]
  Output: [10 11 12  2 13 14 15 16  3 17]
Sequence 3 in x
  Input:  [18 19  3 20 21]
  Output: [18 19  3 20 21  0  0  0  0  0]


In [14]:
def preprocess(x, y):
    """
    Preprocess x and y
    :param x: Feature List of sentences
    :param y: Label List of sentences
    :return: Tuple of (Preprocessed x, Preprocessed y, x tokenizer, y tokenizer)
    """
    preprocess_x, x_tk = tokenize(x)
    preprocess_y, y_tk = tokenize(y)

    preprocess_x = pad(preprocess_x)
    preprocess_y = pad(preprocess_y)

    # Keras's sparse_categorical_crossentropy function requires the labels to be in 3 dimensions
    preprocess_y = preprocess_y.reshape(*preprocess_y.shape, 1)

    return preprocess_x, preprocess_y, x_tk, y_tk

preproc_english_sentences, preproc_french_sentences, english_tokenizer, french_tokenizer =\
    preprocess(english_sentences, french_sentences)
    
max_english_sequence_length = preproc_english_sentences.shape[1]
max_french_sequence_length = preproc_french_sentences.shape[1]
english_vocab_size = len(english_tokenizer.word_index)
french_vocab_size = len(french_tokenizer.word_index)

print('Data Preprocessed')
print("Max English sentence length:", max_english_sequence_length)
print("Max French sentence length:", max_french_sequence_length)
print("English vocabulary size:", english_vocab_size)
print("French vocabulary size:", french_vocab_size)

Data Preprocessed
Max English sentence length: 15
Max French sentence length: 21
English vocabulary size: 199
French vocabulary size: 344


## Models

In [15]:
def logits_to_text(logits, tokenizer):
    """
    Turn logits from a neural network into text using the tokenizer
    :param logits: Logits from a neural network
    :param tokenizer: Keras Tokenizer fit on the labels
    :return: String that represents the text of the logits
    """
    index_to_words = {id: word for word, id in tokenizer.word_index.items()}
    index_to_words[0] = '<PAD>'

    return ' '.join([index_to_words[prediction] for prediction in np.argmax(logits, 1)])

print('`logits_to_text` function loaded.')

`logits_to_text` function loaded.


### Model 1: RNN

In [16]:
def simple_model(input_shape, output_sequence_length, english_vocab_size, french_vocab_size):
    """
    Build and train a basic RNN on x and y
    :param input_shape: Tuple of input shape
    :param output_sequence_length: Length of output sequence
    :param english_vocab_size: Number of unique English words in the dataset
    :param french_vocab_size: Number of unique French words in the dataset
    :return: Keras model built, but not trained
    """
    
    learning_rate = 0.05

    input_seq = Input(input_shape[1:])
    rnn = GRU(english_vocab_size, return_sequences=True)(input_seq)
    logits = TimeDistributed(Dense(french_vocab_size))(rnn)

    model = Model(input_seq, Activation('softmax')(logits))
    
    model.compile(loss=sparse_categorical_crossentropy,
                  optimizer=Adam(learning_rate),
                  metrics=['accuracy'])
    
    return model

In [17]:
tmp_x = pad(preproc_english_sentences, max_french_sequence_length)
tmp_x = tmp_x.reshape((-1, preproc_french_sentences.shape[-2], 1))

simple_rnn_model = simple_model(
    tmp_x.shape,
    max_french_sequence_length,
    english_vocab_size,
    french_vocab_size)

simple_rnn_model.summary()

Model: "model_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 21, 1)             0         
_________________________________________________________________
gru_1 (GRU)                  (None, 21, 199)           119997    
_________________________________________________________________
time_distributed_1 (TimeDist (None, 21, 344)           68800     
_________________________________________________________________
activation_1 (Activation)    (None, 21, 344)           0         
Total params: 188,797
Trainable params: 188,797
Non-trainable params: 0
_________________________________________________________________


In [18]:
checkpoint_path = "models/simple_rnn.ckpt"

simple_rnn_model.load_weights(checkpoint_path)

print(logits_to_text(simple_rnn_model.predict(tmp_x[:1])[0], french_tokenizer))

new jersey est parfois calme en l' de mai est il est en en <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD>


### Model 2: Embedding

In [19]:
def embed_model(input_shape, output_sequence_length, english_vocab_size, french_vocab_size):
    """
    Build and train a RNN model using word embedding on x and y
    :param input_shape: Tuple of input shape
    :param output_sequence_length: Length of output sequence
    :param english_vocab_size: Number of unique English words in the dataset
    :param french_vocab_size: Number of unique French words in the dataset
    :return: Keras model built, but not trained
    """
    
    inputs = Input(input_shape[1:])
    
    embeddings = Embedding(english_vocab_size, output_sequence_length)(inputs)
    outputs = GRU(english_vocab_size, return_sequences=True)(embeddings)
    outputs = TimeDistributed(Dense(french_vocab_size))(outputs)
    outputs = Activation('softmax')(outputs)

    model = Model(inputs, outputs)
    
    learning_rate = 0.01
    
    model.compile(loss=sparse_categorical_crossentropy,
                  optimizer=Adam(learning_rate),
                  metrics=['accuracy'])
    
    return model

In [20]:
tmp_x = pad(preproc_english_sentences, max_french_sequence_length)

embed_rnn_model = embed_model(
    tmp_x.shape,
    max_french_sequence_length,
    english_vocab_size,
    french_vocab_size)

embed_rnn_model.summary()

Model: "model_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         (None, 21)                0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 21, 21)            4179      
_________________________________________________________________
gru_2 (GRU)                  (None, 21, 199)           131937    
_________________________________________________________________
time_distributed_2 (TimeDist (None, 21, 344)           68800     
_________________________________________________________________
activation_2 (Activation)    (None, 21, 344)           0         
Total params: 204,916
Trainable params: 204,916
Non-trainable params: 0
_________________________________________________________________


In [21]:
checkpoint_path = "models/embed_rnn.ckpt"

embed_rnn_model.load_weights(checkpoint_path)

print(logits_to_text(embed_rnn_model.predict(tmp_x[:1])[0], french_tokenizer))

new jersey est parfois calme en l' automne et il est neigeux en avril <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD>


### Model 3: Bidirectional RNNs

In [22]:
def bd_model(input_shape, output_sequence_length, english_vocab_size, french_vocab_size):
    """
    Build and train a bidirectional RNN model on x and y
    :param input_shape: Tuple of input shape
    :param output_sequence_length: Length of output sequence
    :param english_vocab_size: Number of unique English words in the dataset
    :param french_vocab_size: Number of unique French words in the dataset
    :return: Keras model built, but not trained
    """
    
    inputs = Input(input_shape[1:])
    
    outputs = Bidirectional(GRU(english_vocab_size, return_sequences=True, input_shape = input_shape))(inputs)
    outputs = TimeDistributed(Dense(french_vocab_size))(outputs)
    outputs = Activation('softmax')(outputs)

    model = Model(inputs=inputs, outputs=outputs)
    
    learning_rate = 0.002
    
    model.compile(loss=sparse_categorical_crossentropy,
                  optimizer=Adam(learning_rate),
                  metrics=['accuracy'])
 
    return model

In [23]:
# Reshaping the input to work with a basic RNN
tmp_x = pad(preproc_english_sentences, max_french_sequence_length)
tmp_x = tmp_x.reshape((-1, preproc_french_sentences.shape[-2], 1))

# # Train the neural network
bd_rnn_model = bd_model(
    tmp_x.shape,
    max_french_sequence_length,
    english_vocab_size,
    french_vocab_size)

bd_rnn_model.summary()

Model: "model_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_3 (InputLayer)         (None, 21, 1)             0         
_________________________________________________________________
bidirectional_1 (Bidirection (None, 21, 398)           239994    
_________________________________________________________________
time_distributed_3 (TimeDist (None, 21, 344)           137256    
_________________________________________________________________
activation_3 (Activation)    (None, 21, 344)           0         
Total params: 377,250
Trainable params: 377,250
Non-trainable params: 0
_________________________________________________________________


In [24]:
checkpoint_path = "models/bd_rnn.ckpt"

bd_rnn_model.load_weights(checkpoint_path)

print(logits_to_text(bd_rnn_model.predict(tmp_x[:1])[0], french_tokenizer))

new jersey est parfois calme en mois et il est est en en en <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD>


### Model 4: Encoder-Decoder

In [25]:
def encdec_model(input_shape, output_sequence_length, english_vocab_size, french_vocab_size):
    """
    Build and train an encoder-decoder model on x and y
    :param input_shape: Tuple of input shape
    :param output_sequence_length: Length of output sequence
    :param english_vocab_size: Number of unique English words in the dataset
    :param french_vocab_size: Number of unique French words in the dataset
    :return: Keras model built, but not trained
    """
    
    '''
    Encoder
    '''
    
    encoder_inputs = Input(input_shape[1:])
    encoder = GRU(english_vocab_size)
    encoder_outputs = encoder(encoder_inputs)
    
    encoder_outputs = RepeatVector(output_sequence_length)(encoder_outputs)
    
    
    '''
    Decoder
    '''

    decoder_gru = GRU(english_vocab_size, return_sequences=True)
    decoder_outputs = decoder_gru(encoder_outputs)
    decoder_dense = TimeDistributed(Dense(french_vocab_size))
    decoder_outputs = decoder_dense(decoder_outputs)
    decoder_outputs = Activation('softmax')(decoder_outputs)
    
    
    '''
    Final
    '''
    
    model = Model(encoder_inputs, decoder_outputs)
    
    learning_rate = 0.005
    
    model.compile(loss=sparse_categorical_crossentropy,
                  optimizer='adam',
                  metrics=['accuracy'])
    
    return model

In [26]:
tmp_x = pad(preproc_english_sentences, max_english_sequence_length)
tmp_x = tmp_x.reshape((-1, max_english_sequence_length, 1))

encdec_rnn_model = encdec_model(
    tmp_x.shape,
    max_french_sequence_length,
    english_vocab_size,
    french_vocab_size)

encdec_rnn_model.summary()

Model: "model_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_4 (InputLayer)         (None, 15, 1)             0         
_________________________________________________________________
gru_4 (GRU)                  (None, 199)               119997    
_________________________________________________________________
repeat_vector_1 (RepeatVecto (None, 21, 199)           0         
_________________________________________________________________
gru_5 (GRU)                  (None, 21, 199)           238203    
_________________________________________________________________
time_distributed_4 (TimeDist (None, 21, 344)           68800     
_________________________________________________________________
activation_4 (Activation)    (None, 21, 344)           0         
Total params: 427,000
Trainable params: 427,000
Non-trainable params: 0
_____________________________________________________

In [27]:
checkpoint_path = "models/encdec_rnn.ckpt"

encdec_rnn_model.load_weights(checkpoint_path)

print(logits_to_text(encdec_rnn_model.predict(tmp_x[:1])[0], french_tokenizer))

new jersey est généralement agréable en l' et il est est en en <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD>


### Model 5: Custom

In [28]:
def model_final(input_shape, output_sequence_length, english_vocab_size, french_vocab_size):
    """
    Build and train a model that incorporates embedding, encoder-decoder, and bidirectional RNN on x and y
    :param input_shape: Tuple of input shape
    :param output_sequence_length: Length of output sequence
    :param english_vocab_size: Number of unique English words in the dataset
    :param french_vocab_size: Number of unique French words in the dataset
    :return: Keras model built, but not trained
    """
    
    inputs = Input(shape=(input_shape[1:]))
    
    embeddings = Embedding(english_vocab_size, output_sequence_length)(inputs)
    
    encoder_outputs = Bidirectional(GRU(128))(embeddings)
    
    encoder_outputs = RepeatVector(output_sequence_length)(encoder_outputs)
    
    decoder_outputs = Bidirectional(GRU(64, return_sequences=True))(encoder_outputs)
    
    decoder_dense = TimeDistributed(Dense(french_vocab_size))
    decoder_outputs = decoder_dense(decoder_outputs)
    decoder_outputs = Activation('softmax')(decoder_outputs)
    
    model = Model(inputs, decoder_outputs)
    
    learning_rate = 0.01
    
    model.compile(loss=sparse_categorical_crossentropy,
                  optimizer=Adam(learning_rate),
                  metrics=['accuracy'])
    
    return model

In [29]:
tmp_x = pad(preproc_english_sentences, max_english_sequence_length)

custom_model = model_final(
    tmp_x.shape,
    max_french_sequence_length,
    english_vocab_size,
    french_vocab_size)

custom_model.summary()

Model: "model_5"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_5 (InputLayer)         (None, 15)                0         
_________________________________________________________________
embedding_2 (Embedding)      (None, 15, 21)            4179      
_________________________________________________________________
bidirectional_2 (Bidirection (None, 256)               115200    
_________________________________________________________________
repeat_vector_2 (RepeatVecto (None, 21, 256)           0         
_________________________________________________________________
bidirectional_3 (Bidirection (None, 21, 128)           123264    
_________________________________________________________________
time_distributed_5 (TimeDist (None, 21, 344)           44376     
_________________________________________________________________
activation_5 (Activation)    (None, 21, 344)           0   

In [30]:
checkpoint_path = "models/custom_rnn.ckpt"

custom_model.load_weights(checkpoint_path)

print(logits_to_text(custom_model.predict(tmp_x[:1])[0], french_tokenizer))

new jersey est parfois calme pendant l' automne et il est neigeux en avril <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD>


## Prediction

In [31]:
def final_predictions(x, y, x_tk, y_tk, model):
    """
    Gets predictions using the final model
    :param x: Preprocessed English data
    :param y: Preprocessed French data
    :param x_tk: English tokenizer
    :param y_tk: French tokenizer
    """

    ## DON'T EDIT ANYTHING BELOW THIS LINE
    y_id_to_word = {value: key for key, value in y_tk.word_index.items()}
    y_id_to_word[0] = '<PAD>'

    sentence = 'he saw a old yellow truck'
    sentence = [x_tk.word_index[word] for word in sentence.split()]
    sentence = pad_sequences([sentence], maxlen=x.shape[-1], padding='post')
    sentences = np.array([sentence[0], x[0]])
    predictions = model.predict(sentences, len(sentences))

    print('Sample 1:')
    print(' '.join([y_id_to_word[np.argmax(x)] for x in predictions[0]]))
    print('Il a vu un vieux camion jaune')
    print('Sample 2:')
    print(' '.join([y_id_to_word[np.argmax(x)] for x in predictions[1]]))
    print(' '.join([y_id_to_word[np.max(x)] for x in y[0]]))

In [32]:
final_predictions(
    preproc_english_sentences, 
    preproc_french_sentences, 
    english_tokenizer, 
    french_tokenizer,
    custom_model)

Sample 1:
il a vu un vieux camion jaune <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD>
Il a vu un vieux camion jaune
Sample 2:
new jersey est parfois calme pendant l' automne et il est neigeux en avril <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD>
new jersey est parfois calme pendant l' automne et il est neigeux en avril <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD>


## Submission

In [33]:
!!python -m nbconvert *.ipynb

['[NbConvertApp] Converting notebook Machine Translation-1.ipynb to html',
 '[NbConvertApp] Writing 316915 bytes to Machine Translation-1.html',
 '[NbConvertApp] Converting notebook Machine Translation-2.ipynb to html',
 '[NbConvertApp] Writing 317656 bytes to Machine Translation-2.html',
 '[NbConvertApp] Converting notebook Machine Translation-3.ipynb to html',
 '[NbConvertApp] Writing 317348 bytes to Machine Translation-3.html',
 '[NbConvertApp] Converting notebook Machine Translation-4.ipynb to html',
 '[NbConvertApp] Writing 318540 bytes to Machine Translation-4.html',
 '[NbConvertApp] Converting notebook Machine Translation-5.ipynb to html',
 '[NbConvertApp] Writing 318803 bytes to Machine Translation-5.html',
 '[NbConvertApp] Converting notebook Machine Translation.ipynb to html',
 '[NbConvertApp] Writing 355051 bytes to Machine Translation.html']