# Preprocessing

In [2]:
from __future__ import print_function, division
from builtins import range, input

import os
import sys
import string
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, Embedding, Input, LSTM
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.optimizers import Adam, SGD

try:
  import tensorflow.keras.backend as K
  if len(K.tensorflow_backend._get_available_gpus()) > 0:
    from tensorflow.keras.layers import CuDNNLSTM as LSTM
    from tensorflow.keras.layers import CuDNNGRU as GRU
except:
  pass

In [3]:
# config
BATCH_SIZE = 64  # Batch size for training
EPOCHS = 50  # Number of epochs to train for
LATENT_DIM = 256  # Latent dimensionality of the encoding space
NUM_SAMPLES = 20000  # Number of samples to train on
MAX_NUM_WORDS = 40000
EMBEDDING_DIM = 100

In [3]:
input_texts = [] # sentence in original language
target_texts = [] # sentence in target language
target_texts_inputs = [] # sentence in target language offset by 1 for later teacher forcing

In [4]:
# load in the data
# download the data at: http://www.manythings.org/anki/
t = 0
for line in open('fra.txt'):
  # only keep a limited number of samples
  t += 1
  if t > NUM_SAMPLES:
    break

  # input and target are separated by tab
  if '\t' not in line:
    continue

  # split up the input and translation
  input_text, translation, *rest = line.rstrip().split('\t')

  # make the target input and output
  # recall we'll be using teacher forcing
  target_text = translation + ' <eos>'
  target_text_input = '<sos> ' + translation

  input_texts.append(input_text)
  target_texts.append(target_text)
  target_texts_inputs.append(target_text_input)

print("num samples:", len(input_texts))

num samples: 20000


In [6]:
# Tokenization
# For translation task, we have 2 languages, so we need 2 tokenizers for inputs and outputs respectively

# tokenize the inputs
tokenizer_inputs = Tokenizer(num_words=MAX_NUM_WORDS)
tokenizer_inputs.fit_on_texts(input_texts)
input_sequences = tokenizer_inputs.texts_to_sequences(input_texts)

# get the word to index mapping for input language
word2idx_inputs = tokenizer_inputs.word_index
print('Found %s unique input tokens.' % len(word2idx_inputs))

# determine maximum length input sequence
max_len_input = max(len(s) for s in input_sequences)


# tokenize the outputs
# compared to input tokenizer, we specify filters='' bc we don't want to filter out special characters <SOS> and <EOS>
tokenizer_outputs = Tokenizer(num_words=MAX_NUM_WORDS, filters='')
tokenizer_outputs.fit_on_texts(target_texts + target_texts_inputs)
target_sequences = tokenizer_outputs.texts_to_sequences(target_texts)
target_sequences_inputs = tokenizer_outputs.texts_to_sequences(target_texts_inputs)

# get the word to index mapping for output language
word2idx_outputs = tokenizer_outputs.word_index
print('Found %s unique output tokens.' % len(word2idx_outputs))

# store number of output words for later
# remember to add 1 since indexing starts at 1
num_words_output = len(word2idx_outputs) + 1

# determine maximum length output sequence
max_len_target = max(len(s) for s in target_sequences)

Found 3342 unique input tokens.
Found 9437 unique output tokens.


In [7]:
# pad the sequences
# pad inputs, targets, targets teacher forcing input respectively
# padding for inputs uses default as 'pre' and padding for outputs uses padding='post', so that the output of the encoder corresponds to just when you see the last word of the input sentence, and the decoder produces the output imnediately upon seeing the encoder state, rather than having to go through a bunch of zeros first

# pad the inputs
encoder_inputs = pad_sequences(input_sequences, maxlen=max_len_input)
print("encoder_inputs.shape:", encoder_inputs.shape)
print("encoder_inputs[0]:", encoder_inputs[0])

# pad the teacher forcing inputs for the decoder
decoder_inputs = pad_sequences(target_sequences_inputs, maxlen=max_len_target, padding='post')
print("decoder_inputs[0]:", decoder_inputs[0])
print("decoder_inputs.shape:", decoder_inputs.shape)

# pad the target
decoder_targets = pad_sequences(target_sequences, maxlen=max_len_target, padding='post')

encoder_inputs.shape: (20000, 5)
encoder_inputs[0]: [ 0  0  0  0 17]
decoder_inputs[0]: [ 2 47  4  0  0  0  0  0  0  0  0  0]
decoder_inputs.shape: (20000, 12)


In [8]:
# load in pre-trained word vectors
print('Loading word vectors...')
word2vec = {}
f = open("glove.6B.100d.txt", encoding="utf-8")
  # is just a space-separated text file in the format:
  # word vec[0] vec[1] vec[2] ...
for line in f:
    values = line.split()
    word = values[0]
    vec = np.asarray(values[1:], dtype='float32')
    word2vec[word] = vec
print('Found %s word vectors.' % len(word2vec))

Loading word vectors...
Found 400000 word vectors.


In [9]:
# prepare embedding matrix
print('Filling pre-trained embeddings...')
num_words = min(MAX_NUM_WORDS, len(word2idx_inputs) + 1)
embedding_matrix = np.zeros((num_words, EMBEDDING_DIM))
for word, i in word2idx_inputs.items():
  if i < MAX_NUM_WORDS:
    embedding_vector = word2vec.get(word)
    if embedding_vector is not None:
      # words not found in embedding index will be all zeros.
      embedding_matrix[i] = embedding_vector

Filling pre-trained embeddings...


In [10]:
# create embedding layer
embedding_layer = Embedding(
  num_words,
  EMBEDDING_DIM,
  weights=[embedding_matrix],
  input_length=max_len_input,
  # trainable=True
)

In [11]:
# one-hot targets, since we cannot use sparse categorical cross entropy when we have sequences
decoder_targets_one_hot = np.zeros(
  (
    len(input_texts),
    max_len_target,
    num_words_output
  ),
  dtype='float32'
)

# assign the values
for i, d in enumerate(decoder_targets):
  for t, word in enumerate(d):
    if word != 0:
      decoder_targets_one_hot[i, t, word] = 1

# Model

In [12]:
# encoder
encoder_inputs_placeholder = Input(shape=(max_len_input,))
x = embedding_layer(encoder_inputs_placeholder)

encoder = LSTM(LATENT_DIM, return_state=True)
encoder_outputs, h, c = encoder(x)
encoder_states = [h, c] # keep only the states to pass into decoder

# decoder, using [h, c] as initial state.
decoder_inputs_placeholder = Input(shape=(max_len_target,)) # this is the targets input we will use for teacher forcing
decoder_embedding = Embedding(num_words_output, EMBEDDING_DIM) # word embedding here will not use pre-trained vectors
decoder_inputs_x = decoder_embedding(decoder_inputs_placeholder)

decoder_lstm = LSTM(LATENT_DIM, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_inputs_x, initial_state=encoder_states)

decoder_dense = Dense(num_words_output, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

# Create the model object
model = Model([encoder_inputs_placeholder, decoder_inputs_placeholder], decoder_outputs)

In [14]:
# model training

def custom_loss(y_true, y_pred):
  # both are of shape N x T x K
  mask = K.cast(y_true > 0, dtype='float32')
  out = mask * y_true * K.log(y_pred)
  return -K.sum(out) / K.sum(mask)

def acc(y_true, y_pred):
  # both are of shape N x T x K
  targ = K.argmax(y_true, axis=-1)
  pred = K.argmax(y_pred, axis=-1)
  correct = K.cast(K.equal(targ, pred), dtype='float32')

  # 0 is padding, don't include those
  mask = K.cast(K.greater(targ, 0), dtype='float32')
  n_correct = K.sum(mask * correct)
  n_total = K.sum(mask)
  return n_correct / n_total

model.compile(optimizer='adam', loss=custom_loss, metrics=[acc])

r = model.fit(
  [encoder_inputs, decoder_inputs], decoder_targets_one_hot,
  batch_size=BATCH_SIZE,
  epochs=EPOCHS,
  validation_split=0.2,
)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [15]:
# prediction model
# we need to create another model that can take in the RNN state and previous word as input and accept a T=1 sequence.

# encoder
# this is to take in the eng sentence, and return the final LSTM h and c
encoder_model = Model(encoder_inputs_placeholder, encoder_states)

# decoder
# using existing layers

# initial h and c representations
decoder_state_input_h = Input(shape=(LATENT_DIM,))
decoder_state_input_c = Input(shape=(LATENT_DIM,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]

# input of sequence of 1 since we are only generating one word at a time
decoder_inputs_single = Input(shape=(1,))
decoder_inputs_single_x = decoder_embedding(decoder_inputs_single)

decoder_outputs, h, c = decoder_lstm(decoder_inputs_single_x, initial_state=decoder_states_inputs)
decoder_states = [h, c]
decoder_outputs = decoder_dense(decoder_outputs)

decoder_model = Model([decoder_inputs_single] + decoder_states_inputs, [decoder_outputs] + decoder_states)

# map indexes back into real words
idx2word_eng = {v:k for k, v in word2idx_inputs.items()}
idx2word_trans = {v:k for k, v in word2idx_outputs.items()}

In [16]:
def decode_sequence(input_seq):
  # encode model takes in the input sentence
  states_value = encoder_model.predict(input_seq)

  # Generate empty target sequence of length 1
  # input is 1 x 1, bc 1 sample and 1 time step
  target_seq = np.zeros((1, 1))
  target_seq[0, 0] = word2idx_outputs['<sos>']

  # if we get EOS we break
  eos = word2idx_outputs['<eos>']

  output_sentence = []
  for _ in range(max_len_target):
    # the decoder model makes prediction
    # now we have output probabilities and the new RNN states (h and c)
    output_tokens, h, c = decoder_model.predict([target_seq] + states_value)

    # Get next word taking the greedy approach
    idx = np.argmax(output_tokens[0, 0, :])

    # End sentence of EOS
    if eos == idx:
      break

    word = ''
    if idx > 0:
      word = idx2word_trans[idx]
      output_sentence.append(word)

    # Update the decoder input, which is just the word just generated
    target_seq[0, 0] = idx

    # Update states
    states_value = [h, c]

  return ' '.join(output_sentence)

In [25]:
# testing
i = np.random.choice(len(input_texts))
print("Random idx: ", i)
input_seq = encoder_inputs[i:i+1]
print("input_seq: ", input_seq)
translation = decode_sequence(input_seq)
print('-')
print('Input:', input_texts[i])
print('Translation:', translation)

Random idx:  7291
input_seq:  [[   0    0    0   21 1887]]
-
Input: Don't despair.
Translation: ne dã©sespã©rez pas !
