In [1]:
import nltk 
import numpy as np
import pandas as pd
import tensorflow as tf
import re 
import os 
import unicodedata
import zipfile
import data_load

### Load Data

In [3]:
data_load.clean_dir('data')

Cleaning directory data...


In [4]:
download_url = "http://www.manythings.org/anki/fra-eng.zip"
en_sents, fr_sents_in, fr_sents_out = data_load.download_and_read_url(download_url, num_sent_pairs=10000)

### Attention layer

In [36]:
class AdditiveAttention(tf.keras.layers.Layer):
    def __init__(self, num_units):
        super(AdditiveAttention, self).__init__()
        self.W1 = tf.keras.layers.Dense(num_units)
        self.W2 = tf.keras.layers.Dense(num_units)
        self.V  = tf.keras.layers.Dense(1)
        
    def call(self, query, values):
        query_with_timesteps = tf.expand_dims(query, axis=1)
        score = self.V(
            tf.keras.activations.tanh(self.W1(query_with_timesteps) + self.W2(values))
        )
        alignment = tf.nn.softmax(score, axis=1)
        context = tf.reduce_sum(
            tf.linalg.matmul(
                tf.linalg.matrix_transpose(alignment),
                values
            ), axis=1
        )
        context = tf.expand_dims(context, axis=1)
        return context, alignment 
    
class Encoder(tf.keras.models.Model):
    def __init__(self, vocab_size, embedding_dim, num_timesteps, encoder_dim, **kwargs):
        super(Encoder, self).__init__(**kwargs)
        self.encoder_dim = encoder_dim
        self.embedding = tf.keras.layers.Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=num_timesteps)
        self.rnn = tf.keras.layers.GRU(encoder_dim, return_sequences=True, return_state=True)
        
    def call(self, x, state):
        x = self.embedding(x)
        x, state = self.rnn(x, initial_state=state)
        return x, state
    
    def init_state(self, batch_size):
        return tf.zeros((batch_size, self.encoder_dim))
    
class Decoder(tf.keras.models.Model):
    def __init__(self, vocab_size, embedding_dim, num_timesteps, decoder_dim, **kwargs):
        super(Decoder, self).__init__(**kwargs)
        self.decoder_dim = decoder_dim
        self.embedding = tf.keras.layers.Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=num_timesteps)
        self.attention = AdditiveAttention(embedding_dim)
        self.rnn = tf.keras.layers.GRU(decoder_dim, return_sequences=True, return_state=True)
        self.Wc = tf.keras.layers.Dense(decoder_dim, activation='tanh')
        self.Ws = tf.keras.layers.Dense(vocab_size)
        
    def call(self, x, state, encoder_out):
        x = self.embedding(x)
        context, alignment = self.attention(x, encoder_out)
        x = tf.expand_dims(
            tf.concat([
                x, tf.squeeze(context, axis=1)
            ], axis=1),
        axis=1)
        x, state = self.rnn(x, state)
        x = self.Wc(x)
        x = self.Ws(x)
        return x, state, alignment
    
def loss_fn(ytrue, ypred):
    scce = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
    mask = tf.math.logical_not(tf.math.equal(ytrue, 0))
    mask = tf.cast(mask, dtype=tf.int64)
    loss = scce(ytrue, ypred, sample_weight=mask)
    return loss

@tf.function
def train_step(encoder_in, decoder_in, decoder_out, encoder_state):
    with tf.GradientTape() as tape:
        encoder_out, encoder_state = encoder(encoder_in, encoder_state)
        decoder_state = encoder_state
        
        loss = 0
        for t in range(decoder_out.shape[1]):
            decoder_in_t = decoder_in[:, t]
            decoder_pred_t, decoder_state, _ = decoder(decoder_in_t, decoder_state, encoder_out)
            loss += loss_fn(decoder_out[:, t], decoder_pred_t)
    
    trainable_variables = encoder.trainable_variables + decoder.trainable_variables
    gradients = tape.gradient(loss, trainable_variables)
    optimizer.apply_gradients(zip(gradients, trainable_variables))
    return loss / decoder_out.shape[1]

### Tokenization and data set up 

In [21]:
en_tokenizer = tf.keras.preprocessing.text.Tokenizer(filters="", lower=False)
en_tokenizer.fit_on_texts(en_sents)
en_data = en_tokenizer.texts_to_sequences(en_sents)
en_data = tf.keras.preprocessing.sequence.pad_sequences(en_data, padding='post')

fr_tokenizer = tf.keras.preprocessing.text.Tokenizer(filters="", lower=False)
fr_tokenizer.fit_on_texts(fr_sents_in)
fr_tokenizer.fit_on_texts(fr_sents_out)
fr_data_in = fr_tokenizer.texts_to_sequences(fr_sents_in)
fr_data_in = tf.keras.preprocessing.sequence.pad_sequences(fr_data_in, padding='post')
fr_data_out = fr_tokenizer.texts_to_sequences(fr_sents_out)
fr_data_out = tf.keras.preprocessing.sequence.pad_sequences(fr_data_out, padding='post')

en_maxlen = en_data.shape[1]
fr_maxlen = fr_data_out.shape[1]
print(f"seqlen (en): {en_maxlen}, seqlen (fr): {fr_maxlen}")

seqlen (en): 6, seqlen (fr): 14


In [16]:
NUM_SENT_PAIRS = 10000
EMBEDDING_DIM = 32
ENCODER_DIM, DECODER_DIM = 64, 64
BATCH_SIZE = 16
NUM_EPOCHS = 3

In [20]:
tf.random.set_seed(123)

batch_size = BATCH_SIZE
dataset = tf.data.Dataset.from_tensor_slices((en_data, fr_data_in, fr_data_out))
dataset = dataset.shuffle(10000)
test_size = NUM_SENT_PAIRS // 4
test_dataset = dataset.take(test_size).batch(batch_size, drop_remainder=True)
train_dataset = dataset.skip(test_size).batch(batch_size, drop_remainder=True)

en_vocab_size = len(en_tokenizer.word_index)
fr_vocab_size = len(fr_tokenizer.word_index)
en_word2idx = en_tokenizer.word_index
en_idx2word = {idx:word for word, idx in en_word2idx.items()}
fr_word2idx = fr_tokenizer.word_index
fr_idx2word = {idx:word for word, idx in fr_word2idx.items()}
print(f"vocab size (en): {en_vocab_size}, vocab size (fr): {fr_vocab_size}")

vocab size (en): 1988, vocab size (fr): 3758


In [None]:
encoder = Encoder(en_vocab_size+1, embedding_dim=EMBEDDING_DIM, num_timesteps=en_maxlen, encoder_dim=ENCODER_DIM)
decoder = Decoder(fr_vocab_size+1, embedding_dim=EMBEDDING_DIM, num_timesteps=fr_maxlen, decoder_dim=DECODER_DIM)

### Test code to examine dimensions

In [32]:
for encoder_in, decoder_in, decoder_out in train_dataset:
    print("inputs:", encoder_in.shape, decoder_in.shape, decoder_out.shape)
    encoder_state = encoder.init_state(batch_size)
    encoder_out, encoder_state = encoder(encoder_in, encoder_state)
    decoder_state = encoder_state
    decoder_pred = []
    for t in range(decoder_out.shape[1]):
        decoder_in_t = decoder_in[:, t]
        decoder_pred_t, decoder_state, _ = decoder(decoder_in_t,
            decoder_state, encoder_out)
        decoder_pred.append(decoder_pred_t.numpy())
    decoder_pred = tf.squeeze(np.array(decoder_pred), axis=2)
    break
print("encoder input          :", encoder_in.shape)
print("encoder output         :", encoder_out.shape, "state:", encoder_state.shape)
print("decoder output (logits):", decoder_pred.shape, "state:", decoder_state.shape)
print("decoder output (labels):", decoder_out.shape)

inputs: (16, 6) (16, 14) (16, 14)
encoder input          : (16, 6)
encoder output         : (16, 6, 64) state: (16, 64)
decoder output (logits): (14, 16, 3759) state: (16, 64)
decoder output (labels): (16, 14)


### Training

In [37]:
optimizer = tf.keras.optimizers.Adam()
num_epochs = NUM_EPOCHS

for e in range(num_epochs):
    encoder_state = encoder.init_state(batch_size)
    
    for batch, data in enumerate(train_dataset):
        encoder_in, decoder_in, decoder_out = data
        loss = train_step(encoder_in, decoder_in, decoder_out, encoder_state)
        
    print("EPOCH: {}, LOSS: {:.4f}".format(e+1, loss.numpy()))

EPOCH: 1, LOSS: 1.6801
EPOCH: 2, LOSS: 1.3345
EPOCH: 3, LOSS: 1.2794
