In [1]:
import tensorflow as tf
import numpy as np
from matplotlib import pyplot as plt
import pandas as pd

In [2]:
text_dataset = tf.data.TextLineDataset("./Dataset/preprocessed_data.txt")

In [3]:
# Hyperparameters
VOCAB_SIZE = 20000
ENGLISH_SEQUENCE_LENGTH = 32
FRENCH_SEQUENCE_LENGTH = 32
EMBEDDING_DIM = 256
BATCH_SIZE = 32

In [4]:
# Vectorization Layer
english_vectorization_layer = tf.keras.layers.TextVectorization(
    standardize="lower_and_strip_punctuation",
    max_tokens = VOCAB_SIZE,
    output_mode = 'int',
    output_sequence_length = ENGLISH_SEQUENCE_LENGTH
)

french_vectorization_layer = tf.keras.layers.TextVectorization(
    standardize="lower_and_strip_punctuation",
    max_tokens = VOCAB_SIZE,
    output_mode = 'int',
    output_sequence_length = FRENCH_SEQUENCE_LENGTH
)

In [5]:
# Preprocess Text
def split_text(text):
    text = tf.strings.split(text, '\t')
    input_1 = text[:1]
    input_2 = 'starttoken ' + text[1:2] + ' endtoken'
    return input_1, input_2

def vectorize(text):
    text = tf.strings.split(text, '\t')
    input_1 = text[:1]
    start_input = 'starttoken ' + text[1:2]
    end_input = text[1:2] + ' endtoken'
    print(f"Vectorization -- Start Input: {start_input} End Input: {end_input}")
    return {
        'input_1': english_vectorization_layer(input_1),
        'input_2': french_vectorization_layer(start_input)
    }, french_vectorization_layer(end_input)

In [6]:
# Preprocessing
splitted_dataset = text_dataset.map(split_text)

# Create training data
print("Creating english training data and vectorization layer...")
english_training_data = splitted_dataset.map(lambda x, y: x)
english_vectorization_layer.adapt(english_training_data)

print("Creating french training data and vectorization layer...")
french_training_data = splitted_dataset.map(lambda x, y: y)
french_vectorization_layer.adapt(french_training_data)

# Map Shuffle
dataset = text_dataset.map(vectorize)

# Shuffling dataset and Batching dataset
dataset = dataset.shuffle(200).unbatch().batch(BATCH_SIZE).prefetch(buffer_size=tf.data.AUTOTUNE)

Creating english training data and vectorization layer...
Creating french training data and vectorization layer...
Vectorization -- Start Input: Tensor("add:0", shape=(None,), dtype=string) End Input: Tensor("add_1:0", shape=(None,), dtype=string)


In [7]:
len_japan_vocab = len(french_vectorization_layer.get_vocabulary())
index_to_word={x:y for x, y in zip(range(len_japan_vocab), french_vectorization_layer.get_vocabulary())}
print(index_to_word)

{0: '', 1: '[UNK]', 2: 'starttoken', 3: 'endtoken', 4: 'я', 5: 'не', 6: 'том', 7: 'что', 8: 'в', 9: 'это', 10: 'ты', 11: 'вы', 12: 'мне', 13: 'на', 14: 'у', 15: 'с', 16: 'мэри', 17: 'он', 18: 'мы', 19: 'тома', 20: 'меня', 21: 'как', 22: 'и', 23: 'тому', 24: 'бы', 25: 'тебе', 26: 'сказал', 27: 'был', 28: 'чтобы', 29: 'то', 30: 'тебя', 31: 'очень', 32: 'вам', 33: 'она', 34: 'по', 35: 'его', 36: 'так', 37: 'думаю', 38: 'было', 39: 'за', 40: 'знаю', 41: 'почему', 42: 'есть', 43: 'вас', 44: 'хочу', 45: 'когда', 46: 'всё', 47: 'они', 48: 'сделать', 49: 'ещё', 50: 'здесь', 51: 'кто', 52: 'о', 53: 'знал', 54: 'нам', 55: 'из', 56: 'томом', 57: 'этого', 58: 'хотел', 59: 'могу', 60: 'нужно', 61: 'никогда', 62: 'будет', 63: 'больше', 64: 'где', 65: 'делать', 66: 'надо', 67: 'нет', 68: 'нибудь', 69: 'уже', 70: 'все', 71: 'к', 72: 'нас', 73: 'этом', 74: 'сделал', 75: 'если', 76: 'сегодня', 77: 'пожалуйста', 78: 'может', 79: 'быть', 80: 'её', 81: 'сколько', 82: 'от', 83: 'для', 84: 'хочет', 85: 'мног

In [8]:
# Splitting Dataset
dataset_len = sum(1 for _ in dataset)
train_dataset = dataset.take(int(0.9*dataset_len))
val_dataset = dataset.skip(int(0.9*dataset_len))

In [9]:
class Encoder(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, units):
        super(Encoder, self).__init__()
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
        self.lstm = tf.keras.layers.LSTM(units, return_sequences=True)
    def call(self, x):
        x = self.embedding(x)
        return self.lstm(x)

In [10]:
class BahdanauAttention(tf.keras.layers.Layer):
    def __init__(self, units):
        super(BahdanauAttention, self).__init__()
        self.w_1 = tf.keras.layers.Dense(units)
        self.w_2 = tf.keras.layers.Dense(units)
        self.w_output = tf.keras.layers.Dense(1)

    def call(self, prev_dec_state, enc_states):
        scores=self.w_output(tf.nn.tanh(self.w_1(tf.expand_dims(prev_dec_state, -2)) + self.w_2(enc_states)))
        attention_weights = tf.nn.softmax(scores, axis=1)
        context_vector=attention_weights*enc_states
        context_vector=tf.reduce_sum(context_vector,axis=1)
        return context_vector,attention_weights

In [11]:
class Decoder(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, dec_units, sequence_length):
        super(Decoder,self).__init__()
        self.embedding=tf.keras.layers.Embedding(vocab_size, embedding_dim)
        self.attention=BahdanauAttention(dec_units)
        self.gru=tf.keras.layers.GRU(dec_units,return_sequences=True,return_state=True)
        self.dense=tf.keras.layers.Dense(vocab_size,activation='softmax')
        self.sequence_length = sequence_length

    def call(self, x, hidden, shifted_target):
        outputs=[]
        context_vectors=[] 
        attention_weights=[]
        shifted_target=self.embedding(shifted_target)

        for t in range(0,self.sequence_length):
            context_vector, attention_weights = self.attention(hidden,x)
            dec_input = context_vector + shifted_target[:,t]
            output, hidden = self.gru(tf.expand_dims(dec_input,1))
            outputs.append(output[:,0])

        outputs=tf.convert_to_tensor(outputs)
        outputs=tf.transpose(outputs, perm=[1,0,2])

        outputs=self.dense(outputs)
        return outputs,attention_weights
     

In [12]:
# Model Creation
HIDDEN_UNITS=256
EMBEDDING_DIM=256

### ENCODER
input = tf.keras.layers.Input(shape=(ENGLISH_SEQUENCE_LENGTH, ), dtype="int64", name="input_1")
encoder = Encoder(VOCAB_SIZE, EMBEDDING_DIM, HIDDEN_UNITS)
encoder_output = encoder(input)

### DECODER
shifted_target = tf.keras.layers.Input(shape=(FRENCH_SEQUENCE_LENGTH,), dtype="int64", name="input_2")
decoder = Decoder(VOCAB_SIZE, EMBEDDING_DIM, HIDDEN_UNITS, FRENCH_SEQUENCE_LENGTH)
decoder_output, attention_weights = decoder(encoder_output, tf.zeros([1, HIDDEN_UNITS]), shifted_target)

bahdanau = tf.keras.Model([input, shifted_target], decoder_output)
bahdanau.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, 32)]         0           []                               
                                                                                                  
 encoder (Encoder)              (None, 32, 256)      5645312     ['input_1[0][0]']                
                                                                                                  
 input_2 (InputLayer)           [(None, 32)]         0           []                               
                                                                                                  
 decoder (Decoder)              ((None, 32, 20000),  10786593    ['encoder[0][0]',                
                                 (None, 32, 1))                   'input_2[0][0]']            

In [13]:
bahdanau.compile(loss='sparse_categorical_crossentropy', metrics=['accuracy'], optimizer='adam')

In [None]:
history = bahdanau.fit(train_dataset, validation_data=val_dataset, epochs=1)

In [None]:
loss,acc = bahdanau.evaluate(val_dataset)

In [None]:
len_japan_vocab = len(french_vectorization_layer.get_vocabulary())
index_to_word={x:y for x, y in zip(range(len_japan_vocab), french_vectorization_layer.get_vocabulary())}
print(index_to_word)

In [None]:
def translate(sentences):
    tokenized = english_vectorization_layer([sentences])
    shifted_target = 'starttoken'
    results = ''

    for i in range(FRENCH_SEQUENCE_LENGTH):
        tokenized_shifted = french_vectorization_layer([shifted_target])
        output = bahdanau.predict([tokenized, tokenized_shifted])
        word_index = tf.argmax(output, axis=-1)[0][i].numpy()
        current_word = index_to_word[word_index]
        if current_word == "endtoken":
            break


        shifted_target += ' ' + current_word

        if i != 0:
            results += ' '
        results += current_word 
    
    return results

In [None]:
translate('My name is Vincent')