In [2]:
import tensorflow as tf
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split

In [24]:
dataset = pd.read_csv("dictionary.csv", encoding='utf-8')
dataset = dataset.dropna()
dataset.head()

Unnamed: 0,en,de
0,Go.,Geh.
1,Run!,Lauf!
2,Wow!,Donnerwetter!
3,Help!,Hilfe!
4,Stop!,Stopp!


In [25]:
VOCABULARY_SIZE = 5000
MAX_SEQUENCE_LENGTH = 32
UNITS = 256 
EMBEDDING_DIM = 256
BUFFER = 5000
BATCH_SIZE = 32

english_vectorizer = tf.keras.layers.TextVectorization(
    standardize = "lower_and_strip_punctuation",
    max_tokens = MAX_SEQUENCE_LENGTH, 
    output_mode = 'int',
    output_sequence_length = MAX_SEQUENCE_LENGTH
)

german_vectorizer = tf.keras.layers.TextVectorization(
    standardize = "lower_and_strip_punctuation",
    max_tokens = MAX_SEQUENCE_LENGTH, 
    output_mode = 'int',
    output_sequence_length = MAX_SEQUENCE_LENGTH
)

In [26]:
def standardize_text(text, is_target):
    w = text.lower()
    w = re.sub(r'[" "]+', " ",  w)
    w = w.strip()

    if is_target:
        w = 'starttoken ' + w + ' endtoken'
    return w

dataset["eng"] = dataset["en"].apply(lambda x: standardize_text(x, is_target=False))
dataset["ger"] = dataset["de"].apply(lambda x: standardize_text(x, is_target=True))

english_lang = dataset["eng"].tolist()
german_lang = dataset["ger"].tolist()

print(english_lang[:3])
print(german_lang[:3])

['go.', 'run!', 'wow!']
['starttoken geh. endtoken', 'starttoken lauf! endtoken', 'starttoken donnerwetter! endtoken']


In [27]:
english_train, english_val, german_train, german_val = train_test_split(english_lang, german_lang, test_size=0.2, random_state=42)

english_vectorizer.adapt(english_train)
german_vectorizer.adapt(german_train)

input_vector = english_vectorizer(english_train)
output_vector = english_vectorizer(german_train)

In [28]:
input_vocab = {index: word for index, word in enumerate(english_vectorizer.get_vocabulary())}
output_vocab = {index: word for index, word in enumerate(german_vectorizer.get_vocabulary())}

print(dict(list(input_vocab.items())[:30]))
print(dict(list(output_vocab.items())[:30]))

{0: '', 1: '[UNK]', 2: 'tom', 3: 'you', 4: 'i', 5: 'to', 6: 'the', 7: 'a', 8: 'is', 9: 'do', 10: 'that', 11: 'it', 12: 'he', 13: 'me', 14: 'this', 15: 'have', 16: 'was', 17: 'in', 18: 'dont', 19: 'my', 20: 'are', 21: 'of', 22: 'im', 23: 'what', 24: 'your', 25: 'we', 26: 'for', 27: 'like', 28: 'be', 29: 'know'}
{0: '', 1: '[UNK]', 2: 'starttoken', 3: 'endtoken', 4: 'ich', 5: 'tom', 6: 'ist', 7: 'nicht', 8: 'das', 9: 'du', 10: 'sie', 11: 'es', 12: 'zu', 13: 'hat', 14: 'die', 15: 'er', 16: 'wir', 17: 'ein', 18: 'habe', 19: 'der', 20: 'was', 21: 'mir', 22: 'in', 23: 'mich', 24: 'ihr', 25: 'dass', 26: 'war', 27: 'wie', 28: 'eine', 29: 'sich'}


In [29]:
def vectorize(input_text, target_text):
    encoder_input = english_vectorizer(input_text)

    decoder_input_text = [text.replace(" endtoken",  "") for text in target_text]
    decoder_output_text = [text.replace("starttoken ", "") for text in target_text]

    decoder_input = german_vectorizer(decoder_input_text)
    decoder_output = german_vectorizer(decoder_output_text)

    return {"input_1": encoder_input, "input_2": decoder_input}, decoder_output

In [30]:
x_train, y_train = vectorize(english_train, german_train)
x_val, y_val = vectorize(english_val, german_val)

train_dataset = tf.data.Dataset.from_tensor_slices((x_train, y_train))
train_dataset = train_dataset.shuffle(BUFFER).batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)

val_dataset = tf.data.Dataset.from_tensor_slices((x_val, y_val))
val_dataset = val_dataset.batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)

In [31]:
class Encoder(tf.keras.Model):
    def __init__(self, vocabulary_size, embedding_dim, units):
        super(Encoder, self).__init__()
        self.embedding = tf.keras.layers.Embedding(vocabulary_size, embedding_dim)
        self.lstm = tf.keras.layers.LSTM(units, return_sequences=True, return_state=True)

    def call(self, x):
        x = self.embedding(x)
        output, state_h, state_c = self.lstm(x)
        return output, state_h