In [255]:
%load_ext nb_black

The nb_black extension is already loaded. To reload it, use:
  %reload_ext nb_black


<IPython.core.display.Javascript object>

In [256]:
import pprint as pp
import time

from functools import wraps

import numpy as np
import tensorflow as tf

from tensorflow.keras.layers import Activation, Bidirectional, Dense, Embedding, GRU, Input
from tensorflow.keras.losses import SparseCategoricalCrossentropy
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import set_random_seed
from tensorflow.keras.utils import to_categorical

<IPython.core.display.Javascript object>

In [257]:
N = 200
texts = ["the weather is great", "there is nothing better in the world", "you are great", "why are you so amazing"] * N
targets = ["ο καιρός είναι εκπληκτικός", "δεν υπάρχει τίποτα καλύτερο στον κόσμο", "είσαι εκπληκτικός", "γιατί είσαι τόσο εκπληκτικός"] * N

<IPython.core.display.Javascript object>

In [258]:
NUM_WORDS = 10_000
ANALYZER = None
LOWER = False
OOV_TOKEN = "<OOV>"


tokenizer_en = Tokenizer(num_words=NUM_WORDS, analyzer=ANALYZER, lower=LOWER, oov_token=OOV_TOKEN)
tokenizer_gr = Tokenizer(num_words=NUM_WORDS, analyzer=ANALYZER, lower=LOWER, oov_token=OOV_TOKEN)

<IPython.core.display.Javascript object>

In [259]:
tokenizer_en.fit_on_texts(texts)
tokenizer_gr.fit_on_texts(targets)

<IPython.core.display.Javascript object>

In [260]:
def create_vocab(tokenizer):

  if tokenizer.num_words:
    vocab = {k: v for k, v in tokenizer.word_index.items() if v <= tokenizer.num_words}
  else:
    vocab = tokenizer.word_index

  return vocab


vocab_en = create_vocab(tokenizer_en)
vocab_gr = create_vocab(tokenizer_gr)
pp.pprint(vocab_en)
print("\n")
pp.pprint(vocab_gr)
vocab_gr_inv = {v: k for k, v in vocab_gr.items()}

{'<OOV>': 1,
 'BOS': 2,
 'EOS': 3,
 'amazing': 17,
 'are': 8,
 'better': 12,
 'great': 6,
 'in': 13,
 'is': 5,
 'nothing': 11,
 'so': 16,
 'the': 4,
 'there': 10,
 'weather': 9,
 'why': 15,
 'world': 14,
 'you': 7}


{'<OOV>': 1,
 'BOS': 2,
 'EOS': 3,
 'γιατί': 15,
 'δεν': 9,
 'είναι': 8,
 'είσαι': 5,
 'εκπληκτικός': 4,
 'καιρός': 7,
 'καλύτερο': 12,
 'κόσμο': 14,
 'ο': 6,
 'στον': 13,
 'τίποτα': 11,
 'τόσο': 16,
 'υπάρχει': 10}


<IPython.core.display.Javascript object>

In [261]:
texts_seq = tokenizer_en.texts_to_sequences(texts)
targets_seq = tokenizer_gr.texts_to_sequences(targets)

<IPython.core.display.Javascript object>

In [262]:
PADDING = "post"
TRUNCATING = "post"
VALUE = 0
MAXLEN = 9

padded_texts = pad_sequences(texts_seq, padding=PADDING, truncating=TRUNCATING, value=VALUE, maxlen=MAXLEN)
padded_targets = pad_sequences(targets_seq, padding=PADDING, truncating=TRUNCATING, value=VALUE, maxlen=MAXLEN)

print(padded_texts)
print("\n")
print(padded_targets)

[[ 2  4  9 ...  0  0  0]
 [ 2 10  5 ...  4 14  3]
 [ 2  7  8 ...  0  0  0]
 ...
 [ 2 10  5 ...  4 14  3]
 [ 2  7  8 ...  0  0  0]
 [ 2 15  8 ...  3  0  0]]


[[ 2  6  7 ...  0  0  0]
 [ 2  9 10 ... 14  3  0]
 [ 2  5  4 ...  0  0  0]
 ...
 [ 2  9 10 ... 14  3  0]
 [ 2  5  4 ...  0  0  0]
 [ 2 15  5 ...  0  0  0]]


<IPython.core.display.Javascript object>

# Model

In [263]:
input_seq = np.array(padded_texts)
output_seq = np.array(padded_targets)
output_seq_one_hot= to_categorical(output_seq, num_classes=len(vocab_gr) + 1)


output_seq_one_hot.shape

(800, 9, 17)

<IPython.core.display.Javascript object>

In [264]:
# Define input and output dimensions
INPUT_DIM = len(vocab_en) + 1
OUTPUT_DIM = 16


# Define the encoder inputs and outputs
encoder_inputs = Input(shape=(MAXLEN, ), name="encoder_input_layer")
encoder_embedding = Embedding(input_dim=INPUT_DIM, output_dim=OUTPUT_DIM, input_length=MAXLEN, name="encoder_embedding_layer", mask_zero=False)(encoder_inputs)
encoder_outputs, encoder_states = GRU(units=16, return_sequences=True, return_state=True, name="encoder_gru_layer")(encoder_embedding)

# Define the decoder inputs

# Define the decoder GRU and Dense layers
decoder_gru = GRU(units=16, return_sequences=True, return_state=False, name="decoder_gru_layer")
decoder_dense = Dense(units=len(vocab_gr) + 1, activation="softmax", name="decoder_output_layer")

# Connect the decoder layers
decoder_outputs = decoder_gru(encoder_outputs, initial_state=encoder_states)
decoder_outputs = decoder_dense(decoder_outputs)

# Define the model
model = Model(encoder_inputs, decoder_outputs)


<IPython.core.display.Javascript object>

In [265]:
OPTIMIZER = Adam(learning_rate=0.01)
LOSS = SparseCategoricalCrossentropy()
METRICS = ["accuracy"]

model.compile(optimizer=OPTIMIZER, loss=LOSS, metrics=METRICS)
model.summary()

Model: "model_18"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 encoder_input_layer (InputLaye  [(None, 9)]         0           []                               
 r)                                                                                               
                                                                                                  
 encoder_embedding_layer (Embed  (None, 9, 16)       288         ['encoder_input_layer[0][0]']    
 ding)                                                                                            
                                                                                                  
 encoder_gru_layer (GRU)        [(None, 9, 16),      1632        ['encoder_embedding_layer[0][0]']
                                 (None, 16)]                                               

<IPython.core.display.Javascript object>

In [266]:
model.fit(x=input_seq, y=output_seq, epochs=20)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x253aaa12f50>

<IPython.core.display.Javascript object>

In [294]:
# n_example = 300
# input_text = texts[n_example]
# print("input_text:", input_text)


input_text = ["<BOS> nothing better the world <EOS>"]
input_seq = tokenizer_en.texts_to_sequences(input_text)
input_seq_padded = pad_sequences(input_seq, padding=PADDING, truncating=TRUNCATING, value=VALUE, maxlen=MAXLEN)
print("input_seq_padded", input_seq_padded)

# input_text = "There is nothing in the world"
preds_seq = model.predict(x=input_seq_padded)
preds = np.argmax(preds_seq, axis=-1).flatten()
prediction = [vocab_gr_inv.get(i) for i in preds]
print(prediction)


input_seq_padded [[ 2 11 12  4 14  3  0  0  0]]
['BOS', 'ο', 'υπάρχει', 'είναι', 'εκπληκτικός', 'EOS', None, None, None]


<IPython.core.display.Javascript object>