In [11]:
# Imports and Dependancies
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, LSTM, Embedding, Input
from tensorflow.keras.callbacks import ReduceLROnPlateau, ModelCheckpoint
from tensorflow.keras.utils import pad_sequences
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import re

In [12]:
# Data Import
data_path = '/content/data.txt'
data = pd.read_csv(data_path, sep='\t', names=['eng', 'spa'])
data.sample
data['eng'] = data['eng'].apply(lambda x: re.sub("[^a-zA-Z]", " ", x.lower()))
data['spa'] = data['spa'].apply(lambda x: re.sub("[^a-zA-Z]", " ", x.lower()))
data['spa'] = data['spa'].apply(lambda x: f"START_ {x} _END")

In [13]:
# Create vocabularies for English and Spanish
eng_vocab = set(word for sentence in data['eng'] for word in sentence.split())
spa_vocab = set(word for sentence in data['spa'] for word in sentence.split())

ing_word2idx = {word: idx + 1 for idx, word in enumerate(sorted(eng_vocab))}
spa_word2idx = {word: idx + 1 for idx, word in enumerate(sorted(spa_vocab))}

eng_idx2word = {idx: word for word, idx in ing_word2idx.items()}
spa_idx2word = {idx: word for word, idx in spa_word2idx.items()}

In [14]:
# Determine max sequence lengths
max_eng_seq_len = max(len(sentence.split()) for sentence in data['eng'])
max_spa_seq_len = max(len(sentence.split()) for sentence in data['spa'])

# Split data into training and testing sets
data_shuffled = shuffle(data, random_state=42)
x_train, x_test, y_train, y_test = train_test_split(data_shuffled['eng'], data_shuffled['spa'], test_size=0.15)

# Vocabulary sizes
num_encoder_tokens = len(ing_word2idx)
num_decoder_tokens = len(spa_word2idx) + 1

In [15]:
# Batch generator
def batch_generator(X, Y, batch_size=128):
    while True:
        for start in range(0, len(X), batch_size):
            end = start + batch_size
            X_batch = X[start:end]
            Y_batch = Y[start:end]

            enc_input = np.zeros((len(X_batch), max_eng_seq_len), dtype="float32")
            dec_input = np.zeros((len(Y_batch), max_spa_seq_len), dtype="float32")
            dec_output = np.zeros((len(Y_batch), max_spa_seq_len, num_decoder_tokens), dtype="float32")

            for i, (eng_sent, spa_sent) in enumerate(zip(X_batch, Y_batch)):
                for t, word in enumerate(eng_sent.split()):
                    enc_input[i, t] = ing_word2idx.get(word, 0)
                for t, word in enumerate(spa_sent.split()):
                    if t < len(spa_sent.split()) - 1:
                        dec_input[i, t] = spa_word2idx.get(word, 0)
                    if t > 0:
                        dec_output[i, t - 1, spa_word2idx.get(word, 0)] = 1

            yield [enc_input, dec_input], dec_output


In [17]:
# MODEL ARCHITECTURE
# Define the model
latent_dim = 256

# Encoder
encoder_inputs = Input(shape=(None,))
encoder_embedding = Embedding(input_dim=num_encoder_tokens + 1, output_dim=latent_dim, mask_zero=True)(encoder_inputs)
encoder_lstm = LSTM(latent_dim, return_state=True)
_, state_h, state_c = encoder_lstm(encoder_embedding)
encoder_states = [state_h, state_c]

# Decoder
decoder_inputs = Input(shape=(None,))
decoder_embedding = Embedding(input_dim=num_decoder_tokens + 1, output_dim=latent_dim, mask_zero=True)(decoder_inputs)
decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_embedding, initial_state=encoder_states)
decoder_dense = Dense(num_decoder_tokens, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

In [19]:
# Compile the model
nmt_model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
nmt_model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['accuracy'])

# Callbacks
model_checkpoint = ModelCheckpoint(filepath='models/best_nmt_model.keras', monitor='val_loss', save_best_only=True, verbose=1)  # Changed .h5 to .keras
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=3, verbose=1, min_delta=1e-4)

# Train the model
batch_size = 128
train_steps = len(x_train) // batch_size
val_steps = len(x_test) // batch_size

In [21]:
history = nmt_model.fit(
    tf.data.Dataset.from_generator(
        lambda: batch_generator(x_train, y_train, batch_size),
        output_signature=(
            (tf.TensorSpec(shape=(None, max_eng_seq_len), dtype=tf.float32),
             tf.TensorSpec(shape=(None, max_spa_seq_len), dtype=tf.float32)),
            tf.TensorSpec(shape=(None, max_spa_seq_len, num_decoder_tokens), dtype=tf.float32)
        )
    ),
    steps_per_epoch=train_steps,
    epochs=50,
    validation_data=tf.data.Dataset.from_generator(
        lambda: batch_generator(x_test, y_test, batch_size),
        output_signature=(
            (tf.TensorSpec(shape=(None, max_eng_seq_len), dtype=tf.float32),
             tf.TensorSpec(shape=(None, max_spa_seq_len), dtype=tf.float32)),
            tf.TensorSpec(shape=(None, max_spa_seq_len, num_decoder_tokens), dtype=tf.float32)
        )
    ),
    validation_steps=val_steps,
    callbacks=[model_checkpoint, reduce_lr],
    verbose=1
)

# Save final weights
nmt_model.save_weights('models/final_nmt_weights.h5')

Epoch 1/50


InvalidArgumentError: Graph execution error:

Detected at node PyFunc defined at (most recent call last):
<stack traces unavailable>
TypeError: `generator` yielded an element that did not match the expected structure. The expected structure was ((tf.float32, tf.float32), tf.float32), but the yielded element was ([array([[9.5120e+03, 9.2950e+03, 1.0000e+00, ..., 0.0000e+00, 0.0000e+00,
        0.0000e+00],
       [1.1456e+04, 1.5688e+04, 8.0160e+03, ..., 0.0000e+00, 0.0000e+00,
        0.0000e+00],
       [1.5506e+04, 1.8171e+04, 6.2740e+03, ..., 0.0000e+00, 0.0000e+00,
        0.0000e+00],
       ...,
       [1.9070e+03, 1.8172e+04, 1.5506e+04, ..., 0.0000e+00, 0.0000e+00,
        0.0000e+00],
       [9.3730e+03, 9.1500e+03, 2.1803e+04, ..., 0.0000e+00, 0.0000e+00,
        0.0000e+00],
       [1.3134e+04, 9.1500e+03, 2.2358e+04, ..., 0.0000e+00, 0.0000e+00,
        0.0000e+00]], dtype=float32), array([[  1., 270., 236., ...,   0.,   0.,   0.],
       [  1., 270., 236., ...,   0.,   0.,   0.],
       [  1., 270., 236., ...,   0.,   0.,   0.],
       ...,
       [  1., 270., 236., ...,   0.,   0.,   0.],
       [  1., 270., 236., ...,   0.,   0.,   0.],
       [  1., 270., 236., ...,   0.,   0.,   0.]], dtype=float32)], array([[[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]],

       [[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]],

       [[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]],

       ...,

       [[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]],

       [[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]],

       [[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]]], dtype=float32)).
Traceback (most recent call last):

  File "/usr/local/lib/python3.10/dist-packages/tensorflow/python/data/ops/from_generator_op.py", line 204, in generator_py_func
    flattened_values = nest.flatten_up_to(output_types, values)

  File "/usr/local/lib/python3.10/dist-packages/tensorflow/python/data/util/nest.py", line 237, in flatten_up_to
    return nest_util.flatten_up_to(

  File "/usr/local/lib/python3.10/dist-packages/tensorflow/python/util/nest_util.py", line 1541, in flatten_up_to
    return _tf_data_flatten_up_to(shallow_tree, input_tree)

  File "/usr/local/lib/python3.10/dist-packages/tensorflow/python/util/nest_util.py", line 1570, in _tf_data_flatten_up_to
    _tf_data_assert_shallow_structure(shallow_tree, input_tree)

  File "/usr/local/lib/python3.10/dist-packages/tensorflow/python/util/nest_util.py", line 1444, in _tf_data_assert_shallow_structure
    _tf_data_assert_shallow_structure(

  File "/usr/local/lib/python3.10/dist-packages/tensorflow/python/util/nest_util.py", line 1414, in _tf_data_assert_shallow_structure
    raise TypeError(

TypeError: If shallow structure is a sequence, input must also be a sequence. Input has type: 'list'.


The above exception was the direct cause of the following exception:


Traceback (most recent call last):

  File "/usr/local/lib/python3.10/dist-packages/tensorflow/python/ops/script_ops.py", line 270, in __call__
    ret = func(*args)

  File "/usr/local/lib/python3.10/dist-packages/tensorflow/python/autograph/impl/api.py", line 643, in wrapper
    return func(*args, **kwargs)

  File "/usr/local/lib/python3.10/dist-packages/tensorflow/python/data/ops/from_generator_op.py", line 206, in generator_py_func
    raise TypeError(

TypeError: `generator` yielded an element that did not match the expected structure. The expected structure was ((tf.float32, tf.float32), tf.float32), but the yielded element was ([array([[9.5120e+03, 9.2950e+03, 1.0000e+00, ..., 0.0000e+00, 0.0000e+00,
        0.0000e+00],
       [1.1456e+04, 1.5688e+04, 8.0160e+03, ..., 0.0000e+00, 0.0000e+00,
        0.0000e+00],
       [1.5506e+04, 1.8171e+04, 6.2740e+03, ..., 0.0000e+00, 0.0000e+00,
        0.0000e+00],
       ...,
       [1.9070e+03, 1.8172e+04, 1.5506e+04, ..., 0.0000e+00, 0.0000e+00,
        0.0000e+00],
       [9.3730e+03, 9.1500e+03, 2.1803e+04, ..., 0.0000e+00, 0.0000e+00,
        0.0000e+00],
       [1.3134e+04, 9.1500e+03, 2.2358e+04, ..., 0.0000e+00, 0.0000e+00,
        0.0000e+00]], dtype=float32), array([[  1., 270., 236., ...,   0.,   0.,   0.],
       [  1., 270., 236., ...,   0.,   0.,   0.],
       [  1., 270., 236., ...,   0.,   0.,   0.],
       ...,
       [  1., 270., 236., ...,   0.,   0.,   0.],
       [  1., 270., 236., ...,   0.,   0.,   0.],
       [  1., 270., 236., ...,   0.,   0.,   0.]], dtype=float32)], array([[[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]],

       [[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]],

       [[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]],

       ...,

       [[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]],

       [[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]],

       [[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]]], dtype=float32)).


	 [[{{node PyFunc}}]]
	 [[IteratorGetNext]] [Op:__inference_one_step_on_iterator_5535]