<a href="https://colab.research.google.com/github/viniciusrpb/116319_estruturasdedados/blob/main/Untitled31.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import Embedding, LSTM, Dense, Input
from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.optimizers import Adam

In [2]:
portuguese_sentences = ["ola mundo", "como voce esta", "bom dia"]
english_sentences = ["hello world", "how are you", "good morning"]

# Add start and end tokens to the English sentences
english_sentences = ["<start> " + sentence + " <end>" for sentence in english_sentences]

# Tokenize the sentences
tokenizer_pt = Tokenizer()
tokenizer_pt.fit_on_texts(portuguese_sentences)
tokenizer_en = Tokenizer(filters='')
tokenizer_en.fit_on_texts(english_sentences)

# Convert sentences to sequences
sequences_pt = tokenizer_pt.texts_to_sequences(portuguese_sentences)
sequences_en = tokenizer_en.texts_to_sequences(english_sentences)

# Pad the sequences
max_len_pt = max([len(seq) for seq in sequences_pt])
max_len_en = max([len(seq) for seq in sequences_en])

sequences_pt = pad_sequences(sequences_pt, maxlen=max_len_pt, padding='post')
sequences_en = pad_sequences(sequences_en, maxlen=max_len_en, padding='post')

# Vocabulary sizes
vocab_size_pt = len(tokenizer_pt.word_index) + 1
vocab_size_en = len(tokenizer_en.word_index) + 1

In [3]:
# Define the model
embedding_dim = 256
units = 512

# Encoder
encoder_inputs = Input(shape=(max_len_pt,))
encoder_embedding = Embedding(vocab_size_pt, embedding_dim)(encoder_inputs)
encoder_lstm = LSTM(units, return_state=True)
encoder_outputs, state_h, state_c = encoder_lstm(encoder_embedding)
encoder_states = [state_h, state_c]

# Decoder
decoder_inputs = tf.keras.Input(shape=(max_len_en,))
decoder_embedding = Embedding(vocab_size_en, embedding_dim)(decoder_inputs)
decoder_lstm = LSTM(units, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_embedding, initial_state=encoder_states)
decoder_dense = Dense(vocab_size_en, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

# Define the model
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

# Compile the model
opt = Adam(learning_rate=0.0001)
model.compile(optimizer=opt, loss='sparse_categorical_crossentropy', metrics=['accuracy'])

In [4]:
# Prepare the target data
# Shift the target sequences by one to the right
target_data = np.zeros_like(sequences_en)
target_data[:, :-1] = sequences_en[:, 1:]

# Train the model
model.fit([sequences_pt, sequences_en], target_data, batch_size=32, epochs=100, validation_split=0.2)

Epoch 1/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 5s/step - accuracy: 0.1000 - loss: 2.3025 - val_accuracy: 0.2000 - val_loss: 2.3054
Epoch 2/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 666ms/step - accuracy: 0.2000 - loss: 2.2979 - val_accuracy: 0.4000 - val_loss: 2.3034
Epoch 3/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 299ms/step - accuracy: 0.4000 - loss: 2.2932 - val_accuracy: 0.4000 - val_loss: 2.3014
Epoch 4/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 162ms/step - accuracy: 0.5000 - loss: 2.2885 - val_accuracy: 0.4000 - val_loss: 2.2994
Epoch 5/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 290ms/step - accuracy: 0.5000 - loss: 2.2838 - val_accuracy: 0.4000 - val_loss: 2.2973
Epoch 6/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 312ms/step - accuracy: 0.5000 - loss: 2.2790 - val_accuracy: 0.4000 - val_loss: 2.2952
Epoch 7/100
[1m1/1[0m [32m━━━━━━━━

<keras.src.callbacks.history.History at 0x7f42fca66ce0>

In [5]:
# Encoder model
encoder_model = Model(encoder_inputs, encoder_states)

# Decoder model
decoder_state_input_h = Input(shape=(units,))
decoder_state_input_c = Input(shape=(units,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]

decoder_outputs, state_h, state_c = decoder_lstm(
    decoder_embedding, initial_state=decoder_states_inputs)
decoder_states = [state_h, state_c]
decoder_outputs = decoder_dense(decoder_outputs)

decoder_model = Model(
    [decoder_inputs] + decoder_states_inputs,
    [decoder_outputs] + decoder_states)

In [6]:
def decode_sequence(input_seq):
    # Encode the input sequence to get the internal state vectors
    states_value = encoder_model.predict(input_seq)

    # Generate empty target sequence of length 1 with only the start token
    target_seq = np.zeros((1, 1))
    target_seq[0, 0] = tokenizer_en.word_index['<start>']

    # Loop for generating the translated sentence
    stop_condition = False
    translated_sentence = ''
    while False:
        output_tokens, h, c = decoder_model.predict([target_seq] + states_value)

        # Sample a token
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_word = tokenizer_en.index_word[sampled_token_index]
        translated_sentence += ' ' + sampled_word

        # Exit condition: either hit max length or find stop token
        if (sampled_word == '<end>' or len(translated_sentence.split()) > max_len_en):
            stop_condition = True

        # Update the target sequence (of length 1)
        target_seq = np.zeros((1, 1))
        target_seq[0, 0] = sampled_token_index

        # Update states
        states_value = [h, c]

    return translated_sentence

# Translate a new sentence
input_seq = pad_sequences(tokenizer_pt.texts_to_sequences(["como esta mundo"]), maxlen=max_len_pt, padding='post')
translated_sentence = decode_sequence(input_seq)
print('Translated sentence:', translated_sentence)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 198ms/step
Translated sentence: 
