<a href="https://colab.research.google.com/github/Kothai-g/Generative-AI-Workshop-/blob/main/Encoder_and_Decoder.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Dense, Embedding
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [2]:
input_texts = ["hello", "good morning", "thank you", "how are you", "see you soon"]
target_texts = ["வணக்கம்", "காலை வணக்கம்", "நன்றி", "நீங்கள் எப்படி இருக்கிறீர்கள்?", "வேகமாக பார்க்கலாம்"]

# Ensure '\t' and '\n' are added for marking start and end of sentences
target_texts = ['\t' + text + '\n' for text in target_texts]

In [3]:
# Tokenization
input_tokenizer = Tokenizer(char_level=True)  # Character-level tokenization
input_tokenizer.fit_on_texts(input_texts)
input_sequences = input_tokenizer.texts_to_sequences(input_texts)
max_input_len = max(len(seq) for seq in input_sequences)

target_tokenizer = Tokenizer(char_level=True)
target_tokenizer.fit_on_texts(target_texts)
target_sequences = target_tokenizer.texts_to_sequences(target_texts)
max_target_len = max(len(seq) for seq in target_sequences)

In [4]:
# Padding
encoder_input_data = pad_sequences(input_sequences, maxlen=max_input_len, padding='post')
decoder_input_data = pad_sequences(target_sequences, maxlen=max_target_len, padding='post')

# One-hot encoding for decoder output
num_decoder_tokens = len(target_tokenizer.word_index) + 1
decoder_output_data = np.zeros((len(target_texts), max_target_len, num_decoder_tokens), dtype='float32')

for i, seq in enumerate(target_sequences):
    for t, char_index in enumerate(seq):
        if t > 0:  # Shift left for decoder output
            decoder_output_data[i, t - 1, char_index] = 1.0

# Model parameters
latent_dim = 256

In [5]:
# Encoder
encoder_inputs = Input(shape=(max_input_len,))
encoder_embedding = Embedding(len(input_tokenizer.word_index) + 1, latent_dim)(encoder_inputs)
encoder_lstm = LSTM(latent_dim, return_state=True)
encoder_outputs, state_h, state_c = encoder_lstm(encoder_embedding)
encoder_states = [state_h, state_c]

In [6]:
# Decoder
decoder_inputs = Input(shape=(max_target_len,))
decoder_embedding_layer = Embedding(num_decoder_tokens, latent_dim)
decoder_embedding = decoder_embedding_layer(decoder_inputs)
decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_embedding, initial_state=encoder_states)
decoder_dense = Dense(num_decoder_tokens, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

In [7]:
# Define full seq2seq Model
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
model.compile(optimizer='adam', loss='categorical_crossentropy')

In [8]:
# Train the model
batch_size = 64
epochs = 500

In [9]:
model.fit(
    [encoder_input_data, decoder_input_data],
    decoder_output_data,
    batch_size=batch_size,
    epochs=epochs,
    validation_split=0.2
)

Epoch 1/500
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 6s/step - loss: 1.4926 - val_loss: 1.9519
Epoch 2/500
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 217ms/step - loss: 1.4838 - val_loss: 1.9443
Epoch 3/500
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 296ms/step - loss: 1.4745 - val_loss: 1.9354
Epoch 4/500
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 246ms/step - loss: 1.4639 - val_loss: 1.9243
Epoch 5/500
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 212ms/step - loss: 1.4509 - val_loss: 1.9092
Epoch 6/500
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 336ms/step - loss: 1.4341 - val_loss: 1.8863
Epoch 7/500
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 227ms/step - loss: 1.4096 - val_loss: 1.8465
Epoch 8/500
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 304ms/step - loss: 1.3731 - val_loss: 1.8024
Epoch 9/500
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[

<keras.src.callbacks.history.History at 0x7cc66e71fa10>

In [10]:
# Inference Models

# Encoder model for inference
encoder_model = Model(encoder_inputs, encoder_states)

# Decoder model for inference
decoder_state_input_h = Input(shape=(latent_dim,))
decoder_state_input_c = Input(shape=(latent_dim,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]

In [11]:
# New input layer for inference decoding
decoder_inputs_infer = Input(shape=(1,))
decoder_embedding_infer = decoder_embedding_layer(decoder_inputs_infer)  # Corrected embedding usage
decoder_outputs, state_h, state_c = decoder_lstm(decoder_embedding_infer, initial_state=decoder_states_inputs)
decoder_states = [state_h, state_c]
decoder_outputs = decoder_dense(decoder_outputs)

decoder_model = Model([decoder_inputs_infer] + decoder_states_inputs, [decoder_outputs] + decoder_states)

In [12]:
# Translation function
def translate_sentence(input_sentence):
    input_seq = pad_sequences(input_tokenizer.texts_to_sequences([input_sentence]), maxlen=max_input_len, padding='post')
    states_value = encoder_model.predict(input_seq)

    target_seq = np.zeros((1, 1))
    target_seq[0, 0] = target_tokenizer.word_index.get('\t', 1)  # Fix applied here

    output_sentence = ''
    for _ in range(max_target_len):
        output_tokens, h, c = decoder_model.predict([target_seq] + states_value)
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_char = {index: char for char, index in target_tokenizer.word_index.items()}.get(sampled_token_index, '')

        if sampled_char == '\n' or sampled_char == '':
            break

        output_sentence += sampled_char

        target_seq = np.zeros((1, 1))
        target_seq[0, 0] = sampled_token_index
        states_value = [h, c]

    return output_sentence

In [13]:
# Test translation
print("English: hello")
print("Tamil Translation:", translate_sentence("hello"))

print("English: good morning")
print("Tamil Translation:", translate_sentence("good morning"))

print("English: thank you")
print("Tamil Translation:", translate_sentence("thank you"))

English: hello
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 410ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 381ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 79ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 91ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 70ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 68ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 102ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 100ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 106ms/step
Tamil Translation: வணக்கம்
English: good morning
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 88ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 74ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 94ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 89ms/