In [74]:
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [75]:
import os
import pandas as pd

In [76]:
data_path = "/content/drive/MyDrive/2025 - 6CS012 - AI and ML - Student/Week9/npi.txt"

In [77]:
root_path = "/content/drive/MyDrive/2025 - 6CS012 - AI and ML - Student/Week9"

In [78]:
# Read the .txt file into a DataFrame
lines = pd.read_table(data_path, names=['source', 'target', 'comments'])

# Drop the 'comments' column if it exists
if 'comments' in lines.columns:
    lines = lines.drop(columns=['comments'])

# Path to save the cleaned CSV file
csv_path = os.path.join("/content/drive/MyDrive/2025 - 6CS012 - AI and ML - Student/Week9", "raw_data.csv")

# Save as CSV if it doesn't already exist
if not os.path.exists(csv_path):
    lines.to_csv(csv_path, index=False, encoding='utf-8')
    print(f"✅ CSV file saved to: {csv_path}")
else:
    print(f"⚠️ CSV file already exists at: {csv_path}")


⚠️ CSV file already exists at: /content/drive/MyDrive/2025 - 6CS012 - AI and ML - Student/Week9/raw_data.csv


In [79]:
raw_data = pd.read_csv("/content/drive/MyDrive/2025 - 6CS012 - AI and ML - Student/Week9/raw_data.csv")
raw_data.sample(6)

Unnamed: 0,source,target
1021,I'm also from Australia.,म पनि अस्ट्रेलिया बाट हुँ ।
1584,The attackers escaped easily.,आक्रमणकारीहरू सजिलै भागे।
1644,I don't have a microwave oven.,मसँग माइक्रोवेभ छैन।
1344,Can you give me an example?,के तपाईं मलाई एउटा उदाहरण दिन सक्नुहुन्छ?
413,What did you say?,तपाईले के भन्नु भयो ?
1893,You need to respect the elderly.,ज्येष्ठ नागरिकलाई सम्मान गर्नुपर्छ।


# Clean, Normalize and Prepare Target Sentences.

In [80]:
import re
import string
import os
from string import digits

def clean_text_data(df, output_path=None):
    """
    Cleans source and target text columns in a DataFrame for translation tasks.

    Parameters:
    -----------
    df : pandas.DataFrame
        A DataFrame with 'source' and 'target' columns.
        - 'source': English sentences
        - 'target': Translated Nepali sentences

    output_path : str, optional
        If provided, saves the cleaned DataFrame as CSV with 'cleaned_source' and 'cleaned_target'.
        Will not overwrite if file already exists.

    Returns:
    --------
    pandas.DataFrame
        Cleaned data with minimal changes to preserve sentence meaning.
    """

    # Lowercase both columns
    df.source = df.source.apply(lambda x: x.lower())
    df.target = df.target.apply(lambda x: x.lower())

    # Remove stray apostrophes or quotes
    df.source = df.source.apply(lambda x: re.sub(r"[\"’‘“”']", '', x))
    df.target = df.target.apply(lambda x: re.sub(r"[\"’‘“”']", '', x))

    # Remove digits only
    df.source = df.source.apply(lambda x: re.sub(r"\d+", '', x))
    df.target = df.target.apply(lambda x: re.sub(r"\d+", '', x))

    # Normalize whitespace
    df.source = df.source.apply(lambda x: re.sub(r"\s+", " ", x.strip()))
    df.target = df.target.apply(lambda x: re.sub(r"\s+", " ", x.strip()))

    # Add START_ and _END to target text
    df.target = df.target.apply(lambda x: f"START_ {x} _END")

    # Rename the cleaned columns
    df.rename(columns={"source": "cleaned_source", "target": "cleaned_target"}, inplace=True)

    # Save cleaned file (if not exists)
    if output_path:
        if not os.path.exists(output_path):
            df.to_csv(output_path, index=False, encoding='utf-8')
            print(f"✅ Cleaned data saved to: {output_path}")
        else:
            print(f"⚠️ File already exists. Skipping save: {output_path}")

    return df


In [81]:
cleaned_data_path = os.path.join(root_path, "cleaned_data_translation.csv")
cleaned_lines = clean_text_data(raw_data, cleaned_data_path)
# Check the column names
print(cleaned_lines.columns)

⚠️ File already exists. Skipping save: /content/drive/MyDrive/2025 - 6CS012 - AI and ML - Student/Week9/cleaned_data_translation.csv
Index(['cleaned_source', 'cleaned_target'], dtype='object')


### For sanity - ReLoad the Cleaned Dataset.

For consistency, we reload the dataset from the saved CSV. The earlier extraction and saving steps will not be rerun moving forward

In [82]:
cleaned_data = pd.read_csv("/content/drive/MyDrive/2025 - 6CS012 - AI and ML - Student/Week9/cleaned_data_translation.csv")

In [83]:
cleaned_data.sample(6)

Unnamed: 0,cleaned_source,cleaned_target
2360,tom is all alone with no one to talk to.,START_ टम एक्लै छ जससँग कुरा गर्न कोही छैन। _END
2153,tom doesnt like milk in his coffee.,START_ टमलाई आफ्नो कफीमा दूध मन पर्दैन। _END
2398,"without me, you wont be able to do that.","START_ म बिना, तपाईं त्यो गर्न सक्षम हुनुहुने ..."
1871,tom is still too young to drive.,START_ टम अझै पनि ड्राइभ गर्न को लागी धेरै जवा...
1474,id rather die than give up.,START_ म हार मान्नु भन्दा मर्न रुचाउँछु । _END
1206,he used to be a gentleman.,START_ उहाँ पहिले सज्जन हुनुहुन्थ्यो। _END


### Vocabulary Extractions:

We put all the words from source[English] to a list called source vocabulary.

We put all the words from target[Nepali] to a list called target vocabulary.

In [84]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
from tensorflow.keras.preprocessing.sequence import pad_sequences
from collections import Counter

In [85]:
all_source_words = set()
for source in cleaned_data.cleaned_source:
    for word in source.split():
        all_source_words.add(word)

all_target_words = set()
for target in cleaned_data.cleaned_target:
    for word in target.split():
        all_target_words.add(word)

source_words = sorted(list(all_source_words))
target_words = sorted(list(all_target_words))
print(len(target_words))

3269


### Sentence Length Calculation:

Finding longest sentence both in Source and Target.


In [86]:
#Find maximum sentence length in  the source and target data
source_length_list=[]
for l in cleaned_data.cleaned_source:
    source_length_list.append(len(l.split(' ')))
max_source_length= max(source_length_list)
print(" Max length of the source sentence",max_source_length)
target_length_list=[]
for l in cleaned_data.cleaned_target:
    target_length_list.append(len(l.split(' ')))
max_target_length= max(target_length_list)
print(" Max length of the target sentence",max_target_length)

 Max length of the source sentence 25
 Max length of the target sentence 22


### Word - to - Index and Index - to - Word Mapping

Creating a Look Up table.
  1.   We create a dicitionary word2indx both for source and target.
  2.   We will also Creata reverse dicitionary indx2word for both source and target.

In [87]:
# Define special tokens
PAD_TOKEN = '<PAD>'
UNK_TOKEN = '<UNK>'

# Create word-to-index dictionaries
source_word2idx = {PAD_TOKEN: 0, UNK_TOKEN: 1} | dict([(word, i+2) for i, word in enumerate(source_words)])
target_word2idx = {PAD_TOKEN: 0, UNK_TOKEN: 1} | dict([(word, i+2) for i, word in enumerate(target_words)])

# Create index-to-word dictionaries
source_idx2word = {i: word for word, i in source_word2idx.items()}
target_idx2word = {i: word for word, i in target_word2idx.items()}

# Check if the dictionaries have been properly created
print(source_word2idx)
print(len(target_word2idx))
print(source_idx2word)
print(len(target_idx2word))

{'<PAD>': 0, '<UNK>': 1, ',': 2, '.': 3, ':': 4, ':.': 5, '?': 6, 'a': 7, 'able': 8, 'aboard.': 9, 'about': 10, 'about?': 11, 'above': 12, 'abroad': 13, 'abroad.': 14, 'abused': 15, 'accept': 16, 'accept.': 17, 'accepted': 18, 'accident.': 19, 'accountant?': 20, 'accurate': 21, 'accurately': 22, 'across': 23, 'act': 24, 'active.': 25, 'actor?': 26, 'actually': 27, 'actually,': 28, 'add?': 29, 'addict.': 30, 'address.': 31, 'admit': 32, 'admitted': 33, 'advance': 34, 'advance.': 35, 'adventures': 36, 'advice,': 37, 'advice.': 38, 'afraid': 39, 'after': 40, 'afternoon.': 41, 'afternoon?': 42, 'again': 43, 'again,': 44, 'again.': 45, 'again?': 46, 'age.': 47, 'aggressive.': 48, 'ago.': 49, 'agree': 50, 'agreed': 51, 'ahead': 52, 'airport': 53, 'alcohol?': 54, 'alice': 55, 'all': 56, 'all.': 57, 'all?': 58, 'allergic': 59, 'allergies.': 60, 'allowed': 61, 'almost': 62, 'alone': 63, 'alone!': 64, 'alone.': 65, 'alone?': 66, 'along': 67, 'already': 68, 'also': 69, 'alternative.': 70, 'always

### Shuffle and Split:

In [88]:
#Shuffle the data
lines = shuffle(cleaned_data)
# Train - Test Split
X, y = cleaned_data.cleaned_source, cleaned_data.cleaned_target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.1)
X_train.shape, X_test.shape

((2420,), (269,))

In [89]:
# Input tokens for encoder
num_encoder_tokens=len(source_word2idx)
# Input tokens for decoder zero padded
num_decoder_tokens=len(target_idx2word)
print(num_decoder_tokens)

3271


## Generate in Batch:

To manage our memory we will create and input data pipeline in batches.

In [90]:
def generate_batch(X, y, batch_size=128):
    while True:
        for j in range(0, len(X), batch_size):
            batch_X = X[j:j + batch_size]
            batch_y = y[j:j + batch_size]

            encoder_input_data = np.zeros((len(batch_X), max_source_length), dtype='float32')
            decoder_input_data = np.zeros((len(batch_X), max_target_length), dtype='float32')
            decoder_target_data = np.zeros((len(batch_X), max_target_length, num_decoder_tokens), dtype='float32')

            for i, (input_text, target_text) in enumerate(zip(batch_X, batch_y)):
                input_seq = [source_word2idx.get(word, source_word2idx[UNK_TOKEN]) for word in input_text.split()]
                target_seq = [target_word2idx.get(word, target_word2idx[UNK_TOKEN]) for word in target_text.split()]

                encoder_input_data[i] = pad_sequences([input_seq], maxlen=max_source_length, padding='post')[0]
                decoder_input_data[i] = pad_sequences([target_seq], maxlen=max_target_length, padding='post')[0]

                for t in range(1, len(target_seq)):
                    decoder_target_data[i, t - 1, target_seq[t]] = 1.

            # Yield as expected structure: ((inputs), targets)
            yield ((encoder_input_data, decoder_input_data), decoder_target_data)

In [91]:
def create_tf_dataset(X, y, batch_size=128):
    output_signature = (
        (tf.TensorSpec(shape=(None, max_source_length), dtype=tf.float32),  # encoder_input_data
         tf.TensorSpec(shape=(None, max_target_length), dtype=tf.float32)),  # decoder_input_data
        tf.TensorSpec(shape=(None, max_target_length, num_decoder_tokens), dtype=tf.float32)  # decoder_target_data
    )
    return tf.data.Dataset.from_generator(
        lambda: generate_batch(X, y, batch_size),  # Lambda to call the generator function
        output_signature=output_signature  # Defining the output signature for the dataset
    )


# Model Building:

1. encoder inputs: The 2D array will be of shape (batch_size, max source sentence length). For a batch_size of 128 and a max source sentence length of 47, the shape of encoder_input will be (128,47)

2. decoder inputs: The 2D array will be of shape (batch_size, max target sentence length). For a batch_size of 128 and a max target sentence length of 55, the shape of decoder inputs will be (128,55)

3. decoder outputs: The 3D array will be of shape (batch_size, max target sentence length, number of unique words in target sentences). For a batch_size of 128 and a max target sentence length of 55, the shape of decoder output will be (128,55, 27200).

## Encoder Architecture:
Encoder encodes the input sentence.
1. It takes the input source tokens from input layer.
2. Embedding layer then translates sparse vectors into a dense lower dimesional space preserving teh semantic realtionships.
3. Create the LSTM layer and only set return_state to True, because we want hidden state and cell state, as an input to decoder.

In [92]:
train_samples = len(X_train)
val_samples = len(X_test)
batch_size = 32
epochs = 100
latent_dim=256

In [93]:
def define_encoder(input_shape, num_encoder_tokens, latent_dim):
    """
    Defines the encoder architecture for a sequence-to-sequence model.

    The encoder processes input sequences through an embedding layer and LSTM,
    returning the final states that capture the encoded information.
    Parameters:
    -----------
    input_shape : tuple
        Shape of the input tensor (max_sequence_length,) for variable-length sequences
    num_encoder_tokens : int
        Size of the source vocabulary (including special tokens)
    latent_dim : int
        Dimensionality of the embedding and LSTM layers
    Returns:
    --------
    tuple: (encoder_inputs, encoder_states)
        encoder_inputs : keras.Input
            Input layer for the encoder
        encoder_states : list
            Final states [hidden_state, cell_state] from the LSTM
    """
    encoder_inputs = Input(shape=input_shape, name='encoder_inputs')
    enc_emb = Embedding(num_encoder_tokens, latent_dim, mask_zero=True, name='encoder_embedding')(encoder_inputs)
    encoder_lstm = LSTM(latent_dim, return_state=True, name='encoder_lstm')
    encoder_outputs, state_h, state_c = encoder_lstm(enc_emb)
    encoder_states = [state_h, state_c]

    return encoder_inputs, encoder_states

## Decoder Architecture.

1. Decoder uses hidden state and cell state from encoder and from embedding layer as an input.

2. Decoder returns output sentence and also hidden and cell states.

3. The final layer in decoder is linear layer(dense) with softmax activation function used for predictions.


In [94]:
def define_decoder(latent_dim, num_decoder_tokens, encoder_states, max_target_length):
    """
    Defines the decoder architecture for a sequence-to-sequence model.

    The decoder processes target sequences through an embedding layer and LSTM,
    using the encoder states as initial state, and outputs probability distributions
    over the target vocabulary via a dense softmax layer.

    Parameters:
    -----------
    latent_dim : int
        Dimensionality of the embedding and LSTM layers (must match encoder)
    num_decoder_tokens : int
        Size of the target vocabulary (including special tokens)
    encoder_states : list
        Final states [hidden_state, cell_state] from the encoder LSTM
    max_target_length : int
        Maximum length of target sequences (for shape reference)

    Returns:
    --------
    tuple: (decoder_inputs, decoder_outputs)
        decoder_inputs : keras.Input
            Input layer for the decoder (teacher forcing inputs)
        decoder_outputs : keras.Layer
            Output tensor containing sequence of vocabulary probabilities
    """
    decoder_inputs = Input(shape=(None,), name='decoder_inputs')
    dec_emb_layer = Embedding(num_decoder_tokens, latent_dim, mask_zero=True, name='decoder_embedding')
    dec_emb = dec_emb_layer(decoder_inputs)

    decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True, name='decoder_lstm')
    decoder_outputs, _, _ = decoder_lstm(dec_emb, initial_state=encoder_states)

    decoder_dense = Dense(num_decoder_tokens, activation='softmax', name='decoder_dense')
    decoder_outputs = decoder_dense(decoder_outputs)

    return decoder_inputs, decoder_outputs, dec_emb_layer, decoder_lstm, decoder_dense

### Building Seq - to - Seq Model:
This function creates a complete model that:

1. Encodes input sequences into context vectors
2. Decodes the context vectors into target sequences
3. Outputs probability distributions over the target vocabulary

In [95]:
def build_seq2seq_model(input_shape, num_encoder_tokens, num_decoder_tokens, latent_dim, max_target_length):
    """
    Constructs an end-to-end sequence-to-sequence model combining encoder and decoder.
    Parameters:
    -----------
    input_shape : tuple
        Shape of the input sequences (max_sequence_length,)
    num_encoder_tokens : int
        Size of the source vocabulary (including special tokens)
    num_decoder_tokens : int
        Size of the target vocabulary (including special tokens)
    latent_dim : int
        Dimensionality of the embedding and LSTM layers
    max_target_length : int
        Maximum length of target sequences (for reference)

    Returns:
    --------
    keras.Model
        A compiled seq2seq model with encoder and decoder components
    """
    encoder_inputs, encoder_states = define_encoder(input_shape, num_encoder_tokens, latent_dim)
    decoder_inputs, decoder_outputs, dec_emb_layer, decoder_lstm, decoder_dense = define_decoder(latent_dim, num_decoder_tokens, encoder_states, max_target_length)

    model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
    return model, encoder_inputs, decoder_inputs, encoder_states, dec_emb_layer, decoder_lstm, decoder_dense

In [96]:
# Create dataset
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense

train_dataset = create_tf_dataset(X_train, y_train, batch_size=batch_size)
val_dataset = create_tf_dataset(X_test, y_test, batch_size=batch_size)

# Test the first batch
for batch in train_dataset.take(1):
    print(f"Input data shape: {batch[0][0].shape}, {batch[0][1].shape}")  # encoder_input_data, decoder_input_data
    print(f"Target data shape: {batch[1].shape}")  # decoder_target_data

Input data shape: (32, 25), (32, 22)
Target data shape: (32, 22, 3271)


In [97]:
# Prepare the dataset for training
train_dataset = train_dataset.prefetch(tf.data.AUTOTUNE)  # Optimizing for performance
val_dataset = val_dataset.prefetch(tf.data.AUTOTUNE)

# Build and compile the model
input_shape = (None,)  # Variable-length input sequence (e.g., (None,))
latent_dim = 256  # Latent dimension for LSTM
model, encoder_inputs, decoder_inputs, encoder_states, dec_emb_layer, decoder_lstm, decoder_dense = build_seq2seq_model(
    input_shape=(max_source_length,),
    num_encoder_tokens=num_encoder_tokens,
    num_decoder_tokens=num_decoder_tokens,
    latent_dim=latent_dim,
    max_target_length=max_target_length
)
model.summary()

In [98]:
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

# 1. Define callbacks
callbacks = [
    # Stop training if val_loss doesn't improve for 3 consecutive epochs
    EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True, verbose=1),

    # Save the model with the best validation accuracy
    ModelCheckpoint('best_seq2seq_model.h5', monitor='val_accuracy', save_best_only=True, verbose=1)
]

# 2. Compile the model
model.compile(
    optimizer='adam',
    loss='categorical_crossentropy',
    metrics=['accuracy']
)

# 3. Training parameters
train_samples = len(X_train)
val_samples = len(X_test)

steps_per_epoch = train_samples // batch_size
validation_steps = val_samples // batch_size

# 4. Train the model with callbacks
model.fit(
    train_dataset,
    steps_per_epoch=steps_per_epoch,
    epochs=epochs,
    validation_data=val_dataset,
    validation_steps=validation_steps,
    callbacks=callbacks
)

# 5. Save final model (optional, in case best wasn't triggered)
model.save('final_seq2seq_model.h5')

Epoch 1/100
[1m73/75[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 22ms/step - accuracy: 0.0437 - loss: 6.1739
Epoch 1: val_accuracy improved from -inf to 0.05433, saving model to best_seq2seq_model.h5




[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 31ms/step - accuracy: 0.0440 - loss: 6.1506 - val_accuracy: 0.0543 - val_loss: 5.0801
Epoch 2/100
[1m73/75[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 20ms/step - accuracy: 0.0546 - loss: 4.8939
Epoch 2: val_accuracy improved from 0.05433 to 0.05522, saving model to best_seq2seq_model.h5




[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 33ms/step - accuracy: 0.0546 - loss: 4.8897 - val_accuracy: 0.0552 - val_loss: 4.9793
Epoch 3/100
[1m73/75[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 19ms/step - accuracy: 0.0556 - loss: 4.6676
Epoch 3: val_accuracy improved from 0.05522 to 0.05682, saving model to best_seq2seq_model.h5




[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 36ms/step - accuracy: 0.0556 - loss: 4.6645 - val_accuracy: 0.0568 - val_loss: 4.9326
Epoch 4/100
[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step - accuracy: 0.0578 - loss: 4.4755
Epoch 4: val_accuracy improved from 0.05682 to 0.05859, saving model to best_seq2seq_model.h5




[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 34ms/step - accuracy: 0.0578 - loss: 4.4747 - val_accuracy: 0.0586 - val_loss: 4.8479
Epoch 5/100
[1m73/75[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 21ms/step - accuracy: 0.0623 - loss: 4.2739
Epoch 5: val_accuracy improved from 0.05859 to 0.06286, saving model to best_seq2seq_model.h5




[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 25ms/step - accuracy: 0.0624 - loss: 4.2715 - val_accuracy: 0.0629 - val_loss: 4.7494
Epoch 6/100
[1m73/75[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 20ms/step - accuracy: 0.0709 - loss: 4.0728
Epoch 6: val_accuracy improved from 0.06286 to 0.06463, saving model to best_seq2seq_model.h5




[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 23ms/step - accuracy: 0.0710 - loss: 4.0701 - val_accuracy: 0.0646 - val_loss: 4.6866
Epoch 7/100
[1m73/75[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 20ms/step - accuracy: 0.0776 - loss: 3.8757
Epoch 7: val_accuracy improved from 0.06463 to 0.06516, saving model to best_seq2seq_model.h5




[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 34ms/step - accuracy: 0.0776 - loss: 3.8735 - val_accuracy: 0.0652 - val_loss: 4.6591
Epoch 8/100
[1m73/75[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 20ms/step - accuracy: 0.0816 - loss: 3.6941
Epoch 8: val_accuracy improved from 0.06516 to 0.06623, saving model to best_seq2seq_model.h5




[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 34ms/step - accuracy: 0.0817 - loss: 3.6921 - val_accuracy: 0.0662 - val_loss: 4.6453
Epoch 9/100
[1m74/75[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 21ms/step - accuracy: 0.0873 - loss: 3.5202
Epoch 9: val_accuracy improved from 0.06623 to 0.07013, saving model to best_seq2seq_model.h5




[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 35ms/step - accuracy: 0.0873 - loss: 3.5187 - val_accuracy: 0.0701 - val_loss: 4.6288
Epoch 10/100
[1m74/75[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 25ms/step - accuracy: 0.0920 - loss: 3.3462
Epoch 10: val_accuracy improved from 0.07013 to 0.07422, saving model to best_seq2seq_model.h5




[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 29ms/step - accuracy: 0.0920 - loss: 3.3449 - val_accuracy: 0.0742 - val_loss: 4.6230
Epoch 11/100
[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step - accuracy: 0.0973 - loss: 3.2042
Epoch 11: val_accuracy improved from 0.07422 to 0.07564, saving model to best_seq2seq_model.h5




[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 33ms/step - accuracy: 0.0973 - loss: 3.2033 - val_accuracy: 0.0756 - val_loss: 4.6262
Epoch 12/100
[1m73/75[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 20ms/step - accuracy: 0.1028 - loss: 3.0111
Epoch 12: val_accuracy improved from 0.07564 to 0.07848, saving model to best_seq2seq_model.h5




[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 23ms/step - accuracy: 0.1029 - loss: 3.0086 - val_accuracy: 0.0785 - val_loss: 4.6387
Epoch 13/100
[1m74/75[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 20ms/step - accuracy: 0.1098 - loss: 2.8116
Epoch 13: val_accuracy improved from 0.07848 to 0.08043, saving model to best_seq2seq_model.h5




[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 35ms/step - accuracy: 0.1098 - loss: 2.8104 - val_accuracy: 0.0804 - val_loss: 4.6721
Epoch 14/100
[1m74/75[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 21ms/step - accuracy: 0.1165 - loss: 2.6256
Epoch 14: val_accuracy did not improve from 0.08043
[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 32ms/step - accuracy: 0.1165 - loss: 2.6248 - val_accuracy: 0.0801 - val_loss: 4.7863
Epoch 15/100
[1m74/75[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 22ms/step - accuracy: 0.1218 - loss: 2.4927
Epoch 15: val_accuracy improved from 0.08043 to 0.08381, saving model to best_seq2seq_model.h5




[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 36ms/step - accuracy: 0.1218 - loss: 2.4914 - val_accuracy: 0.0838 - val_loss: 4.7576
Epoch 16/100
[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step - accuracy: 0.1294 - loss: 2.3362
Epoch 16: val_accuracy improved from 0.08381 to 0.08594, saving model to best_seq2seq_model.h5




[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 27ms/step - accuracy: 0.1294 - loss: 2.3355 - val_accuracy: 0.0859 - val_loss: 4.7492
Epoch 17/100
[1m73/75[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 20ms/step - accuracy: 0.1380 - loss: 2.1837
Epoch 17: val_accuracy improved from 0.08594 to 0.08647, saving model to best_seq2seq_model.h5




[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 23ms/step - accuracy: 0.1381 - loss: 2.1821 - val_accuracy: 0.0865 - val_loss: 4.7919
Epoch 18/100
[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step - accuracy: 0.1486 - loss: 2.0178
Epoch 18: val_accuracy improved from 0.08647 to 0.08913, saving model to best_seq2seq_model.h5




[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 24ms/step - accuracy: 0.1486 - loss: 2.0175 - val_accuracy: 0.0891 - val_loss: 4.8607
Epoch 19/100
[1m73/75[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 20ms/step - accuracy: 0.1577 - loss: 1.8794
Epoch 19: val_accuracy did not improve from 0.08913
[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 22ms/step - accuracy: 0.1579 - loss: 1.8785 - val_accuracy: 0.0890 - val_loss: 4.9370
Epoch 20/100
[1m73/75[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 20ms/step - accuracy: 0.1662 - loss: 1.7456
Epoch 20: val_accuracy improved from 0.08913 to 0.09002, saving model to best_seq2seq_model.h5




[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 23ms/step - accuracy: 0.1663 - loss: 1.7447 - val_accuracy: 0.0900 - val_loss: 4.9915
Epoch 20: early stopping
Restoring model weights from the end of the best epoch: 10.




In [99]:
# Encoder model (same as in training)
encoder_model = Model(encoder_inputs, encoder_states)

# Decoder setup for inference
# These inputs will hold the LSTM states for each timestep
decoder_state_input_h = Input(shape=(latent_dim,), name='decoder_input_h')
decoder_state_input_c = Input(shape=(latent_dim,), name='decoder_input_c')
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]

# Embedding layer reused from training
dec_emb2 = dec_emb_layer(decoder_inputs)

# Reuse the LSTM layer and pass in the previous states
decoder_outputs2, state_h2, state_c2 = decoder_lstm(
    dec_emb2, initial_state=decoder_states_inputs
)
decoder_states2 = [state_h2, state_c2]

# Reuse the dense softmax layer
decoder_outputs2 = decoder_dense(decoder_outputs2)

# Final inference decoder model
decoder_model = Model(
    [decoder_inputs] + decoder_states_inputs,
    [decoder_outputs2] + decoder_states2
)

### Function for Quick Predictions.

In [100]:
def decode_sequence(input_seq):
    # Encode the input as state vectors.
    states_value = encoder_model.predict(input_seq)
    # Generate empty target sequence of length 1.
    target_seq = np.zeros((1,1))
    # Populate the first character of
    #target sequence with the start character.
    target_seq[0, 0] = target_word2idx['START_']
# Sampling loop for a batch of sequences
    # (to simplify, here we assume a batch of size 1).
    stop_condition = False
    decoded_sentence = ''
    while not stop_condition:
        output_tokens, h, c = decoder_model.predict([target_seq] + states_value)
# Sample a token
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_word =target_idx2word[sampled_token_index]
        decoded_sentence += ' '+ sampled_word
# Exit condition: either hit max length
        # or find stop character.
        if (sampled_word == '_END' or
           len(decoded_sentence) > 50):
            stop_condition = True
# Update the target sequence (of length 1).
        target_seq = np.zeros((1,1))
        target_seq[0, 0] = sampled_token_index
# Update states
        states_value = [h, c]
    return decoded_sentence

In [101]:
train_gen = generate_batch(X_train, y_train, batch_size=1)
k = -1

In [102]:
k+=1
(input_seq, actual_output), _ = next(train_gen)
decoded_sentence = decode_sequence(input_seq)
print('Input Source sentence:', X_train[k:k+1].values[0])
print('Actual Target Translation:', y_train[k:k+1].values[0][6:-4])
print('Predicted Target Translation:', decoded_sentence[:-4])

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 134ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 147ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 38ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 32ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 32ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 33ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 33ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 34ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 34ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 37ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 33ms/step
Input Source sentence: i like white roses better than red ones.
Actual Target Translation:  मलाई रातो गुलाब भन्दा सेतो गुलाब मन पर्छ। 
Predicted Target Translation:  मलाई थाहा छ मलाई थाहा छ टम गर्न पर्छ। 


In [103]:
!pip install gradio



In [107]:
import gradio as gr
import numpy as np
import tensorflow as tf

# ---- Helper Function to Decode Sequences ----
def decode_sequence(input_seq):
    # Encode the input as state vectors.
    states_value = encoder_model.predict(input_seq)

    # Generate empty target sequence of length 1 with only the start token.
    target_seq = np.zeros((1, 1), dtype='int32')
    target_seq[0, 0] = target_word2idx.get('<PAD>')  # or <START> if you have one

    # Output sequence
    stop_condition = False
    decoded_sentence = ''
    while not stop_condition:
        output_tokens, h, c = decoder_model.predict([target_seq] + states_value)

        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_word = target_idx2word.get(sampled_token_index, '<UNK>')

        if sampled_word == '<PAD>' or len(decoded_sentence.split()) > max_target_length:
            stop_condition = True
        else:
            decoded_sentence += ' ' + sampled_word

            # Update target sequence (of length 1)
            target_seq = np.zeros((1, 1), dtype='int32')
            target_seq[0, 0] = sampled_token_index

            # Update states
            states_value = [h, c]

    return decoded_sentence.strip()

# ---- Preprocessing for the input sentence ----
def preprocess_input(sentence):
    sequence = [source_word2idx.get(word, source_word2idx['<UNK>']) for word in sentence.lower().split()]
    sequence = pad_sequences([sequence], maxlen=max_source_length, padding='post')
    return sequence

# ---- Main function for Gradio ----
def translate_sentence(input_sentence):
    preprocessed_input = preprocess_input(input_sentence)
    translated_output = decode_sequence(preprocessed_input)
    return translated_output

# ---- Build Gradio Interface ----
with gr.Blocks(theme=gr.themes.Default(primary_hue="blue", neutral_hue="blue")) as demo:
    gr.Markdown(
        """<h1 style='text-align: center; color: #00BFFF;'>Seq2Seq Translator</h1>
        <h3 style='text-align: center; color: white;'>Enter a sentence to translate</h3>""",
    )
    with gr.Row():
        with gr.Column():
            input_text = gr.Textbox(label="Input Sentence", placeholder="Type your source sentence here...")
            translate_button = gr.Button("Translate", elem_id="translate-btn")
        with gr.Column():
            output_text = gr.Textbox(label="Translated Sentence", placeholder="Translation will appear here...")

    translate_button.click(fn=translate_sentence, inputs=input_text, outputs=output_text)

# ---- Custom CSS to set background ----
demo.launch(share=True, inline=False, server_name="0.0.0.0", server_port=7960)

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://2008a4e472f37908c0.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


