In [None]:
# Numerical operations and array handling
import numpy as np

# DataFrame handling (dataset is assumed to be a pandas DataFrame)
import pandas as pd

# Converts text into integer sequences (word → index)
from tensorflow.keras.preprocessing.text import Tokenizer

# Makes all sequences same length by padding with zeros
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Core neural network layers
from tensorflow.keras.layers import Input, LSTM, Embedding, Dense

# Used to build models using Functional API (needed for encoder–decoder)
from tensorflow.keras.models import Model

In [None]:
def clean_text(text):
    """
    Minimal cleaning for translation tasks.
    We avoid removing punctuation aggressively because it
    can change sentence meaning in translation.
    """
    if not isinstance(text, str):
        return ""

    # Lowercase for consistency, strip removes extra spaces
    return text.lower().strip()

In [None]:
dataset = pd.read_csv('/content/Hindi_English_Truncated_Corpus.csv')
dataset = dataset.dropna().drop_duplicates()
dataset = dataset[dataset['source'] == 'ted'][['english_sentence','hindi_sentence']]

In [None]:
# Clean English sentences (encoder input)
dataset['english_sentence'] = dataset['english_sentence'].apply(clean_text)

# Add start_ and _end tokens to Hindi sentences
# Decoder learns where to start and stop generation
dataset['hindi_sentence'] = dataset['hindi_sentence'].apply(
    lambda x: 'start_ ' + clean_text(x) + ' _end'
)

In [None]:
# English tokenizer
# - num_words limits vocabulary size
# - <OOV> handles unseen words
eng_token = Tokenizer(num_words=15000, oov_token="<OOV>")
eng_token.fit_on_texts(dataset['english_sentence'])

# Hindi tokenizer
# - filters='' prevents accidental removal of Devanagari characters
hin_token = Tokenizer(num_words=15000, filters='', oov_token="<OOV>")
hin_token.fit_on_texts(dataset['hindi_sentence'])

In [None]:
# Convert English sentences into sequences of token IDs
eng_seq = eng_token.texts_to_sequences(dataset['english_sentence'])

# Convert Hindi sentences into sequences of token IDs
hin_seq = hin_token.texts_to_sequences(dataset['hindi_sentence'])

In [None]:
# Find maximum sequence lengths
max_eng_len = max(len(seq) for seq in eng_seq)
max_hin_len = max(len(seq) for seq in hin_seq)

# Pad English sequences (encoder input)
encoder_input = pad_sequences(
    eng_seq,
    maxlen=max_eng_len,
    padding='post'  # add zeros at the end
)

# Pad Hindi sequences (decoder input)
decoder_input = pad_sequences(
    hin_seq,
    maxlen=max_hin_len,
    padding='post'
)

In [None]:
# Create empty target array
# Shape: (samples, time_steps, 1)
decoder_target = np.zeros(
    (decoder_input.shape[0], decoder_input.shape[1], 1)
)

# Shift decoder input by one timestep (Teacher Forcing Learning)
# Decoder learns: given word_t → predict word_(t+1)
decoder_target[:, :-1, 0] = decoder_input[:, 1:]

In [None]:
# Vocabulary size = number of unique words + padding token
eng_vocab_size = len(eng_token.word_index) + 1
hin_vocab_size = len(hin_token.word_index) + 1

# Latent dimension controls embedding size and LSTM memory capacity
latent_dim = 256

In [None]:
# Encoder input receives a sequence of English token IDs
encoder_inputs = Input(shape=(None,))

# Embedding layer converts token IDs → dense vectors
# mask_zero=True ensures padding tokens are ignored
enc_emb = Embedding(
    eng_vocab_size,
    latent_dim,
    mask_zero=True
)(encoder_inputs)

# LSTM processes the embedded sequence
# return_state=True returns final hidden and cell states
_, state_h, state_c = LSTM(
    latent_dim,
    return_state=True
)(enc_emb)

# Encoder states summarize the entire input sentence
encoder_states = [state_h, state_c]

In [None]:
# Decoder input receives Hindi token IDs
decoder_inputs = Input(shape=(None,))

# Shared embedding layer for decoder
dec_emb_layer = Embedding(
    hin_vocab_size,
    latent_dim,
    mask_zero=True
)

# Convert decoder input tokens to vectors
dec_emb = dec_emb_layer(decoder_inputs)

# Decoder LSTM generates output at each timestep
dec_lstm = LSTM(
    latent_dim,
    return_sequences=True,  # output at every timestep
    return_state=True
)

# Initialize decoder LSTM with encoder states
dec_outputs, _, _ = dec_lstm(
    dec_emb,
    initial_state=encoder_states
)

# Dense layer converts LSTM output → vocabulary probabilities
dec_dense = Dense(hin_vocab_size, activation='softmax')
dec_outputs = dec_dense(dec_outputs)

In [None]:
# Full training model (encoder + decoder)
model = Model(
    [encoder_inputs, decoder_inputs],
    dec_outputs
)

# Compile model
# - rmsprop works well for RNNs
# - sparse_categorical_crossentropy because targets are integer IDs
model.compile(
    optimizer='rmsprop',
    loss='sparse_categorical_crossentropy',
    metrics=['accuracy']
)

model.summary()

In [23]:
# Train using teacher forcing
model.fit(
    [encoder_input, decoder_input],
    decoder_target,
    batch_size=64,
    epochs=1,
    validation_split=0.2
)

[1m486/486[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3686s[0m 8s/step - accuracy: 0.6755 - loss: 6.9035 - val_accuracy: 0.7446 - val_loss: 5.7782


<keras.src.callbacks.history.History at 0x7f6fa86d7dd0>

In [None]:
# During inference, encoder outputs only the final states
encoder_model_inf = Model(
    encoder_inputs,
    encoder_states
)

In [None]:
# Inputs for previous decoder states
decoder_state_input_h = Input(shape=(latent_dim,))
decoder_state_input_c = Input(shape=(latent_dim,))

decoder_states_inputs = [
    decoder_state_input_h,
    decoder_state_input_c
]

# Embed the current decoder input token
dec_inf_emb = dec_emb_layer(decoder_inputs)

# Run one timestep of decoder LSTM
dec_outputs_inf, state_h_inf, state_c_inf = dec_lstm(
    dec_inf_emb,
    initial_state=decoder_states_inputs
)

# Convert output to word probabilities
decoder_outputs_inf = dec_dense(dec_outputs_inf)

# Inference decoder model
decoder_model_inf = Model(
    [decoder_inputs] + decoder_states_inputs,
    [decoder_outputs_inf, state_h_inf, state_c_inf]
)

In [26]:
# Map token IDs back to words for readable output
reverse_eng = {v: k for k, v in eng_token.word_index.items()}
reverse_hin = {v: k for k, v in hin_token.word_index.items()}

In [27]:
def translate(sentence):
    # Clean input sentence
    sentence = clean_text(sentence)

    # Convert sentence to token sequence
    seq = eng_token.texts_to_sequences([sentence])

    # Pad to encoder input length
    padded = pad_sequences(seq, maxlen=max_eng_len, padding='post')

    # Encode sentence → initial decoder states
    states = encoder_model_inf.predict(padded)

    # Start decoding with start_ token
    target_seq = np.zeros((1, 1))
    target_seq[0, 0] = hin_token.word_index['start_']

    decoded_words = []

    while True:
        # Predict next word
        output, h, c = decoder_model_inf.predict(
            [target_seq] + states
        )

        # Choose word with highest probability
        token_index = np.argmax(output[0, -1, :])
        word = reverse_hin.get(token_index, '')

        # Stop if end token or max length reached
        if word == '_end' or len(decoded_words) >= max_hin_len:
            break

        decoded_words.append(word)

        # Feed predicted word back into decoder
        target_seq = np.zeros((1, 1))
        target_seq[0, 0] = token_index

        # Update decoder states
        states = [h, c]

    return ' '.join(decoded_words)

In [28]:
print("English:", "And")
print("Hindi:",
      translate("And")
     )

English: And
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 265ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 326ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 54ms/step
Hindi: और
