In [1]:
!pip install tensorflow==2.12.0
!pip install pandas
!pip install gdown

Collecting tensorflow==2.12.0
  Downloading tensorflow-2.12.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.4 kB)
Collecting gast<=0.4.0,>=0.2.1 (from tensorflow==2.12.0)
  Downloading gast-0.4.0-py3-none-any.whl.metadata (1.1 kB)
Collecting keras<2.13,>=2.12.0 (from tensorflow==2.12.0)
  Downloading keras-2.12.0-py2.py3-none-any.whl.metadata (1.4 kB)
Collecting numpy<1.24,>=1.22 (from tensorflow==2.12.0)
  Downloading numpy-1.23.5-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (2.3 kB)
Collecting protobuf!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<5.0.0dev,>=3.20.3 (from tensorflow==2.12.0)
  Downloading protobuf-4.25.6-cp37-abi3-manylinux2014_x86_64.whl.metadata (541 bytes)
Collecting tensorboard<2.13,>=2.12 (from tensorflow==2.12.0)
  Downloading tensorboard-2.12.3-py3-none-any.whl.metadata (1.8 kB)
Collecting tensorflow-estimator<2.13,>=2.12.0 (from tensorflow==2.12.0)
  Downloading tensorflow_estimator-2.12.0-py2.py3-none-an



In [37]:
import numpy as np
import tensorflow as tf
from tensorflow import keras
from keras.models import Model
from keras.layers import Input, LSTM, Dense, Embedding, Dropout, Concatenate, Activation, Dot, Lambda
from keras.optimizers import Adam
from keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau

In [38]:
def load_data(file_path, num_samples=None):
    input_texts = []
    target_texts = []

    with open(file_path, 'r', encoding='utf-8') as f:
        lines = f.read().split('\n')

    for line in lines[:num_samples]:
        if not line.strip():
            continue

        parts = line.split('\t')
        if len(parts) >= 2:
            input_text = parts[1].strip().lower()
            target_text = parts[0].strip()

            if input_text and target_text:
                input_texts.append(input_text)
                target_texts.append('\t' + target_text + '\n')

    print(f"Loaded {len(input_texts)} samples from {file_path}")
    return input_texts, target_texts

train_input_texts, train_target_texts = load_data('hi.translit.sampled.train.tsv', num_samples=20000)
val_input_texts, val_target_texts = load_data('hi.translit.sampled.dev.tsv', num_samples=1000)
test_input_texts, test_target_texts = load_data('hi.translit.sampled.test.tsv', num_samples=1000)

Loaded 20000 samples from hi.translit.sampled.train.tsv
Loaded 1000 samples from hi.translit.sampled.dev.tsv
Loaded 1000 samples from hi.translit.sampled.test.tsv


In [39]:
def build_vocab(texts, min_count=2):
    char_counts = {}
    for text in texts:
        for char in text:
            char_counts[char] = char_counts.get(char, 0) + 1

    all_chars = [char for char, count in char_counts.items() if count >= min_count]
    all_chars = sorted(all_chars)
    all_chars.extend(['<UNK>', '<PAD>'])

    char2idx = {char: idx for idx, char in enumerate(all_chars)}
    idx2char = {idx: char for char, idx in char2idx.items()}
    return char2idx, idx2char

target_char2idx, target_idx2char = build_vocab(train_target_texts)
input_char2idx, input_idx2char = build_vocab(train_input_texts)

print(f"Input vocab size: {len(input_char2idx)}")
print(f"Target vocab size: {len(target_char2idx)}")

Input vocab size: 28
Target vocab size: 65


In [40]:
max_encoder_seq_length = max(len(txt) for txt in train_input_texts) + 2
max_decoder_seq_length = max(len(txt) for txt in train_target_texts) + 2

def vectorize(input_texts, target_texts):
    encoder_input_data = np.zeros((len(input_texts), max_encoder_seq_length), dtype='float32')
    decoder_input_data = np.zeros((len(target_texts), max_decoder_seq_length), dtype='float32')
    decoder_target_data = np.zeros((len(target_texts), max_decoder_seq_length, len(target_char2idx)), dtype='float32')

    for i, (input_text, target_text) in enumerate(zip(input_texts, target_texts)):
        for t, char in enumerate(input_text):
            encoder_input_data[i, t] = input_char2idx.get(char, input_char2idx['<UNK>'])
        for t in range(len(input_text), max_encoder_seq_length):
            encoder_input_data[i, t] = input_char2idx['<PAD>']

        for t, char in enumerate(target_text):
            decoder_input_data[i, t] = target_char2idx.get(char, target_char2idx['<UNK>'])
            if t > 0:
                decoder_target_data[i, t-1, target_char2idx.get(char, target_char2idx['<UNK>'])] = 1.0
        for t in range(len(target_text), max_decoder_seq_length):
            decoder_input_data[i, t] = target_char2idx['<PAD>']
            if t > 0:
                decoder_target_data[i, t-1, target_char2idx['<PAD>']] = 1.0

    return encoder_input_data, decoder_input_data, decoder_target_data

encoder_input_data, decoder_input_data, decoder_target_data = vectorize(train_input_texts, train_target_texts)
val_encoder_input_data, val_decoder_input_data, val_decoder_target_data = vectorize(val_input_texts, val_target_texts)
test_encoder_input_data, _, _ = vectorize(test_input_texts, test_target_texts)


In [41]:
embedding_dim = 256
hidden_dim = 512

# Custom Attention Layer
def attention_layer(decoder_hidden, encoder_output):
    # Shape: (batch, dec_seq_len, enc_seq_len)
    scores = Dot(axes=[2, 2])([decoder_hidden, encoder_output])
    attention_weights = Activation('softmax')(scores)
    context = Dot(axes=[2, 1])([attention_weights, encoder_output])
    return context

# Encoder
encoder_inputs = Input(shape=(None,))
enc_emb = Embedding(len(input_char2idx), embedding_dim, mask_zero=True)(encoder_inputs)
encoder_lstm = LSTM(hidden_dim, return_sequences=True, return_state=True,
                   dropout=0.4, recurrent_dropout=0.4)
encoder_outputs, state_h, state_c = encoder_lstm(enc_emb)

# Decoder
decoder_inputs = Input(shape=(None,))
dec_emb = Embedding(len(target_char2idx), embedding_dim, mask_zero=True)(decoder_inputs)
decoder_lstm = LSTM(hidden_dim, return_sequences=True, return_state=True,
                   dropout=0.4, recurrent_dropout=0.4)
decoder_outputs, _, _ = decoder_lstm(dec_emb, initial_state=[state_h, state_c])

# Apply custom attention
attention_context = attention_layer(decoder_outputs, encoder_outputs)
decoder_concat = Concatenate(axis=-1)([decoder_outputs, attention_context])

# Output
decoder_dense = Dense(len(target_char2idx), activation='softmax')(decoder_concat)

model = Model([encoder_inputs, decoder_inputs], decoder_dense)
model.compile(optimizer=Adam(learning_rate=0.001),
              loss='categorical_crossentropy',
              metrics=['accuracy'])
model.summary()


In [21]:
history = model.fit(
    [X_train, decoder_input_train],
    decoder_output_train,
    batch_size=64,
    epochs=30,
    validation_data=([X_dev, decoder_input_dev], decoder_output_dev),
    verbose=1
)

Epoch 1/30
[1m691/691[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 10ms/step - accuracy: 0.7069 - loss: 1.2913 - val_accuracy: 0.7648 - val_loss: 0.8550
Epoch 2/30
[1m691/691[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 9ms/step - accuracy: 0.7625 - loss: 0.8656 - val_accuracy: 0.7893 - val_loss: 0.7465
Epoch 3/30
[1m691/691[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 8ms/step - accuracy: 0.7935 - loss: 0.7267 - val_accuracy: 0.8427 - val_loss: 0.5482
Epoch 4/30
[1m691/691[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 8ms/step - accuracy: 0.8475 - loss: 0.5228 - val_accuracy: 0.8800 - val_loss: 0.4013
Epoch 5/30
[1m691/691[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 8ms/step - accuracy: 0.8836 - loss: 0.3849 - val_accuracy: 0.9035 - val_loss: 0.3179
Epoch 6/30
[1m691/691[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 9ms/step - accuracy: 0.9067 - loss: 0.3050 - val_accuracy: 0.9150 - val_loss: 0.2790
Epoch 7/30
[1m691/691[0

In [26]:
def decode_sequence(input_seq, encoder_model, decoder_model,
                   latin_tokenizer, devanagari_tokenizer, max_length=20):
    # Encode input
    states_value = encoder_model.predict(input_seq, verbose=0)

    # Generate empty target sequence with START token
    target_seq = np.zeros((1, 1))
    target_seq[0, 0] = devanagari_tokenizer.word_index['<start>']  # Changed from '' to '<start>'

    reverse_target_char_index = {i: char for char, i in devanagari_tokenizer.word_index.items()}

    decoded_sentence = []
    for _ in range(max_length):
        output_tokens, *states_value = decoder_model.predict(
            [target_seq] + states_value, verbose=0)

        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_char = reverse_target_char_index[sampled_token_index]
        decoded_sentence.append(sampled_char)

        # Stop when we hit END token
        if sampled_char == '<end>':  # Changed from '' to '<end>'
            break

        target_seq = np.zeros((1, 1))
        target_seq[0, 0] = sampled_token_index

    # Remove the end token if present
    if decoded_sentence and decoded_sentence[-1] == '<end>':
        decoded_sentence = decoded_sentence[:-1]

    return ''.join(decoded_sentence)
# Evaluate on test set
test_loss, test_acc = model.evaluate([X_test, decoder_input_test], decoder_output_test, verbose=0)
print(f"Test Accuracy: {test_acc:.4f}")


Test Accuracy: 0.9448


In [43]:
from IPython.display import display, Markdown

for i in range(5):
    input_seq = X_test[i:i+1]
    decoded = decode_sequence(
        input_seq, encoder_model, decoder_model,
        latin_tokenizer, devanagari_tokenizer)

    original_input = latin_tokenizer.sequences_to_texts([X_test[i]])[0]
    original_target = devanagari_tokenizer.sequences_to_texts([decoder_input_test[i]])[0]

    display(Markdown(f"""
**Sample {i+1}**
- **Input (Latin):** `{original_input}`
- **Target:** `{original_target}`
- **Predicted:** `{decoded}`
"""))
print(f"Test Accuracy: {test_acc:.4f}")


**Sample 1**  
- **Input (Latin):** `a n k`  
- **Target:** `<start> अ ं क <end>`  
- **Predicted:** `अंक`  



**Sample 2**  
- **Input (Latin):** `a n k a`  
- **Target:** `<start> अ ं क <end>`  
- **Predicted:** `अंका`  



**Sample 3**  
- **Input (Latin):** `a n k i t`  
- **Target:** `<start> अ ं क ि त <end>`  
- **Predicted:** `अंकित`  



**Sample 4**  
- **Input (Latin):** `a n a k o n`  
- **Target:** `<start> अ ं क ो ं <end>`  
- **Predicted:** `अनाकों`  



**Sample 5**  
- **Input (Latin):** `a n k h o n`  
- **Target:** `<start> अ ं क ो ं <end>`  
- **Predicted:** `अंखों`  


Test Accuracy: 0.9448
