In [5]:
import json

path = '/content/drive/MyDrive/HTIC - Assignments/Dataset/hin/hin_train.json'  # adjust as needed
data = []

# Read line by line
with open(path, "r", encoding="utf-8") as f:
    for line in f:
        data.append(json.loads(line))

# Check a few examples
print(len(data))
print(data[0])


1299155
{'unique_identifier': 'hin1', 'native word': 'जन्मदिवस', 'english word': 'janamdivas', 'source': 'Dakshina', 'score': None}


In [6]:
from collections import Counter

def build_vocab(pairs):
    src_chars = Counter()  # Romanized (English letters)
    tgt_chars = Counter()  # Devanagari
    for p in pairs:
        src_chars.update(list(p["english word"]))
        tgt_chars.update(list(p["native word"]))

    src_vocab = ['<PAD>', '<SOS>', '<EOS>'] + sorted(src_chars)
    tgt_vocab = ['<PAD>', '<SOS>', '<EOS>'] + sorted(tgt_chars)

    src_to_ix = {ch: i for i, ch in enumerate(src_vocab)}
    tgt_to_ix = {ch: i for i, ch in enumerate(tgt_vocab)}
    ix_to_tgt = {i: ch for ch, i in tgt_to_ix.items()}

    return src_to_ix, tgt_to_ix, ix_to_tgt

src_to_ix, tgt_to_ix, ix_to_tgt = build_vocab(data)

print(f"Source vocab size: {len(src_to_ix)}")
print(f"Target vocab size: {len(tgt_to_ix)}")


Source vocab size: 29
Target vocab size: 72


In [7]:
def encode_sequence(text, mapping):
    return [mapping['<SOS>']] + [mapping[ch] for ch in text if ch in mapping] + [mapping['<EOS>']]

encoded_data = [
    (encode_sequence(d["english word"], src_to_ix),
     encode_sequence(d["native word"], tgt_to_ix))
    for d in data
]

print(f"Encoded {len(encoded_data)} pairs")
print("Example input encoding:", encoded_data[0][0])
print("Example target encoding:", encoded_data[0][1])


Encoded 1299155 pairs
Example input encoding: [1, 12, 3, 16, 3, 15, 6, 11, 24, 3, 21, 2]
Example target encoding: [1, 26, 38, 68, 43, 36, 56, 48, 51, 2]


In [8]:
!pip install torch
import torch
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence

# ---- Step 1: Pad and prepare dataset ----
class TransliterationDataset(Dataset):
    def __init__(self, data_pairs):
        self.data_pairs = data_pairs

    def __len__(self):
        return len(self.data_pairs)

    def __getitem__(self, idx):
        src_seq, tgt_seq = self.data_pairs[idx]
        return torch.tensor(src_seq, dtype=torch.long), torch.tensor(tgt_seq, dtype=torch.long)

# Collate function for DataLoader
def collate_fn(batch):
    src_batch, tgt_batch = zip(*batch)
    src_batch = pad_sequence(src_batch, padding_value=src_to_ix['<PAD>'], batch_first=True)
    tgt_batch = pad_sequence(tgt_batch, padding_value=tgt_to_ix['<PAD>'], batch_first=True)
    return src_batch, tgt_batch

# Use a small sample for quick testing (you can increase later)
sampled_data = encoded_data[:5000]

dataset = TransliterationDataset(sampled_data)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True, collate_fn=collate_fn)

for src, tgt in dataloader:
    print("Source batch shape:", src.shape)
    print("Target batch shape:", tgt.shape)
    break


Source batch shape: torch.Size([32, 16])
Target batch shape: torch.Size([32, 13])


In [9]:
import torch.nn as nn

class EncoderRNN(nn.Module):
    def __init__(self, input_dim, embed_dim, hidden_dim, cell_type='gru'):
        super().__init__()
        self.embedding = nn.Embedding(input_dim, embed_dim)
        if cell_type == 'gru':
            self.rnn = nn.GRU(embed_dim, hidden_dim, batch_first=True)
        else:
            self.rnn = nn.LSTM(embed_dim, hidden_dim, batch_first=True)
        self.hidden_dim = hidden_dim

    def forward(self, src):
        embedded = self.embedding(src)
        outputs, hidden = self.rnn(embedded)
        return hidden


class DecoderRNN(nn.Module):
    def __init__(self, output_dim, embed_dim, hidden_dim, cell_type='gru'):
        super().__init__()
        self.embedding = nn.Embedding(output_dim, embed_dim)
        if cell_type == 'gru':
            self.rnn = nn.GRU(embed_dim, hidden_dim, batch_first=True)
        else:
            self.rnn = nn.LSTM(embed_dim, hidden_dim, batch_first=True)
        self.fc_out = nn.Linear(hidden_dim, output_dim)

    def forward(self, input_token, hidden):
        embedded = self.embedding(input_token.unsqueeze(1))
        output, hidden = self.rnn(embedded, hidden)
        prediction = self.fc_out(output.squeeze(1))
        return prediction, hidden


class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.device = device

    def forward(self, src, tgt, teacher_forcing_ratio=0.5):
        batch_size = src.shape[0]
        tgt_len = tgt.shape[1]
        tgt_vocab_size = self.decoder.fc_out.out_features

        outputs = torch.zeros(batch_size, tgt_len, tgt_vocab_size).to(self.device)
        hidden = self.encoder(src)
        input_token = tgt[:, 0]  # <SOS> token

        for t in range(1, tgt_len):
            output, hidden = self.decoder(input_token, hidden)
            outputs[:, t] = output
            teacher_force = torch.rand(1).item() < teacher_forcing_ratio
            top1 = output.argmax(1)
            input_token = tgt[:, t] if teacher_force else top1

        return outputs


In [10]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

input_dim = len(src_to_ix)
output_dim = len(tgt_to_ix)
embed_dim = 256
hidden_dim = 512

encoder = EncoderRNN(input_dim, embed_dim, hidden_dim)
decoder = DecoderRNN(output_dim, embed_dim, hidden_dim)
model = Seq2Seq(encoder, decoder, device).to(device)

criterion = nn.CrossEntropyLoss(ignore_index=tgt_to_ix['<PAD>'])
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

sampled_data = encoded_data[:30000]
n_epochs = 20

for epoch in range(n_epochs):
    model.train()
    total_loss = 0
    for src, tgt in dataloader:
        src, tgt = src.to(device), tgt.to(device)
        optimizer.zero_grad()
        output = model(src, tgt)
        output_dim = output.shape[-1]

        loss = criterion(output[:, 1:].reshape(-1, output_dim), tgt[:, 1:].reshape(-1))
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    print(f"Epoch [{epoch+1}/{n_epochs}], Loss: {total_loss/len(dataloader):.4f}")


Epoch [1/20], Loss: 2.7156
Epoch [2/20], Loss: 1.6927
Epoch [3/20], Loss: 1.2587
Epoch [4/20], Loss: 0.9923
Epoch [5/20], Loss: 0.8100
Epoch [6/20], Loss: 0.6634
Epoch [7/20], Loss: 0.4992
Epoch [8/20], Loss: 0.3933
Epoch [9/20], Loss: 0.3069
Epoch [10/20], Loss: 0.2259
Epoch [11/20], Loss: 0.1425
Epoch [12/20], Loss: 0.0937
Epoch [13/20], Loss: 0.0619
Epoch [14/20], Loss: 0.0502
Epoch [15/20], Loss: 0.0409
Epoch [16/20], Loss: 0.0311
Epoch [17/20], Loss: 0.0268
Epoch [18/20], Loss: 0.0292
Epoch [19/20], Loss: 0.0414
Epoch [20/20], Loss: 0.1212


In [11]:
def decode_sequence(seq, ix_to_char):
    return ''.join([ix_to_char[i] for i in seq if i not in [tgt_to_ix['<PAD>'], tgt_to_ix['<SOS>'], tgt_to_ix['<EOS>']]])

model.eval()
with torch.no_grad():
    for src, tgt in dataloader:
        src, tgt = src.to(device), tgt.to(device)
        output = model(src, tgt, teacher_forcing_ratio=0.0)
        preds = output.argmax(2)
        print("Romanized:", decode_sequence(src[0].cpu().numpy(), {v: k for k, v in src_to_ix.items()}))
        print("Expected:", decode_sequence(tgt[0].cpu().numpy(), ix_to_tgt))
        print("Predicted:", decode_sequence(preds[0].cpu().numpy(), ix_to_tgt))
        break


Romanized: ganwai
Expected: गंवाई
Predicted: गंवाई


In [12]:
# --- Step 8: Interactive prediction ---
def transliterate_word(model, word, src_to_ix, tgt_to_ix, ix_to_tgt, device, max_len=30):
    model.eval()

    # Encode input sequence
    input_seq = [src_to_ix['<SOS>']] + [src_to_ix[ch] for ch in word if ch in src_to_ix] + [src_to_ix['<EOS>']]
    input_tensor = torch.tensor(input_seq, dtype=torch.long).unsqueeze(0).to(device)

    # Encode
    hidden = model.encoder(input_tensor)

    # Start decoding
    input_token = torch.tensor([tgt_to_ix['<SOS>']], dtype=torch.long).to(device)
    decoded_chars = []

    for _ in range(max_len):
        output, hidden = model.decoder(input_token, hidden)
        pred_token = output.argmax(1).item()

        if pred_token == tgt_to_ix['<EOS>']:
            break

        decoded_chars.append(ix_to_tgt[pred_token])
        input_token = torch.tensor([pred_token], dtype=torch.long).to(device)

    return ''.join(decoded_chars)

# Test manually
test_words = ["ghar", "janamdivas", "pustak", "ladka", "dil"]
for w in test_words:
    print(f"{w} → {transliterate_word(model, w, src_to_ix, tgt_to_ix, ix_to_tgt, device)}")


ghar → घार
janamdivas → जन्दिमिस
pustak → पुस्टका
ladka → लक्ड़ा
dil → दिल


In [13]:
while True:
    user_input = input("Enter a Romanized Hindi word (or 'quit' to stop): ").strip()
    if user_input.lower() == "quit":
        break
    result = transliterate_word(model, user_input, src_to_ix, tgt_to_ix, ix_to_tgt, device)
    print(f"🔤 {user_input} → {result}\n")


Enter a Romanized Hindi word (or 'quit' to stop): quit


In [14]:
from tqdm import tqdm

# Create reverse vocab for target
tgt_vocab = type('', (), {})()  # simple object with .itos attribute
tgt_vocab.itos = ix_to_tgt

# ✅ Accuracy Evaluation Function
def evaluate_accuracy(model, dataloader, tgt_vocab, device='cuda'):
    model.eval()
    total_chars = 0
    correct_chars = 0
    total_words = 0
    correct_words = 0

    with torch.no_grad():
        for src, tgt in tqdm(dataloader, desc="Evaluating"):
            src, tgt = src.to(device), tgt.to(device)
            outputs = model(src, tgt, teacher_forcing_ratio=0.0)
            preds = outputs.argmax(2)

            for i in range(len(preds)):
                pred_seq = [tgt_vocab.itos[idx] for idx in preds[i].cpu().numpy()]
                true_seq = [tgt_vocab.itos[idx] for idx in tgt[i].cpu().numpy()]

                # Remove special tokens
                for tok in ['<PAD>', '<SOS>', '<EOS>']:
                    if tok in pred_seq:
                        pred_seq = pred_seq[:pred_seq.index(tok)]
                    if tok in true_seq:
                        true_seq = true_seq[:true_seq.index(tok)]

                # --- Character-level accuracy ---
                min_len = min(len(pred_seq), len(true_seq))
                correct_chars += sum(1 for a, b in zip(pred_seq[:min_len], true_seq[:min_len]) if a == b)
                total_chars += max(len(pred_seq), len(true_seq))

                # --- Word-level accuracy ---
                if ''.join(pred_seq) == ''.join(true_seq):
                    correct_words += 1
                total_words += 1

    char_acc = correct_chars / total_chars if total_chars > 0 else 0
    word_acc = correct_words / total_words if total_words > 0 else 0

    print(f"\n✅ Character Accuracy: {char_acc * 100:.2f}%")
    print(f"✅ Word Accuracy: {word_acc * 100:.2f}%")

    return char_acc, word_acc


# ✅ Call the function
char_acc, word_acc = evaluate_accuracy(model, dataloader, tgt_vocab, device=device)


Evaluating: 100%|██████████| 157/157 [00:10<00:00, 15.54it/s]


✅ Character Accuracy: 0.00%
✅ Word Accuracy: 100.00%



