<a href="https://colab.research.google.com/github/therisbh/Machine_Translation/blob/main/Natural_machine_translation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Install & Imports

In [None]:
!pip install -q sentencepiece datasets sacrebleu



[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/100.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m100.8/100.8 kB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
from sklearn.model_selection import train_test_split

import sentencepiece as spm
from datasets import load_dataset
import random


In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


## Load & Subset Dataset

In [None]:
from datasets import load_dataset
from sklearn.model_selection import train_test_split

dataset = load_dataset("cfilt/iitb-english-hindi")

In [None]:
print(dataset["train"][0])

{'translation': {'en': 'Give your application an accessibility workout', 'hi': 'अपने अनुप्रयोग को पहुंचनीयता व्यायाम का लाभ दें'}}


In [None]:
MAX_LINES = 150000

en_all = []
hi_all = []

for i in range(MAX_LINES):
    item = dataset["train"][i]
    en_all.append(item["translation"]["en"])
    hi_all.append(item["translation"]["hi"])

en_train, en_test, hi_train, hi_test = train_test_split(
    en_all, hi_all, test_size=0.1, random_state=42
)

print("Train size:", len(en_train))
print("Test size:", len(en_test))


Train size: 135000
Test size: 15000


In [None]:
print(dataset["train"][0])

{'translation': {'en': 'Give your application an accessibility workout', 'hi': 'अपने अनुप्रयोग को पहुंचनीयता व्यायाम का लाभ दें'}}


## Prepare Data for SentencePiece

In [None]:
with open("spm_train.txt", "w", encoding="utf-8") as f:
    for e, h in zip(en_train, hi_train):
        f.write(e.strip() + "\n")
        f.write(h.strip() + "\n")


Train SentencePiece

In [None]:
spm.SentencePieceTrainer.train(
    input="spm_train.txt",
    model_prefix="spm_bpe",
    vocab_size=10000,
    model_type="bpe",
    character_coverage=1.0,
    pad_id=0,
    unk_id=1,
    bos_id=2,
    eos_id=3
)


## Load SentencePiece Model

In [None]:
sp = spm.SentencePieceProcessor()
sp.load("spm_bpe.model")

PAD, UNK, BOS, EOS = 0, 1, 2, 3
VOCAB_SIZE = sp.get_piece_size()

print("Vocab size:", VOCAB_SIZE)


Vocab size: 10000


## Dataset Class

In [None]:
class TranslationDataset(Dataset):
    def __init__(self, src, tgt, max_len=30):
        self.src = src
        self.tgt = tgt
        self.max_len = max_len

    def encode(self, sentence):
        ids = [BOS] + sp.encode(sentence, out_type=int) + [EOS]
        return ids[:self.max_len]

    def __len__(self):
        return len(self.src)

    def __getitem__(self, idx):
        return (
            torch.tensor(self.encode(self.src[idx])),
            torch.tensor(self.encode(self.tgt[idx]))
        )


## Collate Function

In [None]:
def collate_fn(batch):
    src, tgt = zip(*batch)
    src = pad_sequence(src, batch_first=True, padding_value=PAD)
    tgt = pad_sequence(tgt, batch_first=True, padding_value=PAD)
    return src, tgt


## DataLoader

In [None]:
train_dataset = TranslationDataset(en_train, hi_train)
test_dataset  = TranslationDataset(en_test, hi_test)

train_loader = DataLoader(
    train_dataset,
    batch_size=64,
    shuffle=True,
    collate_fn=collate_fn
)

test_loader = DataLoader(
    test_dataset,
    batch_size=64,
    shuffle=False,
    collate_fn=collate_fn
)

loader = DataLoader(
    dataset,
    batch_size=64,
    shuffle=True,
    collate_fn=collate_fn
)


## Encoder

In [None]:
class Encoder(nn.Module):
    def __init__(self, vocab_size, emb_dim, hid_dim):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, emb_dim, padding_idx=PAD)
        self.rnn = nn.GRU(emb_dim, hid_dim, batch_first=True)

    def forward(self, x):
        emb = self.embedding(x)
        outputs, hidden = self.rnn(emb)
        return outputs, hidden


In [None]:
encoder = Encoder(VOCAB_SIZE, 256, 512).to(device)
print(encoder)

Encoder(
  (embedding): Embedding(10000, 256, padding_idx=0)
  (rnn): GRU(256, 512, batch_first=True)
)


## Vanilla Decoder

In [None]:
class VanillaDecoder(nn.Module):
    def __init__(self, vocab_size, emb_dim, hid_dim):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, emb_dim, padding_idx=PAD)
        self.rnn = nn.GRU(emb_dim, hid_dim, batch_first=True)
        self.fc = nn.Linear(hid_dim, vocab_size)

    def forward(self, x, hidden):
        emb = self.embedding(x)
        output, hidden = self.rnn(emb, hidden)
        output = self.fc(output.squeeze(1))
        return output, hidden


In [None]:
vanilla_decoder = VanillaDecoder(VOCAB_SIZE, 256, 512).to(device)

optimizer_vanilla = optim.Adam(
    list(encoder.parameters()) + list(vanilla_decoder.parameters()),
    lr=0.001
)

def train_epoch_vanilla():
    encoder.train()
    vanilla_decoder.train()
    total_loss = 0

    for src, tgt in train_loader:
        src, tgt = src.to(device), tgt.to(device)
        optimizer_vanilla.zero_grad()

        enc_out, hidden = encoder(src)
        input_tok = tgt[:, 0].unsqueeze(1)

        loss = 0
        for t in range(1, tgt.size(1)):
            output, hidden = vanilla_decoder(input_tok, hidden)
            loss += criterion(output, tgt[:, t])
            input_tok = tgt[:, t].unsqueeze(1)

        loss.backward()
        torch.nn.utils.clip_grad_norm_(encoder.parameters(), 1.0)
        optimizer_vanilla.step()

        total_loss += loss.item()

    return total_loss / len(train_loader)


## Train Vanilla Model

In [None]:
criterion = nn.CrossEntropyLoss(ignore_index=PAD)

In [None]:
print("Training Vanilla Encoder–Decoder")

for epoch in range(10):
    loss = train_epoch_vanilla()
    print(f"[Vanilla] Epoch {epoch+1} | Loss: {loss:.4f}")


Training Vanilla Encoder–Decoder
[Vanilla] Epoch 1 | Loss: 79.4926
[Vanilla] Epoch 2 | Loss: 25.4579
[Vanilla] Epoch 3 | Loss: 13.3244
[Vanilla] Epoch 4 | Loss: 8.8935
[Vanilla] Epoch 5 | Loss: 6.7948
[Vanilla] Epoch 6 | Loss: 6.1329
[Vanilla] Epoch 7 | Loss: 5.3259
[Vanilla] Epoch 8 | Loss: 4.9544
[Vanilla] Epoch 9 | Loss: 4.8974
[Vanilla] Epoch 10 | Loss: 4.8000


In [None]:
def evaluate_token_accuracy(encoder, decoder, test_loader):
    encoder.eval()
    decoder.eval()

    correct = 0
    total = 0

    with torch.no_grad():
        for src, tgt in test_loader:
            src, tgt = src.to(device), tgt.to(device)

            enc_out, hidden = encoder(src)
            input_tok = tgt[:, 0].unsqueeze(1)

            for t in range(1, tgt.size(1)):
                if isinstance(decoder, VanillaDecoder):
                    output, hidden = decoder(input_tok, hidden)
                else:
                    output, hidden = decoder(input_tok, hidden, enc_out)

                preds = output.argmax(1)
                mask = tgt[:, t] != PAD

                correct += (preds[mask] == tgt[:, t][mask]).sum().item()
                total += mask.sum().item()

                input_tok = preds.unsqueeze(1)

    return correct / total


## Vanilla Test Evaluation

In [None]:
print("Evaluating Vanilla Encoder–Decoder on Test Set...")

vanilla_acc = evaluate_token_accuracy(
    encoder,
    vanilla_decoder,
    test_loader
)

print(f"Vanilla Seq2Seq Test Accuracy: {vanilla_acc * 100:.2f}%")


Evaluating Vanilla Encoder–Decoder on Test Set...
Vanilla Seq2Seq Test Accuracy: 3.64%


## Luong Attention

In [None]:
class LuongAttention(nn.Module):
    def forward(self, decoder_hidden, encoder_outputs):
        # decoder_hidden: (B, H)
        # encoder_outputs: (B, T, H)
        scores = torch.bmm(
            encoder_outputs,
            decoder_hidden.unsqueeze(2)
        ).squeeze(2)

        attn_weights = torch.softmax(scores, dim=1)

        context = torch.bmm(
            attn_weights.unsqueeze(1),
            encoder_outputs
        ).squeeze(1)

        return context


## Decoder with Luong Attention

In [None]:
class Decoder(nn.Module):
    def __init__(self, vocab_size, emb_dim, hid_dim):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, emb_dim, padding_idx=PAD)
        self.attn = LuongAttention()
        self.rnn = nn.GRU(emb_dim + hid_dim, hid_dim, batch_first=True)
        self.fc = nn.Linear(hid_dim * 2, vocab_size)

    def forward(self, x, hidden, encoder_outputs):
        emb = self.embedding(x).squeeze(1)
        context = self.attn(hidden.squeeze(0), encoder_outputs)
        rnn_input = torch.cat([emb, context], dim=1).unsqueeze(1)
        output, hidden = self.rnn(rnn_input, hidden)
        output = self.fc(torch.cat([output.squeeze(1), context], dim=1))
        return output, hidden


## Initialize Model

In [None]:
encoder = Encoder(VOCAB_SIZE, 256, 512).to(device)
decoder = Decoder(VOCAB_SIZE, 256, 512).to(device)

optimizer = optim.Adam(
    list(encoder.parameters()) + list(decoder.parameters()),
    lr=0.001
)

criterion = nn.CrossEntropyLoss(ignore_index=PAD)


## Training Function

In [None]:
def train_epoch():
    encoder.train()
    decoder.train()
    total_loss = 0

    for src, tgt in train_loader:
        src, tgt = src.to(device), tgt.to(device)
        optimizer.zero_grad()

        enc_out, hidden = encoder(src)
        input_tok = tgt[:, 0].unsqueeze(1)

        loss = 0
        for t in range(1, tgt.size(1)):
            output, hidden = decoder(input_tok, hidden, enc_out)
            loss += criterion(output, tgt[:, t])
            input_tok = tgt[:, t].unsqueeze(1)

        loss.backward()
        torch.nn.utils.clip_grad_norm_(encoder.parameters(), 1.0)
        optimizer.step()

        total_loss += loss.item()

    return total_loss / len(loader)


Train Model

In [None]:
EPOCHS = 10

for epoch in range(EPOCHS):
    loss = train_epoch()
    print(f"Epoch {epoch+1}/{EPOCHS} | Loss: {loss:.4f}")


Epoch 1/10 | Loss: 158269.2581
Epoch 2/10 | Loss: 54377.7760
Epoch 3/10 | Loss: 33292.8532
Epoch 4/10 | Loss: 25145.8430
Epoch 5/10 | Loss: 21582.9462
Epoch 6/10 | Loss: 19697.3691
Epoch 7/10 | Loss: 20280.4798
Epoch 8/10 | Loss: 19168.1870
Epoch 9/10 | Loss: 20376.8965
Epoch 10/10 | Loss: 19516.3468


## Attention Accuracy Test

In [None]:
print("Evaluating Seq2Seq + Luong Attention on Test Set...")

attention_acc = evaluate_token_accuracy(
    encoder,
    decoder,   # Luong attention decoder
    test_loader
)

print(f"Luong Attention Test Accuracy: {attention_acc * 100:.2f}%")


Evaluating Seq2Seq + Luong Attention on Test Set...
Luong Attention Test Accuracy: 57.58%


## Testing on my custom text

In [None]:
def translate_custom_sentence(sentence, encoder, decoder, max_len=30):
    encoder.eval()
    decoder.eval()

    with torch.no_grad():
        # Encode input sentence
        src_ids = [BOS] + sp.encode(sentence, out_type=int) + [EOS]
        src = torch.tensor(src_ids).unsqueeze(0).to(device)

        enc_out, hidden = encoder(src)

        # Start decoding
        input_tok = torch.tensor([[BOS]]).to(device)
        output_ids = []

        for _ in range(max_len):
            if isinstance(decoder, VanillaDecoder):
                output, hidden = decoder(input_tok, hidden)
            else:
                output, hidden = decoder(input_tok, hidden, enc_out)

            pred = output.argmax(1).item()

            if pred == EOS:
                break

            output_ids.append(pred)
            input_tok = torch.tensor([[pred]]).to(device)

    return sp.decode(output_ids)


In [None]:
custom_sentence = "I am learning machine learning"

print("Input (English):", custom_sentence)

print("\nVanilla Seq2Seq Output:")
print(translate_custom_sentence(custom_sentence, encoder, vanilla_decoder))

print("\nLuong Attention Output:")
print(translate_custom_sentence(custom_sentence, encoder, decoder))


Input (English): I am learning machine learning

Vanilla Seq2Seq Output:
उपफ़ोल्डर रखने के लिए, सूचना के गुण में बताता हैं% s

Luong Attention Output:
स्ट्रिंग स्थानीय


In [None]:
test_sentences = [
    "I am going to school",
    "She is reading a book",
    "Machine learning is very interesting",
    "I love studying artificial intelligence"
]

for s in test_sentences:
    print("\nEnglish:", s)
    print("Hindi (Attention):", translate_custom_sentence(s, encoder, vanilla_decoder))

    print("\nEnglish:", s)
    print("Hindi (Loung):", translate_custom_sentence(s, encoder, decoder))



English: I am going to school
Hindi (Attention): ेटा हुआ है.

English: I am going to school
Hindi (Loung): रद्दी में फ़ाइल को रद्दी और फिर से चालू करे

English: She is reading a book
Hindi (Attention): इस लिंक _ भ्रिकोण

English: She is reading a book
Hindi (Loung): किताब किताब% s

English: Machine learning is very interesting
Hindi (Attention): स्वतः आकार की जाँच करें

English: Machine learning is very interesting
Hindi (Loung): संदेश में भेज रहा है शब्द को आयात करने की जरूरत है

English: I love studying artificial intelligence
Hindi (Attention): % s में आ रहा है.

English: I love studying artificial intelligence
Hindi (Loung): यह संचेस के पास कि कोई भी अतिरिक्त में जोड़ें


## saving model


In [None]:
torch.save({
    "encoder": encoder.state_dict(),
    "decoder": vanilla_decoder.state_dict()
}, "vanilla_seq2seq.pth")


In [None]:
torch.save({
    "encoder": encoder.state_dict(),
    "decoder": decoder.state_dict()
}, "attention_seq2seq.pth")


In [None]:
from google.colab import drive
drive.mount("/content/drive")


Mounted at /content/drive


In [None]:
!cp attention_seq2seq.pth /content/drive/MyDrive/
!cp spm_bpe.model /content/drive/MyDrive/
!cp spm_bpe.vocab /content/drive/MyDrive/


Save Model Configuration

In [None]:
model_config = {
    "vocab_size": VOCAB_SIZE,
    "embedding_dim": 256,
    "hidden_dim": 512,
    "model_type": "Seq2Seq + Luong Attention",
    "tokenizer": "SentencePiece BPE"
}

torch.save(model_config, "model_config.pth")
!cp model_config.pth /content/drive/MyDrive/
