In [1]:
import pandas as pd
import torch
import torch.nn as nn
from torch.optim import adamw
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from tqdm import tqdm
import time
from nltk.translate.bleu_score import corpus_bleu
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\vaibh\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

## Check CUDA

In [2]:
# Check if CUDA is available
device = torch.device(None if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

Using device: cpu


## Get Data and Preprocess

In [3]:
# Load and preprocess data
df = pd.read_csv(None)
df.dropna(inplace =True)
df = df.sample(None,random_state=42)
src_lang = None.astype(str).tolist()
tgt_lang = None.astype(str).tolist()

ValueError: Invalid file path or buffer object type: <class 'NoneType'>

## Create Vocabulary

In [None]:
# Create vocabulary
def create_vocab(sentences):
    vocab = set()
    for sentence in sentences:
        vocab.update(str(sentence).split())
    return vocab

src_vocab = create_vocab(src_lang)
tgt_vocab = create_vocab(tgt_lang)
src_vocab_size = len(src_vocab)
tgt_vocab_size = len(tgt_vocab)

## Word to Index

In [None]:
# Create word to index mappings
src_word2idx = {word: idx for idx, word in enumerate(src_vocab)}
tgt_word2idx = {word: idx for idx, word in enumerate(tgt_vocab)}
src_idx2word = {idx: word for word, idx in src_word2idx.items()}
tgt_idx2word = {idx: word for word, idx in tgt_word2idx.items()}

# Convert sentences to indices
def sentence_to_indices(sentence, word2idx):
    return [word2idx.get(word, 0) for word in str(sentence).split()]

src_indices = [sentence_to_indices(sentence, src_word2idx) for sentence in src_lang]
tgt_indices = [sentence_to_indices(sentence, tgt_word2idx) for sentence in tgt_lang]

# Pad sequences
max_src_len = max(len(s) for s in src_indices)
max_tgt_len = max(len(s) for s in tgt_indices)

src_indices = [s + [0] * (max_src_len - len(s)) for s in src_indices]
tgt_indices = [s + [0] * (max_tgt_len - len(s)) for s in tgt_indices]

## Creation of Dataset and Train-Test Split

In [None]:
# Create dataset
class TranslationDataset(Dataset):
    def __init__(self, src, tgt):
        self.src = src
        self.tgt = tgt

    def __len__(self):
        return len(self.src)

    def __getitem__(self, idx):
        return torch.tensor(self.src[idx]), torch.tensor(self.tgt[idx])

# Split data
X_train, X_test, y_train, y_test = train_test_split(src_indices, tgt_indices, test_size=0.2, random_state=42)

# Create dataloaders
train_dataset = TranslationDataset(X_train, y_train)
test_dataset = TranslationDataset(X_test, y_test)
train_loader = DataLoader(train_dataset, batch_size=None, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=None, shuffle=False)

## Model Definition

In [None]:
# Define the model
class Seq2SeqLSTM(nn.Module):
    def __init__(self, src_vocab_size, tgt_vocab_size, hidden_size):
        super(Seq2SeqLSTM, self).__init__()
        self.encoder = nn.LSTM(src_vocab_size, hidden_size, batch_first=True)
        self.decoder = nn.LSTM(tgt_vocab_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, tgt_vocab_size)

    def forward(self, src, tgt):
        _, (hidden, cell) = self.encoder(src)
        output, _ = self.decoder(tgt, (hidden, cell))
        return self.fc(output)

In [None]:
class Seq2SeqGRU(nn.Module):
    def __init__(self, src_vocab_size, tgt_vocab_size, hidden_size):
        super(Seq2SeqGRU, self).__init__()
        self.encoder = nn.GRU(src_vocab_size, hidden_size, batch_first=True)
        self.decoder = nn.GRU(tgt_vocab_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, tgt_vocab_size)

    def forward(self, src, tgt):
        _, hidden = self.encoder(src)
        output, _ = self.decoder(tgt, hidden)
        return self.fc(output)

In [None]:
class Seq2SeqTransformer(nn.Module):
    def __init__(self, src_vocab_size, tgt_vocab_size, hidden_size, nheads, num_layers):
        super(Seq2SeqTransformer, self).__init__()
        self.encoder_layer = nn.TransformerEncoderLayer(
            d_model=hidden_size, nhead=nheads
        )
        self.encoder = nn.TransformerEncoder(self.encoder_layer, num_layers=num_layers)

        self.decoder_layer = nn.TransformerDecoderLayer(
            d_model=hidden_size, nhead=nheads
        )
        self.decoder = nn.TransformerDecoder(self.decoder_layer, num_layers=num_layers)

        self.src_embed = nn.Embedding(src_vocab_size, hidden_size)
        self.tgt_embed = nn.Embedding(tgt_vocab_size, hidden_size)
        self.fc = nn.Linear(hidden_size, tgt_vocab_size)

    def forward(self, src, tgt):
        src_embed = self.src_embed(src)
        tgt_embed = self.tgt_embed(tgt)
        memory = self.encoder(src_embed)
        output = self.decoder(tgt_embed, memory)
        return self.fc(output)

## Model Initialization

In [None]:
# Initialize model and optimizer
model = Seq2SeqLSTM(None, None, hidden_size=256).to(device)
optimizer = torch.optim.Adam(model.parameters())
criterion = nn.CrossEntropyLoss(ignore_index=0)

In [None]:
gru_model = Seq2SeqGRU(src_vocab_size, tgt_vocab_size, hidden_size=256).to(device)
gru_optimizer = torch.optim.Adam(gru_model.parameters())
gru_criterion = nn.CrossEntropyLoss(ignore_index=0)

In [None]:
transformer_model = Seq2SeqTransformer(
    src_vocab_size, tgt_vocab_size, hidden_size=256, nheads=8, num_layers=3
).to(device)
transformer_optimizer = torch.optim.Adam(transformer_model.parameters())
transformer_criterion = nn.CrossEntropyLoss(ignore_index=0)

## Training Initialization

In [None]:
# Training loop
num_epochs = 5
best_loss = float('inf')

## Training Loop

## LSTM

In [None]:
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    start_time = time.time()

    progress_bar = tqdm(enumerate(train_loader), total=len(train_loader), desc=f"Epoch {epoch+1}/{num_epochs}")

    for batch_idx, (src, tgt) in progress_bar:
        src, tgt = src.to(device), tgt.to(device)
        optimizer.zero_grad()
        src_onehot = nn.functional.one_hot(src, num_classes=src_vocab_size).float().to(device)
        tgt_onehot = nn.functional.one_hot(tgt, num_classes=tgt_vocab_size).float().to(device)
        output = model(src_onehot, tgt_onehot[:, :-1])
        loss = criterion(output.reshape(-1, tgt_vocab_size), tgt[:, 1:].reshape(-1))
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        avg_loss = total_loss / (batch_idx + 1)

        # Update progress bar
        progress_bar.set_postfix({
            'Loss': f'{avg_loss:.4f}',
            'Batch': f'{batch_idx+1}/{len(train_loader)}'
        })

    epoch_loss = total_loss / len(train_loader)
    epoch_time = time.time() - start_time

    print(f"Epoch {epoch+1}/{num_epochs} completed in {epoch_time:.2f}s")
    print(f"Average Loss: {epoch_loss:.4f}")

    # Validation
    model.eval()
    val_loss = 0
    with torch.no_grad():
        for src, tgt in test_loader:
            src, tgt = src.to(device), tgt.to(device)
            src_onehot = nn.functional.one_hot(src, num_classes=src_vocab_size).float().to(device)
            tgt_onehot = nn.functional.one_hot(tgt, num_classes=tgt_vocab_size).float().to(device)
            output = model(src_onehot, tgt_onehot[:, :-1])
            loss = criterion(output.reshape(-1, tgt_vocab_size), tgt[:, 1:].reshape(-1))
            val_loss += loss.item()

    val_loss /= len(test_loader)
    print(f"Validation Loss: {val_loss:.4f}")

    # Save the best model
    if val_loss < best_loss:
        best_loss = val_loss
        torch.save(model.state_dict(), 'best_translation_model.pth')
        print("New best model saved!")

    print("-" * 50)

## GRU

In [None]:
def train_model_gru(model, optimizer, criterion, train_loader, num_epochs=2):
    best_loss = float("inf")
    for epoch in range(num_epochs):
        model.train()
        total_loss = 0
        start_time = time.time()

        progress_bar = tqdm(
            enumerate(train_loader),
            total=len(train_loader),
            desc=f"GRU Epoch {epoch+1}/{num_epochs}",
        )

        for batch_idx, (src, tgt) in progress_bar:
            src, tgt = src.to(device), tgt.to(device)
            optimizer.zero_grad()
            src_onehot = (
                nn.functional.one_hot(src, num_classes=src_vocab_size)
                .float()
                .to(device)
            )
            tgt_onehot = (
                nn.functional.one_hot(tgt, num_classes=tgt_vocab_size)
                .float()
                .to(device)
            )
            output = model(src_onehot, tgt_onehot[:, :-1])
            loss = criterion(output.reshape(-1, tgt_vocab_size), tgt[:, 1:].reshape(-1))
            loss.backward()
            optimizer.step()

            total_loss += loss.item()
            avg_loss = total_loss / (batch_idx + 1)
            progress_bar.set_postfix({"Loss": f"{avg_loss:.4f}"})

        epoch_loss = total_loss / len(train_loader)
        print(f"GRU Epoch {epoch+1}/{num_epochs} completed. Avg Loss: {epoch_loss:.4f}")

In [None]:
train_model_gru(gru_model, gru_optimizer, gru_criterion, train_loader)

## Transformer

In [None]:
def train_model_transformer(model, optimizer, criterion, train_loader, num_epochs=2):
    best_loss = float("inf")
    for epoch in range(num_epochs):
        model.train()
        total_loss = 0
        start_time = time.time()

        progress_bar = tqdm(
            enumerate(train_loader),
            total=len(train_loader),
            desc=f"Transformer Epoch {epoch+1}/{num_epochs}",
        )

        for batch_idx, (src, tgt) in progress_bar:
            src, tgt = src.to(device), tgt.to(device)
            optimizer.zero_grad()
            output = model(src, tgt[:, :-1])
            loss = criterion(output.reshape(-1, tgt_vocab_size), tgt[:, 1:].reshape(-1))
            loss.backward()
            optimizer.step()

            total_loss += loss.item()
            avg_loss = total_loss / (batch_idx + 1)
            progress_bar.set_postfix({"Loss": f"{avg_loss:.4f}"})

        epoch_loss = total_loss / len(train_loader)
        print(
            f"Transformer Epoch {epoch+1}/{num_epochs} completed. Avg Loss: {epoch_loss:.4f}"
        )

In [None]:
train_model_transformer(
    transformer_model, transformer_optimizer, transformer_criterion, train_loader
)

## Inference

In [None]:
def translate(model, test_loader, src_idx2word, tgt_idx2word, src_vocab_size, tgt_vocab_size, device, max_tgt_len):
    model.eval()
    all_translations = []
    all_references = []

    progress_bar = tqdm(test_loader, desc="Translating")

    for src, tgt in progress_bar:
        src, tgt = src.to(device), tgt.to(device)

        for i in range(len(src)):
            # Convert indices to words, then back to indices to ensure we have integers
            src_words = [src_idx2word.get(idx.item(), "") for idx in src[i] if idx.item() != 0]
            tgt_words = [tgt_idx2word.get(idx.item(), "") for idx in tgt[i] if idx.item() != 0]

            src_sentence = ' '.join(src_words)
            tgt_sentence = ' '.join(tgt_words)

            # Convert words back to indices, using 0 for unknown words
            src_indices = [src_word2idx.get(word, 0) for word in src_words]
            src_indices = src_indices + [0] * (src.size(1) - len(src_indices))

            src_tensor = torch.tensor(src_indices, dtype=torch.long, device=device).unsqueeze(0)
            src_onehot = torch.nn.functional.one_hot(src_tensor, num_classes=src_vocab_size).float()

            with torch.no_grad():
                _, (hidden, cell) = model.encoder(src_onehot)
                tgt_tensor = torch.zeros(1, 1, tgt_vocab_size, device=device)
                output_sentence = []

                for _ in range(max_tgt_len):
                    output, (hidden, cell) = model.decoder(tgt_tensor, (hidden, cell))
                    output = model.fc(output)
                    predicted = output.argmax(2).item()
                    if predicted == 0:
                        break
                    output_sentence.append(tgt_idx2word.get(predicted, ""))
                    tgt_tensor = torch.nn.functional.one_hot(torch.tensor([[predicted]], device=device), num_classes=tgt_vocab_size).float()

            translated = ' '.join(filter(None, output_sentence))
            all_translations.append(translated)
            all_references.append(tgt_sentence)

    return all_translations, all_references

## Loading Saved Model

In [None]:
# Load the best model for inference and BLEU score calculation
model.load_state_dict(torch.load('best_translation_model.pth'))
model.eval()

## Translation by LSTM

In [None]:
# Translate and calculate BLEU score
translations, references = translate(model, test_loader, src_idx2word, tgt_idx2word, src_vocab_size, tgt_vocab_size, device, max_tgt_len)

## BLEU Score  and Output

In [None]:
# Tokenize translations and references
processed_translations = [nltk.word_tokenize(t.lower()) for t in translations]
processed_references = [[nltk.word_tokenize(r.lower())] for r in references]

# Calculate BLEU score
bleu_score = corpus_bleu(processed_references, processed_translations)
print(f"BLEU Score: {bleu_score:.4f}")

# Print some example translations
num_examples = 5
print("\nExample Translations:")
for i in range(min(num_examples, len(translations))):
    print(f"Source: {references[i]}")
    print(f"Translation: {translations[i]}")
    print(f"Reference: {references[i]}")
    print()

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


BLEU Score: 0.0003

Example Translations:
Source: दक्षिणी ' विमान ' मंदिरों के मूल ततऋ-ऊण्श्छ्ष्-वों की ऐसी सामानऋ-ऊण्श्छ्ष्-य जानकारी को समझना आवशऋ-ऊण्श्छ्ष्-यक होगा कऋ-ऊण्श्छ्ष्-योंकि इस कडऋई में आने वाली मंदिरों का , उनके सर्वाधिक विशिषऋ-ऊण्श्छ्ष्-ट लक्षणों के अतिरिकऋ-ऊण्श्छ्ष्-त उनके विवरण दे पाना वऋ-ऊण्श्छ्ष्-यावहारिक नहीं होगा.पूर्वकालीन समसऋ-ऊण्श्छ्ष्-त और पाशऋ-ऊण्श्छ्ष्-चातकालीन अधिकांश ' शिलऋ-ऊण्श्छ्ष्-प ' और ' आगम ' ग्रंढथों एवं अनेक समसामयिक अभिलेखों के अनुसार ' विमान ' शबऋ-ऊण्श्छ्ष्-द ' उपान ' या ' अधिषऋ-ऊण्श्छ्ष्-ठान ' के निमऋ-ऊण्श्छ्ष्-नतम गढऋन , या मंच से ' सऋ-ऊण्श्छ्ष्-तूपी ' या उचऋ-ऊण्श्छ्ष्-चतम कलश तक संपूर्ण भवन का द्योतक है .
Translation: के लिए एक बार एक दूसरे के लिए एक ही है . ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' 

In [None]:
gru_translations, gru_references = translate(
    gru_model,
    test_loader,
    src_idx2word,
    tgt_idx2word,
    src_vocab_size,
    tgt_vocab_size,
    device,
    max_tgt_len,
)
gru_bleu_score = corpus_bleu(
    [[nltk.word_tokenize(ref.lower())] for ref in gru_references],
    [nltk.word_tokenize(trans.lower()) for trans in gru_translations],
)
print(f"GRU BLEU Score: {gru_bleu_score:.4f}")

## Print some example translations by GRU model

In [None]:
num_examples = 5
print("\nExample Translations:")
for i in range(min(num_examples, len(gru_translations))):
    print(f"Source: {gru_references[i]}")
    print(f"Translation: {gru_translations[i]}")
    print(f"Reference: {gru_references[i]}")
    print()

In [None]:
transformer_translations, transformer_references = translate(
    transformer_model,
    test_loader,
    src_idx2word,
    tgt_idx2word,
    src_vocab_size,
    tgt_vocab_size,
    device,
    max_tgt_len,
)
transformer_bleu_score = corpus_bleu(
    [[nltk.word_tokenize(ref.lower())] for ref in transformer_references],
    [nltk.word_tokenize(trans.lower()) for trans in transformer_translations],
)
print(f"Transformer BLEU Score: {transformer_bleu_score:.4f}")

## Print some example translations by Transformer model

In [None]:
num_examples = 5
print("\nExample Translations:")
for i in range(min(num_examples, len(transformer_translations))):
    print(f"Source: {transformer_references[i]}")
    print(f"Translation: {transformer_translations[i]}")
    print(f"Reference: {transformer_references[i]}")
    print()