In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
import numpy as np
from collections import Counter
from tqdm import tqdm


latent_dim = 128
embedding_dim = 128
max_encoder_seq_length = 20
max_decoder_seq_length = 20
vocab_size_limit = 20000
sample_size = int(0.8 * 900000)
batch_size = 64
epochs = 20
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


def load_data(english_file, hindi_file, sample_size):
    with open(english_file, 'r', encoding='utf-8') as f_en, open(hindi_file, 'r', encoding='utf-8') as f_hi:
        english_texts = f_en.read().splitlines()[:sample_size]
        hindi_texts = f_hi.read().splitlines()[:sample_size]
    return english_texts, hindi_texts

english_texts, hindi_texts = load_data("GNOME.en-hi.en", "GNOME.en-hi.hi", sample_size)


def build_tokenizer(texts, vocab_size_limit):
    token_counts = Counter(word for text in texts for word in text.split())
    most_common_tokens = token_counts.most_common(vocab_size_limit - 2)
    word_to_index = {word: i + 2 for i, (word, _) in enumerate(most_common_tokens)}
    word_to_index["<PAD>"] = 0
    word_to_index["<START>"] = 1
    index_to_word = {i: word for word, i in word_to_index.items()}
    return word_to_index, index_to_word

input_word_to_index, input_index_to_word = build_tokenizer(english_texts, vocab_size_limit)
target_word_to_index, target_index_to_word = build_tokenizer(hindi_texts, vocab_size_limit)


def texts_to_sequences(texts, word_to_index, max_length):
    sequences = []
    for text in texts:
        seq = [word_to_index.get(word, word_to_index["<PAD>"]) for word in text.split()]
        sequences.append(seq[:max_length] + [word_to_index["<PAD>"]] * (max_length - len(seq)))
    return np.array(sequences)

input_sequences = texts_to_sequences(english_texts, input_word_to_index, max_encoder_seq_length)
target_sequences = texts_to_sequences(hindi_texts, target_word_to_index, max_decoder_seq_length)


class TranslationDataset(Dataset):
    def __init__(self, encoder_data, decoder_data):
        self.encoder_data = encoder_data
        self.decoder_data = decoder_data

    def __len__(self):
        return len(self.encoder_data)

    def __getitem__(self, idx):
        encoder_input = torch.tensor(self.encoder_data[idx], dtype=torch.long)
        decoder_input = torch.tensor(self.decoder_data[idx], dtype=torch.long)
        return encoder_input, decoder_input

dataset = TranslationDataset(input_sequences, target_sequences)
data_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True, drop_last=True)


class Attention(nn.Module):
    def __init__(self, latent_dim):
        super(Attention, self).__init__()
        self.attn = nn.Linear(latent_dim * 2, max_encoder_seq_length)
        self.v = nn.Parameter(torch.rand(max_encoder_seq_length))

    def forward(self, hidden, encoder_outputs):
        hidden = hidden.unsqueeze(1).repeat(1, max_encoder_seq_length, 1)
        attn_energies = torch.sum(self.v * torch.tanh(self.attn(torch.cat((hidden, encoder_outputs), 2))), dim=2)
        return torch.softmax(attn_energies, dim=1).unsqueeze(1)


class Encoder(nn.Module):
    def __init__(self, input_dim, emb_dim, latent_dim):
        super(Encoder, self).__init__()
        self.embedding = nn.Embedding(input_dim, emb_dim)
        self.lstm = nn.LSTM(emb_dim, latent_dim, batch_first=True)

    def forward(self, src):
        embedded = self.embedding(src)
        outputs, (hidden, cell) = self.lstm(embedded)
        return outputs, hidden, cell


class Decoder(nn.Module):
    def __init__(self, output_dim, emb_dim, latent_dim):
        super(Decoder, self).__init__()
        self.embedding = nn.Embedding(output_dim, emb_dim)
        self.lstm = nn.LSTM(emb_dim + latent_dim, latent_dim, batch_first=True)
        self.attention = Attention(latent_dim)
        self.fc = nn.Linear(latent_dim * 2, output_dim)

    def forward(self, input, hidden, cell, encoder_outputs):
        input = input.unsqueeze(1)
        embedded = self.embedding(input)
        attn_weights = self.attention(hidden[-1], encoder_outputs)
        context = attn_weights.bmm(encoder_outputs)
        lstm_input = torch.cat((embedded, context), dim=2)
        output, (hidden, cell) = self.lstm(lstm_input, (hidden, cell))
        prediction = self.fc(torch.cat((output.squeeze(1), context.squeeze(1)), dim=1))
        return prediction, hidden, cell


encoder = Encoder(len(input_word_to_index), embedding_dim, latent_dim).to(device)
decoder = Decoder(len(target_word_to_index), embedding_dim, latent_dim).to(device)
criterion = nn.CrossEntropyLoss(ignore_index=0)
encoder_optimizer = optim.Adam(encoder.parameters(), lr=0.001)
decoder_optimizer = optim.Adam(decoder.parameters(), lr=0.001)


for epoch in range(epochs):
    total_loss = 0
    for encoder_input, decoder_target in tqdm(data_loader):
        encoder_input, decoder_target = encoder_input.to(device), decoder_target.to(device)
        encoder_outputs, hidden, cell = encoder(encoder_input)
        decoder_input = torch.tensor([target_word_to_index["<START>"]] * batch_size).to(device)

        loss = 0
        use_teacher_forcing = np.random.rand() < (1 - epoch / epochs)
        for t in range(1, decoder_target.size(1)):
            output, hidden, cell = decoder(decoder_input, hidden, cell, encoder_outputs)
            loss += criterion(output, decoder_target[:, t])

            decoder_input = decoder_target[:, t] if use_teacher_forcing else output.argmax(1)

        encoder_optimizer.zero_grad()
        decoder_optimizer.zero_grad()
        loss.backward()
        encoder_optimizer.step()
        decoder_optimizer.step()
        total_loss += loss.item() / decoder_target.size(1)

    print(f"Epoch {epoch+1}/{epochs}, Loss: {total_loss / len(data_loader):.4f}")


def translate(input_text):
    with torch.no_grad():
        input_seq = texts_to_sequences([input_text], input_word_to_index, max_encoder_seq_length)
        encoder_input = torch.tensor(input_seq).to(device)
        
        encoder_outputs, hidden, cell = encoder(encoder_input)
        decoder_input = torch.tensor([target_word_to_index["<START>"]]).to(device)

        translated_sentence = []
        for _ in range(max_decoder_seq_length):
            output, hidden, cell = decoder(decoder_input, hidden, cell, encoder_outputs)
            top1 = output.argmax(1)
            if top1.item() == target_word_to_index["<PAD>"]:
                break
            translated_sentence.append(target_index_to_word.get(top1.item(), ""))
            decoder_input = top1

    return ' '.join(translated_sentence)


user_input = input("Enter a sentence in English: ")
print("Hindi Translation:", translate(user_input))


100%|██████████| 2276/2276 [03:45<00:00, 10.09it/s]


Epoch 1/20, Loss: nan


100%|██████████| 2276/2276 [03:50<00:00,  9.85it/s]


Epoch 2/20, Loss: nan


100%|██████████| 2276/2276 [03:50<00:00,  9.86it/s]


Epoch 3/20, Loss: nan


100%|██████████| 2276/2276 [03:49<00:00,  9.93it/s]


Epoch 4/20, Loss: nan


100%|██████████| 2276/2276 [03:48<00:00,  9.94it/s]


Epoch 5/20, Loss: nan


100%|██████████| 2276/2276 [03:49<00:00,  9.92it/s]


Epoch 6/20, Loss: nan


100%|██████████| 2276/2276 [03:50<00:00,  9.88it/s]


Epoch 7/20, Loss: nan


100%|██████████| 2276/2276 [03:49<00:00,  9.92it/s]


Epoch 8/20, Loss: nan


100%|██████████| 2276/2276 [03:49<00:00,  9.91it/s]


Epoch 9/20, Loss: nan


100%|██████████| 2276/2276 [03:50<00:00,  9.86it/s]


Epoch 10/20, Loss: nan


100%|██████████| 2276/2276 [03:53<00:00,  9.74it/s]


Epoch 11/20, Loss: nan


100%|██████████| 2276/2276 [03:51<00:00,  9.84it/s]


Epoch 12/20, Loss: nan


100%|██████████| 2276/2276 [04:10<00:00,  9.09it/s]


Epoch 13/20, Loss: nan


100%|██████████| 2276/2276 [03:59<00:00,  9.52it/s]


Epoch 14/20, Loss: nan


100%|██████████| 2276/2276 [04:01<00:00,  9.43it/s]


Epoch 15/20, Loss: nan


100%|██████████| 2276/2276 [03:58<00:00,  9.56it/s]


Epoch 16/20, Loss: nan


100%|██████████| 2276/2276 [03:58<00:00,  9.53it/s]


Epoch 17/20, Loss: nan


100%|██████████| 2276/2276 [03:55<00:00,  9.66it/s]


Epoch 18/20, Loss: nan


100%|██████████| 2276/2276 [03:59<00:00,  9.49it/s]


Epoch 19/20, Loss: nan


100%|██████████| 2276/2276 [03:57<00:00,  9.57it/s]


Epoch 20/20, Loss: nan
Hindi Translation: आरंभ करें (_I) gedit.gnome-2-2.hi.po #-#-#-#-# #-#-#-#-# #-#-#-#-# #-#-#-#-# #-#-#-#-# #-#-#-#-# #-#-#-#-# #-#-#-#-# #-#-#-#-# #-#-#-#-# #-#-#-#-# #-#-#-#-# #-#-#-#-# #-#-#-#-# #-#-#-#-# #-#-#-#-#
