In [1]:
import torch
import torch.nn as nn
import torch.optim as optim

In [2]:
import torch.nn.functional as F
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence, pad_packed_sequence
from torch.utils.data import Dataset, DataLoader

In [3]:
from torchtext.data.metrics import bleu_score

In [4]:
import numpy as np
from tqdm import tqdm
import random
import os

In [5]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [6]:
# PAD_START = ['*']
# PAD_NULL = [' ']
# TOKEN_UNK
TOKEN_PAD = ' '
TOKEN_SOS = '*'
TOKEN_EOS = '^'



chr_eng = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']
chr_hin = ['ँ', 'ं', 'अ', 'आ', 'इ', 'ई', 'उ', 'ऊ', 'ऋ', 'ए', 'ऐ', 'ऑ', 'ओ', 'औ', 'क', 'ख', 'ग', 'घ', 'च', 'छ', 'ज', 'झ', 'ञ', 'ट', 'ठ', 'ड', 'ढ', 'ण', 'त', 'थ', 'द', 'ध', 'न', 'प', 'फ', 'ब', 'भ', 'म', 'य', 'र', 'ल', 'व', 'श', 'ष', 'स', 'ह', '़', 'ा', 'ि', 'ी', 'ु', 'ू', 'ृ', 'ॅ', 'े', 'ै', 'ॉ', 'ो', 'ौ', '्', 'क़', 'ख़', 'ग़', 'ज़', 'ड़', 'ढ़', 'फ़', '॥']

eng_vocab = [TOKEN_PAD,TOKEN_SOS,TOKEN_EOS]+chr_eng
hin_vocab = [TOKEN_PAD,TOKEN_SOS,TOKEN_EOS]+chr_hin

print(eng_vocab)
print(hin_vocab)

[' ', '*', '^', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']
[' ', '*', '^', 'ँ', 'ं', 'अ', 'आ', 'इ', 'ई', 'उ', 'ऊ', 'ऋ', 'ए', 'ऐ', 'ऑ', 'ओ', 'औ', 'क', 'ख', 'ग', 'घ', 'च', 'छ', 'ज', 'झ', 'ञ', 'ट', 'ठ', 'ड', 'ढ', 'ण', 'त', 'थ', 'द', 'ध', 'न', 'प', 'फ', 'ब', 'भ', 'म', 'य', 'र', 'ल', 'व', 'श', 'ष', 'स', 'ह', '़', 'ा', 'ि', 'ी', 'ु', 'ू', 'ृ', 'ॅ', 'े', 'ै', 'ॉ', 'ो', 'ौ', '्', 'क़', 'ख़', 'ग़', 'ज़', 'ड़', 'ढ़', 'फ़', '॥']


In [7]:
len(eng_vocab), len(hin_vocab)

(29, 71)

In [8]:
# TODO: Data have to same size or do any other things 
# make batch with same size text and random all batchs
# or use pre embadding not post embadding

def add_startToken(texts, start_token='*'):
    return [start_token + text for text in texts]
def add_endToken(texts, end_token='^'):
    return [text + end_token for text in texts]

def preprocesser(texts: list[list], prePadding=True, vocab=eng_vocab, startToken=False, endToken=False, batch_first=False):
    if startToken:
        texts = add_startToken(texts)
    if endToken:
        texts = add_endToken(texts)
    # Convert characters to integers (ASCII - 97)
    text_ints = [[vocab.index(c) for c in text] for text in texts]
    # Apply pre-padding to each sequence
    if prePadding:
        max_length = max(len(seq) for seq in text_ints)
        padded_seqs = pad_sequence([torch.cat([torch.zeros(max_length - len(seq), dtype=torch.int64), torch.LongTensor(seq)]) for seq in text_ints], batch_first=True)
    else:
        padded_seqs = pad_sequence([torch.LongTensor(seq) for seq in text_ints], batch_first=True, padding_value=0)
    
    return padded_seqs.to(device=device) if batch_first else padded_seqs.T.to(device=device)


preprocesser(['hiir', 'laksfffh'], startToken=True, endToken=True, prePadding=False)

tensor([[ 1,  1],
        [10, 14],
        [11,  3],
        [11, 13],
        [20, 21],
        [ 2,  8],
        [ 0,  8],
        [ 0,  8],
        [ 0, 10],
        [ 0,  2]], device='cuda:0')

In [9]:
preprocesser(['', 'का'], vocab=hin_vocab, prePadding=False, startToken=True, endToken=True)

tensor([[ 1,  1],
        [ 2, 17],
        [ 0, 50],
        [ 0,  2]], device='cuda:0')

In [11]:
class CustomDataset(Dataset):
    def __init__(self, batch_size):
        dataset = []
        # with open('dataset_hinglish2hindi.txt', 'r', encoding='utf-8') as f:
        with open('preprocessedHindi2Hinglish.txt', 'r', encoding='utf-8') as f:
            for line in tqdm(f.readlines()):
                stop = False
                for i in line:
                    if i not in chr_eng+chr_hin+['\t', '\n']: 
                        stop = True
                        break
                if stop: continue
                dataset.append(line.split()[::-1])
        dataset.sort(key=lambda x: len(x[0]))
        
        batched = []
        length = len(dataset)
        for i in tqdm(range(0, length, batch_size)):
            if i+batch_size>length: break
            batched.append(self.custom_collate_fn(dataset[i:i+batch_size]))

        self.dataset = batched
        
    @staticmethod
    def custom_collate_fn(batch):
        x = []
        y = []
        for ix, iy in batch:
            x.append(''.join([i for i in ix.lower() if i in chr_eng]))
            y.append(''.join([i for i in iy.lower() if i in chr_hin]))
        x = preprocesser(x, startToken=True, endToken=True, prePadding=False)
        y = preprocesser(y, vocab=hin_vocab, prePadding=False, startToken=True, endToken=True)

        return x, y

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        # Return a single sequence and its label
        return self.dataset[idx]



# Create a DataLoader with batch size 64
batch_size = 512

# Create an instance of the custom dataset
custom_dataset = CustomDataset(batch_size)

100%|██████████| 244381/244381 [00:04<00:00, 52505.21it/s]
100%|█████████▉| 472/473 [00:08<00:00, 56.87it/s]


In [12]:
data_loader = DataLoader(custom_dataset, shuffle=True, batch_size=1)
# Iterate through the DataLoader
for batch in data_loader:
    sequences, labels = batch
    sequences = sequences.squeeze(0)
    labels = labels.squeeze(0)
    print("Batch Shape - Sequences:", sequences.shape, "Labels:", labels.shape)
    break

Batch Shape - Sequences: torch.Size([8, 512]) Labels: torch.Size([17, 512])


In [13]:
for batch in data_loader:
    sequences, labels = batch
    sequences = sequences.squeeze(0)
    labels = labels.squeeze(0)
    # slow training process very much so improve this or remove custom_collate_fn and set it into data

In [38]:
def save_checkpoint(state, filename="my_checkpoint.pth.tar"):
    print("=> Saving checkpoint")
    torch.save(state, filename)


def load_checkpoint(filename, model, optimizer):
    print("=> Loading checkpoint")
    checkpoint = torch.load(filename)
    model.load_state_dict(checkpoint["state_dict"])
    optimizer.load_state_dict(checkpoint["optimizer"])
    
    
def bleu(data, model):
    targets = []
    outputs = []

    for (src,trg)  in tqdm(data):
        try:
            prediction = translate_sentence(model, src)
            targets.append([trg])
            outputs.append(prediction)
        except: pass

    return bleu_score(outputs, targets)

def translate_sentence(model, text, max_length=50):
    result = []
    for word in text.split(' '):
        x = preprocesser([word], startToken=True, vocab=eng_vocab, endToken=True)
        stopIdx = hin_vocab.index(TOKEN_EOS)
        outputs = []
        for i in range(max_length):
            y = preprocesser([''.join(outputs)], startToken=True, vocab=hin_vocab, endToken=False)
            # print(y)
            with torch.no_grad():
                output = model(x, y)
            best_guess = output.argmax(2)[-1, :].item()
            # print(best_guess)
            if best_guess == stopIdx: break
            outputs.append(hin_vocab[best_guess])
        result.append(''.join(outputs))
    return ' '.join(result)

In [15]:
class Transformer(nn.Module):
    def __init__(
        self,
        embedding_size,
        src_vocab_size,
        trg_vocab_size,
        src_pad_idx,
        num_heads,
        num_encoder_layers,
        num_decoder_layers,
        forward_expansion,
        dropout,
        max_len,
        device,
        useOneHotEncoder=False
    ):
        super(Transformer, self).__init__()
        if useOneHotEncoder:
            embedding_size = max([src_vocab_size, max_len, trg_vocab_size])
            embedding_size = max([src_vocab_size, max_len, trg_vocab_size])
            embedding_size += embedding_size%8
            print('Embedding size : ', embedding_size)
            self.dropout = lambda x: x
            self.src_word_embedding = lambda x: F.one_hot(x, embedding_size).float()
            self.src_position_embedding = lambda x: F.one_hot(x, embedding_size).float()
            self.trg_word_embedding = lambda x: F.one_hot(x, embedding_size).float()
            self.trg_position_embedding = lambda x: F.one_hot(x, embedding_size).float()
        else:
            self.src_word_embedding = nn.Embedding(src_vocab_size, embedding_size)
            self.src_position_embedding = nn.Embedding(max_len, embedding_size)
            self.trg_word_embedding = nn.Embedding(trg_vocab_size, embedding_size)
            self.trg_position_embedding = nn.Embedding(max_len, embedding_size)

        self.device = device
        self.transformer = nn.Transformer(
            embedding_size,
            num_heads,
            num_encoder_layers,
            num_decoder_layers,
            forward_expansion,
            dropout,
        )
        self.fc_out = nn.Linear(embedding_size, trg_vocab_size)
        self.dropout = nn.Dropout(dropout)
        self.src_pad_idx = src_pad_idx

    def make_src_mask(self, src):
        src_mask = src.transpose(0, 1) == self.src_pad_idx

        # (N, src_len)
        return src_mask.to(self.device)

    def forward(self, src, trg):
        src_seq_length, N = src.shape
        trg_seq_length, N = trg.shape

        src_positions = (
            torch.arange(0, src_seq_length)
            .unsqueeze(1)
            .expand(src_seq_length, N)
            .to(self.device)
        )

        trg_positions = (
            torch.arange(0, trg_seq_length)
            .unsqueeze(1)
            .expand(trg_seq_length, N)
            .to(self.device)
        )

        embed_src = self.dropout(
            (self.src_word_embedding(src) + self.src_position_embedding(src_positions))
        )
        embed_trg = self.dropout(
            (self.trg_word_embedding(trg) + self.trg_position_embedding(trg_positions))
        )

        src_padding_mask = self.make_src_mask(src)
        trg_mask = self.transformer.generate_square_subsequent_mask(trg_seq_length).to(
            self.device
        )

        out = self.transformer(
            embed_src,
            embed_trg,
            src_key_padding_mask=src_padding_mask,
            tgt_mask=trg_mask,
        )
        out = self.fc_out(out)
        return out

In [20]:
# We're ready to define everything we need for training our Seq2Seq model

load_model = False
save_model = True

# Training hyperparameters
num_epochs = 50
learning_rate = 3e-4
batch_size = 512*4

# Model hyperparameters
src_vocab_size = len(eng_vocab)
trg_vocab_size = len(hin_vocab)
embedding_size = 128
num_heads = 8
assert embedding_size%num_heads == 0
num_encoder_layers = 2
num_decoder_layers = 2
dropout = 0.10
max_len = 50
forward_expansion = 4
src_pad_idx = eng_vocab.index(TOKEN_SOS)

src_pad_idx

1

In [17]:
# Create an instance of the custom dataset
custom_dataset = CustomDataset(batch_size)

train_iterator = DataLoader(custom_dataset, batch_size=1, shuffle=True) # batch_size == 1 as we set it in data_loader

100%|██████████| 244381/244381 [00:04<00:00, 54056.53it/s]
 99%|█████████▉| 118/119 [00:08<00:00, 14.65it/s]


In [18]:
for batch_idx, (batch_src, batch_trg) in enumerate(train_iterator):
    batch_src = batch_src.squeeze(0)
    batch_trg = batch_trg.squeeze(0)
    break
batch_trg.shape, batch_src.shape

(torch.Size([17, 2048]), torch.Size([12, 2048]))

In [22]:
# embedding_size = None
model = Transformer(
    embedding_size,
    src_vocab_size,
    trg_vocab_size,
    src_pad_idx,
    num_heads,
    num_encoder_layers,
    num_decoder_layers,
    forward_expansion,
    dropout,
    max_len,
    device
).to(device)

optimizer = optim.Adam(model.parameters(), lr=learning_rate)

scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
    optimizer, factor=0.1, patience=10, verbose=True
)

pad_idx = hin_vocab.index(TOKEN_PAD)

criterion = nn.CrossEntropyLoss(ignore_index=pad_idx)

if load_model:
    load_checkpoint(filename=f'model\\hinglish2hindi_epoch-50.pth.tar', model=model, optimizer=optimizer)

In [23]:
sentence = "mera naam laksh kumar sisodiya hai"

for epoch in range(num_epochs):
    print(f"[Epoch {epoch} / {num_epochs}]")

    model.eval()
    translated_sentence = translate_sentence(
        model, sentence, max_length=max_len
    )
# 
    print(f"Translated example sentence: \n {translated_sentence}")
    model.train()
    losses = []
    
    total_length = train_iterator.__len__()

    for batch_idx, (batch_src, batch_trg) in tqdm(enumerate(train_iterator), total=total_length):
        # removed as we fix this in own dataset if batch_idx == total_length - 1: break # stop 1 before as something is wrong for small batch size
        batch_src = batch_src.squeeze(0)
        batch_trg = batch_trg.squeeze(0)
        # Get input and targets and get to cuda
        inp_data = batch_src.to(device)
        target = batch_trg.to(device)

        # Forward prop
        output = model(inp_data, target[:-1, :])

        # Output is of shape (trg_len, batch_size, output_dim) but Cross Entropy Loss
        # doesn't take input in that form. For example if we have MNIST we want to have
        # output to be: (N, 10) and targets just (N). Here we can view it in a similar
        # way that we have output_words * batch_size that we want to send in into
        # our cost function, so we need to do some reshapin.
        # Let's also remove the start token while we're at it
        output = output.reshape(-1, output.shape[2])
        target = target[1:].reshape(-1)

        optimizer.zero_grad()

        loss = criterion(output, target)
        losses.append(loss.item())

        # Back prop
        loss.backward()
        # Clip to avoid exploding gradient issues, makes sure grads are
        # within a healthy range
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1)

        # Gradient descent step
        optimizer.step()

    mean_loss = sum(losses) / len(losses)
    scheduler.step(mean_loss)
    
    if save_model and (epoch+1)%10==0 and epoch!=0:
        checkpoint = {
            "state_dict": model.state_dict(),
            "optimizer": optimizer.state_dict(),
        }
        save_checkpoint(checkpoint, filename=f'model\\hinglish2hindi_epoch-{epoch+1}.pth.tar')

[Epoch 0 / 50]
Translated example sentence: 
 गझक़ँपसँँदझक़रजपररँक़ँँँँरईँड़चरझक़ँँखपरईऊठँँपृृृृृक़ँचओ गँँँबईँँदझढ़पररईँँक़ँँँँरईँड़चरँँँँहँहईऊृृठपृृृृृृक़रओ गग़ँँकँँँबूकठततपरजपृतजँरईँड़तरछजज़तहकहगरईँबगतजँगँँँचर गझक़ँपसँछदझकक़ररईँपक़ँँबकँक़ँड़चरछजृक़ँपरईसक़ँबगँपृृृृक़ँओ गठसँकपससढ़झकठअरईड़ँपृृृृृृृृृृृृृअड़कहईऊृृठपृृृृृृक़ँठ गठसँकजँड़ढ़झढ़ठततठरजपृृृकँगँड़तठअजजँहकहईसईँँहतजँगँँँहत


100%|██████████| 118/118 [00:08<00:00, 13.17it/s]


=> Saving checkpoint
[Epoch 1 / 50]
Translated example sentence: 
 मरार नामा लाक कुमर सिसियिया हि


100%|██████████| 118/118 [00:08<00:00, 13.41it/s]


[Epoch 2 / 50]
Translated example sentence: 
 मेरा नाम लक्श कुमर सिसोडिया हाइ


100%|██████████| 118/118 [00:08<00:00, 13.42it/s]


[Epoch 3 / 50]
Translated example sentence: 
 मेरा नाम लक्ष कुमार सिसोडिया हैई


100%|██████████| 118/118 [00:08<00:00, 13.38it/s]


[Epoch 4 / 50]
Translated example sentence: 
 मेरा नाम लक्ष कुमार सिसोडिया है


100%|██████████| 118/118 [00:08<00:00, 13.26it/s]


[Epoch 5 / 50]
Translated example sentence: 
 मेरा नाम लक्ष कुमर सिसोडिया है


100%|██████████| 118/118 [00:08<00:00, 13.26it/s]


[Epoch 6 / 50]
Translated example sentence: 
 मेरा नाम लक्ष कुमर सिसोदिया है


100%|██████████| 118/118 [00:08<00:00, 13.38it/s]


[Epoch 7 / 50]
Translated example sentence: 
 मेरा नाम लक्ष कुमर सिसोडिया है


100%|██████████| 118/118 [00:08<00:00, 13.25it/s]


[Epoch 8 / 50]
Translated example sentence: 
 मेरा नाम लक्ष कुमर सिसोडिया है


100%|██████████| 118/118 [00:08<00:00, 13.25it/s]


[Epoch 9 / 50]
Translated example sentence: 
 मेरा नाम लक्ष कुमर सिसोडिया है


100%|██████████| 118/118 [00:08<00:00, 13.34it/s]


[Epoch 10 / 50]
Translated example sentence: 
 मेरा नाम लक्ष कुमर सिसोडिया है


100%|██████████| 118/118 [00:08<00:00, 13.37it/s]


=> Saving checkpoint
[Epoch 11 / 50]
Translated example sentence: 
 मेरा नाम लक्ष कुमर सिसोडिया है


100%|██████████| 118/118 [00:09<00:00, 12.79it/s]


[Epoch 12 / 50]
Translated example sentence: 
 मेरा नाम लक्ष कुमर सिसोडिया है


100%|██████████| 118/118 [00:11<00:00, 10.68it/s]


[Epoch 13 / 50]
Translated example sentence: 
 मेरा नाम लक्ष कुमर सिसोडिया है


100%|██████████| 118/118 [00:10<00:00, 10.83it/s]


[Epoch 14 / 50]
Translated example sentence: 
 मेरा नाम लक्ष कुमर सिसोडिया है


100%|██████████| 118/118 [00:10<00:00, 10.82it/s]


[Epoch 15 / 50]
Translated example sentence: 
 मेरा नाम लक्ष कुमर सिसोडिया है


100%|██████████| 118/118 [00:11<00:00, 10.51it/s]


[Epoch 16 / 50]
Translated example sentence: 
 मेरा नाम लक्ष कुमर सिसोडिया है


100%|██████████| 118/118 [00:11<00:00, 10.55it/s]


[Epoch 17 / 50]
Translated example sentence: 
 मेरा नाम लक्ष कुमर सिसोडिया है


100%|██████████| 118/118 [00:11<00:00, 10.70it/s]


[Epoch 18 / 50]
Translated example sentence: 
 मेरा नाम लक्ष कुमर सिसोडिया है


100%|██████████| 118/118 [00:11<00:00, 10.35it/s]


[Epoch 19 / 50]
Translated example sentence: 
 मेरा नाम लक्ष कुमर सिसोदिया है


100%|██████████| 118/118 [00:11<00:00, 10.23it/s]


[Epoch 20 / 50]
Translated example sentence: 
 मेरा नाम लक्ष कुमर सिसोडिया है


100%|██████████| 118/118 [00:11<00:00, 10.42it/s]


=> Saving checkpoint
[Epoch 21 / 50]
Translated example sentence: 
 मेरा नाम लक्ष कुमर सिसोदिया है


100%|██████████| 118/118 [00:11<00:00, 10.69it/s]


[Epoch 22 / 50]
Translated example sentence: 
 मेरा नाम लक्ष कुमर सिसोडिया है


100%|██████████| 118/118 [00:11<00:00, 10.44it/s]


[Epoch 23 / 50]
Translated example sentence: 
 मेरा नाम लक्ष कुमर सिसोदिया है


100%|██████████| 118/118 [00:11<00:00, 10.60it/s]


[Epoch 24 / 50]
Translated example sentence: 
 मेरा नाम लक्ष कुमर सिसोदिया है


100%|██████████| 118/118 [00:11<00:00, 10.66it/s]


[Epoch 25 / 50]
Translated example sentence: 
 मेरा नाम लक्ष कुमर सिसोडिया है


100%|██████████| 118/118 [00:11<00:00, 10.70it/s]


[Epoch 26 / 50]
Translated example sentence: 
 मेरा नाम लक्ष कुमर सिसोडिया है


100%|██████████| 118/118 [00:10<00:00, 10.81it/s]


[Epoch 27 / 50]
Translated example sentence: 
 मेरा नाम लक्ष कुमर सिसोदिया है


100%|██████████| 118/118 [00:11<00:00, 10.66it/s]


[Epoch 28 / 50]
Translated example sentence: 
 मेरा नाम लक्ष कुमर सिसोदिया है


100%|██████████| 118/118 [00:11<00:00, 10.51it/s]


[Epoch 29 / 50]
Translated example sentence: 
 मेरा नाम लक्ष कुमर सिसोदिया है


100%|██████████| 118/118 [00:11<00:00, 10.36it/s]


[Epoch 30 / 50]
Translated example sentence: 
 मेरा नाम लक्ष कुमर सिसोडिया है


100%|██████████| 118/118 [00:11<00:00, 10.53it/s]


=> Saving checkpoint
[Epoch 31 / 50]
Translated example sentence: 
 मेरा नाम लक्ष कुमर सिसोदिया है


100%|██████████| 118/118 [00:11<00:00, 10.46it/s]


[Epoch 32 / 50]
Translated example sentence: 
 मेरा नाम लक्ष कुमर सिसोदिया है


100%|██████████| 118/118 [00:09<00:00, 12.32it/s]


[Epoch 33 / 50]
Translated example sentence: 
 मेरा नाम लक्ष कुमर सिसोदिया है


100%|██████████| 118/118 [00:10<00:00, 11.17it/s]


[Epoch 34 / 50]
Translated example sentence: 
 मेरा नाम लक्ष कुमर सिसोदिया है


100%|██████████| 118/118 [00:10<00:00, 10.88it/s]


[Epoch 35 / 50]
Translated example sentence: 
 मेरा नाम लक्ष कुमर सिसोडिया है


100%|██████████| 118/118 [00:10<00:00, 10.97it/s]


[Epoch 36 / 50]
Translated example sentence: 
 मेरा नाम लक्ष कुमर सिसोदिया है


100%|██████████| 118/118 [00:10<00:00, 10.75it/s]


[Epoch 37 / 50]
Translated example sentence: 
 मेरा नाम लक्ष कुमर सिसोदिया है


100%|██████████| 118/118 [00:10<00:00, 10.76it/s]


[Epoch 38 / 50]
Translated example sentence: 
 मेरा नाम लक्ष कुमर सिसोदिया है


100%|██████████| 118/118 [00:11<00:00, 10.70it/s]


[Epoch 39 / 50]
Translated example sentence: 
 मेरा नाम लक्ष कुमर सिसोदिया है


100%|██████████| 118/118 [00:11<00:00, 10.70it/s]


[Epoch 40 / 50]
Translated example sentence: 
 मेरा नाम लक्ष कुमर सिसोदिया है


100%|██████████| 118/118 [00:11<00:00, 10.27it/s]


=> Saving checkpoint
[Epoch 41 / 50]
Translated example sentence: 
 मेरा नाम लक्ष कुमर सिसोडिया है


100%|██████████| 118/118 [00:11<00:00, 10.40it/s]


[Epoch 42 / 50]
Translated example sentence: 
 मेरा नाम लक्ष कुमर सिसोदिया है


100%|██████████| 118/118 [00:11<00:00, 10.71it/s]


[Epoch 43 / 50]
Translated example sentence: 
 मेरा नाम लक्ष कुमर सिसोडिया है


100%|██████████| 118/118 [00:10<00:00, 10.83it/s]


[Epoch 44 / 50]
Translated example sentence: 
 मेरा नाम लक्ष कुमर सिसोदिया है


100%|██████████| 118/118 [00:11<00:00, 10.56it/s]


[Epoch 45 / 50]
Translated example sentence: 
 मेरा नाम लक्ष कुमर सिसोदिया है


100%|██████████| 118/118 [00:11<00:00, 10.55it/s]


[Epoch 46 / 50]
Translated example sentence: 
 मेरा नाम लक्ष कुमर सिसोदिया है


100%|██████████| 118/118 [00:10<00:00, 11.26it/s]


[Epoch 47 / 50]
Translated example sentence: 
 मेरा नाम लक्ष कुमर सिसोदिया है


100%|██████████| 118/118 [00:10<00:00, 10.87it/s]


[Epoch 48 / 50]
Translated example sentence: 
 मेरा नाम लक्ष कुमर सिसोडिया है


100%|██████████| 118/118 [00:09<00:00, 12.59it/s]


[Epoch 49 / 50]
Translated example sentence: 
 मेरा नाम लक्ष कुमर सिसोदिया है


100%|██████████| 118/118 [00:08<00:00, 13.29it/s]


In [24]:
checkpoint = {
    "state_dict": model.state_dict(),
    "optimizer": optimizer.state_dict(),    
}
save_checkpoint(checkpoint, filename=f'model\\hinglish2hindi_epoch-50.pth.tar')

=> Saving checkpoint


In [39]:
load_checkpoint(filename=f'model\\hinglish2hindi_epoch-50.pth.tar', model=model, optimizer=optimizer)

=> Loading checkpoint


In [43]:
model.eval()
translated_sentence = translate_sentence(
    model, "mera naam laksh kumar sisodiya hai", max_length=50
)

print(f"Translated example sentence: \n {translated_sentence}")

Translated example sentence: 
 मेरा नाम लक्ष कुमर सिसोदिया है


In [27]:
lines = []
with open('dataset.txt', 'r', encoding='utf') as f:
    for line in f.readlines():
        lines.append(line)
random.shuffle(lines)
test_data = []
for i in lines:
    test_data.append(i.split())


In [28]:
# running on entire test data takes a while
score = bleu(test_data[1:500], model)
print(f"Bleu score {score * 100:.2f}") # EPOCH 10 -> 54.79 Now at EPOCH 110 -> 61.72

100%|██████████| 499/499 [00:15<00:00, 32.33it/s]


Bleu score 59.48


In [29]:
model.eval()
sentence = "tum kya kar rahe ho"
translate_sentence(
    model, sentence, max_length=50
)

'तुम क्या कर रहे हो'

In [37]:
sentence = ''
real = ''

for _ in range(20):
    line = random.choice(lines)
    inp, tar = line.split()
    sentence += inp+' '
    real += tar+' '

sentence = sentence.strip()
real = real.strip()

out = translate_sentence(model, sentence).split()

sentence = sentence.split()
real = real.split()
max_sentence = max([len(i) for i in sentence])

print(f"sentence{' '*(max_sentence-len('sentence'))} modelOutput    real\n")

for i, o in enumerate(out):
    print(f"{sentence[i]}{' '*(max_sentence-len(sentence[i]))} => {o:<10} => {real[i]}")

sentence   modelOutput    real

yaseelaa   => यसीला      => यशीला
of         => ऑफ         => की
fuller     => फुलर       => फुलर
lutari     => लुटरी      => लुटारी
hum        => हुम        => हम
enosh      => एनओश       => एनोश
mukam      => मुकम       => मुकाम
purvon     => पूर्वों    => पुर्वो
elda       => एलडा       => एल्दा
king       => किंग       => राजा
malika     => मलिका      => मलिका
veer       => वीर        => वीर
van        => वन         => वैन
keladhan   => केलाधन     => कैलाघन
charkhi    => चरखी       => चरखी
part       => पार्ट      => ओर
nigtingale => निग्तिंगले => नाइटेंगल
varn       => वर्ण       => वर्न
art        => अर्त       => आर्ट
reservoir  => रिजरवोइर   => रिज़रवायर


# trying with oneHot for educational purposes

In [92]:
# We're ready to define everything we need for training our Seq2Seq model

load_model = False
save_model = True

# Training hyperparameters
num_epochs = 50
learning_rate = 3e-4
batch_size = 512*4

# Model hyperparameters
src_vocab_size = len(eng_vocab)
trg_vocab_size = len(hin_vocab)
num_heads = 8
num_encoder_layers = 2
num_decoder_layers = 2
dropout = 0.10
max_len = 72
forward_expansion = 4
src_pad_idx = eng_vocab.index(TOKEN_SOS)

src_pad_idx

1

In [93]:
src_vocab_size, trg_vocab_size

(29, 71)

In [99]:
embedding_size = max([src_vocab_size, max_len, trg_vocab_size])
embedding_size = max([src_vocab_size, max_len, trg_vocab_size])
print('Embedding size : ', embedding_size)
assert embedding_size%num_heads == 0

Embedding size :  72


In [103]:
# embedding_size = None
modelOneHot = Transformer(
    None,
    src_vocab_size,
    trg_vocab_size,
    src_pad_idx,
    num_heads,
    num_encoder_layers,
    num_decoder_layers,
    forward_expansion,
    dropout,
    max_len,
    device,
    useOneHotEncoder=True
).to(device)

optimizer = optim.Adam(modelOneHot.parameters(), lr=learning_rate)

scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
    optimizer, factor=0.1, patience=10, verbose=True
)

pad_idx = hin_vocab.index(TOKEN_PAD)

criterion = nn.CrossEntropyLoss(ignore_index=pad_idx)

if load_model:
    load_checkpoint(filename=f'model\\hinglish2hindi_oneHot_epoch-50.pth.tar', model=modelOneHot, optimizer=optimizer)

Embedding size :  72


In [104]:
sentence = "mera naam laksh kumar sisodiya hai"

for epoch in range(num_epochs):
    print(f"[Epoch {epoch} / {num_epochs}]")

    modelOneHot.eval()
    translated_sentence = translate_sentence(
        modelOneHot, sentence, max_length=max_len
    )
# 
    print(f"Translated example sentence: \n {translated_sentence}")
    modelOneHot.train()
    losses = []
    
    total_length = train_iterator.__len__()

    for batch_idx, (batch_src, batch_trg) in tqdm(enumerate(train_iterator), total=total_length):
        # removed as we fix this in own dataset if batch_idx == total_length - 1: break # stop 1 before as something is wrong for small batch size
        batch_src = batch_src.squeeze(0)
        batch_trg = batch_trg.squeeze(0)
        # Get input and targets and get to cuda
        inp_data = batch_src.to(device)
        target = batch_trg.to(device)

        # Forward prop
        output = modelOneHot(inp_data, target[:-1, :])

        # Output is of shape (trg_len, batch_size, output_dim) but Cross Entropy Loss
        # doesn't take input in that form. For example if we have MNIST we want to have
        # output to be: (N, 10) and targets just (N). Here we can view it in a similar
        # way that we have output_words * batch_size that we want to send in into
        # our cost function, so we need to do some reshapin.
        # Let's also remove the start token while we're at it
        output = output.reshape(-1, output.shape[2])
        target = target[1:].reshape(-1)

        optimizer.zero_grad()

        loss = criterion(output, target)
        losses.append(loss.item())

        # Back prop
        loss.backward()
        # Clip to avoid exploding gradient issues, makes sure grads are
        # within a healthy range
        torch.nn.utils.clip_grad_norm_(modelOneHot.parameters(), max_norm=1)

        # Gradient descent step
        optimizer.step()

    mean_loss = sum(losses) / len(losses)
    scheduler.step(mean_loss)
    
    if save_model and (epoch+1)%10==0 and epoch!=0:
        checkpoint = {
            "state_dict": modelOneHot.state_dict(),
            "optimizer": optimizer.state_dict(),
        }
        save_checkpoint(checkpoint, filename=f'model\\hinglish2hindi_oneHot_epoch-{epoch+1}.pth.tar')

[Epoch 0 / 50]
Translated example sentence: 
 ॥ ॥स दऋफभसए  ॥॥थॅ ॥ ॥रररऋफ॥ ॥   ऋस टघ॥थवरदऋरस    रघ  दई    ॥रससदथरदसदस ख़ ॥ ॥थऋञरृयदथऋसऋरऋफ॥रई रररऋफ॥ ॥थररऋऋ थवोवढ़रदऋररअदस रई  दयद   ॥रससदथरदथदस ख़ ॥ ऑस ट॥॥ दथऋ ॥रयऑट॥ ॥रररऋ दईफ़॥ररदई टपदथ*॥फररट॥  दयद  दयद   ॥रदथदथरदएदऋरर ॥॥सस दथ॥ दथऋ ॥रऋफ॥रई रररऋफ॥ ॥थररऋऋञफररऋफ॥फरऋएख़दस ॥  ख़दई   ॥॥रससदऋरदऋफ॥फर   ट॥ ट॥  टदऋ ॥रऋस ॥  रररऋ टदथऋररऋस टदोवढ़रदऋरट॥   रट  दयद   ॥रटटद रद दस र ॥ै॥रञदऋ॥ दथऋञृदऋख़ञृख़ञृदईफ़ृदईृख़दईदईफ़सदोञष॥फभबञृदऋञृदऋसदईदोञ ख़ञष॥ोञरदऋदऋरर


100%|██████████| 118/118 [00:07<00:00, 15.94it/s]


[Epoch 1 / 50]
Translated example sentence: 
 रमर ममाा स्स रार सससस माला


100%|██████████| 118/118 [00:07<00:00, 16.56it/s]


[Epoch 2 / 50]
Translated example sentence: 
 मर्र मान लाला कुर सिसिय्स हा


100%|██████████| 118/118 [00:07<00:00, 16.55it/s]


[Epoch 3 / 50]
Translated example sentence: 
 मरर नाम लास कुर्र सियियो हा


100%|██████████| 118/118 [00:07<00:00, 16.49it/s]


[Epoch 4 / 50]
Translated example sentence: 
 मरार नाम लक्स कुमर सिस्योडी हाई


100%|██████████| 118/118 [00:07<00:00, 16.49it/s]


[Epoch 5 / 50]
Translated example sentence: 
 मरा नाम लक्श कुमर सिसोया हाई


100%|██████████| 118/118 [00:07<00:00, 16.28it/s]


[Epoch 6 / 50]
Translated example sentence: 
 मेरा नाम लक्श कुमर सिसोडी हाई


100%|██████████| 118/118 [00:07<00:00, 16.38it/s]


[Epoch 7 / 50]
Translated example sentence: 
 मेरा नाम लक्श कुमर सिसोडी हाई


100%|██████████| 118/118 [00:07<00:00, 16.43it/s]


[Epoch 8 / 50]
Translated example sentence: 
 मेरा नाम लक्श कुमर सिसोडी हाई


100%|██████████| 118/118 [00:07<00:00, 16.54it/s]


[Epoch 9 / 50]
Translated example sentence: 
 मेरा नाम लक्श कुमर सिसोडी हाई


100%|██████████| 118/118 [00:07<00:00, 16.41it/s]


=> Saving checkpoint
[Epoch 10 / 50]
Translated example sentence: 
 मेरा नाम लक्ष कुमर सिसोडीया हाई


100%|██████████| 118/118 [00:07<00:00, 15.89it/s]


[Epoch 11 / 50]
Translated example sentence: 
 मेरा नाम लक्ष कुमर सिसोडीया है


100%|██████████| 118/118 [00:07<00:00, 16.08it/s]


[Epoch 12 / 50]
Translated example sentence: 
 मेरा नाम लक्ष कुमर सिसोडीया हाई


100%|██████████| 118/118 [00:07<00:00, 16.24it/s]


[Epoch 13 / 50]
Translated example sentence: 
 मेरा नाम लक्ष कुमर सिसोडीया हाई


100%|██████████| 118/118 [00:07<00:00, 16.39it/s]


[Epoch 14 / 50]
Translated example sentence: 
 मेरा नाम लक्ष कुमर सिसोडीया है


100%|██████████| 118/118 [00:07<00:00, 16.29it/s]


[Epoch 15 / 50]
Translated example sentence: 
 मेरा नाम लक्ष कुमर सिसोडीया है


100%|██████████| 118/118 [00:07<00:00, 16.58it/s]


[Epoch 16 / 50]
Translated example sentence: 
 मेरा नाम लक्ष कुमर सिसोडिया है


100%|██████████| 118/118 [00:07<00:00, 16.65it/s]


[Epoch 17 / 50]
Translated example sentence: 
 मेरा नाम लक्ष कुमर सिसोडिया है


100%|██████████| 118/118 [00:07<00:00, 16.53it/s]


[Epoch 18 / 50]
Translated example sentence: 
 मेरा नाम लक्ष कुमर सिसोडिया है


100%|██████████| 118/118 [00:07<00:00, 16.24it/s]


[Epoch 19 / 50]
Translated example sentence: 
 मेरा नाम लक्ष कुमर सिसोडिया है


100%|██████████| 118/118 [00:07<00:00, 16.45it/s]


=> Saving checkpoint
[Epoch 20 / 50]
Translated example sentence: 
 मेरा नाम लक्ष कुमर सिसोडिया है


100%|██████████| 118/118 [00:07<00:00, 16.57it/s]


[Epoch 21 / 50]
Translated example sentence: 
 मेरा नाम लक्ष कुमर सिसोडिया है


100%|██████████| 118/118 [00:07<00:00, 16.58it/s]


[Epoch 22 / 50]
Translated example sentence: 
 मेरा नाम लक्ष कुमर सिसोडिया है


100%|██████████| 118/118 [00:07<00:00, 16.52it/s]


[Epoch 23 / 50]
Translated example sentence: 
 मेरा नाम लक्ष कुमर सिसोडिया है


100%|██████████| 118/118 [00:07<00:00, 16.41it/s]


[Epoch 24 / 50]
Translated example sentence: 
 मेरा नाम लक्ष कुमर सिसोडिया है


100%|██████████| 118/118 [00:07<00:00, 16.44it/s]


[Epoch 25 / 50]
Translated example sentence: 
 मेरा नाम लक्ष कुमर सिसोडिया है


100%|██████████| 118/118 [00:07<00:00, 16.53it/s]


[Epoch 26 / 50]
Translated example sentence: 
 मेरा नाम लक्ष कुमर सिसोडिया है


100%|██████████| 118/118 [00:07<00:00, 16.45it/s]


[Epoch 27 / 50]
Translated example sentence: 
 मेरा नाम लक्ष कुमर सिसोडिया है


100%|██████████| 118/118 [00:07<00:00, 16.52it/s]


[Epoch 28 / 50]
Translated example sentence: 
 मेरा नाम लक्ष कुमर सिसोडिया है


100%|██████████| 118/118 [00:07<00:00, 16.56it/s]


[Epoch 29 / 50]
Translated example sentence: 
 मेरा नाम लक्ष कुमर सिसोडिया है


100%|██████████| 118/118 [00:07<00:00, 16.53it/s]


=> Saving checkpoint
[Epoch 30 / 50]
Translated example sentence: 
 मेरा नाम लक्ष कुमर सिसोदिया है


100%|██████████| 118/118 [00:07<00:00, 16.57it/s]


[Epoch 31 / 50]
Translated example sentence: 
 मेरा नाम लक्ष कुमर सिसोदिया है


100%|██████████| 118/118 [00:07<00:00, 16.49it/s]


[Epoch 32 / 50]
Translated example sentence: 
 मेरा नाम लक्ष कुमर सिसोडिया है


100%|██████████| 118/118 [00:07<00:00, 16.52it/s]


[Epoch 33 / 50]
Translated example sentence: 
 मेरा नाम लक्ष कुमर सिसोडिया है


100%|██████████| 118/118 [00:07<00:00, 16.48it/s]


[Epoch 34 / 50]
Translated example sentence: 
 मेरा नाम लक्ष कुमर सिसोडिया है


100%|██████████| 118/118 [00:07<00:00, 16.46it/s]


[Epoch 35 / 50]
Translated example sentence: 
 मेरा नाम लक्ष कुमर सिसोडिया है


100%|██████████| 118/118 [00:07<00:00, 16.41it/s]


[Epoch 36 / 50]
Translated example sentence: 
 मेरा नाम लक्ष कुमर सिसोडिया है


100%|██████████| 118/118 [00:07<00:00, 16.54it/s]


[Epoch 37 / 50]
Translated example sentence: 
 मेरा नाम लक्ष कुमर सिसोडिया है


100%|██████████| 118/118 [00:07<00:00, 16.47it/s]


[Epoch 38 / 50]
Translated example sentence: 
 मेरा नाम लक्ष कुमर सिसोडिया है


100%|██████████| 118/118 [00:07<00:00, 16.56it/s]


[Epoch 39 / 50]
Translated example sentence: 
 मेरा नाम लक्ष कुमर सिसोदिया है


100%|██████████| 118/118 [00:07<00:00, 16.65it/s]


=> Saving checkpoint
[Epoch 40 / 50]
Translated example sentence: 
 मेरा नाम लक्ष कुमर सिसोदिया है


100%|██████████| 118/118 [00:07<00:00, 16.60it/s]


[Epoch 41 / 50]
Translated example sentence: 
 मेरा नाम लक्ष कुमर सिसोदिया है


100%|██████████| 118/118 [00:07<00:00, 16.61it/s]


[Epoch 42 / 50]
Translated example sentence: 
 मेरा नाम लक्ष कुमर सिसोदिया है


100%|██████████| 118/118 [00:07<00:00, 16.62it/s]


[Epoch 43 / 50]
Translated example sentence: 
 मेरा नाम लक्ष कुमर सिसोडिया है


100%|██████████| 118/118 [00:07<00:00, 16.54it/s]


[Epoch 44 / 50]
Translated example sentence: 
 मेरा नाम लक्ष कुमर सिसोदिया है


100%|██████████| 118/118 [00:07<00:00, 16.47it/s]


[Epoch 45 / 50]
Translated example sentence: 
 मेरा नाम लक्ष कुमर सिसोडिया है


100%|██████████| 118/118 [00:07<00:00, 16.47it/s]


[Epoch 46 / 50]
Translated example sentence: 
 मेरा नाम लक्ष कुमर सिसोडिया है


100%|██████████| 118/118 [00:07<00:00, 16.53it/s]


[Epoch 47 / 50]
Translated example sentence: 
 मेरा नाम लक्ष कुमर सिसोडिया है


100%|██████████| 118/118 [00:07<00:00, 16.51it/s]


[Epoch 48 / 50]
Translated example sentence: 
 मेरा नाम लक्ष कुमर सिसोदिया है


100%|██████████| 118/118 [00:07<00:00, 16.53it/s]


[Epoch 49 / 50]
Translated example sentence: 
 मेरा नाम लक्ष कुमर सिसोडिया है


100%|██████████| 118/118 [00:07<00:00, 16.60it/s]


=> Saving checkpoint


In [None]:
checkpoint = {
    "state_dict": modelOneHot.state_dict(),
    "optimizer": optimizer.state_dict(),    
}
save_checkpoint(checkpoint, filename=f'model\\hinglish2hindi_oneHot_epoch-50.pth.tar')

In [105]:
load_checkpoint(filename=f'model\\hinglish2hindi_oneHot_epoch-50.pth.tar', model=modelOneHot, optimizer=optimizer)

=> Loading checkpoint


In [107]:
modelOneHot.eval()
translated_sentence = translate_sentence(
    modelOneHot, "mera naam laksh kumar sisodiya hai", max_length=50
)

print(f"Translated example sentence: \n {translated_sentence}")

Translated example sentence: 
 मेरा नाम लक्ष कुमर सिसोडिया है


In [108]:
lines = []
with open('dataset.txt', 'r', encoding='utf') as f:
    for line in f.readlines():
        lines.append(line)
random.shuffle(lines)
test_data = []
for i in lines:
    test_data.append(i.split())


In [112]:
# running on entire test data takes a while
score = bleu(test_data[1:500], modelOneHot)
print(f"Bleu score {score * 100:.2f}")

100%|██████████| 499/499 [00:17<00:00, 28.47it/s]

Bleu score 49.72





In [113]:
modelOneHot.eval()
sentence = "tum kya kar rahe ho"
translate_sentence(
    modelOneHot, sentence, max_length=50
)

'तुम क्या कर रहे हो'

In [114]:
sentence = ''
real = ''

for _ in range(20):
    line = random.choice(lines)
    inp, tar = line.split()
    sentence += inp+' '
    real += tar+' '

sentence = sentence.strip()
real = real.strip()

out = translate_sentence(modelOneHot, sentence).split()

sentence = sentence.split()
real = real.split()
max_sentence = max([len(i) for i in sentence])

print(f"sentence{' '*(max_sentence-len('sentence'))} modelOutput    real\n")

for i, o in enumerate(out):
    print(f"{sentence[i]}{' '*(max_sentence-len(sentence[i]))} => {o:<10} => {real[i]}")

sentence  modelOutput    real

maria     => मरिया      => मारिया
abdul     => अबुल       => अब्दुल
adimurty  => अदिमुर्त्य => आदिमूर्ति
pahala    => पहला       => पहला
tata      => तता        => टाटा
law       => लव         => कानून
chirag    => चिराग      => चिराग़
gulabganj => गुलबगंज    => गुलाबगंज
museum    => मुसेम      => म्युज़ियम
koen      => कोएन       => कोएन
karter    => कर्तर      => कार्टर
babu      => बबू        => बाबू
baspist   => बसपिस्ट    => बैपटिस्ट
gray      => ग्रय       => कंबरलैंड
ramesh    => रमेश       => रमेश
durg      => दुर्ग      => दुर्ग
kanada    => कनदा       => कनाडा
yvonne    => यूनने      => वॉन्नी
batuque   => बतुकुए     => बैट्यूक
carolina  => करोलिना    => कैरोलिना
