In [None]:
import torch.nn.functional as F
import torch.nn as nn
import torch
import numpy as np
from tqdm.auto import tqdm

from datasets import load_dataset

In [None]:
dataset = load_dataset("cfilt/iitb-english-hindi")

In [None]:
# dataset['train']['translation'][:10]

# Tokenizer

In [None]:
from transformers import AutoTokenizer

old_tokenizer = AutoTokenizer.from_pretrained("gpt2")

In [None]:
def training_corpus(dtype='train', lang='hi'):
    l_dataset = len(dataset[dtype])
    for i in range(0, l_dataset, 1000):
        yield [dataset[dtype][i + j]["translation"][lang] for j in range(min(1000,l_dataset-i))]


In [None]:
from transformers import AutoTokenizer

old_tokenizer = AutoTokenizer.from_pretrained("gpt2")

In [None]:
for hi_data in training_corpus(dtype='train',lang='hi'):
    break;
for en_data in training_corpus(dtype='train',lang='en'):
    break;

In [None]:
tokens = old_tokenizer.tokenize(d[0])
len(tokens),tokens

In [None]:
EN_VOCAB_SIZE = 75000
HI_VOCAB_SIZE = 75000

In [None]:
hi_tokenizer = old_tokenizer.train_new_from_iterator(training_corpus('train'), HI_VOCAB_SIZE)
en_tokenizer = old_tokenizer.train_new_from_iterator(training_corpus('train', lang='en'), EN_VOCAB_SIZE)

In [None]:
en_tokenizer.save_pretrained("eng-tokenizer")

In [None]:
hi_tokenizer.save_pretrained("hindi-tokenizer")

In [None]:
tokens = hi_tokenizer.tokenize(hi_data[2])
print(len(tokens),tokens)
    hi_tokenizer.convert_tokens_to_string(tokens)

In [None]:
tokens = en_tokenizer.tokenize(en_data[2])
print(len(tokens),tokens)
en_tokenizer.convert_tokens_to_string(tokens)

### Load Tokenizers from saved

In [None]:
from transformers import AutoTokenizer

hi_tokenizer = AutoTokenizer.from_pretrained("hindi-tokenizer")
en_tokenizer = AutoTokenizer.from_pretrained("eng-tokenizer")

In [None]:
hi_tokenizer.add_special_tokens({'pad_token': '[PAD]', 'cls_token': '<cls>', 'eos_token':'<eos>', 'bos_token' : '<s>'})

en_tokenizer.add_special_tokens({'pad_token': '[PAD]', 'cls_token': '<cls>', 'eos_token':'<eos>', 'bos_token' : '<s>'})

In [None]:
from tokenizers.processors import TemplateProcessing
en_tokenizer._tokenizer.post_processor = TemplateProcessing(
    single=en_tokenizer.bos_token + " $A " + en_tokenizer.eos_token,
    special_tokens=[(en_tokenizer.eos_token, en_tokenizer.eos_token_id), (en_tokenizer.bos_token, en_tokenizer.bos_token_id)],
)

In [None]:
en_sen = dataset['train']['translation'][1]['en']

In [None]:
en_tokenizer.encode(en_sen, add_special_tokens = True)

# Translator - Train

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

In [None]:
subset = list(range(0, 2))
dataset['train'] = torch.utils.data.Subset(dataset['train'], subset)
dataset['validation'] = torch.utils.data.Subset(dataset['validation'], subset)

In [None]:
BS = 2
train_loader = torch.utils.data.DataLoader(dataset['train'], batch_size=BS, shuffle=True)
val_loader = torch.utils.data.DataLoader(dataset['validation'], batch_size=BS, shuffle=True)
test_loader = torch.utils.data.DataLoader(dataset['test'], batch_size=BS, shuffle=True)

In [None]:
for b in train_loader:
    break;

In [None]:
# d = (hi_tokenizer(b['translation']['hi'], padding=True, truncation=True, return_tensors="pt"),
# en_tokenizer(b['translation']['en'], padding=True, truncation=True, return_tensors="pt"))

In [None]:
criterion = nn.NLLLoss(ignore_index=en_tokenizer.pad_token_id)

In [None]:
def compute_loss(predictions, targets):
    """Compute our custom loss"""
    predictions = predictions[:, :-1, :].contiguous()
    targets = targets[:, 1:]

    rearranged_output = predictions.view(predictions.shape[0]*predictions.shape[1], -1)
    rearranged_target = targets.contiguous().view(-1)

    loss = criterion(rearranged_output, rearranged_target)

    return loss

In [None]:
import transformers

encoder_config = transformers.BertConfig(vocab_size=len(hi_tokenizer))
decoder_config = transformers.BertConfig(vocab_size = len(en_tokenizer))

config = transformers.EncoderDecoderConfig.from_encoder_decoder_configs(encoder_config, decoder_config)
model = transformers.EncoderDecoderModel(config)

In [None]:
model.config.decoder_start_token_id = en_tokenizer.cls_token_id
model.config.pad_token_id = en_tokenizer.pad_token_id
model.config.eos_token_id = en_tokenizer.eos_token_id
model.config.bos_token_id = en_tokenizer.bos_token_id

model = model.to(device)

In [None]:
optimizer = transformers.AdamW(model.parameters(), lr=1e-4)

In [None]:
def train_model(train_loader):
    model.train()
    epoch_loss = 0

    num_train_batches = len(train_loader)
    for i, b in tqdm(enumerate(train_loader)):
        if i%20==0:
            print(i,end=' ')
        optimizer.zero_grad()

        hi_token = hi_tokenizer(b['translation']['hi'], padding=True, truncation=True, return_tensors="pt")
        en_token = en_tokenizer(b['translation']['en'], padding=True, truncation=True, return_tensors="pt")
        
        hi_input = hi_token['input_ids'].to(device)
        hi_masks = hi_token['attention_mask'].to(device)
        
        en_output = en_token['input_ids'].to(device)

        out = model(input_ids=hi_input, attention_mask = hi_masks, labels=en_output)
        prediction_scores = out[1]
        predictions = F.log_softmax(prediction_scores, dim=-1)
        loss = compute_loss(predictions, en_output)

        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()

        epoch_loss += loss.item()

    return epoch_loss / num_train_batches

In [None]:
def eval_model(val_loader):
    model.eval()
    epoch_loss = 0

    num_valid_batches = len(val_loader)
    for i, b in enumerate(val_loader):
        if i%20==0:
            print(i,end=' ')
        optimizer.zero_grad()

        hi_token = hi_tokenizer(b['translation']['hi'], padding=True, truncation=True, return_tensors="pt")
        en_token = en_tokenizer(b['translation']['en'], padding=True, truncation=True, return_tensors="pt")
        
        def greedy_decoding(hi_token, en_token, device):
            hi_input = hi_token['input_ids'].to(device)
            hi_masks = hi_token['attention_mask'].to(device)

            en_output = en_token['input_ids'].to(device)

            BS = hi_input.shape[0]

            pred_words = torch.tensor([[en_tokenizer.bos_token_id]]*BS)
            dec_out = pred_words.to(device)
            unfinished_seq = np.array([1]*BS)
            
            for i in range(en_output.shape[1]):
                output = model(input_ids = hi_input, attention_mask = hi_masks, labels = dec_out )
                pred_words = torch.argmax(output.logits, dim=-1)[:,-1:]    
                pred_words[unfinished_seq==0,:] = en_tokenizer.pad_token_id
                dec_out = torch.cat((dec_out,pred_words),dim=1)

                print(pred_words)
                unfinished_seq[(dec_out[:,-1] == en_tokenizer.eos_token_id).cpu().numpy()] = 0

            predictions = F.log_softmax(output.logits, dim=-1)
            
            print(dec_out)
            return compute_loss(predictions, en_output).item()
        
        epoch_loss += greedy_decoding(hi_token, en_token, device)

    return (epoch_loss / num_valid_batches)

eval_model(train_loader)

In [None]:

len(train_loader), len(val_loader)

In [None]:
for epoch in range(100):
    train_epoch_loss = train_model(train_loader)
    val_epoch_loss = eval_model(train_loader)
    print(f"\n\nepoch: {epoch}, train_loss: {train_epoch_loss}, val_loss: {val_epoch_loss}")
    

### Save Model Weights

In [None]:
PATH = "./translate_hin_to_eng.pth"

In [None]:
torch.save(model.state_dict(), PATH)

### Load Model

In [None]:
import transformers

encoder_config = transformers.BertConfig(vocab_size=len(hi_tokenizer))
decoder_config = transformers.BertConfig(vocab_size = len(en_tokenizer))

config = transformers.EncoderDecoderConfig.from_encoder_decoder_configs(encoder_config, decoder_config)
model = transformers.EncoderDecoderModel(config)

model.config.decoder_start_token_id = en_tokenizer.cls_token_id
model.config.pad_token_id = en_tokenizer.pad_token_id
model.config.eos_token_id = en_tokenizer.eos_token_id
model.config.bos_token_id = en_tokenizer.bos_token_id

model.load_state_dict(torch.load(PATH))
model.eval()

# Evaluate

In [None]:
list(map(en_tokenizer.decode, d[1]['input_ids']))

In [None]:
out = model(input_ids=d[0]['input_ids'],
                     attention_mask = d[0]['attention_mask'],
                     labels = d[1]['input_ids']
            )

list(map(en_tokenizer.decode, torch.argmax(out.logits, dim=-1)))

In [None]:
output = model.generate(input_ids = d[0]['input_ids'], decoder_start_token_id=en_tokenizer.cls_token_id)

list(map(en_tokenizer.decode, output))

In [None]:
# model.eval()
# epoch_loss = 0


# # optimizer.zero_grad()
# out = model(input_ids=d[0]['input_ids'],
#                          attention_mask = d[0]['attention_mask'],
#                          labels = d[1]['input_ids'])

# prediction_scores = out.logits
# predictions = F.log_softmax(prediction_scores, dim=-1)
# loss = compute_loss(predictions, d[1]['input_ids'])
# epoch_loss += loss.item()

# print("Mean validation loss:", epoch_loss)


In [None]:
# list(map(en_tokenizer.decode, torch.argmax(predictions,dim=-1)))

In [None]:
model = model.to(device)

In [None]:
unfinished_seq[(dec_out[:,-1] == en_tokenizer.eos_token_id).cpu().numpy()]

In [None]:
hi_tokenizer.decode(d[0]['input_ids'][1,:])

In [None]:
for b in train_loader:
    break;
hi_token = hi_tokenizer(b['translation']['hi'], padding=True, truncation=True, return_tensors="pt")
en_token = en_tokenizer(b['translation']['en'], padding=True, truncation=True, return_tensors="pt")


In [None]:
## greedy decoding
def greedy_decoding(hi_token, en_token):
    BS = 2
    model.eval()
    pred_words = torch.tensor([[en_tokenizer.bos_token_id]]*BS)
    dec_out = pred_words.to(device)

    unfinished_seq = np.array([1]*BS)

    for i in range(en_token['input_ids'].shape[1]):

        output = model(input_ids = hi_token['input_ids'].to(device), attention_mask = hi_token['attention_mask'].to(device), labels = dec_out )
        pred_words = torch.argmax(output.logits, dim=-1)[:,-1:]    
        pred_words[unfinished_seq==0,:] = en_tokenizer.pad_token_id
        dec_out = torch.cat((dec_out,pred_words),dim=1)

        unfinished_seq[(dec_out[:,-1] == en_tokenizer.eos_token_id).cpu().numpy()] = 0


    predictions = F.log_softmax(output.logits, dim=2)

    loss = compute_loss(predictions, en_token['input_ids'].to(device))
    print(loss.item())
    print(list(map(en_tokenizer.decode, dec_out)), '\n', list(map(en_tokenizer.decode, en_token['input_ids'])))

greedy_decoding(hi_token, en_token)

In [None]:
## greedy decoding
BS = 2
model.eval()

for b in train_loader:
    break;
hi_token = hi_tokenizer(b['translation']['hi'], padding=True, truncation=True, return_tensors="pt")
en_token = en_tokenizer(b['translation']['en'], padding=True, truncation=True, return_tensors="pt")

hi_input = hi_token['input_ids'].to(device)
hi_masks = hi_token['attention_mask'].to(device)

en_output = en_token['input_ids'].to(device)


pred_words = torch.tensor([[en_tokenizer.bos_token_id]]*BS).to(device)
dec_out = pred_words.to(device)

unfinished_seq = np.array([1]*BS)

for i in range(en_output.shape[0]):
    while sum(unfinished_seq)>0:
        output = model(input_ids = hi_input, labels = dec_out.to(device) )
        pred_words = torch.argmax(output.logits, dim=-1)[:,-1:]
        pred_words[unfinished_seq==0,:] = en_tokenizer.pad_token_id
        dec_out = torch.cat((dec_out,pred_words),dim=1)

        unfinished_seq[(dec_out[:,-1] == en_tokenizer.eos_token_id).cpu().numpy()] = 0
        
# loss = compute_loss(output.logits, en_output)
# print(loss.item())
list(map(en_tokenizer.decode, dec_out)), list(map(en_tokenizer.decode, en_output))

In [None]:
d[1]['input_ids']

In [None]:
output.logits.shape, d[1]['input_ids'].shape

In [None]:
sum(p.numel() for p in model.parameters())