In [1]:
import torch.nn.functional as F
import torch.nn as nn
import torch
import numpy as np

from datasets import load_dataset

In [2]:
dataset = load_dataset("cfilt/iitb-english-hindi")

Found cached dataset parquet (/home/administrator/.cache/huggingface/datasets/cfilt___parquet/cfilt--iitb-english-hindi-911387c6837f8b91/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7)


  0%|          | 0/3 [00:00<?, ?it/s]

In [None]:
# dataset['train']['translation'][:10]

# Tokenizer

In [None]:
from transformers import AutoTokenizer

old_tokenizer = AutoTokenizer.from_pretrained("gpt2")

In [None]:
def training_corpus(dtype='train', lang='hi'):
    l_dataset = len(dataset[dtype])
    for i in range(0, l_dataset, 1000):
        yield [dataset[dtype][i + j]["translation"][lang] for j in range(min(1000,l_dataset-i))]


In [None]:
from transformers import AutoTokenizer

old_tokenizer = AutoTokenizer.from_pretrained("gpt2")

In [None]:
for hi_data in training_corpus(dtype='train',lang='hi'):
    break;
for en_data in training_corpus(dtype='train',lang='en'):
    break;

In [None]:
tokens = old_tokenizer.tokenize(d[0])
len(tokens),tokens

In [None]:
EN_VOCAB_SIZE = 75000
HI_VOCAB_SIZE = 75000

In [None]:
hi_tokenizer = old_tokenizer.train_new_from_iterator(training_corpus('train'), HI_VOCAB_SIZE)
en_tokenizer = old_tokenizer.train_new_from_iterator(training_corpus('train', lang='en'), EN_VOCAB_SIZE)

In [None]:
en_tokenizer.save_pretrained("eng-tokenizer")

In [None]:
hi_tokenizer.save_pretrained("hindi-tokenizer")

In [None]:
tokens = hi_tokenizer.tokenize(hi_data[2])
print(len(tokens),tokens)
hi_tokenizer.convert_tokens_to_string(tokens)

In [None]:
tokens = en_tokenizer.tokenize(en_data[2])
print(len(tokens),tokens)
en_tokenizer.convert_tokens_to_string(tokens)

### Load Tokenizers from saved

In [3]:
from transformers import AutoTokenizer

hi_tokenizer = AutoTokenizer.from_pretrained("hindi-tokenizer")
en_tokenizer = AutoTokenizer.from_pretrained("eng-tokenizer")

In [4]:
hi_tokenizer.add_special_tokens({'pad_token': '[PAD]', 'cls_token': '<cls>', 'eos_token':'<eos>', 'bos_token' : '<s>'})

en_tokenizer.add_special_tokens({'pad_token': '[PAD]', 'cls_token': '<cls>', 'eos_token':'<eos>', 'bos_token' : '<s>'})

3

In [5]:
from tokenizers.processors import TemplateProcessing
en_tokenizer._tokenizer.post_processor = TemplateProcessing(
    single=en_tokenizer.bos_token + " $A " + en_tokenizer.eos_token,
    special_tokens=[(en_tokenizer.eos_token, en_tokenizer.eos_token_id), (en_tokenizer.bos_token, en_tokenizer.bos_token_id)],
)

In [6]:
en_sen = dataset['train']['translation'][1]['en']

In [7]:
en_tokenizer.encode(en_sen, add_special_tokens = True)

[75003, 60318, 50652, 32916, 75002]

# Translator - Train

In [8]:

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device

device(type='cuda', index=0)

In [36]:
BS = 2
train_loader = torch.utils.data.DataLoader(dataset['train'], batch_size=BS, shuffle=True)
val_loader = torch.utils.data.DataLoader(dataset['validation'], batch_size=BS, shuffle=True)
test_loader = torch.utils.data.DataLoader(dataset['test'], batch_size=BS, shuffle=True)


In [37]:
for b in train_loader:
    break;


d = (hi_tokenizer(b['translation']['hi'], padding=True, truncation=True, return_tensors="pt"),
en_tokenizer(b['translation']['en'], padding=True, truncation=True, return_tensors="pt"))

In [21]:
criterion = nn.NLLLoss(ignore_index=en_tokenizer.pad_token_id)

In [22]:
def compute_loss(predictions, targets):
    """Compute our custom loss"""
    predictions = predictions[:, :-1, :].contiguous()
    targets = targets[:, 1:]

    rearranged_output = predictions.view(predictions.shape[0]*predictions.shape[1], -1)
    rearranged_target = targets.contiguous().view(-1)

    loss = criterion(rearranged_output, rearranged_target)

    return loss

In [23]:
import transformers

encoder_config = transformers.BertConfig(vocab_size=len(hi_tokenizer))
decoder_config = transformers.BertConfig(vocab_size = len(en_tokenizer))

config = transformers.EncoderDecoderConfig.from_encoder_decoder_configs(encoder_config, decoder_config)
model = transformers.EncoderDecoderModel(config)

In [24]:
model.config.decoder_start_token_id = en_tokenizer.cls_token_id
model.config.pad_token_id = en_tokenizer.pad_token_id
model.config.eos_token_id = en_tokenizer.eos_token_id
model.config.bos_token_id = en_tokenizer.bos_token_id

model = model.to(device)

In [25]:
model.train()

optimizer = transformers.AdamW(model.parameters(), lr=1e-4)
# optimizer = torch.optim.SGD(model.parameters(), lr=1e-4)

epochs = 100

for e in range(epochs):
    optimizer.zero_grad()
    
    hi_token = d[0]['input_ids'].to(device)
    hi_mask = d[0]['attention_mask'].to(device)
    en_token = d[1]['input_ids'].to(device)
    
    out = model(input_ids=hi_token, attention_mask = hi_mask, labels = en_token)[:2]
    prediction_scores = out[1]
    predictions = F.log_softmax(prediction_scores, dim=-1)
    loss = compute_loss(predictions, en_token)

    print(f"epoch {e}:", loss.item())
    loss.backward()
    torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
    optimizer.step()



epoch 0: 11.586273193359375
epoch 1: 9.458174705505371
epoch 2: 8.42220687866211
epoch 3: 7.8575568199157715
epoch 4: 7.360096454620361
epoch 5: 6.655338287353516
epoch 6: 6.140342712402344
epoch 7: 5.489269733428955
epoch 8: 4.9837422370910645
epoch 9: 4.242959022521973
epoch 10: 3.792346954345703
epoch 11: 3.4489777088165283
epoch 12: 2.9831364154815674
epoch 13: 2.545281171798706
epoch 14: 2.3930718898773193
epoch 15: 2.0435307025909424
epoch 16: 1.7170997858047485
epoch 17: 1.5399545431137085
epoch 18: 1.369398593902588
epoch 19: 1.1758708953857422
epoch 20: 1.0006738901138306
epoch 21: 0.9694137573242188
epoch 22: 0.8083035945892334
epoch 23: 0.6873987317085266
epoch 24: 0.6448652744293213
epoch 25: 0.48635587096214294
epoch 26: 0.4270227253437042
epoch 27: 0.3203878402709961
epoch 28: 0.2857956290245056
epoch 29: 0.24199660122394562
epoch 30: 0.21654802560806274
epoch 31: 0.17343205213546753
epoch 32: 0.14722836017608643
epoch 33: 0.11802112311124802
epoch 34: 0.10346502810716629

### Save Model Weights

In [None]:
PATH = "./translate_hin_to_eng.pth"

In [None]:
torch.save(model.state_dict(), PATH)

### Load Model

In [None]:
import transformers

encoder_config = transformers.BertConfig(vocab_size=len(hi_tokenizer))
decoder_config = transformers.BertConfig(vocab_size = len(en_tokenizer))

config = transformers.EncoderDecoderConfig.from_encoder_decoder_configs(encoder_config, decoder_config)
model = transformers.EncoderDecoderModel(config)

model.config.decoder_start_token_id = en_tokenizer.cls_token_id
model.config.pad_token_id = en_tokenizer.pad_token_id
model.config.eos_token_id = en_tokenizer.eos_token_id
model.config.bos_token_id = en_tokenizer.bos_token_id

model.load_state_dict(torch.load(PATH))
model.eval()

# Evaluate

In [None]:
list(map(en_tokenizer.decode, d[1]['input_ids']))

In [None]:
out = model(input_ids=d[0]['input_ids'],
                     attention_mask = d[0]['attention_mask'],
                     labels = d[1]['input_ids']
            )

list(map(en_tokenizer.decode, torch.argmax(out.logits, dim=-1)))

In [None]:
output = model.generate(input_ids = d[0]['input_ids'], decoder_start_token_id=en_tokenizer.cls_token_id)

list(map(en_tokenizer.decode, output))

In [None]:
# model.eval()
# epoch_loss = 0


# # optimizer.zero_grad()
# out = model(input_ids=d[0]['input_ids'],
#                          attention_mask = d[0]['attention_mask'],
#                          labels = d[1]['input_ids'])

# prediction_scores = out.logits
# predictions = F.log_softmax(prediction_scores, dim=-1)
# loss = compute_loss(predictions, d[1]['input_ids'])
# epoch_loss += loss.item()

# print("Mean validation loss:", epoch_loss)


In [None]:
# list(map(en_tokenizer.decode, torch.argmax(predictions,dim=-1)))

In [None]:
## greedy decoding
BS = 4
model.eval()
pred_words = torch.tensor([[en_tokenizer.bos_token_id]]*BS)
dec_out = pred_words

unfinished_seq = np.array([1]*BS)

for i in range(20):
    
    output = model(input_ids = d[0]['input_ids'], labels = dec_out )
    pred_words = torch.argmax(output.logits, dim=-1)[:,-1:]    
    pred_words[unfinished_seq==0,:] = en_tokenizer.pad_token_id
    dec_out = torch.cat((dec_out,pred_words),dim=1)

    unfinished_seq[dec_out[:,-1] == en_tokenizer.eos_token_id] = 0

list(map(en_tokenizer.decode, dec_out))

In [None]:
sum(p.numel() for p in model.parameters())