In [1]:
import numpy as np
import torch
from torch import nn
from transformers import AlbertModel, AlbertTokenizer
from nltk.tokenize import word_tokenize
from nltk.translate.bleu_score import sentence_bleu, corpus_bleu
from datasets import load_dataset, DatasetDict
from tqdm import tqdm
from torch.utils.data import DataLoader

## Load Data

In [3]:
# Load datasets from CSV files
train_dataset = load_dataset('csv', data_files='/kaggle/input/myfiles/team16_ta_train.csv', split='train')
validation_dataset = load_dataset('csv', data_files='/kaggle/input/myfiles/team16_ta_valid.csv', split='train')
test_dataset = load_dataset('csv', data_files='/kaggle/input/myfiles/team16_ta_test.csv', split='train')

# Create a DatasetDict
dataset = DatasetDict({
    "train": train_dataset,
    "validation": validation_dataset,
    "test": test_dataset
})

Generating train split: 0 examples [00:00, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

## Output Embeddings

In [4]:
tokenizer = AlbertTokenizer.from_pretrained('/kaggle/input/ai4bharat-indic-bert')
model = AlbertModel.from_pretrained('/kaggle/input/ai4bharat-indic-bert')

def tokenize_ta(text):
    return tokenizer(text, padding=True, return_tensors='pt')

def embedding_ta(tokens:dict|torch.Tensor, model=model):   
    with torch.no_grad():
        if isinstance(tokens, torch.Tensor):
            output = model(tokens)
        else:
            output = model(**tokens)

    return output.last_hidden_state

  return self.fget.__get__(instance, owner)()


## Building Vocabolary for Target Language

In [5]:
def build_vocab_ta():
    temp = {0: 3}
    for sentence in tqdm(dataset['train']['target']):
        for token in tokenize_ta(sentence)['input_ids'][0]:
            if temp.get(token.item()) is None:
                temp[token.item()] = 1
            else:
                temp[token.item()] = temp[token.item()] + 1
    
    vocab = set()
    for tk in temp.keys():
        if temp[tk] > 1:
            vocab.add(tk)
            
    return torch.tensor(np.sort(list(vocab)))

vocab_ta = build_vocab_ta()

100%|██████████| 70000/70000 [00:49<00:00, 1426.69it/s]


In [10]:
def find_neighbors(tensor, input_number):
    sorted_list = tensor.tolist()
    position = 0
    for i, num in enumerate(sorted_list):
        if num >= input_number:
            position = i
            break
    return sorted_list[max(0, position-2):min(len(sorted_list), position+2)]


In [6]:
sentence = 'என் நினைவுகள் உதிர்ந்துவிட்டன.'
tks = tokenize_ta(sentence)
tks['input_ids']

tensor([[     2,    396,  13621,  18248, 120345,  13143,    330,   7388, 140713,
              9,      3]])

In [18]:
find_neighbors(vocab_ta, 120345)

[120237, 120258, 120345, 120382]

In [19]:
rks = torch.tensor([    2,    398,  13621,  18248, 120382,  13143,    326,   7388, 140713,  9,      3])
tokenizer.decode(rks)

"[CLS]'நினைவுகள சீதாநdவிடடன.[SEP]"

In [10]:
def build_map(vocab):
    vocab_map = dict()
    model_input = {
        'token_type_ids':   torch.tensor(0).reshape(1,1),
        'attention_mask':   torch.tensor(1).reshape(1,1)
    }
    for token in tqdm(vocab):
        model_input['input_ids'] = torch.tensor(token.item()).reshape(1,1)
        with torch.no_grad():
            vocab_map[token.item()] = model(**model_input).last_hidden_state
    
    return vocab_map

# dict with each token as key and respective embedding as value
map_ta = build_map(vocab_ta)
map_ta[0] = torch.zeros(1, 1, 768)

100%|██████████| 8074/8074 [03:59<00:00, 33.68it/s]


## Seq2Seq Model

In [19]:
class Encoder(nn.Module):
    def __init__(self, input_dim, hidden_dim):
        super().__init__()
        self.input_dim = input_dim
        self.hidden_dim = hidden_dim
        self.rnn = nn.LSTM(input_dim, hidden_dim, batch_first=True, dtype=torch.float32)

    # input_seq : source sequence embeddings => (N, seq_len, em_size) / (seq_len, em_size)
    def forward(self, input_seq):
        output, curr_state = self.rnn(input_seq)
        return curr_state
    
class Decoder(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super().__init__()
        self.input_dim = input_dim
        self.hidden_dim = hidden_dim
        self.rnn = nn.LSTM(input_dim, hidden_dim, batch_first=True, dtype=torch.float32)
        self.linear = nn.Linear(hidden_dim, output_dim, dtype=torch.float32)
        self.softmax = nn.Softmax(dim=1)
    
    # y : [mostly] prev. (expected) word embedding => (N, em_size) / (1, em_size)
    def forward(self, y, prev_state):
        output, curr_state = self.rnn(y, prev_state)
        prediction = self.softmax(self.linear(output))
        return prediction, curr_state

In [20]:
class Seq2SeqModel(nn.Module):
    def __init__(self, src_em_dim, hidden_dim, tgt_em_dim, tgt_dim):
        super().__init__()
        self.src_em_dim = src_em_dim
        self.hidden_dim = hidden_dim
        self.tgt_em_dim = tgt_em_dim
        self.tgt_dim = tgt_dim
        self.encoder = Encoder(src_em_dim, hidden_dim)
        self.decoder = Decoder(tgt_em_dim, hidden_dim, tgt_dim)

    # source : tensor of embeddings of tokens
    # target : tensor of tokens only => (N, seq_len)
    def forward(self, source, target=None):
        batch_size = source.shape[0]
        target_len = 1000 if target is None else target.shape[1]
        
        last_encoder_state = self.encoder(source)

        outputs = []
        prev_state = last_encoder_state

        # should be (N, 1, em_size)
        decoder_input = torch.tile(map_ta[2], (batch_size, 1, 1))
        
        for t in range(1, target_len):
            decoder_output, state = self.decoder(decoder_input, prev_state)
            outputs.append(decoder_output)
            prev_state = state

            if self.training:
                temp1 = [map_ta.get(tk.item(), map_ta[2]) for tk in target[:, t]]
                decoder_input = torch.concat(temp1)
            else:
                decoder_input = torch.concat([map_ta.get(vocab_ta[torch.argmax(y)].item(), map_ta[2]) for y in decoder_output])
            
        return torch.concat(outputs, dim=1).to(device)

## Training

In [21]:
num_epochs = 5
learning_rate = 0.001
batch_size = 64

en_vocab_size = 0 # not neccesary since we are not constructing any vocabolary for english and it is not needed
en_embedding_size = input_dim
ta_vocab_size = len(vocab_ta)
ta_embedding_size = 768

hidden_dim = en_embedding_size * 2

machine = Seq2SeqModel(en_embedding_size, hidden_dim, ta_embedding_size, ta_vocab_size)

machine.encoder.to(device)
machine.decoder.to(device)
machine.to(device)

optimizer = torch.optim.Adam(machine.parameters(), lr=learning_rate)
criterion = nn.CrossEntropyLoss()

train_dataset.set_format(type='torch', columns=['source', 'target'])
validation_dataset.set_format(type='torch', columns=['source', 'target'])
test_dataset.set_format(type='torch', columns=['source', 'target'])

def collate_fn(example_list: list):
    source_list = [example['source'] for example in example_list]
    target_list = [example['target'] for example in example_list]

    source_tensor = embedding_en(tokenize_en(source_list))
    target_tensor = tokenize_ta(target_list)['input_ids']

    return source_tensor, target_tensor


In [22]:
for epoch in range(num_epochs):
    
    machine.train()

    train_iterator = DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
    tqdm_iterator = tqdm(train_iterator)
    for source, target in tqdm_iterator:

        optimizer.zero_grad()

        outputs = machine(source, target)
        
        target_one_hot = nn.functional.one_hot(token_to_id(target[:, 1:]), num_classes=machine.tgt_dim).float()
        loss = criterion(outputs, target_one_hot)

        loss.backward()

        optimizer.step()
        
    machine.eval()

    valid_iterator = DataLoader(dataset=validation_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
    avg_loss = 0
    for source, target in tqdm(valid_iterator):
        outputs = machine(source, target)
        target_one_hot = nn.functional.one_hot(token_to_id(target[:, 1:]), num_classes=machine.tgt_dim).float()
        loss = criterion(outputs, target_one_hot)
        avg_loss = avg_loss + loss

    avg_loss = avg_loss/len(validation_dataset)
    print(f'Epoch [{epoch+1}] ---------------------------------------------- Loss: {avg_loss}')

100%|██████████| 1094/1094 [45:49<00:00,  2.51s/it] 

  9%|▊         | 27/313 [01:18<11:02,  2.32s/it]

## Testing

In [None]:
machine.eval()

Seq2SeqModel(
  (encoder): Encoder(
    (rnn): LSTM(50, 100, batch_first=True)
  )
  (decoder): Decoder(
    (rnn): LSTM(768, 100, batch_first=True)
    (fc): Linear(in_features=100, out_features=8074, bias=True)
  )
)

In [None]:
def decode_sentences(outputs): 

    def decode_sentence(Y): # Y is 2D tensor => (seq_len, vocab_size)
        tokens = []
        for y in Y:
            token = vocab_ta[torch.argmax(y)].item()
            if token == 0:
                break
            else:
                tokens.append(token)
            
        return tokenizer.decode(tokens)

    if len(outputs.shape) == 2:   
        outputs = outputs.unsqueeze(dim=0)
    
    return [decode_sentence(Y) for Y in outputs]

In [None]:
idx = 5

en_sentence = embedding_en(tokenize_en(dataset['train']['source'][idx]))
ta_sentence = dataset['train']['target'][idx]

machine.eval()
outputs = machine(en_sentence)

ta_sentence, decode_sentences(outputs)

('எனக்கு ஒன்றும் தோன்ற வில்லை.', ['கக'])

In [None]:
for source, target in tqdm(test_dataset):
    pass

100%|██████████| 10000/10000 [00:01<00:00, 8992.33it/s]


## BLEU Scores

In [None]:
def compute_bleu_scores():

    generated_sentences = []
    reference_sentences = []

    def collate_fn(example_list: list):
    source_list = [example['source'] for example in example_list]
    target_list = [example['target'] for example in example_list]

    source_tensor = embedding_en(tokenize_en(source_list))

    return source_tensor, target_list

    iterator = DataLoader(test_dataset, batch_size=batch_size, shuffle=True, collate_fn = collate_fn)
    for source, target in tqdm(iterator):
        reference_sentences.append([target])
        generated_sentences.append(decode_sentences(machine(source)))

    reference_tokens = [[word_tokenize(ref) for ref in refs] for refs in reference_sentences]
    generated_tokens = [word_tokenize(gen) for gen in generated_sentences]

    corpus_bleu_score_1 = corpus_bleu(reference_tokens, generated_tokens, (1, 0, 0, 0))
    corpus_bleu_score_2 = corpus_bleu(reference_tokens, generated_tokens, (1/2, 1/2, 0, 0))
    corpus_bleu_score_3 = corpus_bleu(reference_tokens, generated_tokens, (1/3, 1/3, 1/3, 0))
    corpus_bleu_score_4 = corpus_bleu(reference_tokens, generated_tokens, (1/4, 1/4, 1/4, 1/4))

    return {
        'BLEU@1': corpus_bleu_score_1,
        'BLEU@2': corpus_bleu_score_2,
        'BLEU@3': corpus_bleu_score_3,
        'BLEU@4': corpus_bleu_score_4
    }

bleu_scores = compute_bleu_scores()
bleu_scores

  4%|▍         | 7/157 [03:02<1:05:06, 26.05s/it]


KeyboardInterrupt: 