In [1]:
!pip install transformers==4.12.5 sentencepiece==0.1.96 sacrebleu

In [2]:
!git clone https://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@github.com/taufiqhusada/amr-to-text-indonesia.git

In [3]:
PREPROCESSED_DATA_PATH = './amr-to-text-indonesia/data/preprocessed_data'

In [4]:
import pandas as pd
import os
import torch
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
from transformers.optimization import  AdamW, Adafactor 
import time
import warnings
from tqdm import tqdm
from sacrebleu import corpus_bleu
import random
import numpy as np
warnings.filterwarnings('ignore')

In [5]:
if torch.cuda.is_available():
    device = torch.device("cuda:0") 
    print("Running on the GPU")
else:
    device = torch.device("cpu")
    print("Running on the CPU")

In [6]:
def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
set_seed(42)

In [7]:
tokenizer = AutoTokenizer.from_pretrained("Wikidepia/IndoT5-base")
model = AutoModelForSeq2SeqLM.from_pretrained("Wikidepia/IndoT5-base", return_dict=True)

#moving the model to device(GPU/CPU)
model.to(device)

In [8]:
AMR_TOKENS = [':ARG0',':ARG1',':mod',':time', ':name', ':location']
T5_PREFIX = 'translate Graph to Indonesian: '

In [9]:
new_tokens_vocab = {}
new_tokens_vocab['additional_special_tokens'] = []
for idx, t in enumerate(AMR_TOKENS):
    new_tokens_vocab['additional_special_tokens'].append(t)

num_added_toks = tokenizer.add_special_tokens(new_tokens_vocab)
print(f'added {num_added_toks} tokens')

model.resize_token_embeddings(len(tokenizer))

In [10]:
from torch.utils.data import Dataset, DataLoader

# class to load preprocessed amr data
class AMRToTextDataset(Dataset):    
    def __init__(self, file_amr_path, file_sent_path, tokenizer, split):
        temp_list_amr_input = []
        with open(file_amr_path) as f:
            temp_list_amr_input = f.readlines()
        list_amr_input = []
        for item in temp_list_amr_input:
            list_amr_input.append(item.strip())
            
        temp_list_sent_output = []
        with open(file_sent_path) as f:
            temp_list_sent_output = f.readlines()
        list_sent_output = []
        for item in temp_list_sent_output:
            list_sent_output.append(item.strip())
        
        df = pd.DataFrame(list(zip(list_amr_input, list_sent_output)), columns = ['amr','sent'])
        self.data = df
        self.tokenizer = tokenizer
 
    
    def __getitem__(self, index):
        data = self.data.loc[index,:]
        amr, sent = data['amr'], data['sent']
       
        tokenize_amr = self.tokenizer.encode(amr, add_special_tokens=False)
        tokenize_sent = self.tokenizer.encode(sent, add_special_tokens=False)
        
        item = {'input':{}, 'output':{}}
        item['input']['encoded'] = tokenize_amr
        item['input']['raw'] = amr
        item['output']['encoded'] = tokenize_sent
        item['output']['raw'] = sent
        return item
    
    def __len__(self):
        return len(self.data)
    
## Data loader class
class AMRToTextDataLoader(DataLoader):
    def __init__(self, max_seq_len=384, label_pad_token_id=-100, model_type='indo-t5', tokenizer=None, *args, **kwargs):
        super(AMRToTextDataLoader, self).__init__(*args, **kwargs)
        self.tokenizer = tokenizer
        self.max_seq_len = max_seq_len
        self.label_pad_token_id = label_pad_token_id
        
        self.pad_token_id = tokenizer.pad_token_id
        self.bos_token_id = tokenizer.pad_token_id
        self.eos_token_id = tokenizer.eos_token_id
        
        if model_type == 'indo-t5':
            if self.tokenizer is not None:
                self.t5_prefix =np.array(self.tokenizer.encode(T5_PREFIX, add_special_tokens=False))
            self.collate_fn = self._t5_collate_fn
            
    def _t5_collate_fn(self, batch):
        batch_size = len(batch)
        max_enc_len = min(self.max_seq_len, max(map(lambda x: len(x['input']['encoded']), batch))  + len(self.t5_prefix))
        max_dec_len = min(self.max_seq_len, max(map(lambda x: len(x['output']['encoded']), batch)) + 1)
        
        id_batch = []
        enc_batch = np.full((batch_size, max_enc_len), self.pad_token_id, dtype=np.int64)
        dec_batch = np.full((batch_size, max_dec_len), self.pad_token_id, dtype=np.int64)
        label_batch = np.full((batch_size, max_dec_len), self.label_pad_token_id, dtype=np.int64)
        enc_mask_batch = np.full((batch_size, max_enc_len), 0, dtype=np.float32)
        dec_mask_batch = np.full((batch_size, max_dec_len), 0, dtype=np.float32)
        
        for i, item in enumerate(batch):
            input_seq = item['input']['encoded']
            label_seq = item['output']['encoded']
            input_seq, label_seq = input_seq[:max_enc_len - len(self.t5_prefix)], label_seq[:max_dec_len - 1]
            
            # Assign content
            enc_batch[i,len(self.t5_prefix):len(self.t5_prefix) + len(input_seq)] = input_seq
            dec_batch[i,1:1+len(label_seq)] = label_seq
            label_batch[i,:len(label_seq)] = label_seq
            enc_mask_batch[i,:len(input_seq) + len(self.t5_prefix)] = 1
            dec_mask_batch[i,:len(label_seq) + 1] = 1
            
            # Assign special token to encoder input
            enc_batch[i,:len(self.t5_prefix)] = self.t5_prefix
            
            # Assign special token to decoder input
            dec_batch[i,0] = self.bos_token_id
            
            # Assign special token to label
            label_batch[i,len(label_seq)] = self.eos_token_id
            
        
        return enc_batch, dec_batch, enc_mask_batch, None, label_batch

In [11]:
DATA_FOLDER = './amr-to-text-indonesia/data/preprocessed_data/'

train_amr_path = os.path.join(DATA_FOLDER, 'train.amr.txt')
train_sent_path = os.path.join(DATA_FOLDER, 'train.sent.txt')

dev_amr_path = os.path.join(DATA_FOLDER, 'dev.amr.txt')
dev_sent_path = os.path.join(DATA_FOLDER, 'dev.sent.txt')

test_amr_path = os.path.join(DATA_FOLDER, 'test.amr.txt')
test_sent_path = os.path.join(DATA_FOLDER, 'test.sent.txt')

train_dataset = AMRToTextDataset(train_amr_path, train_sent_path, tokenizer, 'train')
dev_dataset = AMRToTextDataset(dev_amr_path, dev_sent_path, tokenizer, 'dev')
test_dataset = AMRToTextDataset(test_amr_path, test_sent_path, tokenizer, 'test')

model_type = 'indo-t5'
max_seq_len = 384
batch_size = 4
train_loader = AMRToTextDataLoader(dataset=train_dataset, model_type=model_type, tokenizer=tokenizer, max_seq_len=max_seq_len, 
                                    batch_size=batch_size, shuffle=False)  
test_loader = AMRToTextDataLoader(dataset=test_dataset, model_type=model_type, tokenizer=tokenizer, max_seq_len=max_seq_len, 
                                    batch_size=batch_size, shuffle=False)  

In [12]:
print(len(train_dataset))
print(len(dev_dataset))
print(len(test_dataset))

print(len(train_loader))

In [13]:
optimizer = AdamW(
    model.parameters(),
    lr=3e-5,
    eps=1e-8
)

def get_lr(optimizer):
    for param_group in optimizer.param_groups:
        return param_group['lr']

n_epochs = 4
num_beams = 5

In [14]:
# train
for epoch in range(n_epochs):
    model.train()
    torch.set_grad_enabled(True)
 
    total_train_loss = 0
    list_hyp, list_label = [], []

    train_pbar = tqdm(iter(train_loader), leave=True, total=len(train_loader))
    for i, batch_data in enumerate(train_pbar):
        enc_batch = torch.LongTensor(batch_data[0])
        dec_batch = torch.LongTensor(batch_data[1])
        enc_mask_batch = torch.FloatTensor(batch_data[2])
        dec_mask_batch = None
        label_batch = torch.LongTensor(batch_data[4])
        token_type_batch = None
        
        # cuda
        enc_batch = enc_batch.cuda()
        dec_batch = dec_batch.cuda()
        enc_mask_batch = enc_mask_batch.cuda() 
        dec_mask_batch = None
        label_batch = label_batch.cuda()
        token_type_batch = None

        outputs = model(input_ids=enc_batch, attention_mask=enc_mask_batch, decoder_input_ids=dec_batch, 
                    decoder_attention_mask=dec_mask_batch, labels=label_batch)
        loss, logits = outputs[:2]
        hyps = logits.topk(1, dim=-1)[1]
        
        loss.backward()
        
        tr_loss = loss.item()
        total_train_loss = total_train_loss + tr_loss
        
        train_pbar.set_description("(Epoch {}) TRAIN LOSS:{:.4f} LR:{:.8f}".format((epoch+1),
                total_train_loss/(i+1), get_lr(optimizer)))
        
        optimizer.step()
        optimizer.zero_grad()
        

In [15]:
# test on data test

model.eval()
torch.set_grad_enabled(False)

list_hyp, list_label = [], []

pbar = tqdm(iter(test_loader), leave=True, total=len(test_loader))
for i, batch_data in enumerate(pbar):
    batch_seq = batch_data[-1]

    enc_batch = torch.LongTensor(batch_data[0])
    dec_batch = torch.LongTensor(batch_data[1])
    enc_mask_batch = torch.FloatTensor(batch_data[2])
    dec_mask_batch = None
    label_batch = torch.LongTensor(batch_data[4])
    token_type_batch = None

    # cuda
    enc_batch = enc_batch.cuda()
    dec_batch = dec_batch.cuda()
    enc_mask_batch = enc_mask_batch.cuda() 
    dec_mask_batch = None
    label_batch = label_batch.cuda()
    token_type_batch = None

    hyps = model.generate(input_ids=enc_batch, attention_mask=enc_mask_batch, num_beams=num_beams, max_length=max_seq_len, 
                          early_stopping=True, pad_token_id=tokenizer.pad_token_id, eos_token_id=tokenizer.eos_token_id)

    batch_list_hyp = []
    batch_list_label = []
    for j in range(len(hyps)):
        hyp = hyps[j]
        label = label_batch[j,:].squeeze()
     
        batch_list_hyp.append(tokenizer.decode(hyp, skip_special_tokens=True))
        batch_list_label.append(tokenizer.decode(label[label != -100], skip_special_tokens=True))
    
    list_hyp += batch_list_hyp
    list_label += batch_list_label

In [16]:
for i in range(len(list_hyp)):
    print(list_hyp[i], '----', list_label[i])

In [17]:
## save model
torch.save(model.state_dict(), "4epoch_t5_fixed.th")

In [18]:
## save generated outputs
with open('test_generations.txt', 'w') as f:
    for i in range(len(list_hyp)):
        e = list_hyp[i]
        f.write(e)
        if (i != len(list_hyp)-1):
            f.write('\n')
          
## save label 
with open('test_label.txt', 'w') as f:
    for i in range(len(list_label)):
        e = list_label[i]
        f.write(e)
        if (i != len(list_label)-1):
            f.write('\n')

In [19]:
## BLEU SCORE
from sacrebleu import corpus_bleu

bleu = corpus_bleu(list_hyp, [list_label])
print(bleu.score)

In [20]:
def generate(text):
    model.eval()
    input_ids = tokenizer.encode(f"{T5_PREFIX}{text}", return_tensors="pt", add_special_tokens=False)  # Batch size 1
    input_ids = input_ids.to(device)
    outputs = model.generate(input_ids, num_beams=num_beams)
    
    gen_text= tokenizer.decode(outputs[0], skip_special_tokens=True)
    return gen_text

In [21]:
generate("( ketik :ARG0 ( saya ) :ARG1 ( makalah ) ) )")

In [22]:
temp_list_hyp = []
temp_list_label = []
for e in list_hyp:
    temp_list_hyp.append(e)
for e in list_label:
    temp_list_label.append(e)
bleu = corpus_bleu(temp_list_hyp, [temp_list_label])
print(bleu.score)