In [2]:
!cp -r /kaggle/input/llm-qald-9 .

In [3]:
%cd /kaggle/working/llm-qald-9

/kaggle/working/llm-qald-9


In [7]:
%%writefile run.py
# Read the data first
import re
import json

def preprocess_amr_data(data):
    amr_complete = []
    text_complete = []

    for entry in data.keys():
        amr_graph = data[entry]['amr']
        target_text = data[entry]['text']
        model_input = amr_graph.strip()
        model_output = target_text.strip()
        
        amr_complete.append(model_input)
        text_complete.append(model_output)

    return amr_complete, text_complete

import torch
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

import torch.multiprocessing as mp
from torch.utils.data.distributed import DistributedSampler
from torch.nn.parallel import DistributedDataParallel as DDP
from torch.distributed import init_process_group, destroy_process_group
import os

from transformers import BartTokenizer, BartForConditionalGeneration, AdamW, get_scheduler, MBartTokenizer
from transformers import GPT2Tokenizer, GPT2LMHeadModel
from transformers import T5Tokenizer, T5ForConditionalGeneration
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer
from tqdm import tqdm


# Load AMR data from a file
with open('qald-9-amr-test.txt', 'r', encoding='utf-8') as f:
    amr_data_test = f.read()
with open('qald-9-amr-train.txt','r',encoding='utf-8') as f:
    amr_data_train=f.read()

# with open('massive_amr_test.txt', 'r', encoding='utf-8') as f:
#     amr_data_test = f.read()
# with open('massive_amr.txt','r',encoding='utf-8') as f:
#     amr_data_train=f.read()

import re
def split_amr_strings(amr_text):
    amr_pattern = r"(# ::snt .+?)(?=\n# ::snt|\Z)"
    amr_blocks = re.findall(amr_pattern, amr_text, flags=re.DOTALL)
    return amr_blocks

def create_amr_dict(amr_text):
    amr_strings = split_amr_strings(amr_text)
    amr_dict = {}
    amr_list = []
    text_list = []
    for idx, amr_string in enumerate(amr_strings):
        # Split each block into the sentence part and the AMR part
        snt_part = amr_string.split('\n', 1)[0].strip()  # First line is the sentence part
        text_part = re.sub('# ::snt ','',snt_part)
        amr_part = amr_string[len(snt_part):].strip()  # Rest is the AMR part
        
        # Add to dictionary
        amr_dict[idx] = {
            "amr": amr_part,
            "text": text_part
        }
        amr_list.append(amr_part)
        text_list.append(text_part)
    
    return amr_dict, amr_list, text_list

amr_data_train, _, __ = create_amr_dict(amr_data_train)
amr_data_test, amr_test, text_test = create_amr_dict(amr_data_test)

# Read the data first
import re
import json

def preprocess_amr_data(data):
    amr_complete = []
    text_complete = []

    for entry in data.keys():
        amr_graph = data[entry]['amr']
        target_text = data[entry]['text']
        model_input = amr_graph.strip()
        model_output = target_text.strip()
        
        amr_complete.append(model_input)
        text_complete.append(model_output)

    return amr_complete, text_complete

class QALD_9_AMRDataset(Dataset):
    def __init__(self, data, tokenizer):
     
        #train_amr, train_text = preprocess_amr_data(train)
        amr, text = preprocess_amr_data(data)
        self.amr_len = len(amr)
        self.text_len = len(text)
        prefixed_amr = ["translate AMR to text: " + x for x in amr]

        self.amr = tokenizer(amr, truncation=True, padding='max_length', max_length=384, return_tensors = 'pt')
        #print(self.amr)
        self.text = tokenizer(text, truncation=True, padding='max_length', max_length=384, return_tensors = 'pt')
  
    def __len__(self): # returns the total number of samples in the dataset
        assert self.amr_len == self.text_len
        return self.amr_len
        
    def __getitem__(self, idx):
        return {'amr': self.amr['input_ids'][idx], 
                'att_mask': self.amr['attention_mask'][idx],
                'text': self.text['input_ids'][idx]
               }

def ddp_setup(rank, world_size):
    os.environ["MASTER_ADDR"] = "localhost"
    os.environ["MASTER_PORT"] = "12355"
    torch.cuda.set_device(rank)
    init_process_group(backend="nccl", rank=rank, world_size=world_size)
    
class Trainer:
    def __init__(
        self,
        model: torch.nn.Module,
        train_data: DataLoader,
        val_data: DataLoader,
        tokenizer: BartTokenizer,
        optimizer: torch.optim.Optimizer,
        gpu_id: int,
        save_every: int,
    ) -> None:
        self.gpu_id = gpu_id
        self.model = model.to(gpu_id)
        self.train_data = train_data
        self.val_data = val_data
        self.tokenizer = tokenizer
        self.optimizer = optimizer
        self.save_every = save_every
        self.model = DDP(model, device_ids=[gpu_id])

    def _run_batch(self, input_ids, attention_mask, labels, run_type='train', eval_loss=None):
        if run_type == 'train':
            self.optimizer.zero_grad()
            output = self.model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = output.loss
            loss.backward()

            self.optimizer.step()
            return output, eval_loss
        elif run_type == 'validate':
            with torch.no_grad():
                output = self.model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
                eval_loss += output.loss.item()
            return output, eval_loss

    def _run_epoch(self, epoch, epoch_type='train'):
        
        if epoch_type == 'train':
            print("------ Training! ------")
            data = self.train_data
            eval_loss = None
            print(type(self.train_data), len(self.train_data))

        elif epoch_type == 'validate':
            print("------ Validating! ------")
            data = self.val_data
            eval_loss = 0

        print(f"[GPU{self.gpu_id}] Epoch {epoch} | Steps: {len(data)}")
        data.sampler.set_epoch(epoch)

        for batch_idx, batch in tqdm(enumerate(data), total=len(data)):
            input_ids = batch['amr'].to(self.gpu_id)
            attention_mask = batch['att_mask'].to(self.gpu_id)
            labels = batch['text'].to(self.gpu_id)

            if epoch_type == 'train':
                _, __ = self._run_batch(input_ids=input_ids, attention_mask=attention_mask, labels=labels, eval_loss=eval_loss, run_type='train')
            elif epoch_type == 'validate':
                _, eval_loss = self._run_batch(input_ids=input_ids, attention_mask=attention_mask, labels=labels, eval_loss=eval_loss, run_type='validate')
        if epoch_type == 'validate':
            print(f"Epoch {epoch+1}: Evaluation Loss = {eval_loss / len(self.val_data)}")

    def _save_checkpoint(self, epoch):
        ckp = self.model.module.state_dict()
        PATH = "."
        #         torch.save(ckp, PATH)
        self.model.module.save_pretrained(PATH)
        self.tokenizer.save_pretrained(PATH)

        print(f"Epoch {epoch} | Training checkpoint saved at {PATH}")

    def train(self, max_epochs: int):
        for epoch in range(max_epochs):
            self._run_epoch(epoch=epoch, epoch_type='train')
            self._run_epoch(epoch=epoch, epoch_type='validate')
            if self.gpu_id == 0 and (epoch + 1) % self.save_every == 0:
                self._save_checkpoint(epoch)
                
    def generate_predictions(self, texts):
        model = self.model.module
        model.to('cuda')
        model.eval()
        predictions = []
    
        for text in tqdm(texts):
            inputs = self.tokenizer(text, return_tensors="pt", padding=True, truncation=True)
            with torch.no_grad():
                output_sequences = model.generate(**inputs.to('cuda'), num_beams=3, max_new_tokens=300, pad_token_id=model.config.eos_token_id)
        
            decoded_preds = self.tokenizer.batch_decode(output_sequences, skip_special_tokens=True)
            print(decoded_preds)
            predictions.extend(decoded_preds)
    
        return predictions 

def load_train_objs(preTmodel):
    if preTmodel == 'bart':
        tokenizer = BartTokenizer.from_pretrained('facebook/bart-base')
        model = BartForConditionalGeneration.from_pretrained('facebook/bart-base')
    elif preTmodel == 'gpt2':
        tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
        tokenizer.pad_token = tokenizer.eos_token  # Set padding token to EOS (since GPT-2 does not have a dedicated pad token)
        model = GPT2LMHeadModel.from_pretrained("gpt2")
    elif preTmodel == 'T5':
        tokenizer = T5Tokenizer.from_pretrained("t5-small")  # Using t5-base instead of t5-small
        model = T5ForConditionalGeneration.from_pretrained('t5-small')
        
        # Ensure all necessary special tokens are present
        special_tokens = {
            'pad_token': '[PAD]',
            'eos_token': '</s>',
            'bos_token': '<s>',
        }
        tokenizer.add_special_tokens(special_tokens)
        model.resize_token_embeddings(len(tokenizer))
    train_dataset = QALD_9_AMRDataset(amr_data_train, tokenizer)
    val_dataset = QALD_9_AMRDataset(amr_data_test, tokenizer)
    print('erealy', len(train_dataset), len(val_dataset))
    optimizer = torch.optim.AdamW(model.parameters(), lr=1e-4)
    return train_dataset, val_dataset, model, tokenizer, optimizer


def prepare_dataloader(dataset: Dataset, batch_size: int):
    return DataLoader(
        dataset,
        batch_size=batch_size,
        pin_memory=True,
        shuffle=False,
        sampler=DistributedSampler(dataset)
    )

def run_model(rank: int, world_size: int, save_every: int, total_epochs: int, batch_size: int):
    ddp_setup(rank, world_size)
    train_dataset, val_dataset, model, tokenizer, optimizer = load_train_objs("bart")
    print('trainlen',len(train_dataset))
    train_data = prepare_dataloader(train_dataset, batch_size)
    val_data = prepare_dataloader(val_dataset, batch_size)
    trainer = Trainer(model, train_data, val_data, tokenizer, optimizer, rank, save_every)
    trainer.train(total_epochs)
    print('Running Evaluation!')
    predictions = trainer.generate_predictions(amr_test)
    # from nltk.translate.bleu_score import corpus_bleu
    # references = [[ref.split()] for ref in text_test]  # Reference texts should be a list of lists of tokens
    # predicted_tokens = [pred.split() for pred in predictions]  # Predictions should also be a list of lists of tokens
    # bleu_score = corpus_bleu(references, predicted_tokens)
    # print(f"BLEU score: {bleu_score:.4f}")
    from nltk.translate.bleu_score import sentence_bleu
    # Ensure text_test and predictions are aligned
    references = [ref.split() for ref in text_test]  # Reference texts (list of tokenized sentences)
    predicted_tokens = [pred.split() for pred in predictions]  # Predictions (list of tokenized sentences)
    # Calculate sentence-level BLEU scores
    sentence_bleu_scores = []
    for ref, pred in zip(references, predicted_tokens):
        score = sentence_bleu([ref], pred)  # `ref` should be wrapped in a list as it's a single reference
        sentence_bleu_scores.append(score)
    
    # Print BLEU scores for each sentence
    for i, score in enumerate(sentence_bleu_scores):
        print(f"Sentence {i + 1} BLEU score: {score:.4f}")
    
    # Optionally, calculate the average of the sentence BLEU scores
    average_bleu_score = sum(sentence_bleu_scores) / len(sentence_bleu_scores)
    print(f"Average sentence BLEU score: {average_bleu_score:.4f}")
    destroy_process_group()


if __name__ == "__main__":
    world_size = torch.cuda.device_count()
    save_every = 2
    total_epochs = 7
    batch_size = 10
    mp.spawn(run_model, args=(world_size, save_every, total_epochs, batch_size), nprocs=world_size)



Overwriting run.py


In [8]:
!python run.py 

erealy 408 150
erealy 408 150
trainlen 408
trainlen 408
------ Training! ------
<class 'torch.utils.data.dataloader.DataLoader'> 21
[GPU1] Epoch 0 | Steps: 21
  0%|                                                    | 0/21 [00:00<?, ?it/s]------ Training! ------
<class 'torch.utils.data.dataloader.DataLoader'> 21
[GPU0] Epoch 0 | Steps: 21
100%|███████████████████████████████████████████| 21/21 [00:24<00:00,  1.19s/it]
------ Validating! ------
[GPU0] Epoch 0 | Steps: 8
100%|███████████████████████████████████████████| 21/21 [00:24<00:00,  1.19s/it]
------ Validating! ------
[GPU1] Epoch 0 | Steps: 8
100%|█████████████████████████████████████████████| 8/8 [00:02<00:00,  3.01it/s]
Epoch 1: Evaluation Loss = 2.918136328458786
------ Training! ------
<class 'torch.utils.data.dataloader.DataLoader'> 21
[GPU0] Epoch 1 | Steps: 21
100%|█████████████████████████████████████████████| 8/8 [00:03<00:00,  2.57it/s]
Epoch 1: Evaluation Loss = 2.918111264705658
------ Training! ------
<class 'torch