In [1]:
!cp -r /kaggle/input/qald-amr-dataset .

In [2]:
%cd qald-amr-dataset/

/kaggle/working/qald-amr-dataset


In [None]:
import json

with open('bfs_train.json', 'r') as f:
    bfs_train = json.load(f)
with open('bfs_test.json', 'r') as f:
    bfs_test = json.load(f) 
with open('dfs_train.json', 'r') as f:
    dfs_train = json.load(f)
with open('dfs_test.json', 'r') as f:
    dfs_test = json.load(f)
   

In [13]:
# Read the data first
import re
import json

def preprocess_amr_data(data):
    """
    Preprocesses AMR data by serializing the AMR graph and pairing it with target text.

    Args:
    - data (list of dict): Each dict contains 'amr' (AMR graph as string) and 'text' (target sentence).

    Returns:
    - preprocessed_data (list of dict): Each dict has 'input' (serialized AMR) and 'output' (target text).
    """
    
    amr_complete = []
    text_complete = []

    for entry in data.keys():
        amr_graph = data[entry]['amr']
        target_text = data[entry]['text']
        
        # Example serialization: simple token-based serialization for AMR (varies by approach)
        #serialized_amr = re.sub(r'\s+', ' ', amr_graph).strip()
        
        # Prepare data format for model input
        #model_input = f"Translate AMR to text: {serialized_amr}"
        model_input = amr_graph.strip()
        model_output = target_text.strip()
        
        amr_complete.append(model_input)
        text_complete.append(model_output)

        #preprocessed_data.append({
        #    'input': model_input,
        #    'output': model_output
        #})

    return amr_complete, text_complete

bfs_train_processed = preprocess_amr_data(bfs_train)
dfs_train_processed = preprocess_amr_data(dfs_train)
bfs_test_processed = preprocess_amr_data(bfs_test)
dfs_test_processed = preprocess_amr_data(dfs_test)

AttributeError: 'list' object has no attribute 'keys'

In [None]:
from transformers import GPT2Tokenizer

def tokenize_amr_data(preprocessed_data, tokenizer):
    """
    Tokenizes the preprocessed AMR data.

    Args:
    - preprocessed_data (list of dict): Each dict contains 'input' (serialized AMR) and 'output' (target text).
    - tokenizer: Tokenizer instance from Hugging Face, like GPT2Tokenizer.

    Returns:
    - tokenized_data (list of dict): Each dict has tokenized 'input_ids' and 'labels' (for the output).
    """

    tokenized_data = []

    for entry in preprocessed_data:
        # Tokenize the input and output separately
        input_ids = tokenizer.encode(entry['input'], add_special_tokens=True)
        labels = tokenizer.encode(entry['output'], return_tensors='pt', add_special_tokens=True).squeeze()
        
        # Store tokenized input and output
        tokenized_data.append({
            'input_ids': input_ids,
            'labels': labels
        })

    return tokenized_data

# Example usage
# Initialize the tokenizer
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
# Add any special tokens if necessary, e.g., "<sos>", "<eos>"
tokenizer.add_special_tokens({'pad_token': '<pad>'})

# Tokenize the preprocessed data
tokenized_data = tokenize_amr_data(bfs_train_processed[:24], tokenizer)
for item in tokenized_data:
    print(f"Input IDs: {item['input_ids']}")
    print(f"Labels: {item['labels']}")


In [5]:
%%writefile run.py
import torch
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
# from datautils import MyTrainDataset

import torch.multiprocessing as mp
from torch.utils.data.distributed import DistributedSampler
from torch.nn.parallel import DistributedDataParallel as DDP
from torch.distributed import init_process_group, destroy_process_group
import os


from transformers import BartTokenizer, BartForConditionalGeneration, AdamW, get_scheduler, MBartTokenizer
from transformers import GPT2Tokenizer, GPT2LMHeadModel
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer
from tqdm import tqdm


def ddp_setup(rank, world_size):
    """
    Args:
        rank: Unique identifier of each process
        world_size: Total number of processes
    """
    os.environ["MASTER_ADDR"] = "localhost"
    os.environ["MASTER_PORT"] = "12355"
    torch.cuda.set_device(rank)
    init_process_group(backend="nccl", rank=rank, world_size=world_size)

class QALD_9_AMRDataset(Dataset):
    def __init__(self, dsetType: str, tokenizer, train):
        with open(f'{dsetType}_train.json', 'r') as f:
            train = json.load(f)
        with open(f'{dsetType}_test.json', 'r') as f:
            test = json.load(f) 
     
        bfs_train_processed = preprocess_amr_data(train)
        dfs_train_processed = preprocess_amr_data(train)


        self.bfs_train = tokenizer(source_raw, truncation=True, padding='max_length', max_length=384, return_tensors = 'pt')
        self.bfs_test = tokenizer(target_raw, truncation=True, padding='max_length', max_length=384, return_tensors = 'pt')
        
    def __len__(self): # returns the total number of samples in the dataset
        assert self.source_tok['input_ids'].shape[0] == self.target_tok['input_ids'].shape[0]
        return self.source_tok['input_ids'].shape[0]
        
    def __getitem__(self, idx):
        return {'input_ids': self.source_tok['input_ids'][idx], 
                'attention_mask': self.source_tok['attention_mask'][idx],
                'labels': self.target_tok['input_ids'][idx],            
               }

class Trainer:
    def __init__(
        self,
        model: torch.nn.Module,
        train_data: DataLoader,
        val_data: DataLoader,
        tokenizer: BartTokenizer,
        optimizer: torch.optim.Optimizer,
        gpu_id: int,
        save_every: int,
    ) -> None:
        self.gpu_id = gpu_id
        self.model = model.to(gpu_id)
        self.train_data = train_data
        self.val_data = val_data
        self.tokenizer = tokenizer
        self.optimizer = optimizer
        self.save_every = save_every
        self.model = DDP(model, device_ids=[gpu_id])

    def _run_batch(self, input_ids, attention_mask, labels, run_type='train', eval_loss=None):
        if run_type == 'train':
            self.optimizer.zero_grad()
            # print(input_ids)
            # print(labels)
            # print(input_ids.shape, attention_mask.shape, labels.shape)
            output = self.model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            # print(output)
            # loss = F.cross_entropy(output, targets)
            loss = output.loss
            # print(loss)
            ### TODO: Add lr_scheduler ###
            loss.backward()

            self.optimizer.step()
            return output, eval_loss
        elif run_type == 'validate':
            with torch.no_grad():
                output = self.model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
                eval_loss += output.loss.item()
            return output, eval_loss

    def _run_epoch(self, epoch, epoch_type='train'):

        if epoch_type == 'train':
            print("------ Training! ------")
            data = self.train_data
            eval_loss = None

        elif epoch_type == 'validate':
            print("------ Validating! ------")
            data = self.val_data
            eval_loss = 0

        # b_sz = len(next(iter(self.train_data))[0])
        # b_sz = len(next(iter(data))[0])
        # print(f"[GPU{self.gpu_id}] Epoch {epoch} | Batchsize: {b_sz} | Steps: {len(self.train_data)}")
        print(f"[GPU{self.gpu_id}] Epoch {epoch} | Steps: {len(data)}")
        data.sampler.set_epoch(epoch)
        # for source, targets in self.train_data:
        #     source = source.to(self.gpu_id)
        #     targets = targets.to(self.gpu_id)
        #     self._run_batch(source, targets)
        for batch_idx, batch in tqdm(enumerate(data), total=len(data)):
            input_ids = batch['input_ids'].to(self.gpu_id)
            attention_mask = batch['attention_mask'].to(self.gpu_id)
            labels = batch['labels'].to(self.gpu_id)

            if epoch_type == 'train':
                _, __ = self._run_batch(input_ids=input_ids, attention_mask=attention_mask, labels=labels, eval_loss=eval_loss, run_type='train')
            elif epoch_type == 'validate':
                _, eval_loss = self._run_batch(input_ids=input_ids, attention_mask=attention_mask, labels=labels, eval_loss=eval_loss, run_type='validate')
        if epoch_type == 'validate':
            print(f"Epoch {epoch+1}: Evaluation Loss = {eval_loss / len(self.val_data)}")

    def _save_checkpoint(self, epoch):
        ckp = self.model.module.state_dict()
        PATH = "."
        #         torch.save(ckp, PATH)
        self.model.module.save_pretrained(PATH)
        self.tokenizer.save_pretrained(PATH)

        print(f"Epoch {epoch} | Training checkpoint saved at {PATH}")

    def train(self, max_epochs: int):
        for epoch in range(max_epochs):
            self._run_epoch(epoch=epoch, epoch_type='train')
            self._run_epoch(epoch=epoch, epoch_type='validate')
            if self.gpu_id == 0 and (epoch + 1) % self.save_every == 0:
                self._save_checkpoint(epoch)
                
    def generate_predictions(self, texts):
        model = self.model.module
        model.to('cuda')
        model.eval()
        predictions = []
    
        for text in texts:
            inputs = self.tokenizer(text, return_tensors="pt", padding=True, truncation=True).input_ids
            with torch.no_grad():
                output_sequences = model.generate(**inputs.to('cuda'), num_beams=3, max_length=300, pad_token_id=model.config.eos_token_id)
        
            decoded_preds = self.tokenizer.batch_decode(output_sequences, skip_special_tokens=True)
            predictions.extend(decoded_preds)
    
        return predictions 

def load_train_objs(preTmodel):
    # train_set = MyTrainDataset(2048)  # load your dataset
    #     model_path = './model_cache/models--facebook--bart-base/snapshots/aadd2ab0ae0c8268c7c9693540e9904811f36177'
    if preTmodel == 'bart':
        tokenizer = BartTokenizer.from_pretrained('facebook/bart-base')
        model = BartForConditionalGeneration.from_pretrained('facebook/bart-base')
    elif preTmodel == 'gpt2':
        tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
        tokenizer.pad_token = tokenizer.eos_token  # Set padding token to EOS (since GPT-2 does not have a dedicated pad token)
        model = GPT2LMHeadModel.from_pretrained("gpt2")
    new_tokens = ['<H>', '<R>', '<T>']
    tokenizer.add_special_tokens({'additional_special_tokens': new_tokens})
    model.resize_token_embeddings(len(tokenizer))
    dataset_path = './webnlg/'
    train_dataset = QALD_9_AMRDataset('bfs', tokenizer)
    val_dataset = QALD_9_AMRDataset('bfs', tokenizer)
    # model = torch.nn.Linear(20, 1)  # load your model
    optimizer = torch.optim.AdamW(model.parameters(), lr=3e-5)
    return train_dataset, val_dataset, model, tokenizer, optimizer


def prepare_dataloader(dataset: Dataset, batch_size: int):
    return DataLoader(
        dataset,
        batch_size=batch_size,
        pin_memory=True,
        shuffle=False,
        sampler=DistributedSampler(dataset)
    )

def main(rank: int, world_size: int, save_every: int, total_epochs: int, batch_size: int):
    ddp_setup(rank, world_size)
    train_dataset, val_dataset, model, tokenizer, optimizer = load_train_objs("gpt2")
    train_data = prepare_dataloader(train_dataset, batch_size)
    val_data = prepare_dataloader(val_dataset, batch_size)
    trainer = Trainer(model, train_data, val_data, tokenizer, optimizer, rank, save_every)
    trainer.train(total_epochs)
    with open('./webnlg/'+ 'test_both.source', 'r') as f:
        test_source = f.readlines()
    with open('./webnlg/' + 'test_both.target', 'r') as f:
        test_target = f.readlines()
    predictions = trainer.generate_predictions(test_source)
    from nltk.translate.bleu_score import corpus_bleu
    references = [[ref.split()] for ref in test_target]  # Reference texts should be a list of lists of tokens
    predicted_tokens = [pred.split() for pred in predictions]  # Predictions should also be a list of lists of tokens
    bleu_score = corpus_bleu(references, predicted_tokens)
    print(f"BLEU score: {bleu_score:.4f}")
    destroy_process_group()


if __name__ == "__main__":
    import argparse
    parser = argparse.ArgumentParser(description='simple distributed training job')
    parser.add_argument('total_epochs', type=int, help='Total epochs to train the model')
    parser.add_argument('save_every', type=int, help='How often to save a snapshot')
    parser.add_argument('--batch_size', default=8, type=int, help='Input batch size on each device (default: 32)')
    args = parser.parse_args()

    world_size = torch.cuda.device_count()
    mp.spawn(main, args=(world_size, args.save_every, args.total_epochs, args.batch_size), nprocs=world_size)


Overwriting run.py


In [6]:
!python run.py 6 2

W1125 02:16:37.674000 133721619326784 torch/multiprocessing/spawn.py:146] Terminating process 186 via signal SIGTERM
Traceback (most recent call last):
  File "/kaggle/working/qald-amr-dataset/run.py", line 220, in <module>
    mp.spawn(main, args=(world_size, args.save_every, args.total_epochs, args.batch_size), nprocs=world_size)
  File "/opt/conda/lib/python3.10/site-packages/torch/multiprocessing/spawn.py", line 282, in spawn
    return start_processes(fn, args, nprocs, join, daemon, start_method="spawn")
  File "/opt/conda/lib/python3.10/site-packages/torch/multiprocessing/spawn.py", line 238, in start_processes
    while not context.join():
  File "/opt/conda/lib/python3.10/site-packages/torch/multiprocessing/spawn.py", line 189, in join
    raise ProcessRaisedException(msg, error_index, failed_process.pid)
torch.multiprocessing.spawn.ProcessRaisedException: 

-- Process 0 terminated with the following error:
Traceback (most recent call last):
  File "/opt/conda/lib/python3.10/si