In [2]:
import torch

device = 'cuda' if torch.cuda.is_available() else 'cpu'
print('using', device)

using cuda


In [3]:
from datasets import Dataset


def cached(filename, func, *args):
    try:
        res = torch.load(filename)
    except FileNotFoundError:
        res = func(*args)
        torch.save(res, filename)
    return res


def load_segment(segment_id) -> Dataset:
    return Dataset.load_from_disk(f'./tokenized_segments/seg-{segment_id}')
    # def wrapped():
    #     return Dataset.load_from_disk(f'./tokenized_segments/seg-{segment_id}', ).map(
    #         lambda x: {
    #             'input_ids': torch.tensor(x['input_ids']).to(device),
    #             'attention_mask': torch.tensor(x['attention_mask']).to(device) 
    #         },
    #         batched=True,
    #         batch_size=16,
    #         desc='Mapping ds to device'
    #     )

    # return cached(f'./cache/segment-ds-{segment_id}', wrapped)

In [4]:
from transformers import TrainingArguments, DataCollatorForLanguageModeling, Trainer
import evaluate
import numpy as np
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("Salesforce/codet5-large")

def compute_metrics(eval_preds):
    metric = evaluate.load("glue", "mrpc")
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

def meta_training_step(meta_id, segment_id, model):
    seg = load_segment(segment_id)
    ds = seg.train_test_split(0.05)

    training_args = TrainingArguments(
        output_dir=f'./results/meta-epoch-{meta_id}/segment-{segment_id}',
        num_train_epochs=5,
        per_device_train_batch_size=1,
        per_device_eval_batch_size=1,
        warmup_steps=500,
        save_steps=50000,
        weight_decay=0.01,
        logging_dir=f'./logs/meta-epoch-{meta_id}/segment-{segment_id}',
        logging_steps=10,
        evaluation_strategy='epoch',
    )

    data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=True, mlm_probability=0.15)

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=ds["train"],
        eval_dataset=ds["test"],
        data_collator=data_collator,
        compute_metrics=compute_metrics
    )

    trainer.train()

In [5]:
from transformers import T5ForConditionalGeneration

model = T5ForConditionalGeneration.from_pretrained("Salesforce/codet5-large").to(device)

In [6]:
meta_training_step(0, 0, model)

***** Running training *****
  Num examples = 1621479
  Num Epochs = 5
  Instantaneous batch size per device = 1
  Total train batch size (w. parallel, distributed & accumulation) = 1
  Gradient Accumulation steps = 1
  Total optimization steps = 8107395
  Number of trainable parameters = 737639424
  0%|          | 0/8107395 [00:00<?, ?it/s]You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


OutOfMemoryError: CUDA out of memory. Tried to allocate 20.00 MiB (GPU 0; 7.79 GiB total capacity; 6.32 GiB already allocated; 20.19 MiB free; 6.33 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF