In [25]:
from datasets import load_dataset
from transformers import AutoTokenizer, DataCollatorForSeq2Seq, AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer

import evaluate
import numpy as np

In [26]:
billsum = load_dataset("billsum", split="ca_test")
checkpoint = "t5-small"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [27]:
billsum = billsum.train_test_split(test_size=0.2)

In [28]:
prefix = "summarize: "


def preprocess_function(examples):

    inputs = [prefix + doc for doc in examples["text"]]

    model_inputs = tokenizer(inputs, max_length=1024, truncation=True)

    labels = tokenizer(text_target=examples["summary"], max_length=128, truncation=True)

    model_inputs["labels"] = labels["input_ids"]

    return model_inputs

In [29]:
tokenized_billsum = billsum.map(preprocess_function, batched=True)

Map:   0%|          | 0/989 [00:00<?, ? examples/s]

Map:   0%|          | 0/248 [00:00<?, ? examples/s]

In [30]:
tokenized_billsum

DatasetDict({
    train: Dataset({
        features: ['text', 'summary', 'title', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 989
    })
    test: Dataset({
        features: ['text', 'summary', 'title', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 248
    })
})

In [31]:
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=checkpoint)
rouge = evaluate.load("rouge")

In [32]:
def compute_metrics(eval_pred):

    predictions, labels = eval_pred

    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)

    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)

    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    result = rouge.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]

    result["gen_len"] = np.mean(prediction_lens)

    return {k: round(v, 4) for k, v in result.items()}

In [33]:
model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)

In [34]:
training_args = Seq2SeqTrainingArguments(

    output_dir="models/my_awesome_billsum_model",

    evaluation_strategy="epoch",

    learning_rate=2e-5,

    per_device_train_batch_size=1,

    per_device_eval_batch_size=1,

    weight_decay=0.01,

    save_total_limit=3,

    num_train_epochs=4,

    predict_with_generate=True,

    fp16=True,
)
trainer = Seq2SeqTrainer(

    model=model,

    args=training_args,

    train_dataset=tokenized_billsum["train"],

    eval_dataset=tokenized_billsum["test"],

    tokenizer=tokenizer,

    data_collator=data_collator,

    compute_metrics=compute_metrics,

)


In [42]:
import torch
# torch.cuda.empty_cache()
torch.cuda.memory_summary(device=None, abbreviated=False)



In [41]:
trainer.train()

OutOfMemoryError: CUDA out of memory. Tried to allocate 256.00 MiB. GPU 0 has a total capacty of 6.00 GiB of which 0 bytes is free. Including non-PyTorch memory, this process has 17179869184.00 GiB memory in use. Of the allocated memory 4.89 GiB is allocated by PyTorch, and 246.92 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF