In [1]:
from datasets import  load_dataset

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
billsum = load_dataset("billsum", split="ca_test")

In [3]:
billsum = billsum.train_test_split(test_size=0.2)

In [4]:
billsum["train"][0]

{'text': 'The people of the State of California do enact as follows:\n\n\nSECTION 1.\n(a) The Legislature finds and declares the following:\n(1)\nA new form of work has proliferated in which individuals work by the job through an electronic platform, such as the Internet or telephone. These individuals are hired through these hosting platforms to perform short-term work, usually of a day or less, for multiple customers.\n(2)\nThese individuals are not treated by the hosting platforms as employees and do not receive the benefit of state labor protection laws such as minimum wage, unemployment insurance, and workers’ compensation. The platforms treat these individuals as independent contractors and even though they perform work for multiple clients, usually individual people, the individuals securing work through a hosting platform are normally prohibited from negotiating the terms of their services. Instead, the hosting platforms dictate the terms and take a considerable portion of the 

In [36]:
from transformers import AutoTokenizer
checkpoint = "t5-small"
access_token = "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"
tokenizer = AutoTokenizer.from_pretrained(checkpoint, token=access_token)

In [37]:
prefix = "summarize: "


def preprocess_function(examples):
    inputs = [prefix + doc for doc in examples["text"]]
    model_inputs = tokenizer(inputs, max_length=1024, truncation=True)

    labels = tokenizer(text_target=examples["summary"], max_length=128, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [38]:
tokenized_billsum = billsum.map(preprocess_function, batched=True)

In [39]:
billsum

DatasetDict({
    train: Dataset({
        features: ['text', 'summary', 'title'],
        num_rows: 989
    })
    test: Dataset({
        features: ['text', 'summary', 'title'],
        num_rows: 248
    })
})

In [40]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=checkpoint)

In [41]:
import evaluate

rouge = evaluate.load("rouge")

In [42]:
import numpy as np


def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    result = rouge.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)

    return {k: round(v, 4) for k, v in result.items()}

In [43]:
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer

model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint, token=access_token)

In [44]:
training_args = Seq2SeqTrainingArguments(
    output_dir="my_awesome_billsum_model",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=4,
    predict_with_generate=True,
    fp16=True,
    push_to_hub=True,
)

In [45]:
from huggingface_hub import notebook_login

In [48]:
training_args = Seq2SeqTrainingArguments(
    output_dir="my_awesome_billsum_model",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=4,
    predict_with_generate=True,
    fp16=True,
    push_to_hub=True,
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_billsum["train"],
    eval_dataset=tokenized_billsum["test"], 
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)


trainer.train()

ValueError: Token is required (write-access action) but no token found. You need to provide a token or be logged in to Hugging Face with `huggingface-cli login` or `huggingface_hub.login`. See https://huggingface.co/settings/tokens.

In [47]:
trainer.push_to_hub()

NameError: name 'trainer' is not defined