# Training T5 model for text summarization

In [1]:
from datasets import load_dataset
from transformers import AutoTokenizer
from transformers import DataCollatorForSeq2Seq
import evaluate
import numpy as np
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer




In [2]:
# Load public-free fine tuning dataset from Datasets library.
billsum = load_dataset('billsum', split='ca_test')

# Split the dataset to test and train set.
billsum = billsum.train_test_split(test_size=0.2, seed=1)

# Check the train test values.
billsum['train'][0]
billsum['test'][0]

{'text': 'The people of the State of California do enact as follows:\n\n\nSECTION 1.\nSection 13651 of the Business and Professions Code is amended to read:\n13651.\n(a) (1) Every service station in this state shall provide during operating hours, and make available at no cost to customers who purchase motor vehicle fuel, water, compressed air, and a gauge for measuring air pressure to the public for use in servicing any passenger vehicle, as defined in Section 465 of the Vehicle Code, or any commercial vehicle, as defined in Section 260 of the Vehicle Code, with an unladen weight of 6,000 pounds or less.\n(2) Every service station in this state shall display, at a conspicuous place on, at, or near the dispensing apparatus at least one clearly visible sign that shall read as follows: “CALIFORNIA LAW REQUIRES THIS STATION TO PROVIDE FREE AIR AND WATER FOR AUTOMOTIVE PURPOSES TO ITS CUSTOMERS WHO PURCHASE MOTOR VEHICLE FUEL. IF YOU HAVE A COMPLAINT NOTIFY THE STATION ATTENDANT AND/OR CAL

In [5]:
type(billsum)

datasets.dataset_dict.DatasetDict

In [3]:
# Load t5 cokenizer to process text and summary.
checkpoint = 't5-small'
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

prefix = "summarize: "


def preprocess_function(examples):
    inputs = [prefix + doc for doc in examples["text"]]
    model_inputs = tokenizer(inputs, max_length=1024, truncation=True)

    labels = tokenizer(text_target=examples["summary"], max_length=128, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_billsum = billsum.map(preprocess_function, batched=True)

Map:   0%|          | 0/248 [00:00<?, ? examples/s]

In [4]:
tokenized_billsum

DatasetDict({
    train: Dataset({
        features: ['text', 'summary', 'title', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 989
    })
    test: Dataset({
        features: ['text', 'summary', 'title', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 248
    })
})

In [6]:
# Create a DataCollatorForSeq2Seq instance for sequence-to-sequence tasks.
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=checkpoint)

In [7]:
# Load the ROUGE evaluation metric using the `evaluate` module.
rouge = evaluate.load("rouge")

# Define a custom function that passes predictions and labels to compute to calculate the ROUGE metric.
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    result = rouge.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)

    return {k: round(v, 4) for k, v in result.items()}

In [8]:
model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)

training_args = Seq2SeqTrainingArguments(
    output_dir="./Seq2SeqTraining",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=1,
    predict_with_generate=True,
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_billsum["train"],
    eval_dataset=tokenized_billsum["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

  0%|          | 0/62 [00:00<?, ?it/s]

: 

In [None]:
trainer.save_model('fine_tuned_t5_model_by_temuulen')

In [None]:
results = trainer.evaluate()

In [None]:
metrics = results
print("Metrics:", metrics)

In [None]:
text = 'ttm healthcare are currently recruiting a clinical nurse manager 2 theatre on behalf of an east dublin based hospital.\nthis is a full time permanent role working 37.5 hours per week.\n\nthe role:\nthe role is responsible for the provision of a quality service in line with standards of theatre nursing practice. as a key member of the senior management team, the cnm2 in theatre (general) will demonstrate managerial and leadership skills and facilitate effective communication with colleagues in the hospital. the position requires a strategic approach to the development of services and structures, embracing continuous quality improvement and the management of changes necessary to achieve organisational objectives.\n\nessential criteria:\nbe registered in the general division of the register of nurses maintained by nmbi\nhave at least five years recent relevant post-registration nursing experience (full-time or equivalent hours part-time) in an acute hospital setting and a minimum of three years nursing experience in peri-operative theatre nursing\nhave a recognised post-registration nursing course - higher diploma or post grad course, in peri-operative theatre nursing or equivalent or be in pursuit of same\n\nbenefits:\ncompetitive salary\nup to 5% pension contribution\nsick pay\nmaternity benefit\nfree parking\nan education support programme\ndevelopment opportunities\nopportunities for career progression\nsubsidised restaurant\nemployee assistance programme\nlife assurance\nfree parking\n\na full job description is available upon request.\nfor more information call louise on 015136740 or click apply with your most recent cv today\n\nttm healthcare solutions is an equal opportunities employer'

In [1]:
from transformers import pipeline

summarizer = pipeline("summarization", model="fine_tuned_t5_model_by_temuulen")

summary = summarizer(text, max_length=500, min_length=400, length_penalty=2.0, num_beams=4)[0]['summary_text']

print(summary)




OSError: fine_tuned_t5_model_by_temuulen is not a local folder and is not a valid model identifier listed on 'https://huggingface.co/models'
If this is a private repository, make sure to pass a token having permission to this repo either by logging in with `huggingface-cli login` or by passing `token=<your_token>`

In [None]:
summarizer = pipeline("summarization", model="t5-small")

summary = summarizer(text, max_length=500, min_length=400, length_penalty=2.0, num_beams=4)[0]['summary_text']
print(summary)