In [11]:
import warnings
warnings.filterwarnings('ignore') 

from datasets import load_dataset, DatasetDict
from datasets import Dataset
import pandas as pd
import os
import logging
import nltk
import numpy as np
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

huggingface_token = os.getenv("HUGGINGFACE_TOKEN")

# If the dataset is gated/private, make sure you have run huggingface-cli login
train_df = load_dataset("yale-nlp/QTSumm", token = huggingface_token, split='train')
test_df = load_dataset("yale-nlp/QTSumm", token = huggingface_token, split='test')
validate_df = load_dataset("yale-nlp/QTSumm", token = huggingface_token, split='validation')

In [12]:
model_path = "google/flan-t5-base"
tokenizer = AutoTokenizer.from_pretrained(model_path)

def tokenization(examples):
    inputs = [f"query:  {query} header: {' '.join(map(str, entry.get('header', [])))} rows: {' '.join(map(str, entry.get('rows', [])))} title: {' '.join(map(str, entry.get('title', [])))}"
    for query, entry in zip(examples['query'], examples['table'])]
    res = tokenizer(inputs, text_target=examples['summary'], truncation = True, padding = True)
    return res

tokenized_dataset_train = train_df.map(tokenization, batched=True)
tokenized_dataset_test = test_df.map(tokenization, batched=True)
tokenized_dataset_validate = validate_df.map(tokenization, batched=True)

processed_data_train = tokenized_dataset_train.remove_columns(['table','summary', 'row_ids', 'example_id', 'query'])
processed_data_test = tokenized_dataset_test.remove_columns(['table','summary', 'row_ids', 'example_id', 'query'])
processed_data_validate = tokenized_dataset_validate.remove_columns(['table','summary', 'row_ids', 'example_id', 'query'])

model = AutoModelForSeq2SeqLM.from_pretrained(model_path)

Map: 100%|██████████| 1052/1052 [00:01<00:00, 808.11 examples/s]


In [4]:
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer, DataCollatorForSeq2Seq
import evaluate

def metric_fn(eval_predictions):
    predictions, labels = eval_predictions
    decoded_predictions = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    for label in labels:
        label[label < 0] = tokenizer.pad_token_id  # Replace masked label tokens
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    rouge = evaluate.load('rouge')
    results = rouge.compute(predictions=decoded_predictions, references=decoded_labels)

    return results

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model= model)

train_args = Seq2SeqTrainingArguments(
        output_dir="./train_weights",
        learning_rate=2e-5,
        per_device_train_batch_size=32,
        per_device_eval_batch_size=64,
        num_train_epochs=20,
        evaluation_strategy="epoch",
        predict_with_generate=True,
        overwrite_output_dir= True
    )

trainer = Seq2SeqTrainer(
        model,
        train_args,
        train_dataset=processed_data_train,
        eval_dataset=processed_data_test,
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=metric_fn
    )
    
trainer.train()

You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum
1,No log,4.148242,0.021399,0.003809,0.019011,0.018955
2,No log,1.599507,0.118695,0.044411,0.089383,0.089368
3,No log,0.975691,0.269453,0.144385,0.228142,0.228104
4,No log,0.86285,0.300123,0.166756,0.249949,0.249814
5,No log,0.818485,0.301852,0.168822,0.252331,0.252311
6,No log,0.796278,0.306688,0.172871,0.256442,0.256324
7,No log,0.782572,0.307781,0.173716,0.258493,0.258276
8,No log,0.773315,0.307163,0.17457,0.25899,0.258691
9,No log,0.765791,0.309006,0.175789,0.260155,0.259896
10,2.867200,0.759892,0.307511,0.174005,0.259125,0.258862


TrainOutput(global_step=1040, training_loss=1.749329554117643, metrics={'train_runtime': 2022.1609, 'train_samples_per_second': 49.264, 'train_steps_per_second': 0.514, 'total_flos': 6.821552745086976e+16, 'train_loss': 1.749329554117643, 'epoch': 20.0})

In [17]:
from transformers import pipeline

tokenizer = AutoTokenizer.from_pretrained("./train_weights/checkpoint-1000", device_map='auto')
model = AutoModelForSeq2SeqLM.from_pretrained("./train_weights/checkpoint-1000")

pipe = pipeline("summarization", model=model, tokenizer=tokenizer)

In [18]:
def create_text(examples):
    inputs = [f"query:  {query} header: {' '.join(map(str, entry.get('header', [])))} rows: {' '.join(map(str, entry.get('rows', [])))} title: {' '.join(map(str, entry.get('title', [])))}"
    for query, entry in zip(examples['query'], examples['table'])]
    examples['text'] = inputs
    return examples

tester = test_df.map(create_text, batched=True)

In [19]:
tester

Dataset({
    features: ['row_ids', 'summary', 'query', 'table', 'example_id', 'text'],
    num_rows: 1078
})

In [20]:
from transformers.pipelines.pt_utils import KeyDataset
vals = []
for out in pipe(
        KeyDataset(tester, "text"),
        do_sample=False,
        num_return_sequences=1,
        eos_token_id=tokenizer.eos_token_id, ):
    vals.append(out)
    break

Token indices sequence length is longer than the specified maximum sequence length for this model (958 > 512). Running this sequence through the model will result in indexing errors


In [28]:
i = 482
print(pipe(tester['text'][i]))
print(tester['text'][i])
print(tester['summary'][i])

[{'summary_text': 'The team that use KTM-VMC equipment is Danil Willemsen / Kenny Van Gaalen, who is driving the Zabel - Wsp team. He is a driver with a total of 452 points and a passenger with 385 points. Jan Hendrickx / Tim Smeuninx also use the KTM - VMC equipment, with 222 points and 369 points.'}]
query:  Summarize the team(s) that are using KTM-VMC equipment. header: Position Driver / Passenger Equipment Bike No Points rows: ['1', 'Daniãl Willemsen / Kenny Van Gaalen', 'Zabel - Wsp', '1', '452'] ['2', 'Etienne Bax / Kaspars Stupelis', 'Zabel - Wsp', '5', '447'] ['3', 'Ben Adriaenssen / Sven Verbrugge', 'Ktm - Wsp', '6', '385'] ['4', 'Joris Hendrickx / Kaspars Liepins', 'Ktm - Vmc', '222', '369'] ['5', 'Jan Hendrickx / Tim Smeuninx', 'Zabel - Vmc', '3', '369'] ['6', 'Valentin Giraud / Nicolas Musset', 'Ktm - Wht', '138', '334'] ['7', 'Vaclav Rozehnal / Marek Rozehnal', 'Zabel - Vmc', '11', '240'] ['8', 'Marcel Willemsen / Gertie Eggink', 'Zabel - Mefo', '21', '223'] ['9', 'Maris R

In [23]:
tester[0]['summary']

'John Roberts from Maryland and Samuel Alito from New Zersey   were appointed by a President Bush.'