In [1]:
!pip install tensorflow
!pip install rouge-score

Defaulting to user installation because normal site-packages is not writeable
[0mDefaulting to user installation because normal site-packages is not writeable
[0m

In [15]:
import warnings
warnings.filterwarnings('ignore') 

from datasets import load_dataset, DatasetDict
from datasets import Dataset
import pandas as pd
import os
import logging
import nltk
import numpy as np
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

# os.environ["HF_TOKEN"] = "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"
huggingface_token = "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"

# If the dataset is gated/private, make sure you have run huggingface-cli login
train_df = load_dataset("yale-nlp/QTSumm", token = huggingface_token, split='train')
test_df = load_dataset("yale-nlp/QTSumm", token = huggingface_token, split='test')
validate_df = load_dataset("yale-nlp/QTSumm", token = huggingface_token, split='validation')

In [16]:
model_path = "google/flan-t5-base"
tokenizer = AutoTokenizer.from_pretrained(model_path)

def tokenization(examples):
    inputs = [f"query:  {query} header: {' '.join(map(str, entry.get('header', [])))} rows: {' '.join(map(str, entry.get('rows', [])))} title: {' '.join(map(str, entry.get('title', [])))}"
    for query, entry in zip(examples['query'], examples['table'])]
    res = tokenizer(inputs, text_target=examples['summary'], truncation = True, padding = True)
    return res

tokenized_dataset_train = train_df.map(tokenization, batched=True)
tokenized_dataset_test = test_df.map(tokenization, batched=True)
tokenized_dataset_validate = validate_df.map(tokenization, batched=True)

processed_data_train = tokenized_dataset_train.remove_columns(['table','summary', 'row_ids', 'example_id', 'query'])
processed_data_test = tokenized_dataset_test.remove_columns(['table','summary', 'row_ids', 'example_id', 'query'])
processed_data_validate = tokenized_dataset_validate.remove_columns(['table','summary', 'row_ids', 'example_id', 'query'])

model = AutoModelForSeq2SeqLM.from_pretrained(model_path)

In [4]:
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer, DataCollatorForSeq2Seq
import evaluate

def metric_fn(eval_predictions):
    predictions, labels = eval_predictions
    decoded_predictions = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    for label in labels:
        label[label < 0] = tokenizer.pad_token_id  # Replace masked label tokens
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    rouge = evaluate.load('rouge')
    results = rouge.compute(predictions=decoded_predictions, references=decoded_labels)

    return results

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model= model)

train_args = Seq2SeqTrainingArguments(
        output_dir="./train_weights",
        learning_rate=2e-5,
        per_device_train_batch_size=32,
        per_device_eval_batch_size=64,
        num_train_epochs=20,
        evaluation_strategy="epoch",
        predict_with_generate=True,
        overwrite_output_dir= True
    )

trainer = Seq2SeqTrainer(
        model,
        train_args,
        train_dataset=processed_data_train,
        eval_dataset=processed_data_test,
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=metric_fn
    )
    
trainer.train()

2024-03-15 22:34:10.640338: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum
1,No log,4.148242,0.021399,0.003809,0.019011,0.018955
2,No log,1.599507,0.118695,0.044411,0.089383,0.089368
3,No log,0.975691,0.269453,0.144385,0.228142,0.228104
4,No log,0.86285,0.300123,0.166756,0.249949,0.249814
5,No log,0.818485,0.301852,0.168822,0.252331,0.252311
6,No log,0.796278,0.306688,0.172871,0.256442,0.256324
7,No log,0.782572,0.307781,0.173716,0.258493,0.258276
8,No log,0.773315,0.307163,0.17457,0.25899,0.258691
9,No log,0.765791,0.309006,0.175789,0.260155,0.259896
10,2.867200,0.759892,0.307511,0.174005,0.259125,0.258862


Checkpoint destination directory ./train_weights/checkpoint-500 already exists and is non-empty.Saving will proceed but saved results may be invalid.
Checkpoint destination directory ./train_weights/checkpoint-1000 already exists and is non-empty.Saving will proceed but saved results may be invalid.


TrainOutput(global_step=1040, training_loss=1.749329554117643, metrics={'train_runtime': 2076.1865, 'train_samples_per_second': 47.982, 'train_steps_per_second': 0.501, 'total_flos': 6.821552745086976e+16, 'train_loss': 1.749329554117643, 'epoch': 20.0})

In [5]:
from transformers import pipeline

tokenizer = AutoTokenizer.from_pretrained("./train_weights/checkpoint-1000", device_map='auto')
model = AutoModelForSeq2SeqLM.from_pretrained("./train_weights/checkpoint-1000")

pipe = pipeline("summarization", model=model, tokenizer=tokenizer)

In [6]:
def create_text(examples):
    inputs = [f"query:  {query} header: {' '.join(map(str, entry.get('header', [])))} rows: {' '.join(map(str, entry.get('rows', [])))} title: {' '.join(map(str, entry.get('title', [])))}"
    for query, entry in zip(examples['query'], examples['table'])]
    examples['text'] = inputs
    return examples

tester = test_df.map(create_text, batched=True)

In [7]:
tester

Dataset({
    features: ['summary', 'example_id', 'query', 'row_ids', 'table', 'text'],
    num_rows: 1078
})

In [8]:
from transformers.pipelines.pt_utils import KeyDataset
vals = []
for out in pipe(
        KeyDataset(tester, "text"),
        do_sample=False,
        num_return_sequences=1,
        eos_token_id=tokenizer.eos_token_id, ):
    vals.append(out)
    break

Token indices sequence length is longer than the specified maximum sequence length for this model (958 > 512). Running this sequence through the model will result in indexing errors


In [9]:
i = 482
print(pipe(tester['text'][i]))
print(tester['text'][i])
print(tester['summary'][i])

[{'summary_text': 'The team that use KTM-VMC equipment is Danil Willemsen / Kenny Van Gaalen, who is driving the Zabel - Wsp team. He is a driver with a total of 452 points and a passenger with 385 points. Jan Hendrickx / Tim Smeuninx also use the KTM - VMC equipment, with 222 points and 369 points.'}]
query:  Summarize the team(s) that are using KTM-VMC equipment. header: Position Driver / Passenger Equipment Bike No Points rows: ['1', 'Daniãl Willemsen / Kenny Van Gaalen', 'Zabel - Wsp', '1', '452'] ['2', 'Etienne Bax / Kaspars Stupelis', 'Zabel - Wsp', '5', '447'] ['3', 'Ben Adriaenssen / Sven Verbrugge', 'Ktm - Wsp', '6', '385'] ['4', 'Joris Hendrickx / Kaspars Liepins', 'Ktm - Vmc', '222', '369'] ['5', 'Jan Hendrickx / Tim Smeuninx', 'Zabel - Vmc', '3', '369'] ['6', 'Valentin Giraud / Nicolas Musset', 'Ktm - Wht', '138', '334'] ['7', 'Vaclav Rozehnal / Marek Rozehnal', 'Zabel - Vmc', '11', '240'] ['8', 'Marcel Willemsen / Gertie Eggink', 'Zabel - Mefo', '21', '223'] ['9', 'Maris R

In [10]:
tester[0]['summary']

'John Roberts from Maryland and Samuel Alito from New Zersey   were appointed by a President Bush.'

In [17]:
import datasets

## Jairo changes with KFold
all_data = datasets.concatenate_datasets([train_df, test_df, validate_df]) # combine for k-fald, consider only combining train and validate
all_data = all_data.shuffle(seed=42)

In [18]:
from datasets import Dataset

def k_fold_split(dataset, num_folds=5):
    fold_size = len(dataset) // num_folds
    folds = []
    for i in range(num_folds):
        start = i * fold_size
        end = start + fold_size if i < num_folds - 1 else len(dataset)
        folds.append(dataset.select(range(start, end)))
    return folds

In [None]:
folds = k_fold_split(all_data, num_folds=5)

for i in range(len(folds)):
    val_fold = folds[i]
    train_folds = [folds[j] for j in range(len(folds)) if j != i]
    train_dataset = datasets.concatenate_datasets(train_folds)

    tokenized_train = train_dataset.map(tokenization, batched=True)
    tokenized_val = val_fold.map(tokenization, batched=True)

    # Remove unnecessary columns
    processed_train = tokenized_train.remove_columns(['table', 'summary', 'row_ids', 'example_id', 'query'])
    processed_val = tokenized_val.remove_columns(['table', 'summary', 'row_ids', 'example_id', 'query'])

    # Update your trainer's train_dataset and eval_dataset
    trainer.train_dataset = processed_train
    trainer.eval_dataset = processed_val

    # Train your model
    trainer.train()
    trainer.evaluate()


Map: 100%|██████████| 5689/5689 [00:07<00:00, 731.67 examples/s]
Map: 100%|██████████| 1422/1422 [00:01<00:00, 721.34 examples/s]


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum
1,No log,0.532276,0.315868,0.187394,0.272763,0.27294
2,No log,0.530985,0.31577,0.188232,0.272311,0.272526
3,No log,0.531563,0.315974,0.187663,0.27294,0.273156
4,No log,0.530249,0.314669,0.187952,0.271384,0.271585
5,No log,0.530288,0.316527,0.189157,0.273631,0.273853
6,No log,0.529823,0.315698,0.187911,0.272491,0.272746
7,No log,0.529345,0.316345,0.188757,0.273586,0.2738
8,No log,0.528345,0.316942,0.189323,0.274038,0.274292
9,0.626400,0.528366,0.316157,0.190025,0.27396,0.274204
10,0.626400,0.527838,0.316732,0.189278,0.2741,0.274474


Checkpoint destination directory ./train_weights/checkpoint-500 already exists and is non-empty.Saving will proceed but saved results may be invalid.
Checkpoint destination directory ./train_weights/checkpoint-1000 already exists and is non-empty.Saving will proceed but saved results may be invalid.


Map: 100%|██████████| 5689/5689 [00:07<00:00, 772.14 examples/s]
Map: 100%|██████████| 1422/1422 [00:01<00:00, 790.59 examples/s]


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum
1,No log,0.57245,0.318869,0.193607,0.273363,0.273038
2,No log,0.57218,0.318907,0.193385,0.273033,0.272921
3,No log,0.572722,0.319371,0.194746,0.273847,0.273662
4,No log,0.573061,0.319074,0.193481,0.273596,0.27348
5,No log,0.572866,0.31789,0.193186,0.272311,0.272162
6,No log,0.572927,0.318614,0.193231,0.273,0.27278
7,No log,0.572826,0.318621,0.193432,0.273326,0.273168
8,No log,0.572529,0.318725,0.193517,0.273554,0.273301
9,0.580300,0.572627,0.318449,0.193905,0.273447,0.273314
10,0.580300,0.572231,0.3185,0.193778,0.273489,0.273376


Checkpoint destination directory ./train_weights/checkpoint-500 already exists and is non-empty.Saving will proceed but saved results may be invalid.
Checkpoint destination directory ./train_weights/checkpoint-1000 already exists and is non-empty.Saving will proceed but saved results may be invalid.


Map: 100%|██████████| 5689/5689 [00:07<00:00, 719.37 examples/s]
Map: 100%|██████████| 1422/1422 [00:02<00:00, 703.77 examples/s]


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum
1,No log,0.492298,0.317293,0.196126,0.279054,0.278926
2,No log,0.493016,0.317078,0.195439,0.278579,0.278599
3,No log,0.493376,0.317407,0.195625,0.279375,0.279428
4,No log,0.493702,0.317086,0.195194,0.278686,0.278737
5,No log,0.49384,0.316595,0.194045,0.277695,0.277621
6,No log,0.494479,0.317811,0.195382,0.27914,0.279132
7,No log,0.49427,0.317508,0.195015,0.278487,0.278529
8,No log,0.494261,0.318622,0.195949,0.279652,0.279708
9,0.566300,0.49488,0.31773,0.195644,0.279126,0.279227
10,0.566300,0.494495,0.318251,0.195748,0.279498,0.279507


Checkpoint destination directory ./train_weights/checkpoint-500 already exists and is non-empty.Saving will proceed but saved results may be invalid.
