In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import torch
torch.cuda.empty_cache()

In [4]:
from sklearn.model_selection import KFold
from datasets import load_dataset, DatasetDict, Dataset, concatenate_datasets
import datasets
import pandas as pd
import os
import logging
import nltk
import numpy as np
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, AutoModelForCausalLM
from random import sample


train_df = datasets.load_from_disk("/home/y.khan/cai6307-y.khan/Query-Focused-Tabular-Summarization/data/decomposed/decomposed_train")
test_df = datasets.load_from_disk("/home/y.khan/cai6307-y.khan/Query-Focused-Tabular-Summarization/data/decomposed/decomposed_test")
validate_df = datasets.load_from_disk("/home/y.khan/cai6307-y.khan/Query-Focused-Tabular-Summarization/data/decomposed/decomposed_validate")

In [5]:
model_path = "google/flan-t5-large"
tokenizer = AutoTokenizer.from_pretrained(model_path)

model = AutoModelForSeq2SeqLM.from_pretrained(model_path)

In [6]:
from typing import List, Dict

def tokenization_with_answer(examples):
    inputs = []
    targets = []
    
    task_prefix = "Given a query and a table, generate a summary that answers the query based on the information in the table: "

    for i, (query, table, answer, coordinates, summary) in enumerate(zip(examples['query'], examples['table'], examples['answers'], examples['coordinates'], examples['summary'])):
        flattened_table = flatten_table(table, i)
        input_text = f"{task_prefix} Table {flattened_table}. Query: {query}"

        inputs.append(input_text)
        targets.append(summary)
        
    model_inputs = tokenizer(inputs, max_length=1024, truncation=True,padding='max_length')
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples["summary"], max_length=512, truncation=True)
    
    model_inputs["labels"] = labels["input_ids"] 

    res = tokenizer(inputs, text_target=targets, truncation=True, padding=True)
    return model_inputs

def flatten_table(table: Dict, row_index: int) -> str:
    header = table.get('header', [])
    rows = table.get('rows', [])
    title = table.get('title', [])

    flattened_rows = []
    for i, row in enumerate(rows):
        row_text = f"Row {i}, " + ",".join([f"{col}:{val}" for col, val in zip(header, row)])
        flattened_rows.append("## "+row_text)

    flattened_table = f"Title: {' '.join(map(str, title))}" + " " + " ".join(flattened_rows)
    return flattened_table

tokenized_dataset_train = train_df.map(tokenization_with_answer, batched=True)
tokenized_dataset_test = test_df.map(tokenization_with_answer, batched=True)

processed_data_train = tokenized_dataset_train.remove_columns(['table','summary', 'row_ids', 'example_id', 'query', 'answers', 'coordinates'])
processed_data_test = tokenized_dataset_test.remove_columns(['table','summary', 'row_ids', 'example_id', 'query', 'answers', 'coordinates'])

In [7]:
def k_fold_split(dataset, num_folds=5):
    fold_size = len(dataset) // num_folds
    folds = []
    for i in range(num_folds):
        start = i * fold_size
        end = start + fold_size if i < num_folds - 1 else len(dataset)
        folds.append(dataset.select(range(start, end)))
    return folds

In [8]:
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer, DataCollatorForSeq2Seq
import evaluate

def postprocess_text(preds, labels):
        preds = [pred.strip() for pred in preds]
        labels = [label.strip() for label in labels]

        # rougeLSum expects newline after each sentence
        preds = ["\n".join(nltk.sent_tokenize(pred)) for pred in preds]
        labels = ["\n".join(nltk.sent_tokenize(label)) for label in labels]

        return preds, labels

def metric_fn(eval_predictions):
    predictions, labels = eval_predictions
    decoded_predictions = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    for label in labels:
        label[label < 0] = tokenizer.pad_token_id  # Replace masked label tokens
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    
    decoded_predictions, decoded_labels = postprocess_text(decoded_predictions, decoded_labels)

    rouge = evaluate.load('rouge')

    # Compute ROUGE scores
    rouge_results = rouge.compute(predictions=decoded_predictions, references=decoded_labels)

    return rouge_results

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model= model)

train_args = Seq2SeqTrainingArguments(
    output_dir="./train_weights_bart",
    learning_rate=2e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=2,
    num_train_epochs=20,
    evaluation_strategy="epoch",
    save_strategy = "epoch",
    weight_decay=0.01,
    save_total_limit=5,
    warmup_ratio=0.03,
    load_best_model_at_end=True,
    predict_with_generate=True,
    overwrite_output_dir= True
)

trainer = Seq2SeqTrainer(
    model,
    train_args,
    train_dataset=processed_data_train,
    eval_dataset=processed_data_test,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=metric_fn
)

2024-03-29 13:23:02.869744: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [None]:
folds = k_fold_split(train_df, num_folds=10)

for i in range(len(folds)):
    val_fold = folds[i]
    train_folds = [folds[j] for j in range(len(folds)) if j != i]
    train_dataset = concatenate_datasets(train_folds)

    tokenized_train = train_dataset.map(tokenization_with_answer, batched=True)
    tokenized_val = val_fold.map(tokenization_with_answer, batched=True)

    # Remove unnecessary columns
    processed_train = tokenized_train.remove_columns(['table', 'summary', 'row_ids', 'example_id', 'query', 'answers', 'coordinates'])
    processed_val = tokenized_val.remove_columns(['table', 'summary', 'row_ids', 'example_id', 'query', 'answers', 'coordinates'])

    # Update your trainer's train_dataset and eval_dataset
    trainer.train_dataset = processed_train
    trainer.eval_dataset = processed_val

    # Train your model
    trainer.train()
    trainer.evaluate()

You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum
1,No log,1.73693,0.310908,0.17974,0.263519,0.277832
2,No log,1.674099,0.30891,0.173679,0.261352,0.274203
3,No log,1.642674,0.307926,0.175376,0.261209,0.276965
4,1.834600,1.627459,0.312875,0.176647,0.266516,0.281615
5,1.834600,1.60956,0.313811,0.178007,0.266722,0.282186
6,1.834600,1.601541,0.314084,0.18058,0.270219,0.283313
7,1.590100,1.591774,0.314868,0.18145,0.273688,0.28531
8,1.590100,1.58976,0.31333,0.180013,0.271253,0.284016
9,1.590100,1.584818,0.312966,0.180743,0.270743,0.2828
10,1.463500,1.587157,0.31385,0.180796,0.2723,0.285048


Map: 100%|██████████| 1800/1800 [00:07<00:00, 227.05 examples/s]
Map: 100%|██████████| 200/200 [00:00<00:00, 424.40 examples/s]


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum
1,No log,1.114217,0.3326,0.208381,0.291012,0.305302
2,No log,1.118116,0.334085,0.210081,0.293453,0.306973
3,No log,1.118546,0.334477,0.211602,0.294502,0.308379
4,1.297400,1.120974,0.335667,0.214816,0.294608,0.310219
5,1.297400,1.122085,0.337022,0.214228,0.296231,0.310458
6,1.297400,1.124252,0.335772,0.214457,0.297113,0.31074
7,1.191400,1.128902,0.335991,0.212867,0.297776,0.311197
8,1.191400,1.130521,0.335766,0.213879,0.296524,0.309863
9,1.191400,1.13146,0.333117,0.210617,0.292998,0.307099
10,1.119400,1.136435,0.335629,0.21047,0.295753,0.310432


Map: 100%|██████████| 1800/1800 [00:07<00:00, 248.24 examples/s]
Map: 100%|██████████| 200/200 [00:00<00:00, 418.82 examples/s]


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum
1,No log,0.741855,0.356044,0.25456,0.325054,0.337029
2,No log,0.744739,0.350753,0.24835,0.320862,0.331651
3,No log,0.748812,0.349803,0.247885,0.318771,0.33127
4,1.013300,0.753681,0.350841,0.251319,0.320557,0.332555
5,1.013300,0.753704,0.347714,0.246035,0.316583,0.328965
6,1.013300,0.759392,0.347544,0.24599,0.316583,0.329383
7,0.935600,0.760708,0.344649,0.243347,0.313313,0.325964
8,0.935600,0.763665,0.344218,0.24189,0.312875,0.326514
9,0.935600,0.767649,0.342378,0.239609,0.311347,0.32506
10,0.879500,0.768537,0.347525,0.248661,0.316489,0.330389


Map: 100%|██████████| 1800/1800 [00:06<00:00, 267.77 examples/s]
Map: 100%|██████████| 200/200 [00:00<00:00, 399.74 examples/s]


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum
1,No log,0.508371,0.354669,0.266144,0.330871,0.341029
2,No log,0.50899,0.356684,0.272199,0.334627,0.345068
3,No log,0.514289,0.357115,0.269836,0.333715,0.34496
4,0.792400,0.5169,0.35819,0.270751,0.335121,0.345776
5,0.792400,0.520137,0.355598,0.265978,0.331804,0.342417
6,0.792400,0.522712,0.354043,0.266569,0.331453,0.341544
7,0.728700,0.524508,0.355427,0.265465,0.330176,0.342221
8,0.728700,0.527906,0.352531,0.263736,0.328863,0.339307
9,0.728700,0.528442,0.350817,0.260793,0.326855,0.338545
10,0.684400,0.530158,0.35212,0.260756,0.32648,0.338567


Map: 100%|██████████| 1800/1800 [00:08<00:00, 213.65 examples/s]
Map: 100%|██████████| 200/200 [00:00<00:00, 386.25 examples/s]


Epoch,Training Loss,Validation Loss


In [None]:
model.save_pretrained("Flan-descomposed")
tokenizer.save_pretrained("Flan-decomposed")