In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import torch
torch.cuda.empty_cache()

In [39]:
from sklearn.model_selection import KFold
from datasets import load_dataset, DatasetDict, Dataset, concatenate_datasets
import datasets
import pandas as pd
import os
import logging
import nltk
import numpy as np
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, AutoModelForCausalLM
from random import sample


train_df = datasets.load_from_disk("/home/y.khan/cai6307-y.khan/Query-Focused-Tabular-Summarization/data/question_answered/train_with_answer")
test_df = datasets.load_from_disk("/home/y.khan/cai6307-y.khan/Query-Focused-Tabular-Summarization/data/decomposed/decomposed_test")
validate_df = datasets.load_from_disk("/home/y.khan/cai6307-y.khan/Query-Focused-Tabular-Summarization/data/decomposed/decomposed_validate")

In [52]:
ind = 1001
import pandas as pd
def to_pandas(item):
  return pd.DataFrame(item['table']["rows"],columns=item['table']["header"])

to_pandas(train_df[ind])

Unnamed: 0,Team,Head coach,Previous job,Year at school,Overall record,MAAC record,MAAC Tournament championships
0,Canisius,Reggie Witherspoon,Chattanooga (asst.),3,39–28,25–13,0
1,Fairfield,Sydney Johnson,Princeton,8,107–125,62–72,0
2,Iona,Tim Cluess,LIU Post,9,182–92,112–40,4
3,Manhattan,Steve Masiello,Louisville (asst.),8,116–110,72–62,2
4,Marist,John Dunne,Saint Peter's,1,0–0,0–0,1
5,Monmouth,King Rice,Vanderbilt (asst.),8,117–112,60–38,0
6,Niagara,Chris Casey,LIU Post,6,51–110,33–65,0
7,Quinnipiac,Baker Dunleavy,Villanova (asst.),2,12–21,7–11,0
8,Rider,Kevin Baggett,Rider (assoc. HC),7,107–89,69–47,0
9,Saint Peter's,Shaheen Holloway,Seton Hall (asst.),1,0–0,0–0,0


In [53]:
train_df[ind]['query'], train_df[ind]['answers']

('How many years of experience at their respective school do each of the coaches with MAAC Tournament championships have?',
 'COUNT > 9, 8, 1')

In [4]:
# model_path = "google/flan-t5-large"
model_path = "/home/y.khan/cai6307-y.khan/Query-Focused-Tabular-Summarization/models/saved_model/Flan-T5-Decomposed"
tokenizer = AutoTokenizer.from_pretrained(model_path)

model = AutoModelForSeq2SeqLM.from_pretrained(model_path)

In [12]:
from typing import List, Dict

def flatten_table(table: Dict) -> str:
    header = table.get('header', [])
    rows = table.get('rows', [])
    title = table.get('title', [])

    flattened_rows = []
    for i, row in enumerate(rows):
        row_text = f"Row {i}, " + ",".join([f"{col}:{val}" for col, val in zip(header, row)])
        flattened_rows.append("## "+row_text)

    flattened_table = f"Title: {' '.join(map(str, title))}" + " " + " ".join(flattened_rows)
    return flattened_table

def generate_predictions(dataset):
    generated_texts = []
    task_prefix = "Given a query and a table, generate a summary that answers the query based on the information in the table: "
    for example in dataset:
        table = example['table']
        query = example['query']
        flattened_table = flatten_table(table)
        input_text = f"{task_prefix} Table {flattened_table}. Query: {query}"
        model_input = tokenizer(input_text, max_length=1024, truncation=True,padding='max_length') 
        output_sequences = model.generate(model_input)
        generated_text = tokenizer.decode(output_sequences[0], skip_special_tokens=True)
        
        # Add to list of generated text
        generated_texts.append(generated_text)
    return generated_texts

In [24]:
from typing import List, Dict

def tokenization_with_answer(examples):
    inputs = []
    targets = []
    
    task_prefix = "Given a query and a table, generate a summary that answers the query based on the information in the table: "

    for i, (query, table, answer, coordinates, summary) in enumerate(zip(examples['query'], examples['table'], examples['answers'], examples['coordinates'], examples['summary'])):
        flattened_table = flatten_table(table, i)
        input_text = f"{task_prefix} Table {flattened_table}. Query: {query}"

        inputs.append(input_text)
        targets.append(summary)
        
    model_inputs = tokenizer(inputs, max_length=1024, truncation=True,padding='max_length')
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples["summary"], max_length=512, truncation=True)
    
    model_inputs["labels"] = labels["input_ids"] 

    res = tokenizer(inputs, text_target=targets, truncation=True, padding=True)
    return model_inputs

def flatten_table(table: Dict, row_index: int) -> str:
    header = table.get('header', [])
    rows = table.get('rows', [])
    title = table.get('title', [])

    flattened_rows = []
    for i, row in enumerate(rows):
        row_text = f"Row {i}, " + ",".join([f"{col}:{val}" for col, val in zip(header, row)])
        flattened_rows.append("## "+row_text)

    flattened_table = f"Title: {' '.join(map(str, title))}" + " " + " ".join(flattened_rows)
    return flattened_table

tokenized_dataset_train = train_df.map(tokenization_with_answer, batched=True)
tokenized_dataset_test = test_df.map(tokenization_with_answer, batched=True)

processed_data_train = tokenized_dataset_train.remove_columns(['table','summary', 'row_ids', 'example_id', 'query', 'answers', 'coordinates'])
processed_data_test = tokenized_dataset_test.remove_columns(['table','summary', 'row_ids', 'example_id', 'query', 'answers', 'coordinates'])

In [25]:
def k_fold_split(dataset, num_folds=5):
    fold_size = len(dataset) // num_folds
    folds = []
    for i in range(num_folds):
        start = i * fold_size
        end = start + fold_size if i < num_folds - 1 else len(dataset)
        folds.append(dataset.select(range(start, end)))
    return folds

In [26]:
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer, DataCollatorForSeq2Seq
import evaluate

def postprocess_text(preds, labels):
        preds = [pred.strip() for pred in preds]
        labels = [label.strip() for label in labels]

        # rougeLSum expects newline after each sentence
        preds = ["\n".join(nltk.sent_tokenize(pred)) for pred in preds]
        labels = ["\n".join(nltk.sent_tokenize(label)) for label in labels]

        return preds, labels

def metric_fn(eval_predictions):
    predictions, labels = eval_predictions
    decoded_predictions = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    for label in labels:
        label[label < 0] = tokenizer.pad_token_id  # Replace masked label tokens
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    
    decoded_predictions, decoded_labels = postprocess_text(decoded_predictions, decoded_labels)

    rouge = evaluate.load('rouge')

    # Compute ROUGE scores
    rouge_results = rouge.compute(predictions=decoded_predictions, references=decoded_labels)

    return rouge_results

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model= model)

train_args = Seq2SeqTrainingArguments(
    output_dir="./train_weights_flan_decomposed",
    learning_rate=3e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=2,
    num_train_epochs=20,
    evaluation_strategy="epoch",
    save_strategy = "epoch",
    weight_decay=0.01,
    save_total_limit=5,
    warmup_ratio=0.05,
    load_best_model_at_end=True,
    predict_with_generate=True,
    overwrite_output_dir= True,
    gradient_accumulation_steps = 2
)

trainer = Seq2SeqTrainer(
    model,
    train_args,
    train_dataset=processed_data_train,
    eval_dataset=processed_data_test,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=metric_fn
)

In [None]:
folds = k_fold_split(train_df, num_folds=10)

for i in range(len(folds)):
    val_fold = folds[i]
    train_folds = [folds[j] for j in range(len(folds)) if j != i]
    train_dataset = concatenate_datasets(train_folds)

    tokenized_train = train_dataset.map(tokenization_with_answer, batched=True)
    tokenized_val = val_fold.map(tokenization_with_answer, batched=True)

    # Remove unnecessary columns
    processed_train = tokenized_train.remove_columns(['table', 'summary', 'row_ids', 'example_id', 'query', 'answers', 'coordinates'])
    processed_val = tokenized_val.remove_columns(['table', 'summary', 'row_ids', 'example_id', 'query', 'answers', 'coordinates'])

    # Update your trainer's train_dataset and eval_dataset
    trainer.train_dataset = processed_train
    trainer.eval_dataset = processed_val

    # Train your model
    trainer.train()
    trainer.evaluate()

You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum
1,No log,1.73693,0.310908,0.17974,0.263519,0.277832
2,No log,1.674099,0.30891,0.173679,0.261352,0.274203
3,No log,1.642674,0.307926,0.175376,0.261209,0.276965
4,1.834600,1.627459,0.312875,0.176647,0.266516,0.281615
5,1.834600,1.60956,0.313811,0.178007,0.266722,0.282186
6,1.834600,1.601541,0.314084,0.18058,0.270219,0.283313
7,1.590100,1.591774,0.314868,0.18145,0.273688,0.28531
8,1.590100,1.58976,0.31333,0.180013,0.271253,0.284016
9,1.590100,1.584818,0.312966,0.180743,0.270743,0.2828
10,1.463500,1.587157,0.31385,0.180796,0.2723,0.285048


Map: 100%|██████████| 1800/1800 [00:07<00:00, 227.05 examples/s]
Map: 100%|██████████| 200/200 [00:00<00:00, 424.40 examples/s]


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum
1,No log,1.114217,0.3326,0.208381,0.291012,0.305302
2,No log,1.118116,0.334085,0.210081,0.293453,0.306973
3,No log,1.118546,0.334477,0.211602,0.294502,0.308379
4,1.297400,1.120974,0.335667,0.214816,0.294608,0.310219
5,1.297400,1.122085,0.337022,0.214228,0.296231,0.310458
6,1.297400,1.124252,0.335772,0.214457,0.297113,0.31074
7,1.191400,1.128902,0.335991,0.212867,0.297776,0.311197
8,1.191400,1.130521,0.335766,0.213879,0.296524,0.309863
9,1.191400,1.13146,0.333117,0.210617,0.292998,0.307099
10,1.119400,1.136435,0.335629,0.21047,0.295753,0.310432


Map: 100%|██████████| 1800/1800 [00:07<00:00, 248.24 examples/s]
Map: 100%|██████████| 200/200 [00:00<00:00, 418.82 examples/s]


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum
1,No log,0.741855,0.356044,0.25456,0.325054,0.337029
2,No log,0.744739,0.350753,0.24835,0.320862,0.331651
3,No log,0.748812,0.349803,0.247885,0.318771,0.33127
4,1.013300,0.753681,0.350841,0.251319,0.320557,0.332555
5,1.013300,0.753704,0.347714,0.246035,0.316583,0.328965
6,1.013300,0.759392,0.347544,0.24599,0.316583,0.329383
7,0.935600,0.760708,0.344649,0.243347,0.313313,0.325964
8,0.935600,0.763665,0.344218,0.24189,0.312875,0.326514
9,0.935600,0.767649,0.342378,0.239609,0.311347,0.32506
10,0.879500,0.768537,0.347525,0.248661,0.316489,0.330389


Map: 100%|██████████| 1800/1800 [00:06<00:00, 267.77 examples/s]
Map: 100%|██████████| 200/200 [00:00<00:00, 399.74 examples/s]


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum
1,No log,0.508371,0.354669,0.266144,0.330871,0.341029
2,No log,0.50899,0.356684,0.272199,0.334627,0.345068
3,No log,0.514289,0.357115,0.269836,0.333715,0.34496
4,0.792400,0.5169,0.35819,0.270751,0.335121,0.345776
5,0.792400,0.520137,0.355598,0.265978,0.331804,0.342417
6,0.792400,0.522712,0.354043,0.266569,0.331453,0.341544
7,0.728700,0.524508,0.355427,0.265465,0.330176,0.342221
8,0.728700,0.527906,0.352531,0.263736,0.328863,0.339307
9,0.728700,0.528442,0.350817,0.260793,0.326855,0.338545
10,0.684400,0.530158,0.35212,0.260756,0.32648,0.338567


Map: 100%|██████████| 1800/1800 [00:08<00:00, 213.65 examples/s]
Map: 100%|██████████| 200/200 [00:00<00:00, 386.25 examples/s]


Epoch,Training Loss,Validation Loss


In [None]:
model.save_pretrained("Flan-decomposed")
tokenizer.save_pretrained("Flan-decomposed")

In [None]:
#### Predictions

In [1]:
import warnings
import torch
warnings.filterwarnings('ignore')
torch.cuda.empty_cache()

In [2]:
from sklearn.model_selection import KFold
from datasets import load_dataset, DatasetDict, Dataset, concatenate_datasets
import datasets
import pandas as pd
import os
import logging
import nltk
import numpy as np
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer, DataCollatorForSeq2Seq
import evaluate
from random import sample


train_df = datasets.load_from_disk("/home/y.khan/cai6307-y.khan/Query-Focused-Tabular-Summarization/data/decomposed/decomposed_train")
test_df = datasets.load_from_disk("/home/y.khan/cai6307-y.khan/Query-Focused-Tabular-Summarization/data/decomposed/decomposed_test")
validate_df = datasets.load_from_disk("/home/y.khan/cai6307-y.khan/Query-Focused-Tabular-Summarization/data/decomposed/decomposed_validate")

2024-04-05 11:10:35.274337: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-04-05 11:11:05.976143: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /opt/slurm/lib64:/opt/slurm/lib64:
2024-04-05 11:11:05.976185: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
2024-04-05 11:11:09.368485: E tensorflow/stream_executor/cuda/cuda_blas.cc:2981] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-04-05 11:1

In [3]:
model_path = "/home/y.khan/cai6307-y.khan/Query-Focused-Tabular-Summarization/models/saved_model/Flan-T5-Decomposed"
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForSeq2SeqLM.from_pretrained(model_path)

In [4]:
from typing import List, Dict

def tokenization_with_answer(examples):
    inputs = []
    targets = []
    
    task_prefix = "Given a query and a table, generate a summary that answers the query based on the information in the table: "

    for i, (query, table, answer, coordinates, summary) in enumerate(zip(examples['query'], examples['table'], examples['answers'], examples['coordinates'], examples['summary'])):
        flattened_table = flatten_table(table, i)
        input_text = f"{task_prefix} Table {flattened_table}. Query: {query}"

        inputs.append(input_text)
        targets.append(summary)
        
    model_inputs = tokenizer(inputs, max_length=1024, truncation=True,padding='max_length')
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples["summary"], max_length=512, truncation=True)
    
    model_inputs["labels"] = labels["input_ids"] 

    res = tokenizer(inputs, text_target=targets, truncation=True, padding=True)
    return model_inputs

def flatten_table(table: Dict, row_index: int) -> str:
    header = table.get('header', [])
    rows = table.get('rows', [])
    title = table.get('title', [])

    flattened_rows = []
    for i, row in enumerate(rows):
        row_text = f"Row {i}, " + ",".join([f"{col}:{val}" for col, val in zip(header, row)])
        flattened_rows.append("## "+row_text)

    flattened_table = f"Title: {' '.join(map(str, title))}" + " " + " ".join(flattened_rows)
    return flattened_table

tokenized_dataset_train = train_df.map(tokenization_with_answer, batched=True)
tokenized_dataset_test = test_df.map(tokenization_with_answer, batched=True)

processed_data_train = tokenized_dataset_train.remove_columns(['table','summary', 'row_ids', 'example_id', 'query', 'answers', 'coordinates'])
processed_data_test = tokenized_dataset_test.remove_columns(['table','summary', 'row_ids', 'example_id', 'query', 'answers', 'coordinates'])

from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer, DataCollatorForSeq2Seq
import evaluate

def postprocess_text(preds, labels):
        preds = [pred.strip() for pred in preds]
        labels = [label.strip() for label in labels]

        # rougeLSum expects newline after each sentence
        preds = ["\n".join(nltk.sent_tokenize(pred)) for pred in preds]
        labels = ["\n".join(nltk.sent_tokenize(label)) for label in labels]

        return preds, labels

def metric_fn(eval_predictions):
    predictions, labels = eval_predictions
    decoded_predictions = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    for label in labels:
        label[label < 0] = tokenizer.pad_token_id  # Replace masked label tokens
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    
    decoded_predictions, decoded_labels = postprocess_text(decoded_predictions, decoded_labels)

    rouge = evaluate.load('rouge')

    # Compute ROUGE scores
    rouge_results = rouge.compute(predictions=decoded_predictions, references=decoded_labels)

    return rouge_results

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model= model)

train_args = Seq2SeqTrainingArguments(
    output_dir="./train_weights_flan_decomposed",
    learning_rate=3e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=2,
    num_train_epochs=20,
    evaluation_strategy="epoch",
    save_strategy = "epoch",
    weight_decay=0.01,
    save_total_limit=5,
    warmup_ratio=0.05,
    load_best_model_at_end=True,
    predict_with_generate=True,
    overwrite_output_dir= True,
    gradient_accumulation_steps = 2
)

trainer = Seq2SeqTrainer(
    model,
    train_args,
    train_dataset=processed_data_train,
    eval_dataset=processed_data_test,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=metric_fn
)

In [5]:
validate_df_size = len(validate_df)
step_size = 3
num_batches = validate_df_size // step_size

valid = []

for i in range(num_batches):
    start_index = i * step_size
    end_index = (i + 1) * step_size
    valid.append(validate_df.select(range(start_index, end_index)))

# If there are remaining data points that don't fit into full batches of size 3
if validate_df_size % step_size != 0:
    remaining_data = validate_df_size % step_size
    valid.append(validate_df.select(range(validate_df_size - remaining_data, validate_df_size)))

In [8]:
import numpy as np
rougeL = []
bert = []
bertscore = evaluate.load("bertscore")

for i in range(len(valid)):
    validate_df = valid[i].map(tokenization_with_answer, batched=True)
    predict_results = trainer.predict(validate_df, max_length = 1024)
    metrics = predict_results.metrics
    predictions = tokenizer.batch_decode(predict_results.predictions, skip_special_tokens=True, clean_up_tokenization_spaces=True)
    predictions = [pred.strip() for pred in predictions]
    
    bert_score = bertscore.compute(predictions=predictions, references=valid[i]['summary'], lang = "en")
    rougeL.append(metrics['test_rougeLsum'])
    bert.append(np.mean(bert_score['f1']))

sum(rougeL)/len(rougeL), sum(bert)/len(bert)

Map: 100%|██████████| 3/3 [00:00<00:00,  3.54 examples/s]
You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


(0.40658831946919716, 0.8971968287259189)

In [9]:
##### Example

In [19]:
validate_df = valid[16].map(tokenization_with_answer, batched=True)
predict_results = trainer.predict(validate_df, max_length = 1024)
metrics = predict_results.metrics
metrics['test_rougeLsum']

0.46537140837246876

In [20]:
predictions = tokenizer.batch_decode(predict_results.predictions, skip_special_tokens=True, clean_up_tokenization_spaces=True)
predictions = [pred.strip() for pred in predictions]
predictions

['Based on the table, the yearly trend in the total number of acres affected by the wildfires in California has been upwards. The earliest fire started in 1932, the San Andreas fire, burned a total of 220,000 acres in Sonoma County, while the newest fire started in 2017, destroyed 281,893 acres in Sonoma County. The smallest fire, the El Dorado fire, started in Kern County in December 2017, destroyed 257,314 acres in Sonoma County.',
 'The chronological order of episodes in Columbo\'s fourth season is as follows: Episode 1 is "An Exercise in Fatality," episode 2 is "Negative Reaction," episode 3 is "By Dawn\'s Early Light," episode 4 is "Troubled Waters," episode 5 is "Playback," episode 6 is "A Deadly State of Mind." The murderers and victims for these episodes are Robert Conrad, Dick Van Dyke, Patrick McGoohan, Robert Vaughn, Oskar Werner, and George Hamilton.',
 'The players who played for the Utah Jazz and attended Byu are Andy Toolson, who played for the Jazz from 1990-91 and 1995

In [18]:
validate_df['summary']

['There are not enough datas to make a sure conclusion about yearly trend of total acres affected by wildfires in California. As this table only has biggest wildfires and not all fires in state, it cannot show the full trend of every year. However, we can see that some more new fires, like Mendocino Complex in 2018 and Thomas Fire in 2017, have very big acreage. This may show that there is a going up trend in how bad wildfires are in recent years.',
 'In fourth season of Columbo, first episode is "An Exercise in Fatality" on September 15, 1974. Have murderer Robert Conrad and victim Philip Bruns. Second episode is "Negative Reaction" on October 6, 1974. Have Dick Van Dyke as murderer and Antoinette Bower and Don Gordon as victims. Other episodes in order are: "By Dawn\'s Early Light" with Patrick McGoohan as murderer and Tom Simcox as victim, "Troubled Waters" have Robert Vaughn as murderer and Poupée Bocar as victim, "Playback" with Oskar Werner as murderer and Martha Scott as victim,

In [9]:
valid[47][1]['example_id']

'41896ff2-9dbc-48a1-9992-7de0489a517c'