In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import torch
torch.cuda.empty_cache()

In [21]:
from sklearn.model_selection import KFold
from datasets import load_dataset, DatasetDict, Dataset, concatenate_datasets
import datasets
import pandas as pd
import os
import logging
import nltk
import numpy as np
from transformers import T5Tokenizer, T5ForConditionalGeneration
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer, DataCollatorForSeq2Seq
import evaluate
from random import sample


train_df = datasets.load_from_disk("/home/y.khan/cai6307-y.khan/Query-Focused-Tabular-Summarization/data/decomposed/decomposed_train")
test_df = datasets.load_from_disk("/home/y.khan/cai6307-y.khan/Query-Focused-Tabular-Summarization/data/decomposed/decomposed_test")
validate_df = datasets.load_from_disk("/home/y.khan/cai6307-y.khan/Query-Focused-Tabular-Summarization/data/decomposed/decomposed_validate")

In [23]:
model_path = "google-t5/t5-large"
model_path = "/home/y.khan/cai6307-y.khan/Query-Focused-Tabular-Summarization/models/saved_model/T5-decomposed"
tokenizer = T5Tokenizer.from_pretrained(model_path)

model = T5ForConditionalGeneration.from_pretrained(model_path, max_length=1024)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [15]:
from typing import List, Dict

def tokenization_with_answer(examples):
    inputs = []
    targets = []
    
    task_prefix = "Given a query and a table, generate a summary that answers the query based on the information in the table: "

    for i, (query, table, answer, coordinates, summary) in enumerate(zip(examples['query'], examples['table'], examples['answers'], examples['coordinates'], examples['summary'])):
        flattened_table = flatten_table(table, i)
        input_text = f"{task_prefix} Table {flattened_table}. Query: {query}"

        inputs.append(input_text)
        targets.append(summary)
        
    model_inputs = tokenizer(inputs, truncation=True,padding='max_length')
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples["summary"], truncation=True)
    
    model_inputs["labels"] = labels["input_ids"] 

    return model_inputs

def flatten_table(table: Dict, row_index: int) -> str:
    header = table.get('header', [])
    rows = table.get('rows', [])
    title = table.get('title', [])

    flattened_rows = []
    for i, row in enumerate(rows):
        row_text = f"Row {i}, " + ",".join([f"{col}:{val}" for col, val in zip(header, row)])
        flattened_rows.append("## "+row_text)

    flattened_table = f"Title: {' '.join(map(str, title))}" + " " + " ".join(flattened_rows)
    return flattened_table

tokenized_dataset_train = train_df.map(tokenization_with_answer, batched=True)
tokenized_dataset_test = test_df.map(tokenization_with_answer, batched=True)

processed_data_train = tokenized_dataset_train.remove_columns(['table','summary', 'row_ids', 'example_id', 'query', 'answers', 'coordinates'])
processed_data_test = tokenized_dataset_test.remove_columns(['table','summary', 'row_ids', 'example_id', 'query', 'answers', 'coordinates'])

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]Asking to pad to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no padding.
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
Map: 100%|██████████| 2000/2000 [00:02<00:00, 687.81 examples/s]
Map: 100%|██████████| 500/500 [00:00<00:00, 587.14 examples/s]


In [16]:
def k_fold_split(dataset, num_folds=5):
    fold_size = len(dataset) // num_folds
    folds = []
    for i in range(num_folds):
        start = i * fold_size
        end = start + fold_size if i < num_folds - 1 else len(dataset)
        folds.append(dataset.select(range(start, end)))
    return folds

In [17]:
def postprocess_text(preds, labels):
        preds = [pred.strip() for pred in preds]
        labels = [label.strip() for label in labels]

        # rougeLSum expects newline after each sentence
        preds = ["\n".join(nltk.sent_tokenize(pred)) for pred in preds]
        labels = ["\n".join(nltk.sent_tokenize(label)) for label in labels]

        return preds, labels

def metric_fn(eval_predictions):
    predictions, labels = eval_predictions
    decoded_predictions = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    for label in labels:
        label[label < 0] = tokenizer.pad_token_id  # Replace masked label tokens
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    
    decoded_predictions, decoded_labels = postprocess_text(decoded_predictions, decoded_labels)

    rouge = evaluate.load('rouge')

    # Compute ROUGE scores
    rouge_results = rouge.compute(predictions=decoded_predictions, references=decoded_labels)

    return rouge_results

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model= model)

train_args = Seq2SeqTrainingArguments(
    output_dir="./train_weights_t5_decomposed",
    learning_rate=3e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=2,
    num_train_epochs=20,
    evaluation_strategy="steps",
    save_strategy = "steps",
    eval_steps=200,
    save_steps=200,
    weight_decay=0.01,
    save_total_limit=5,
    warmup_ratio=0.05,
    load_best_model_at_end=True,
    predict_with_generate=True,
    overwrite_output_dir= True,
    gradient_accumulation_steps = 2
)

trainer = Seq2SeqTrainer(
    model,
    train_args,
    train_dataset=processed_data_train,
    eval_dataset=processed_data_test,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=metric_fn
)

In [None]:
folds = k_fold_split(train_df, num_folds=10)

for i in range(len(folds)):
    val_fold = folds[i]
    train_folds = [folds[j] for j in range(len(folds)) if j != i]
    train_dataset = concatenate_datasets(train_folds)

    tokenized_train = train_dataset.map(tokenization_with_answer, batched=True)
    tokenized_val = val_fold.map(tokenization_with_answer, batched=True)

    # Remove unnecessary columns
    processed_train = tokenized_train.remove_columns(['table', 'summary', 'row_ids', 'example_id', 'query', 'answers', 'coordinates'])
    processed_val = tokenized_val.remove_columns(['table', 'summary', 'row_ids', 'example_id', 'query', 'answers', 'coordinates'])

    # Update your trainer's train_dataset and eval_dataset
    trainer.train_dataset = processed_train
    trainer.eval_dataset = processed_val

    # Train your model
    trainer.train()
    trainer.evaluate()

In [None]:
model.save_pretrained("T5-decomposed")
tokenizer.save_pretrained("T5-decomposed")

In [None]:
### Predictions

In [2]:
import warnings
import torch
warnings.filterwarnings('ignore')
torch.cuda.empty_cache()

In [14]:
from sklearn.model_selection import KFold
from datasets import load_dataset, DatasetDict, Dataset, concatenate_datasets
import datasets
import pandas as pd
import os
import logging
import nltk
import numpy as np
from transformers import T5Tokenizer, T5ForConditionalGeneration
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer, DataCollatorForSeq2Seq
import evaluate
from random import sample


train_df = datasets.load_from_disk("/home/y.khan/cai6307-y.khan/Query-Focused-Tabular-Summarization/data/decomposed/decomposed_train")
test_df = datasets.load_from_disk("/home/y.khan/cai6307-y.khan/Query-Focused-Tabular-Summarization/data/decomposed/decomposed_test")
validate_df = datasets.load_from_disk("/home/y.khan/cai6307-y.khan/Query-Focused-Tabular-Summarization/data/decomposed/decomposed_validate")

In [4]:
model_path = "/home/y.khan/cai6307-y.khan/Query-Focused-Tabular-Summarization/models/saved_model/T5-decomposed"
tokenizer = T5Tokenizer.from_pretrained(model_path)
model = T5ForConditionalGeneration.from_pretrained(model_path, max_length=1024)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [5]:
from typing import List, Dict

def tokenization_with_answer(examples):
    inputs = []
    targets = []
    
    task_prefix = "Given a query and a table, generate a summary that answers the query based on the information in the table: "

    for i, (query, table, answer, coordinates, summary) in enumerate(zip(examples['query'], examples['table'], examples['answers'], examples['coordinates'], examples['summary'])):
        flattened_table = flatten_table(table, i)
        input_text = f"{task_prefix} Table {flattened_table}. Query: {query}"

        inputs.append(input_text)
        targets.append(summary)
        
    model_inputs = tokenizer(inputs, truncation=True,padding='max_length')
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples["summary"], truncation=True)
    
    model_inputs["labels"] = labels["input_ids"] 

    return model_inputs

def flatten_table(table: Dict, row_index: int) -> str:
    header = table.get('header', [])
    rows = table.get('rows', [])
    title = table.get('title', [])

    flattened_rows = []
    for i, row in enumerate(rows):
        row_text = f"Row {i}, " + ",".join([f"{col}:{val}" for col, val in zip(header, row)])
        flattened_rows.append("## "+row_text)

    flattened_table = f"Title: {' '.join(map(str, title))}" + " " + " ".join(flattened_rows)
    return flattened_table

tokenized_dataset_train = train_df.map(tokenization_with_answer, batched=True)
tokenized_dataset_test = test_df.map(tokenization_with_answer, batched=True)

processed_data_train = tokenized_dataset_train.remove_columns(['table','summary', 'row_ids', 'example_id', 'query', 'answers', 'coordinates'])
processed_data_test = tokenized_dataset_test.remove_columns(['table','summary', 'row_ids', 'example_id', 'query', 'answers', 'coordinates'])

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]Asking to pad to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no padding.
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
Map: 100%|██████████| 2000/2000 [00:03<00:00, 549.12 examples/s]
Map: 100%|██████████| 500/500 [00:00<00:00, 556.19 examples/s]


In [6]:
def postprocess_text(preds, labels):
        preds = [pred.strip() for pred in preds]
        labels = [label.strip() for label in labels]

        # rougeLSum expects newline after each sentence
        preds = ["\n".join(nltk.sent_tokenize(pred)) for pred in preds]
        labels = ["\n".join(nltk.sent_tokenize(label)) for label in labels]

        return preds, labels

def metric_fn(eval_predictions):
    predictions, labels = eval_predictions
    decoded_predictions = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    for label in labels:
        label[label < 0] = tokenizer.pad_token_id  # Replace masked label tokens
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    
    decoded_predictions, decoded_labels = postprocess_text(decoded_predictions, decoded_labels)

    rouge = evaluate.load('rouge')

    # Compute ROUGE scores
    rouge_results = rouge.compute(predictions=decoded_predictions, references=decoded_labels)

    return rouge_results

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model= model)

train_args = Seq2SeqTrainingArguments(
    output_dir="./train_weights_t5_decomposed",
    learning_rate=3e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=2,
    num_train_epochs=20,
    evaluation_strategy="steps",
    save_strategy = "steps",
    eval_steps=200,
    save_steps=200,
    weight_decay=0.01,
    save_total_limit=5,
    warmup_ratio=0.05,
    load_best_model_at_end=True,
    predict_with_generate=True,
    overwrite_output_dir= True,
    gradient_accumulation_steps = 2
)

trainer = Seq2SeqTrainer(
    model,
    train_args,
    train_dataset=processed_data_train,
    eval_dataset=processed_data_test,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=metric_fn
)

In [7]:
validate_df = datasets.load_from_disk("/home/y.khan/cai6307-y.khan/Query-Focused-Tabular-Summarization/data/decomposed/decomposed_validate")
validate_df_size = len(validate_df)
step_size = 3
num_batches = validate_df_size // step_size

valid = []

for i in range(num_batches):
    start_index = i * step_size
    end_index = (i + 1) * step_size
    valid.append(validate_df.select(range(start_index, end_index)))

# If there are remaining data points that don't fit into full batches of size 3
if validate_df_size % step_size != 0:
    remaining_data = validate_df_size % step_size
    valid.append(validate_df.select(range(validate_df_size - remaining_data, validate_df_size)))

In [7]:
import numpy as np
rougeL = []
bert = []
bertscore = evaluate.load("bertscore")

for i in range(len(valid)):
    validate_df = valid[i].map(tokenization_with_answer, batched=True)
    predict_results = trainer.predict(validate_df, max_length = 1024)
    metrics = predict_results.metrics
    predictions = tokenizer.batch_decode(predict_results.predictions, skip_special_tokens=True, clean_up_tokenization_spaces=True)
    predictions = [pred.strip() for pred in predictions]
    
    bert_score = bertscore.compute(predictions=predictions, references=valid[i]['summary'], lang = "en")
    rougeL.append(metrics['test_rougeLsum'])
    bert.append(np.mean(bert_score['f1']))

sum(rougeL)/len(rougeL), sum(bert)/len(bert)

Map: 100%|██████████| 3/3 [00:00<00:00, 150.19 examples/s]


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Map: 100%|██████████| 3/3 [00:00<00:00, 23.34 examples/s]


Map: 100%|██████████| 3/3 [00:00<00:00, 245.53 examples/s]


Map: 100%|██████████| 3/3 [00:00<00:00, 241.01 examples/s]


Map: 100%|██████████| 3/3 [00:00<00:00, 288.09 examples/s]


Map: 100%|██████████| 3/3 [00:00<00:00, 152.65 examples/s]


Map: 100%|██████████| 3/3 [00:00<00:00, 200.31 examples/s]


Map: 100%|██████████| 3/3 [00:00<00:00, 243.61 examples/s]


Map: 100%|██████████| 3/3 [00:00<00:00, 208.03 examples/s]


Map: 100%|██████████| 3/3 [00:00<00:00, 193.22 examples/s]


Map: 100%|██████████| 3/3 [00:00<00:00, 276.67 examples/s]


Map: 100%|██████████| 3/3 [00:00<00:00, 239.04 examples/s]


Map: 100%|██████████| 3/3 [00:00<00:00, 186.00 examples/s]


Map: 100%|██████████| 3/3 [00:00<00:00, 206.68 examples/s]


Map: 100%|██████████| 3/3 [00:00<00:00, 245.91 examples/s]


Map: 100%|██████████| 3/3 [00:00<00:00, 263.84 examples/s]


Map: 100%|██████████| 3/3 [00:00<00:00, 212.82 examples/s]


Map: 100%|██████████| 3/3 [00:00<00:00, 202.14 examples/s]


Map: 100%|██████████| 3/3 [00:00<00:00, 179.31 examples/s]


Map: 100%|██████████| 3/3 [00:00<00:00, 170.95 examples/s]


Map: 100%|██████████| 3/3 [00:00<00:00, 241.02 examples/s]


Map: 100%|██████████| 3/3 [00:00<00:00, 247.34 examples/s]


Map: 100%|██████████| 3/3 [00:00<00:00, 191.84 examples/s]


Map: 100%|██████████| 3/3 [00:00<00:00, 213.20 examples/s]


Map: 100%|██████████| 3/3 [00:00<00:00, 208.43 examples/s]


Map: 100%|██████████| 3/3 [00:00<00:00, 203.00 examples/s]


Map: 100%|██████████| 3/3 [00:00<00:00, 181.43 examples/s]


Map: 100%|██████████| 3/3 [00:00<00:00, 240.04 examples/s]


Map: 100%|██████████| 3/3 [00:00<00:00, 245.42 examples/s]


Map: 100%|██████████| 3/3 [00:00<00:00, 210.65 examples/s]


Map: 100%|██████████| 3/3 [00:00<00:00, 192.87 examples/s]


Map: 100%|██████████| 3/3 [00:00<00:00, 191.92 examples/s]


Map: 100%|██████████| 3/3 [00:00<00:00, 162.68 examples/s]


Map: 100%|██████████| 3/3 [00:00<00:00, 184.31 examples/s]


Map: 100%|██████████| 3/3 [00:00<00:00, 209.84 examples/s]


Map: 100%|██████████| 3/3 [00:00<00:00, 141.40 examples/s]


Map: 100%|██████████| 3/3 [00:00<00:00, 237.00 examples/s]


Map: 100%|██████████| 3/3 [00:00<00:00, 231.39 examples/s]


Map: 100%|██████████| 3/3 [00:00<00:00, 221.25 examples/s]


Map: 100%|██████████| 3/3 [00:00<00:00, 159.63 examples/s]


Map: 100%|██████████| 3/3 [00:00<00:00, 274.78 examples/s]


Map: 100%|██████████| 3/3 [00:00<00:00, 179.61 examples/s]


Map: 100%|██████████| 3/3 [00:00<00:00, 201.02 examples/s]


Map: 100%|██████████| 3/3 [00:00<00:00, 246.00 examples/s]


Map: 100%|██████████| 3/3 [00:00<00:00, 152.17 examples/s]


Map: 100%|██████████| 3/3 [00:00<00:00, 211.12 examples/s]


Map: 100%|██████████| 3/3 [00:00<00:00, 214.21 examples/s]


Map: 100%|██████████| 3/3 [00:00<00:00, 156.81 examples/s]


Map: 100%|██████████| 3/3 [00:00<00:00, 229.41 examples/s]


Map: 100%|██████████| 3/3 [00:00<00:00, 110.34 examples/s]


Map: 100%|██████████| 3/3 [00:00<00:00, 248.58 examples/s]


Map: 100%|██████████| 3/3 [00:00<00:00, 200.21 examples/s]


Map: 100%|██████████| 3/3 [00:00<00:00, 108.70 examples/s]


Map: 100%|██████████| 3/3 [00:00<00:00, 251.02 examples/s]


Map: 100%|██████████| 3/3 [00:00<00:00, 182.64 examples/s]


Map: 100%|██████████| 3/3 [00:00<00:00, 222.10 examples/s]


Map: 100%|██████████| 3/3 [00:00<00:00, 273.33 examples/s]


Map: 100%|██████████| 3/3 [00:00<00:00, 205.04 examples/s]


Map: 100%|██████████| 3/3 [00:00<00:00, 219.86 examples/s]


Map: 100%|██████████| 3/3 [00:00<00:00, 273.38 examples/s]


Map: 100%|██████████| 3/3 [00:00<00:00, 251.40 examples/s]


Map: 100%|██████████| 3/3 [00:00<00:00, 161.18 examples/s]


Map: 100%|██████████| 3/3 [00:00<00:00, 248.65 examples/s]


Map: 100%|██████████| 3/3 [00:00<00:00, 226.38 examples/s]


Map: 100%|██████████| 3/3 [00:00<00:00, 31.77 examples/s]


Map: 100%|██████████| 3/3 [00:00<00:00, 166.77 examples/s]


Map: 100%|██████████| 2/2 [00:00<00:00, 157.04 examples/s]


(0.40201112321812066, 0.8960429207602543)

In [8]:
##### Example

In [8]:
validate_df = valid[16].map(tokenization_with_answer, batched=True)
predict_results = trainer.predict(validate_df, max_length = 1024)

predictions = tokenizer.batch_decode(predict_results.predictions, skip_special_tokens=True, clean_up_tokenization_spaces=True)
predictions = [pred.strip() for pred in predictions]
predictions

Map: 100%|██████████| 3/3 [00:00<00:00, 231.13 examples/s]


['The yearly trend in total number of acres affected by wildfires in California based on the start dates and the acreage of each listed fire is showing a decrease. The largest fire, Camp Fire, have affected the most, with 459,123 acres in July 2018 and 281,893 acres in December 2017. The second largest fire, Camp Fire, have affected the most, with 273,246, occurring in October 2003.',
 'The chronological sequence of episodes in Columbo\'s fourth season is as follows: Episode 1, "An Exercise in Fatality," has Robert Conrad as the murderer and Philip Bruns as the victim. The next episode, "Negative Reaction," has Dick Van Dyke as the murderer and Antoinette Bower and Don Gordon as the victims. The next episode, "By Dawn\'s Early Light," has Patrick McGoohan as the murderer and Tom Simcox as the victim. The last episode, "Troubled Waters," has Robert Vaughn as the murderer and Poupée Bocar as the victim.',
 'The players who played for the Utah Jazz and attended Byu are Andy Toolson, who p

In [9]:
validate_df['summary']

['There are not enough datas to make a sure conclusion about yearly trend of total acres affected by wildfires in California. As this table only has biggest wildfires and not all fires in state, it cannot show the full trend of every year. However, we can see that some more new fires, like Mendocino Complex in 2018 and Thomas Fire in 2017, have very big acreage. This may show that there is a going up trend in how bad wildfires are in recent years.',
 'In fourth season of Columbo, first episode is "An Exercise in Fatality" on September 15, 1974. Have murderer Robert Conrad and victim Philip Bruns. Second episode is "Negative Reaction" on October 6, 1974. Have Dick Van Dyke as murderer and Antoinette Bower and Don Gordon as victims. Other episodes in order are: "By Dawn\'s Early Light" with Patrick McGoohan as murderer and Tom Simcox as victim, "Troubled Waters" have Robert Vaughn as murderer and Poupée Bocar as victim, "Playback" with Oskar Werner as murderer and Martha Scott as victim,

In [17]:
import pandas as pd
def to_pandas(item):
  return pd.DataFrame(item['table']["rows"],columns=item['table']["header"])

to_pandas(train_df[120])

Unnamed: 0,Score,Player,Team,Opposing team,Result
0,122,Shane Watson,Australia,West Indies,Won
1,109,Aaron Finch (1/2),Australia,South Africa,Won
2,102,Hashim Amla (1/2),South Africa,Australia,Lost
3,215,Chris Gayle,West Indies,Zimbabwe,Won
4,133*,Marlon Samuels,West Indies,Zimbabwe,Won
5,159,Hashim Amla (2/2),South Africa,Ireland,Won
6,109,Faf du Plessis,South Africa,Ireland,Won
7,106,Aaron Finch (2/2),Australia,India,Won
8,126,Shikhar Dhawan,India,Australia,Lost
9,106,Virat Kohli,India,Australia,Lost


In [21]:
train_df[120]

{'row_ids': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
 'table': {'header': ['Score', 'Player', 'Team', 'Opposing team', 'Result'],
  'rows': [['122', 'Shane Watson', 'Australia', 'West Indies', 'Won'],
   ['109', 'Aaron Finch (1/2)', 'Australia', 'South Africa', 'Won'],
   ['102', 'Hashim Amla (1/2)', 'South Africa', 'Australia', 'Lost'],
   ['215', 'Chris Gayle', 'West Indies', 'Zimbabwe', 'Won'],
   ['133*', 'Marlon Samuels', 'West Indies', 'Zimbabwe', 'Won'],
   ['159', 'Hashim Amla (2/2)', 'South Africa', 'Ireland', 'Won'],
   ['109', 'Faf du Plessis', 'South Africa', 'Ireland', 'Won'],
   ['106', 'Aaron Finch (2/2)', 'Australia', 'India', 'Won'],
   ['126', 'Shikhar Dhawan', 'India', 'Australia', 'Lost'],
   ['106', 'Virat Kohli', 'India', 'Australia', 'Lost'],
   ['119', 'David Warner', 'Australia', 'New Zealand', 'Won']],
  'table_id': '24cfb068-a416-4fb3-b6a1-af1ceb705e81',
  'title': 'List of international cricket centuries at Manuka Oval - One Day International centuries'},
 'summa