In [3]:
import warnings
warnings.filterwarnings('ignore')

In [4]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /home/y.khan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [5]:
import torch
torch.cuda.empty_cache()

In [6]:
from sklearn.model_selection import KFold
from datasets import load_dataset, DatasetDict, Dataset, concatenate_datasets
import datasets
import pandas as pd
import os
import logging
import nltk
import numpy as np
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from random import sample


train_df = datasets.load_from_disk("/home/y.khan/Query-Focused-Tabular-Summarization/data/decomposed/decomposed_train")
test_df = datasets.load_from_disk("/home/y.khan/Query-Focused-Tabular-Summarization/data/decomposed/decomposed_test")
validate_df = datasets.load_from_disk("/home/y.khan/Query-Focused-Tabular-Summarization/data/decomposed/decomposed_validate")

In [7]:
model_path = "google-t5/t5-base"
tokenizer = AutoTokenizer.from_pretrained(model_path)

model = AutoModelForSeq2SeqLM.from_pretrained(model_path)

In [9]:
from typing import List, Dict

def tokenization_with_answer(examples):
    inputs = []
    targets = []
    
    task_prefix = "Summarize the table based on the query: "

    for i, (query, table, answer, coordinates, summary) in enumerate(zip(examples['query'], examples['table'], examples['answers'], examples['coordinates'], examples['summary'])):
        flattened_table = flatten_table(table, i)
        input_text = f"{task_prefix} Table {flattened_table}. Query: {query}"

        inputs.append(input_text)
        targets.append(summary)
        
    model_inputs = tokenizer(inputs, max_length=512, truncation=True,padding='max_length')
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples["summary"], max_length=256, truncation=True)
    
    model_inputs["labels"] = labels["input_ids"] 

    res = tokenizer(inputs, text_target=targets, truncation=True, padding=True)
    return model_inputs

def flatten_table(table: Dict, row_index: int) -> str:
    header = table.get('header', [])
    rows = table.get('rows', [])
    title = table.get('title', [])

    flattened_rows = []
    for i, row in enumerate(rows):
        row_text = f"Row {i}, " + ",".join([f"{col}:{val}" for col, val in zip(header, row)])
        flattened_rows.append("## "+row_text)

    flattened_table = f"Title: {' '.join(map(str, title))}" + " " + " ".join(flattened_rows)
    return flattened_table

tokenized_dataset_train = train_df.map(tokenization_with_answer, batched=True)
tokenized_dataset_test = test_df.map(tokenization_with_answer, batched=True)

processed_data_train = tokenized_dataset_train.remove_columns(['table','summary', 'row_ids', 'example_id', 'query', 'answers', 'coordinates'])
processed_data_test = tokenized_dataset_test.remove_columns(['table','summary', 'row_ids', 'example_id', 'query', 'answers', 'coordinates'])

Map: 100%|██████████| 2000/2000 [00:04<00:00, 462.73 examples/s]
Map: 100%|██████████| 500/500 [00:01<00:00, 341.71 examples/s]


In [11]:
def k_fold_split(dataset, num_folds=5):
    fold_size = len(dataset) // num_folds
    folds = []
    for i in range(num_folds):
        start = i * fold_size
        end = start + fold_size if i < num_folds - 1 else len(dataset)
        folds.append(dataset.select(range(start, end)))
    return folds

In [12]:
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer, DataCollatorForSeq2Seq
import evaluate

def postprocess_text(preds, labels):
        preds = [pred.strip() for pred in preds]
        labels = [label.strip() for label in labels]

        # rougeLSum expects newline after each sentence
        preds = ["\n".join(nltk.sent_tokenize(pred)) for pred in preds]
        labels = ["\n".join(nltk.sent_tokenize(label)) for label in labels]

        return preds, labels

def metric_fn(eval_predictions):
    predictions, labels = eval_predictions
    decoded_predictions = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    for label in labels:
        label[label < 0] = tokenizer.pad_token_id  # Replace masked label tokens
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    
    decoded_predictions, decoded_labels = postprocess_text(decoded_predictions, decoded_labels)

    rouge = evaluate.load('rouge')

    # Compute ROUGE scores
    rouge_results = rouge.compute(predictions=decoded_predictions, references=decoded_labels)

    return rouge_results

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model= model)

train_args = Seq2SeqTrainingArguments(
    output_dir="./train_weights",
    learning_rate=2e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=20,
    evaluation_strategy="epoch",
    weight_decay=0.01,
    save_total_limit=3,
    predict_with_generate=True,
    overwrite_output_dir= True,
    fp16=True
)

trainer = Seq2SeqTrainer(
    model,
    train_args,
    train_dataset=processed_data_train,
    eval_dataset=processed_data_test,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=metric_fn
)

2024-03-27 17:03:33.483390: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [None]:
folds = k_fold_split(train_df, num_folds=10)

for i in range(len(folds)):
    val_fold = folds[i]
    train_folds = [folds[j] for j in range(len(folds)) if j != i]
    train_dataset = concatenate_datasets(train_folds)

    tokenized_train = train_dataset.map(tokenization_with_answer, batched=True)
    tokenized_val = val_fold.map(tokenization_with_answer, batched=True)

    # Remove unnecessary columns
    processed_train = tokenized_train.remove_columns(['table', 'summary', 'row_ids', 'example_id', 'query', 'answers', 'coordinates'])
    processed_val = tokenized_val.remove_columns(['table', 'summary', 'row_ids', 'example_id', 'query', 'answers', 'coordinates'])

    # Update your trainer's train_dataset and eval_dataset
    trainer.train_dataset = processed_train
    trainer.eval_dataset = processed_val

    # Train your model
    trainer.train()
    trainer.evaluate()

Map: 100%|██████████| 1800/1800 [00:04<00:00, 366.96 examples/s]
Map: 100%|██████████| 200/200 [00:00<00:00, 521.78 examples/s]


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum
1,No log,3.470807,0.090623,0.037867,0.08219,0.083153
2,No log,2.80721,0.167339,0.069255,0.14193,0.149478
3,No log,2.672187,0.211538,0.093243,0.176523,0.186869
4,No log,2.594445,0.23508,0.104441,0.195455,0.208912
5,No log,2.543628,0.236238,0.106482,0.195903,0.210327
6,No log,2.515201,0.232623,0.104587,0.194746,0.20705
7,No log,2.486205,0.237209,0.104608,0.198788,0.212192
8,No log,2.466686,0.239346,0.104905,0.199139,0.212536
9,No log,2.448865,0.243551,0.109226,0.20306,0.217794
10,No log,2.438308,0.242191,0.108038,0.201109,0.21505


Map: 100%|██████████| 1800/1800 [00:04<00:00, 438.06 examples/s]
Map: 100%|██████████| 200/200 [00:00<00:00, 445.40 examples/s]


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum
1,No log,2.405347,0.251307,0.124543,0.211957,0.228256
2,No log,2.395933,0.255187,0.128773,0.214458,0.230545
3,No log,2.387444,0.252399,0.128286,0.213388,0.226792
4,No log,2.38167,0.25354,0.126783,0.215289,0.229166


In [None]:
model.save_pretrained("T5-decomposed")
tokenizer.save_pretrained("T5-decomposed")

In [None]:
validate_df = validate_df.map(tokenization_with_answer, batched=True)
predict_results = trainer.predict(validate_df, max_length=256, num_beams=3)