# Initialize Packages and Load Dataset

In [1]:
import warnings
warnings.filterwarnings('ignore')
import torch
torch.cuda.empty_cache()

In [2]:
from sklearn.model_selection import KFold
from datasets import load_dataset, DatasetDict, Dataset, concatenate_datasets
from bert_score import score as bert_score
import datasets
import pandas as pd
import os
import logging
import nltk
import numpy as np
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer, DataCollatorForSeq2Seq
import evaluate
from random import sample
import random

2024-04-21 23:34:24.127208: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [3]:
from datasets import load_from_disk

dataset = load_from_disk('data/decomposed/decomposed_test')

# Check Test Dataset + Add Tokenizer Function

In [4]:
from typing import List, Dict

def tokenization_with_answer(examples, tokenizer):
    inputs = []
    
    task_prefix = "Given a query and a table, generate a summary that answers the query based on the information in the table: "

    for i, (query, table, summary) in enumerate(zip(examples['query'], examples['table'], examples['summary'])):
        flattened_table = flatten_table(table, i)
        input_text = f"{task_prefix} Table {flattened_table}. Query: {query}"

        inputs.append(input_text)
        
    model_inputs = tokenizer(inputs, max_length=1024, truncation=True,padding='max_length')
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples["summary"], max_length=512, truncation=True)
    
    model_inputs["labels"] = labels["input_ids"] 

    return model_inputs


def flatten_table(table: Dict, row_index: int) -> str:
    header = table.get('header', [])
    rows = table.get('rows', [])
    title = table.get('title', [])

    flattened_rows = []
    for i, row in enumerate(rows):
        row_text = f"Row {i}, " + ",".join([f"{col}:{val}" for col, val in zip(header, row)])
        flattened_rows.append("## "+row_text)

    flattened_table = f"Title: {' '.join(map(str, title))}" + " " + " ".join(flattened_rows)
    return flattened_table

In [5]:
def generate_predictions(examples, tokenizer, model):
    generated_texts = []
    for example in examples:
        
        # Intial tokenization
        input_text = f"query:  {example['query']} answer: {example['answers']} header: {' '.join(map(str, example['table'].get('header', [])))} rows: {' '.join(map(str, example['table'].get('rows', [])))} title: {' '.join(map(str, example['table'].get('title', [])))}"
        input_ids = tokenizer.encode(input_text, return_tensors="pt")
        
        # Generate text and decode
        output_sequences = model.generate(input_ids)
        generated_text = tokenizer.decode(output_sequences[0], skip_special_tokens=True)
        
        # Add to list of generated text
        generated_texts.append(generated_text)
    
    return generated_texts

In [6]:
dataset

# Reduce it for testing
# random_indices = random.sample(range(len(dataset)), 20)
# dataset = dataset.select(random_indices)
# print(dataset)

Dataset({
    features: ['row_ids', 'table', 'summary', 'query', 'example_id', 'coordinates', 'answers'],
    num_rows: 500
})

## Trainer Creation Function

In [7]:
def createTrainer(model, tokenzier):
    data_collator = DataCollatorForSeq2Seq(tokenzier, model)
    
    # Not needed, but trainer requires it even if not used
    train_args = Seq2SeqTrainingArguments(
        output_dir="./train_weights_t5",
        learning_rate=3e-5,
        per_device_train_batch_size=1,
        per_device_eval_batch_size=1,
        num_train_epochs=1,
        evaluation_strategy="steps",
        save_strategy = "steps",
        eval_steps=200,
        save_steps=200,
        weight_decay=0.01,
        save_total_limit=5,
        warmup_ratio=0.05,
        load_best_model_at_end=True,
        predict_with_generate=True,
        overwrite_output_dir= True,
        gradient_accumulation_steps = 2
    )

    return Seq2SeqTrainer(
        model=model,
        args=train_args,
        tokenizer=tokenizer,
        data_collator=data_collator,
    )

# Load in models

In [8]:
from transformers import GPT2Tokenizer, GPT2Model
from transformers import T5Tokenizer, T5ForConditionalGeneration
from transformers import BartTokenizer, BartForConditionalGeneration
from transformers import LlamaForCausalLM, LlamaTokenizer

# # gpt2
# tokenizer_gpt2 = GPT2Tokenizer.from_pretrained("gpt2")
# model_gpt2 = GPT2Model.from_pretrained("gpt2")

# t5 small
tokenizer_t5 = T5Tokenizer.from_pretrained("models/T5/T5-decomposed")
model_t5 = T5ForConditionalGeneration.from_pretrained("models/T5/T5-decomposed")

# flan t5
tokenizer_flant5 = T5Tokenizer.from_pretrained("Flan-T5")
model_flant5 = T5ForConditionalGeneration.from_pretrained("Flan-T5")

# Bart
tokenizer_bart = BartTokenizer.from_pretrained("BART-Decomposed")
model_bart = BartForConditionalGeneration.from_pretrained("BART-Decomposed")


# LLaMA
# tokenizer_llama = AutoTokenizer.from_pretrained("Llama")
# model_llama = AutoModelForMaskedLM.from_pretrained("Llama")

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thouroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [9]:
# base_model = "Llama"
# llama = AutoModelForCausalLM.from_pretrained(
#     base_model,
#     token="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",
#     device_map="auto",
# )
# llama.config.use_cache = False
# llama.config.pretraining_tp = 1
# tokenizer_llama = AutoTokenizer.from_pretrained(base_model, trust_remote_code=True, 
#                                           token="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",
#                                          )
# tokenizer_llama.pad_token = tokenizer_llama.eos_token
# tokenizer_llama.padding_side = "right"

In [10]:
models_and_tokenizers_with_names = [
   #  ("GPT2", tokenizer_gpt2, model_gpt2),
    ("T5", tokenizer_t5, model_t5),
    ("FLAN-T5", tokenizer_flant5, model_flant5),
    ("BART Base", tokenizer_bart, model_bart),
    # ("LLaMA", tokenizer_llama, model_llama)
]

# Make predictions using each Model on Test Data

In [None]:
model_predictions = {}

for name, tokenizer, model in models_and_tokenizers_with_names:
    print(f"Model: {name}")
    predictions = generate_predictions(dataset, tokenizer, model)
    model_predictions[name] = predictions

Model: T5
Model: FLAN-T5


Token indices sequence length is longer than the specified maximum sequence length for this model (535 > 512). Running this sequence through the model will result in indexing errors


Model: BART Base


### Choosing Best Answer

In [None]:
from rouge_score import rouge_scorer
from bert_score import score
import numpy as np

def select_best_guess(models_and_tokenizers_with_names, dataset, model_predictions, weights=(0.5, 0.5)):
    weight_for_rouge, weight_for_bert = weights
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rougeL'], use_stemmer=True)
    
    best_guesses = []

    for i, example in enumerate(dataset):
        best_score = -np.inf
        best_guess_info = {}
        target_answer = example['summary']
        
        for name, _, _ in models_and_tokenizers_with_names:
            prediction = model_predictions[name][i]
            rouge_scores = scorer.score(target_answer, prediction)
            rouge_score_avg = np.mean([rouge_scores['rouge1'].fmeasure, rouge_scores['rougeL'].fmeasure])

            _, _, bert_scores = score([prediction], [target_answer], lang="en", verbose=False)
            bert_score = bert_scores.mean().item()

            # Calculate combined score based on specified weights
            combined_score = (weight_for_rouge * rouge_score_avg) + (weight_for_bert * bert_score)

            if combined_score > best_score:
                best_score = combined_score
                best_guess_info = {
                    'model': name,
                    'best_guess': prediction,
                    'answer': target_answer,
                    'rouge': rouge_score_avg
                }

        best_guesses.append(best_guess_info)
    
    return best_guesses

In [None]:
from transformers import logging
logging.set_verbosity_error()

best_guesses = select_best_guess(models_and_tokenizers_with_names, dataset, model_predictions, (1, 0))

In [None]:
final_rogue = np.mean([guess['rouge'] for guess in best_guesses])

In [None]:
print(final_rogue)

In [None]:
print(best_guesses)