In [1]:
import warnings
import torch 
warnings.filterwarnings('ignore')
torch.cuda.empty_cache()
torch.cuda.device_count()

3

In [2]:
import os
import torch
from datasets import load_dataset, load_from_disk
from typing import List, Dict
from datasets import Dataset
from transformers import (
    AutoModelForSeq2SeqLM,
    AutoModelForCausalLM,
    AutoTokenizer,
    Seq2SeqTrainingArguments,
    BitsAndBytesConfig,
    TrainingArguments,
    pipeline,
    logging,
    LlamaForCausalLM,
    LlamaTokenizer,
)
from trl import SFTTrainer, DataCollatorForCompletionOnlyLM
from peft import LoraConfig, get_peft_model, TaskType
import evaluate
from typing import List, Dict

train_df = load_from_disk("/home/y.khan/cai6307-y.khan/Query-Focused-Tabular-Summarization/data/decomposed/decomposed_train")
test_df = load_from_disk("/home/y.khan/cai6307-y.khan/Query-Focused-Tabular-Summarization/data/decomposed/decomposed_test")
validate_df = load_from_disk("/home/y.khan/cai6307-y.khan/Query-Focused-Tabular-Summarization/data/decomposed/decomposed_validate")

In [3]:
test_example_ids = set(test_df['example_id'])
validate_example_ids = set(validate_df['example_id'])
common_example_ids = test_example_ids.intersection(validate_example_ids)

test_df = test_df.filter(lambda example: example['example_id'] not in common_example_ids)
test_df

Filter: 100%|██████████| 500/500 [00:00<00:00, 847.97 examples/s]


Dataset({
    features: ['row_ids', 'table', 'summary', 'query', 'example_id', 'coordinates', 'answers'],
    num_rows: 300
})

In [4]:
def flatten_table(table: Dict) -> str:
    header = table.get('header', [])
    rows = table.get('rows', [])
    
    flattened_rows = []
    for i, row in enumerate(rows):
        row_text = f"Row {i}, " + ",".join([f"{col}:{val}" for col, val in zip(header, row)])
        flattened_rows.append("## " + row_text)

    flattened_table = " ".join(flattened_rows)
    return flattened_table

def generate_validate_prompt(examples):
    table = examples['table']
    query = examples['query']
    summary = examples['summary']
    table_title = table['title']
    system_prompt = "You are a helpful, respectful and honest assistant. Below is an instruction that describes a query-focused summarization task. Write a summary that appropriately response to the user query."
    
    task = "Using the information from the table, generate a paragraph-long summary to response to the following user query:"

    
    flattened_table = flatten_table(table)
    input_text = f"Table Title: {table_title}\n{flattened_table}\n{task}\nQuery: {query}\n\nSummary:\n"
    prompt = f"""<s>[INST] <<SYS>>
{system_prompt}
<</SYS>>
{input_text} [/INST]"""
    prompt = input_text
    return prompt

In [5]:
nf4_config = BitsAndBytesConfig(
   load_in_4bit=True,
   bnb_4bit_quant_type="nf4",
   bnb_4bit_use_double_quant=True,
   bnb_4bit_compute_dtype=torch.bfloat16
)
cache_dir='./llama3-70B_cache'
model_dir = "meta-llama/Meta-Llama-3-70B-Instruct"
model = AutoModelForCausalLM.from_pretrained(
    model_dir,
    quantization_config=nf4_config,
    token="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",
    device_map="auto",
    cache_dir=cache_dir
)
tokenizer = AutoTokenizer.from_pretrained(model_dir, trust_remote_code=True, 
                                          token="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",
                                          cache_dir=cache_dir
                                         )

Loading checkpoint shards: 100%|██████████| 30/30 [05:05<00:00, 10.20s/it]
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [6]:
pipeline = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    torch_dtype=torch.float16,
    device_map="auto",
)

In [7]:
output_summary = []

In [8]:
from tqdm import tqdm
for i in tqdm(range(200)):
    prompt = generate_validate_prompt(validate_df[i])
    messages = [
        {"role": "system", "content": "You are a helpful, respectful and honest assistant. Below is an instruction that describes a query-focused summarization task. Write a summary that appropriately response to the user query."},
        {"role": "user", "content": prompt},
    ]

    prompt = pipeline.tokenizer.apply_chat_template(
            messages, 
            tokenize=False, 
            add_generation_prompt=True
    )

    terminators = [
        pipeline.tokenizer.eos_token_id,
        pipeline.tokenizer.convert_tokens_to_ids("<|eot_id|>")
    ]

    outputs = pipeline(
        prompt,
        max_new_tokens=400,
        eos_token_id=terminators,
        do_sample=True,
        temperature=0.0001,
        top_k=10,
        num_return_sequences=1,
    )
    output_summary.append(outputs[0]["generated_text"][len(prompt):])

  0%|          | 0/200 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
  0%|          | 1/200 [00:21<1:11:01, 21.42s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
  1%|          | 2/200 [00:34<53:40, 16.26s/it]  Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
  2%|▏         | 3/200 [01:02<1:11:32, 21.79s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
  2%|▏         | 4/200 [01:15<1:00:00, 18.37s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
  2%|▎         | 5/200 [01:36<1:03:11, 19.45s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
  3%|▎         | 6/200 [01:45<51:09, 15.82s/it]  Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
  4%|▎         | 7/200 [01:51<40:51, 12.70s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
  4%|▍         | 8/200 [02:04<40:52,

In [9]:
predicted_summary = []
validate_df = load_from_disk("/home/y.khan/cai6307-y.khan/Query-Focused-Tabular-Summarization/data/data/validate")
from tqdm import tqdm
for i in tqdm(range(200)):
    prompt = generate_validate_prompt(validate_df[i])
    messages = [
        {"role": "system", "content": "You are a helpful, respectful and honest assistant. Below is an instruction that describes a query-focused summarization task. Write a summary that appropriately response to the user query."},
        {"role": "user", "content": prompt},
    ]

    prompt = pipeline.tokenizer.apply_chat_template(
            messages, 
            tokenize=False, 
            add_generation_prompt=True
    )

    terminators = [
        pipeline.tokenizer.eos_token_id,
        pipeline.tokenizer.convert_tokens_to_ids("<|eot_id|>")
    ]

    outputs = pipeline(
        prompt,
        max_new_tokens=400,
        eos_token_id=terminators,
        do_sample=True,
        temperature=0.0001,
        top_k=10,
        num_return_sequences=1,
    )
    predicted_summary.append(outputs[0]["generated_text"][len(prompt):])

  0%|          | 0/200 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
  0%|          | 1/200 [00:23<1:18:27, 23.66s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
  1%|          | 2/200 [00:45<1:15:08, 22.77s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
  2%|▏         | 3/200 [01:24<1:38:05, 29.88s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
  2%|▏         | 4/200 [01:33<1:11:16, 21.82s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
  2%|▎         | 5/200 [01:56<1:12:14, 22.23s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
  3%|▎         | 6/200 [02:05<57:08, 17.67s/it]  Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
  4%|▎         | 7/200 [02:20<53:52, 16.75s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
  4%|▍         | 8/200 [02:34<51:16,

In [10]:
import numpy as np
rougeL = []
bert = []
bertscore = evaluate.load("bertscore")
rougescore = evaluate.load("rouge")

bert_score = bertscore.compute(predictions=predicted_summary, references=validate_df['summary'], lang = "en")
rouge_score = rougescore.compute(predictions=predicted_summary, references=validate_df['summary'])
print(rouge_score, bert_score)

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


{'rouge1': 0.5327328822667501, 'rouge2': 0.28698656295789293, 'rougeL': 0.40900271734110927, 'rougeLsum': 0.41055376634347895} {'precision': [0.9402215480804443, 0.915837824344635, 0.876336395740509, 0.8757199048995972, 0.7956719398498535, 0.9354560971260071, 0.9085547924041748, 0.9254266619682312, 0.8772714734077454, 0.8406026363372803, 0.9012601375579834, 0.9503621459007263, 0.9015098810195923, 0.9214420318603516, 0.8783793449401855, 0.9584165215492249, 0.925640881061554, 0.884534478187561, 0.9064540266990662, 0.9348171949386597, 0.9447590112686157, 0.9173172116279602, 0.851612389087677, 0.9525593519210815, 0.8926726579666138, 0.9342412948608398, 0.9489313364028931, 0.924338698387146, 0.9350916743278503, 0.9201141595840454, 0.9047365784645081, 0.8776346445083618, 0.90773606300354, 0.8664284348487854, 0.9240831136703491, 0.8677371740341187, 0.8962600231170654, 0.9402443766593933, 0.9106146097183228, 0.9406284689903259, 0.9041059017181396, 0.8792629241943359, 0.8726783394813538, 0.9162

In [11]:
validate_df = load_from_disk("/home/y.khan/cai6307-y.khan/Query-Focused-Tabular-Summarization/data/decomposed/decomposed_validate")
rougeL = []
bert = []
bertscore = evaluate.load("bertscore")
rougescore = evaluate.load("rouge")

bert_score_decomposed = bertscore.compute(predictions=output_summary, references=validate_df['summary'], lang = "en")
rouge_score_decomposed = rougescore.compute(predictions=output_summary, references=validate_df['summary'])
print(rouge_score_decomposed, bert_score_decomposed)

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


{'rouge1': 0.48683836592049323, 'rouge2': 0.2522190155897729, 'rougeL': 0.37481998967451835, 'rougeLsum': 0.37606168174107324} {'precision': [0.9167724251747131, 0.9270361661911011, 0.8956968188285828, 0.848545491695404, 0.82440185546875, 0.9354560971260071, 0.9142233729362488, 0.9254266619682312, 0.8670310974121094, 0.8963916301727295, 0.8541078567504883, 0.9503620862960815, 0.8854222297668457, 0.9258536100387573, 0.91858971118927, 0.9584165215492249, 0.9249820709228516, 0.854059636592865, 0.9045165777206421, 0.9401041269302368, 0.9447590112686157, 0.915420413017273, 0.851612389087677, 0.9525593519210815, 0.840628445148468, 0.9362905025482178, 0.9312019944190979, 0.9319135546684265, 0.8806910514831543, 0.9154309630393982, 0.8417859077453613, 0.8804208040237427, 0.8558652997016907, 0.872653603553772, 0.9168321490287781, 0.8645719289779663, 0.9224622845649719, 0.939936637878418, 0.8853536248207092, 0.8750008344650269, 0.9022603034973145, 0.8792628645896912, 0.8671689033508301, 0.9154121