In [1]:
import warnings
import torch 
warnings.filterwarnings('ignore')
torch.cuda.empty_cache()
torch.cuda.device_count()

3

In [2]:
import os
import torch
from datasets import load_dataset, load_from_disk
from typing import List, Dict
from datasets import Dataset
from transformers import (
    AutoModelForSeq2SeqLM,
    AutoModelForCausalLM,
    AutoTokenizer,
    Seq2SeqTrainingArguments,
    BitsAndBytesConfig,
    TrainingArguments,
    pipeline,
    logging,
    LlamaForCausalLM,
    LlamaTokenizer,
)
from trl import SFTTrainer, DataCollatorForCompletionOnlyLM
from peft import LoraConfig, get_peft_model, TaskType
import evaluate
from typing import List, Dict

train_df = load_from_disk("/home/y.khan/cai6307-y.khan/Query-Focused-Tabular-Summarization/data/decomposed/decomposed_train")
test_df = load_from_disk("/home/y.khan/cai6307-y.khan/Query-Focused-Tabular-Summarization/data/decomposed/decomposed_test")
validate_df = load_from_disk("/home/y.khan/cai6307-y.khan/Query-Focused-Tabular-Summarization/data/data/validate")

In [3]:
test_example_ids = set(test_df['example_id'])
validate_example_ids = set(validate_df['example_id'])
common_example_ids = test_example_ids.intersection(validate_example_ids)

test_df = test_df.filter(lambda example: example['example_id'] not in common_example_ids)
test_df

Filter: 100%|██████████| 500/500 [00:03<00:00, 146.79 examples/s]


Dataset({
    features: ['row_ids', 'table', 'summary', 'query', 'example_id', 'coordinates', 'answers'],
    num_rows: 300
})

In [4]:
def flatten_table(table: Dict) -> str:
    header = table.get('header', [])
    rows = table.get('rows', [])
    
    flattened_rows = []
    for i, row in enumerate(rows):
        row_text = f"Row {i}, " + ",".join([f"{col}:{val}" for col, val in zip(header, row)])
        flattened_rows.append("## " + row_text)

    flattened_table = " ".join(flattened_rows)
    return flattened_table

def generate_validate_prompt(examples):
    table = examples['table']
    query = examples['query']
    summary = examples['summary']
    table_title = table['title']
    system_prompt = "You are a helpful, respectful and honest assistant. Below is an instruction that describes a query-focused summarization task. Write a summary that appropriately response to the user query."
    
    task = "Using the information from the table, generate a paragraph-long summary to response to the following user query:"

    
    flattened_table = flatten_table(table)
    input_text = f"Table Title: {table_title}\n{flattened_table}\n{task}\nQuery: {query}\n\nSummary:\n"
    prompt = f"""<s>[INST] <<SYS>>
{system_prompt}
<</SYS>>
{input_text} [/INST]"""
    prompt = f"{system_prompt}\n{input_text}"
    return prompt

In [5]:
prompt = generate_validate_prompt(validate_df[1])
print(prompt)

You are a helpful, respectful and honest assistant. Below is an instruction that describes a query-focused summarization task. Write a summary that appropriately response to the user query.
Table Title: Swiss Locomotive And Machine Works
## Row 0, Built:1895,Number:1,Type:Mountain Railway Rack Steam Locomotive,Slm Number:923,Wheel Arrangement:0 - 4 - 2 T,Location:Snowdon Mountain Railway,Notes:Ladas ## Row 1, Built:1895,Number:2,Type:Mountain Railway Rack Steam Locomotive,Slm Number:924,Wheel Arrangement:0 - 4 - 2 T,Location:Snowdon Mountain Railway,Notes:Enid ## Row 2, Built:1895,Number:3,Type:Mountain Railway Rack Steam Locomotive,Slm Number:925,Wheel Arrangement:0 - 4 - 2 T,Location:Snowdon Mountain Railway,Notes:Wyddfa ## Row 3, Built:1896,Number:4,Type:Mountain Railway Rack Steam Locomotive,Slm Number:988,Wheel Arrangement:0 - 4 - 2 T,Location:Snowdon Mountain Railway,Notes:Snowdon ## Row 4, Built:1896,Number:5,Type:Mountain Railway Rack Steam Locomotive,Slm Number:989,Wheel Arran

In [6]:
model_dir = "daryl149/llama-2-70b-chat-hf"
cache_dir='./llama2-70B_cache'

nf4_config = BitsAndBytesConfig(
   load_in_4bit=True,
   bnb_4bit_quant_type="nf4",
   bnb_4bit_use_double_quant=True,
   bnb_4bit_compute_dtype=torch.bfloat16
)

model = LlamaForCausalLM.from_pretrained(model_dir,
                                        token="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",
                                         quantization_config=nf4_config,
                                        device_map="auto",
                                        cache_dir=cache_dir
                                        )
tokenizer = LlamaTokenizer.from_pretrained(model_dir, 
                                           token="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",
                                           trust_remote_code=True, 
                                           cache_dir=cache_dir
                                          )

Loading checkpoint shards: 100%|██████████| 15/15 [06:11<00:00, 24.76s/it]


In [7]:
pipeline = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    torch_dtype=torch.float16,
    device_map="auto",
)

In [9]:
output_summary = []

In [10]:
from tqdm import tqdm
for i in tqdm(range(200)):
    prompt = generate_validate_prompt(validate_df[i])
    messages = [
        {"role": "system", "content": "You are a helpful, respectful and honest assistant. Below is an instruction that describes a query-focused summarization task. Write a summary that appropriately response to the user query."},
        {"role": "user", "content": prompt},
    ]

    prompt = pipeline.tokenizer.apply_chat_template(
            messages, 
            tokenize=False, 
            add_generation_prompt=True
    )

    terminators = [
        pipeline.tokenizer.eos_token_id,
    ]

    outputs = pipeline(
        prompt,
        max_new_tokens=400,
        eos_token_id=tokenizer.eos_token_id,
        do_sample=True,
        temperature=0.0001,
        top_k=10,
        num_return_sequences=1,
    )
    output_summary.append(outputs[0]["generated_text"][len(prompt):])

  0%|          | 0/200 [00:00<?, ?it/s]
No chat template is defined for this tokenizer - using the default template for the LlamaTokenizer class. If the default is not appropriate for your model, please set `tokenizer.chat_template` to an appropriate template. See https://huggingface.co/docs/transformers/main/chat_templating for more information.

 66%|██████▌   | 132/200 [1:09:28<31:47, 28.05s/it]This is a friendly reminder - the current text generation call will exceed the model's predefined maximum length (2048). Depending on the model, you may observe exceptions, performance degradation, or nothing at all.
100%|██████████| 200/200 [1:46:38<00:00, 31.99s/it]


In [8]:
def flatten_table(table: Dict) -> str:
    header = table.get('header', [])
    rows = table.get('rows', [])
    
    flattened_rows = []
    for i, row in enumerate(rows):
        row_text = f"Row {i}, " + ",".join([f"{col}:{val}" for col, val in zip(header, row)])
        flattened_rows.append("## " + row_text)

    flattened_table = " ".join(flattened_rows)
    return flattened_table

def generate_validate_prompt(examples):
    table = examples['table']
    query = examples['query']
    summary = examples['summary']
    table_title = table['title']
    system_prompt = "You are a helpful, respectful and honest assistant. Below is an instruction that describes a query-focused summarization task. Write a summary that appropriately response to the user query."
    
    task = "Using the information from the table, generate a paragraph-long summary to response to the following user query:"

    
    flattened_table = flatten_table(table)
    input_text = f"Table Title: {table_title}\n{flattened_table}\n{task}\nQuery: {query}\n\nSummary:\n"
    prompt = f"""<s>[INST] <<SYS>>
{system_prompt}
<</SYS>>
{input_text} [/INST]"""
    #prompt = f"{system_prompt}\n{input_text}"
    return prompt

In [9]:
final_summary = []

In [10]:
print(generate_validate_prompt(validate_df[10]))

<s>[INST] <<SYS>>
You are a helpful, respectful and honest assistant. Below is an instruction that describes a query-focused summarization task. Write a summary that appropriately response to the user query.
<</SYS>>
Table Title: List of colleges and universities in Maine - Open institutions
## Row 0, School:Bates College,Location(s):Lewiston,Control:Private,Type:Baccalaureate college,Enrollment (2016):1,903,Founded:1855 ## Row 1, School:Beal College,Location(s):Bangor,Control:Private (for-profit),Type:Associates college,Enrollment (2016):600,Founded:1891 ## Row 2, School:Bowdoin College,Location(s):Brunswick,Control:Private,Type:Baccalaureate college,Enrollment (2016):1,952,Founded:1794 ## Row 3, School:Central Maine Community College,Location(s):Auburn,Control:Public,Type:Associates college,Enrollment (2016):3,978,Founded:1963 ## Row 4, School:Colby College,Location(s):Waterville,Control:Private,Type:Baccalaureate college,Enrollment (2016):2,055,Founded:1813 ## Row 5, School:College 

In [11]:
from tqdm import tqdm
for i in tqdm(range(200)):
    prompt = generate_validate_prompt(validate_df[i])
    messages = [
        {"role": "system", "content": "You are a helpful, respectful and honest assistant. Below is an instruction that describes a query-focused summarization task. Write a summary that appropriately response to the user query."},
        {"role": "user", "content": prompt},
    ]

    # prompt = pipeline.tokenizer.apply_chat_template(
    #         messages, 
    #         tokenize=False, 
    #         add_generation_prompt=True
    # )

    terminators = [
        pipeline.tokenizer.eos_token_id,
    ]

    outputs = pipeline(
        prompt,
        max_new_tokens=400,
        eos_token_id=tokenizer.eos_token_id,
        do_sample=True,
        temperature=0.0001,
        top_k=10,
        num_return_sequences=1,
    )
    final_summary.append(outputs[0]["generated_text"][len(prompt):])

 38%|███▊      | 77/200 [36:19<1:01:18, 29.91s/it]This is a friendly reminder - the current text generation call will exceed the model's predefined maximum length (2048). Depending on the model, you may observe exceptions, performance degradation, or nothing at all.
100%|██████████| 200/200 [1:34:02<00:00, 28.21s/it]


In [14]:
import numpy as np
rougeL = []
bert = []
bertscore = evaluate.load("bertscore")
rougescore = evaluate.load("rouge")

bert_score = bertscore.compute(predictions=output_summary, references=validate_df['summary'], lang = "en")
rouge_score = rougescore.compute(predictions=output_summary, references=validate_df['summary'])
print(rouge_score, bert_score)

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


{'rouge1': 0.43802416271277406, 'rouge2': 0.22315055717637833, 'rougeL': 0.3339567177992043, 'rougeLsum': 0.3351979311861989} {'precision': [0.9254542589187622, 0.903687059879303, 0.892741858959198, 0.8382596969604492, 0.8795133233070374, 0.86527419090271, 0.8924967050552368, 0.9122763872146606, 0.873224139213562, 0.8648399114608765, 0.8779414892196655, 0.8244060277938843, 0.8873380422592163, 0.890691876411438, 0.8413833975791931, 0.9511193037033081, 0.8795099854469299, 0.8976823091506958, 0.8666635751724243, 0.904883861541748, 0.8972246646881104, 0.8855209350585938, 0.8773843050003052, 0.9489731192588806, 0.8841674327850342, 0.9214867353439331, 0.8996047973632812, 0.9254379272460938, 0.9023730754852295, 0.8697454929351807, 0.8328706622123718, 0.8654690980911255, 0.8901695013046265, 0.8708077073097229, 0.8705869317054749, 0.8608335256576538, 0.9153760671615601, 0.9522333741188049, 0.8936750888824463, 0.8720929622650146, 0.9089547395706177, 0.8584100008010864, 0.8707201480865479, 0.9244

In [12]:
import numpy as np
rougeL = []
bert = []
bertscore = evaluate.load("bertscore")
rougescore = evaluate.load("rouge")
bert_score = bertscore.compute(predictions=final_summary, references=validate_df['summary'], lang = "en")
rouge_score = rougescore.compute(predictions=final_summary, references=validate_df['summary'])
print(rouge_score, bert_score)

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


{'rouge1': 0.4694828438261428, 'rouge2': 0.24351902314368945, 'rougeL': 0.35012452007387795, 'rougeLsum': 0.35430039249642475} {'precision': [0.8953565359115601, 0.9415045380592346, 0.8889484405517578, 0.8771141171455383, 0.8301807641983032, 0.86527419090271, 0.8873572945594788, 0.8892030715942383, 0.8705712556838989, 0.8747484683990479, 0.8889113068580627, 0.9283400177955627, 0.8683232069015503, 0.8367645144462585, 0.8608748912811279, 0.8794569373130798, 0.8784754872322083, 0.9002668261528015, 0.883944034576416, 0.8790221214294434, 0.9294285774230957, 0.8510318994522095, 0.8952310681343079, 0.9245123863220215, 0.8624144792556763, 0.9232460260391235, 0.903176486492157, 0.9083459377288818, 0.8843892216682434, 0.8669808506965637, 0.862229585647583, 0.8774843215942383, 0.9186300039291382, 0.8728663921356201, 0.8645857572555542, 0.8689563274383545, 0.8672002553939819, 0.9004122018814087, 0.8707717657089233, 0.9491473436355591, 0.9213534593582153, 0.8747454285621643, 0.8796473741531372, 0.8