In [1]:
import warnings
import torch 
warnings.filterwarnings('ignore')
torch.cuda.empty_cache()
torch.cuda.device_count()

3

In [2]:
import os
import torch
from datasets import load_dataset, load_from_disk
from typing import List, Dict
from datasets import Dataset
from transformers import (
    AutoModelForSeq2SeqLM,
    AutoModelForCausalLM,
    AutoTokenizer,
    Seq2SeqTrainingArguments,
    BitsAndBytesConfig,
    TrainingArguments,
    pipeline,
    logging,
    LlamaForCausalLM,
    LlamaTokenizer,
)
from trl import SFTTrainer, DataCollatorForCompletionOnlyLM
from peft import LoraConfig, get_peft_model, TaskType
import evaluate
from typing import List, Dict

train_df = load_from_disk("/home/y.khan/cai6307-y.khan/Query-Focused-Tabular-Summarization/data/decomposed/decomposed_train")
test_df = load_from_disk("/home/y.khan/cai6307-y.khan/Query-Focused-Tabular-Summarization/data/decomposed/decomposed_test")
validate_df = load_from_disk("/home/y.khan/cai6307-y.khan/Query-Focused-Tabular-Summarization/data/decomposed/decomposed_validate")

In [3]:
def flatten_table(table: Dict) -> str:
    header = table.get('header', [])
    rows = table.get('rows', [])
    
    flattened_rows = []
    for i, row in enumerate(rows):
        row_text = f"Row {i}, " + ",".join([f"{col}:{val}" for col, val in zip(header, row)])
        flattened_rows.append("## " + row_text)

    flattened_table = " ".join(flattened_rows)
    return flattened_table

def generate_validate_prompt(examples):
    table = examples['table']
    query = examples['query']
    summary = examples['summary']
    table_title = table['title']
    system_prompt = "You are a helpful, respectful and honest assistant. Below is an instruction that describes a query-focused table summarization task. Write a summary that appropriately response to the user query."
    
    task = "Using the information from the table, generate a paragraph-long summary to response to the following user query:"

    
    flattened_table = flatten_table(table)
    input_text = f"Table Title: {table_title}\n{flattened_table}\n{task}\nQuery: {query}\n\nSummary:\n"
    prompt = f'<s> [INST] {system_prompt}\nTable Title: {table_title}\n{flattened_table}\n{task} [/INST]\nUser: {query}\nAssistant: "Summary": '
    return prompt

In [4]:
prompt = generate_validate_prompt(validate_df[1])
print(prompt)

<s> [INST] You are a helpful, respectful and honest assistant. Below is an instruction that describes a query-focused table summarization task. Write a summary that appropriately response to the user query.
Table Title: Swiss Locomotive And Machine Works
## Row 0, Built:1895,Number:1,Type:Mountain Railway Rack Steam Locomotive,Slm Number:923,Wheel Arrangement:0 - 4 - 2 T,Location:Snowdon Mountain Railway ## Row 1, Built:1895,Number:2,Type:Mountain Railway Rack Steam Locomotive,Slm Number:924,Wheel Arrangement:0 - 4 - 2 T,Location:Snowdon Mountain Railway ## Row 2, Built:1895,Number:3,Type:Mountain Railway Rack Steam Locomotive,Slm Number:925,Wheel Arrangement:0 - 4 - 2 T,Location:Snowdon Mountain Railway ## Row 3, Built:1896,Number:4,Type:Mountain Railway Rack Steam Locomotive,Slm Number:988,Wheel Arrangement:0 - 4 - 2 T,Location:Snowdon Mountain Railway ## Row 4, Built:1896,Number:5,Type:Mountain Railway Rack Steam Locomotive,Slm Number:989,Wheel Arrangement:0 - 4 - 2 T,Location:Snowd

In [5]:
model_dir = "mistralai/Mixtral-8x22B-Instruct-v0.1"
cache_dir='mixtral-cache'

nf4_config = BitsAndBytesConfig(
   load_in_4bit=True,
   bnb_4bit_quant_type="nf4",
   bnb_4bit_use_double_quant=True,
   bnb_4bit_compute_dtype=torch.bfloat16
)

model = AutoModelForCausalLM.from_pretrained(model_dir,
                                        token="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",
                                        quantization_config=nf4_config,
                                        device_map="auto",
                                        cache_dir=cache_dir
                                        )
tokenizer = AutoTokenizer.from_pretrained(model_dir, 
                                           token="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",
                                           trust_remote_code=True, 
                                           cache_dir=cache_dir
                                          )

Loading checkpoint shards: 100%|██████████| 59/59 [10:57<00:00, 11.15s/it]
generation_config.json: 100%|██████████| 116/116 [00:00<00:00, 5.89kB/s]
tokenizer_config.json: 100%|██████████| 2.30k/2.30k [00:00<00:00, 176kB/s]
tokenizer.json: 100%|██████████| 1.82M/1.82M [00:00<00:00, 16.6MB/s]
special_tokens_map.json: 100%|██████████| 117/117 [00:00<00:00, 607kB/s]


In [8]:
generate_text = pipeline(
    model=model, tokenizer=tokenizer,
    return_full_text=False,  
    task="text-generation",
    temperature=0.001,  # 'randomness' of outputs, 0.0 is the min and 1.0 the max
    do_sample=True,
    top_k=20,
    max_new_tokens=400,  # max number of tokens to generate in the output
    repetition_penalty=1.1  # if output begins repeating increase
)

In [None]:
generated_summary = []

In [10]:
from tqdm import tqdm

for i in tqdm(range(200)):
    prompt = generate_validate_prompt(validate_df[i])
    res = generate_text(prompt)
    generated_summary.append(res[0]["generated_text"])

  0%|          | 0/200 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
  0%|          | 1/200 [01:30<4:59:08, 90.19s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


 In the 2008 - 09 season, three players on the Connecticut Huskies Women's Basketball Team managed to score over 600 points each. The top scorer was Maya Moore with an impressive total of 754 points. Following closely behind were Renee Montgomery and Tina Charles who both contributed significantly to the team's success by scoring 644 and 642 points respectively."


  1%|          | 2/200 [01:58<2:57:08, 53.68s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


 The Swiss Locomotive and Machine Works constructed a mountain railway rack steam locomotive in 1896 with SLM number 988. This particular locomotive was assigned the number 4 and shares its type, wheel arrangement (0 - 4 - 2 T), and location (Snowdon Mountain Railway) with several other locomotives built between 1895 and 1923.


  1%|          | 2/200 [02:33<4:12:27, 76.50s/it]

 The data shows a general upward trend in the number of turbines installed per wind project in Maine between 2006 and 2017. In 2006, there were only 28 turbines installed for one project, but by 2016, several projects had more than 50 turbines each. For instance, two projects completed in 2016 had 56 and 17 turbines respectively. However, it's worth noting that the number of turbines varied significantly among different projects within the same year, indicating potential differences in scale or capacity.





In [None]:
import numpy as np
rougeL = []
bert = []
bertscore = evaluate.load("bertscore")
rougescore = evaluate.load("rouge")

bert_score = bertscore.compute(predictions=generated_summary, references=validate_df['summary'], lang = "en")
rouge_score = rougescore.compute(predictions=generated_summary, references=validate_df['summary'])
print(rouge_score, bert_score)