In [1]:
#%%capture
#%pip install accelerate peft bitsandbytes trl

In [1]:
import warnings
warnings.filterwarnings('ignore')
import torch 
torch.cuda.empty_cache()
torch.cuda.device_count()

3

In [2]:
import os
import torch
from datasets import load_dataset, load_from_disk
from typing import List, Dict
from datasets import Dataset
from transformers import (
    AutoModelForSeq2SeqLM,
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    pipeline,
    logging,
)
from trl import SFTTrainer
from peft import LoraConfig, get_peft_model, TaskType
import evaluate
from typing import List, Dict

In [3]:
train_df = load_from_disk("/home/y.khan/cai6307-y.khan/Query-Focused-Tabular-Summarization/data/decomposed/decomposed_train")
test_df = load_from_disk("/home/y.khan/cai6307-y.khan/Query-Focused-Tabular-Summarization/data/decomposed/decomposed_test")
validate_df = load_from_disk("/home/y.khan/cai6307-y.khan/Query-Focused-Tabular-Summarization/data/decomposed/decomposed_validate")
test_example_ids = set(test_df['example_id'])
validate_example_ids = set(validate_df['example_id'])
common_example_ids = test_example_ids.intersection(validate_example_ids)

test_df = test_df.filter(lambda example: example['example_id'] not in common_example_ids)
test_df

Filter: 100%|██████████| 500/500 [00:01<00:00, 257.22 examples/s]


Dataset({
    features: ['row_ids', 'table', 'summary', 'query', 'example_id', 'coordinates', 'answers'],
    num_rows: 300
})

In [4]:
def generate_training_prompt(query: str, table: str, summary: str, system_prompt: str):
    
    return f"""### Instruction: {system_prompt}
    \n\n### Input: 
Table: {table.strip()}
Query: {query.strip()} \n\n
### Summary: {summary}""".strip()

def flatten_table(table: Dict) -> str:
    header = table.get('header', [])
    rows = table.get('rows', [])
    title = table.get('title', [])

    flattened_rows = []

    for i, row in enumerate(rows):
        row_text = f"Row {i}, " + ",".join([f"{col}:{val}" for col, val in zip(header, row)])
        flattened_rows.append("## " + row_text)

    flattened_table = f"Title: {' '.join(map(str, title))}" + " " + " ".join(flattened_rows)

    return flattened_table


def generate_instruction_dataset(data_point):
    
    task_prefix = "Given a table and a query, generate a summary that answers the query based on the information in the table: "

    return {
        "query": data_point["query"],
        "table": flatten_table(data_point["table"]),
        "summary": data_point["summary"],
        "text": generate_training_prompt(data_point["query"], flatten_table(data_point["table"]),  data_point['summary'], task_prefix)
    }

def process_dataset(data: Dataset):
    return (
        data.shuffle(seed=42)
        .map(generate_instruction_dataset).remove_columns(['row_ids', 'example_id', 'coordinates', 'answers']
    ))

In [5]:
## APPLYING PREPROCESSING ON WHOLE DATASET
train_df = process_dataset(train_df)
validate_df = process_dataset(validate_df)
test_df = process_dataset(test_df)

# Select 1000 rows from the training split
train_data = train_df.shuffle(seed=42).select([i for i in range(1800)])


test_data = test_df.shuffle(seed=42).select([i for i in range(300)])
validation_data = validate_df

train_data,test_data,validation_data

Map: 100%|██████████| 300/300 [00:00<00:00, 677.16 examples/s]


(Dataset({
     features: ['table', 'summary', 'query', 'text'],
     num_rows: 1800
 }),
 Dataset({
     features: ['table', 'summary', 'query', 'text'],
     num_rows: 300
 }),
 Dataset({
     features: ['table', 'summary', 'query', 'text'],
     num_rows: 200
 }))

In [6]:
cache_dir = "./llama3-cache"

compute_dtype = getattr(torch, "float16")

quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=False,
)

base_model = "meta-llama/Meta-Llama-3-8B"
#base_model = "llama-2-7b-QTsumm"
model = AutoModelForCausalLM.from_pretrained(
    base_model,
    #quantization_config=quant_config,
    token="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",
    device_map="auto",
    cache_dir=cache_dir
)
model.config.use_cache = False
model.config.pretraining_tp = 1
tokenizer = AutoTokenizer.from_pretrained(base_model, trust_remote_code=True, 
                                          token="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",
                                          cache_dir=cache_dir
                                         )
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

Loading checkpoint shards: 100%|██████████| 4/4 [00:32<00:00,  8.00s/it]
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [7]:
index = 21

query = test_df['query'][index]
table = test_df['table'][index]
summary = test_df['summary'][index]
task_prefix = "Given a query and a table, generate a summary that answers the query based on the information in the table: "

prompt = f"""### Instruction: {task_prefix}
    \n\n### Input: 
Table: {table.strip()}
Query: {query.strip()} \n\n
### Summary: """

inputs = tokenizer(prompt, return_tensors='pt')
output = tokenizer.decode(model.generate(inputs["input_ids"], max_new_tokens=100)[0], skip_special_tokens=True)


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


In [8]:
lora_r = 16
lora_alpha = 64
lora_dropout = 0.05
lora_target_modules = ["q_proj", "up_proj", "o_proj", "k_proj", "down_proj", "gate_proj","v_proj", "ln", "fc"]


peft_params = LoraConfig(
    r=lora_r,
    lora_alpha=lora_alpha,
    lora_dropout=lora_dropout,
    target_modules=lora_target_modules,
    bias="none",
    task_type="CAUSAL_LM",
)

In [9]:
def postprocess_text(preds, labels):
        preds = [pred.strip() for pred in preds]
        labels = [label.strip() for label in labels]

        # rougeLSum expects newline after each sentence
        preds = ["\n".join(nltk.sent_tokenize(pred)) for pred in preds]
        labels = ["\n".join(nltk.sent_tokenize(label)) for label in labels]

        return preds, labels

def metric_fn(eval_predictions):
    predictions, labels = eval_predictions
    decoded_predictions = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    for label in labels:
        label[label < 0] = tokenizer.pad_token_id  # Replace masked label tokens
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    
    decoded_predictions, decoded_labels = postprocess_text(decoded_predictions, decoded_labels)

    rouge = evaluate.load('rouge')

    # Compute ROUGE scores
    rouge_results = rouge.compute(predictions=decoded_predictions, references=decoded_labels)

    return rouge_results

training_params = TrainingArguments(
    output_dir="./train_weights_8b",
    save_strategy = "no",
    num_train_epochs=20,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=2,
    gradient_accumulation_steps=4,
    optim="paged_adamw_32bit",
    save_steps=-1,
    logging_steps=50,
    learning_rate=2e-4,
    weight_decay=0.003,
    fp16=False,
    bf16=False,
    max_grad_norm=0.3,
    max_steps=-1,
    warmup_ratio=0.05,
    group_by_length=True,
    load_best_model_at_end=True,
    lr_scheduler_type="constant",
    report_to="tensorboard",
)

trainer = SFTTrainer(
    model=model,
    train_dataset=train_data,
    eval_dataset=test_data,
    peft_config=peft_params,
    dataset_text_field="text",
    max_seq_length=900,
    tokenizer=tokenizer,
    args=training_params,
    packing=False,
    compute_metrics=metric_fn,
)

Map: 100%|██████████| 300/300 [00:00<00:00, 756.67 examples/s]


In [10]:
trainer.train()

You're using a PreTrainedTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss
50,0.6894
100,0.5233
150,0.4043
200,0.3073
250,0.2268
300,0.1699
350,0.1205
400,0.0954
450,0.0753
500,0.0624


TrainOutput(global_step=1120, training_loss=0.1414074673716511, metrics={'train_runtime': 37958.0762, 'train_samples_per_second': 0.948, 'train_steps_per_second': 0.03, 'total_flos': 8.894065708512707e+17, 'train_loss': 0.1414074673716511, 'epoch': 19.91})

In [11]:
new_model = "llama-3-8b"
trainer.model.save_pretrained(new_model)
trainer.tokenizer.save_pretrained(new_model)

('llama-3-8b/tokenizer_config.json',
 'llama-3-8b/special_tokens_map.json',
 'llama-3-8b/tokenizer.json')

In [13]:
validate_df_size = len(validation_data)
step_size = 3
num_batches = validate_df_size // step_size

valid = []

for i in range(num_batches):
    start_index = i * step_size
    end_index = (i + 1) * step_size
    valid.append(validation_data.select(range(start_index, end_index)))

# If there are remaining data points that don't fit into full batches of size 3
if validate_df_size % step_size != 0:
    remaining_data = validate_df_size % step_size
    valid.append(validation_data.select(range(validate_df_size - remaining_data, validate_df_size)))

In [15]:
valid[1]

Dataset({
    features: ['table', 'summary', 'query', 'text'],
    num_rows: 3
})

In [22]:
output_summary = []

In [17]:
validate_df = load_from_disk("/home/y.khan/cai6307-y.khan/Query-Focused-Tabular-Summarization/data/decomposed/decomposed_validate")

def generate_validation_prompt(query: str, table: str, system_prompt: str):
    
    return f"""### Instruction: {system_prompt}
    \n\n### Input: 
Table: {table.strip()}
Query: {query.strip()} \n\n
### Response: """.strip()

def generate_instruction_dataset(data_point):
    
    task_prefix = "Given a table and a query, generate a summary that answers the query based on the information in the table: "

    return {
        "query": data_point["query"],
        "table": flatten_table(data_point["table"]),
        "summary": data_point["summary"],
        "text": generate_validation_prompt(data_point["query"], flatten_table(data_point["table"]), task_prefix)
    }

def process_dataset(data: Dataset):
    return (
        data.shuffle(seed=42)
        .map(generate_instruction_dataset).remove_columns(['row_ids', 'example_id', 'coordinates', 'answers']
    ))

validate_df = process_dataset(validate_df)

Map: 100%|██████████| 200/200 [00:00<00:00, 982.41 examples/s] 


In [None]:
def summarize(text: str):
    inputs = tokenizer(text, return_tensors="pt")
    inputs_length = len(inputs["input_ids"][0])
    with torch.inference_mode():
        outputs = model.generate(**inputs, max_new_tokens=256, temperature=0.0001)
    return tokenizer.decode(outputs[0][inputs_length:], skip_special_tokens=True)


for i in range(200):
    summary = summarize(validate_df[i]['text'])
    output_summary.append(summary)
    print(i)

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


0


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


1


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


2


In [20]:
for i in range (200):    
    prompt = validate_df[i]['text']
    inputs = tokenizer(prompt, return_tensors='pt')
    output = tokenizer.decode(model.generate(inputs["input_ids"], max_new_tokens=256)[0], skip_special_tokens=True)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


In [21]:
output

'### Instruction: Given a table and a query, generate a summary that answers the query based on the information in the table: \n    \n\n### Input: \nTable: Title: L a k s h m i   N a r a s i m h a   E n t e r t a i n m e n t s   -   F i l m   P r o d u c t i o n ## Row 0, Year:2011,Film:Veedu Theda,Actors:Nikhil Siddharth,Director:Chinni Krishna ## Row 1, Year:2013,Film:Swamy Ra Ra,Actors:Nikhil Siddharth, Swathi Reddy,Director:Sudheer Varma ## Row 2, Year:2015,Film:Mosagallaku Mosagadu (2015 film),Actors:Sudheer Babu,Director:Nellore Bose ## Row 3, Year:2017,Film:C/o Surya,Actors:Sundeep Kishan,Mehreen,Director:Suseenthiran ## Row 4, Year:2017,Film:Okka Kshanam,Actors:Allu Sirish, Seerat Kapoor, Surbhi,Director:Vi Anand\nQuery: What year was the film "Veedu Theda" directed by Chinni Krishna produced and who was the actor in it? \n\n\n### Response: In 2011, Chinni Krishna direct movie "Veedu Theda" produce by Lakshmi Narasimha Entertainment. The actor in this film is Nikhil Siddharth. 

In [None]:
for index in range(200):
    query = validate_df['query'][index]
    table = validate_df['table'][index]
    summary = validate_df['summary'][index]
    task_prefix = "Given a table and a query, generate a summary that answers the query based on the information in the table: "

    prompt = f"""{task_prefix}

    ### Input:
    Table: {table.strip()}
    Query: {query.strip()}

    ### Summary:
    """.strip()

    inputs = tokenizer(prompt, return_tensors='pt')
    output = tokenizer.decode(model.generate(inputs["input_ids"], max_new_tokens=512, temperature=0.0001)[0], skip_special_tokens=True)
    output_summary.append(output)
    print(index)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


0


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
