In [1]:
%%capture
%pip install accelerate peft bitsandbytes trl

In [2]:
import warnings
import torch 
warnings.filterwarnings('ignore')
torch.cuda.empty_cache()
torch.cuda.device_count()

3

In [3]:
import os
import torch
from datasets import load_dataset, load_from_disk
from typing import List, Dict
from datasets import Dataset
from transformers import (
    AutoModelForSeq2SeqLM,
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    pipeline,
    logging,
)
from trl import SFTTrainer
from peft import LoraConfig, get_peft_model, prepare_model_for_int8_training, TaskType
import evaluate
from typing import List, Dict

2024-04-19 23:35:53.458122: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-04-19 23:36:39.546325: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /opt/slurm/lib64:/opt/slurm/lib64:
2024-04-19 23:36:39.546362: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
2024-04-19 23:36:43.767939: E tensorflow/stream_executor/cuda/cuda_blas.cc:2981] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-04-19 23:3

In [4]:
train_df = load_from_disk("/home/y.khan/cai6307-y.khan/Query-Focused-Tabular-Summarization/data/decomposed/decomposed_train")
test_df = load_from_disk("/home/y.khan/cai6307-y.khan/Query-Focused-Tabular-Summarization/data/decomposed/decomposed_test")
validate_df = load_from_disk("/home/y.khan/cai6307-y.khan/Query-Focused-Tabular-Summarization/data/decomposed/decomposed_validate")

In [5]:
test_example_ids = set(test_df['example_id'])
validate_example_ids = set(validate_df['example_id'])
common_example_ids = test_example_ids.intersection(validate_example_ids)

test_df = test_df.filter(lambda example: example['example_id'] not in common_example_ids)
test_df

Filter: 100%|██████████| 500/500 [00:01<00:00, 319.47 examples/s]


Dataset({
    features: ['row_ids', 'table', 'summary', 'query', 'example_id', 'coordinates', 'answers'],
    num_rows: 300
})

In [6]:
def generate_training_prompt(query: str, table: str, summary: str, system_prompt: str):
    
    return f"""{system_prompt}
    \nTable: {table.strip()}
Query: {query.strip()} \n
Summary: {summary}""".strip()

def flatten_table(table: Dict) -> str:
    header = table.get('header', [])
    rows = table.get('rows', [])
    title = table.get('title', [])

    flattened_rows = []

    for i, row in enumerate(rows):
        row_text = f"Row {i}, " + ",".join([f"{col}:{val}" for col, val in zip(header, row)])
        flattened_rows.append("## " + row_text)

    flattened_table = f"Title: {' '.join(map(str, title))}" + " " + " ".join(flattened_rows)

    return flattened_table


def generate_instruction_dataset(data_point):
    
    task_prefix = "Given a table and a query, generate a summary that answers the query based on the information in the table:"

    return {
        "query": data_point["query"],
        "table": flatten_table(data_point["table"]),
        "summary": data_point["summary"],
        "text": generate_training_prompt(data_point["query"], flatten_table(data_point["table"]),  data_point['summary'], task_prefix)
    }

def process_dataset(data: Dataset):
    return (
        data.shuffle(seed=42)
        .map(generate_instruction_dataset).remove_columns(['row_ids', 'example_id', 'coordinates', 'answers']
    ))

In [7]:
## APPLYING PREPROCESSING ON WHOLE DATASET
train_df = process_dataset(train_df)
validate_df = process_dataset(validate_df)
test_df = process_dataset(test_df)

# Select 512 rows from the training split
train_data = train_df.shuffle(seed=42).select([i for i in range(2000)])


test_data = test_df.shuffle(seed=42).select([i for i in range(300)])
validation_data = validate_df

train_data,test_data,validation_data

Map: 100%|██████████| 300/300 [00:00<00:00, 2679.90 examples/s]


(Dataset({
     features: ['table', 'summary', 'query', 'text'],
     num_rows: 2000
 }),
 Dataset({
     features: ['table', 'summary', 'query', 'text'],
     num_rows: 300
 }),
 Dataset({
     features: ['table', 'summary', 'query', 'text'],
     num_rows: 200
 }))

In [8]:
print(train_data[1]['text'])

Given a table and a query, generate a summary that answers the query based on the information in the table:
    
Table: Title: H u l l   F . C ## Row 0, Competition:Super League Iii,Played:23,Drawn:0,Lost:15,Position:9Th ## Row 1, Competition:Super League Iv,Played:30,Drawn:0,Lost:25,Position:13Th ## Row 2, Competition:Super League V,Played:28,Drawn:1,Lost:15,Position:7Th ## Row 3, Competition:Super League Vi,Played:28,Drawn:2,Lost:6,Position:3Rd ## Row 4, Competition:Super League Vii,Played:28,Drawn:0,Lost:12,Position:5Th ## Row 5, Competition:Super League Viii,Played:28,Drawn:3,Lost:12,Position:7Th ## Row 6, Competition:Super League Ix,Played:28,Drawn:2,Lost:12,Position:3Rd ## Row 7, Competition:Super League X,Played:28,Drawn:2,Lost:11,Position:5Th ## Row 8, Competition:Super League Xi,Played:28,Drawn:0,Lost:8,Position:2Nd ## Row 9, Competition:Super League Xii,Played:27,Drawn:2,Lost:11,Position:5Th ## Row 10, Competition:Super League Xiii,Played:27,Drawn:1,Lost:18,Position:11Th ## R

In [9]:
compute_dtype = getattr(torch, "float16")

quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=False,
)

In [None]:
cache_dir = "./falcon-cache"
base_model = "tiiuae/falcon-40b"
model = AutoModelForCausalLM.from_pretrained(
    base_model,
    #quantization_config=quant_config,
    token="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",
    device_map="auto",
    cache_dir=cache_dir
)
model.config.use_cache = False
model.config.pretraining_tp = 1
tokenizer = AutoTokenizer.from_pretrained(base_model, trust_remote_code=True, 
                                          token="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",
                                          cache_dir=cache_dir,
                                         )
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

Loading checkpoint shards:   0%|          | 0/9 [00:00<?, ?it/s]

In [None]:
index = 27

query = test_df['query'][index]
table = test_df['table'][index]
summary = test_df['summary'][index]
task_prefix = "Given a table and a query, generate a summary that answers the query based on the information in the table: "

prompt = f"""{task_prefix}
    \nTable: {table.strip()}
Query: {query.strip()} \n
Summary: """.strip()

inputs = tokenizer(prompt, return_tensors='pt')
output = tokenizer.decode(model.generate(inputs["input_ids"], max_new_tokens=100)[0], skip_special_tokens=True)

dash_line = '-'.join('' for x in range(100))
print(dash_line)
print(f'INPUT PROMPT:\n{prompt}')
print(dash_line)
print(f'BASELINE HUMAN SUMMARY:\n{summary}\n')
print(dash_line)
print(f'MODEL GENERATION - ZERO SHOT:\n{output}')

In [None]:
from peft import prepare_model_for_kbit_training

model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)

def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )

lora_r = 16
lora_alpha = 64
lora_dropout = 0.05
lora_target_modules = ["q_proj", "up_proj", "o_proj", "k_proj", "down_proj", "gate_proj","v_proj", "ln", "fc", "query_key_value", "dense", "dense_h_to_4h", "dense_4h_to_h"]


peft_params = LoraConfig(
    r=lora_r,
    lora_alpha=lora_alpha,
    lora_dropout=lora_dropout,
    target_modules=lora_target_modules,
    bias="none",
    task_type="CAUSAL_LM",
)

In [None]:
def postprocess_text(preds, labels):
        preds = [pred.strip() for pred in preds]
        labels = [label.strip() for label in labels]

        # rougeLSum expects newline after each sentence
        preds = ["\n".join(nltk.sent_tokenize(pred)) for pred in preds]
        labels = ["\n".join(nltk.sent_tokenize(label)) for label in labels]

        return preds, labels

def metric_fn(eval_predictions):
    predictions, labels = eval_predictions
    decoded_predictions = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    for label in labels:
        label[label < 0] = tokenizer.pad_token_id  # Replace masked label tokens
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    
    decoded_predictions, decoded_labels = postprocess_text(decoded_predictions, decoded_labels)

    rouge = evaluate.load('rouge')

    # Compute ROUGE scores
    rouge_results = rouge.compute(predictions=decoded_predictions, references=decoded_labels)

    return rouge_results

training_params = TrainingArguments(
    output_dir="./train_weights",
    save_strategy = "no",
    num_train_epochs=20,
    per_device_train_batch_size=12,
    per_device_eval_batch_size=6,
    gradient_accumulation_steps=4,
    optim="paged_adamw_32bit",
    save_steps=-1,
    logging_steps=25,
    learning_rate=2e-4,
    weight_decay=0.003,
    fp16=False,
    bf16=False,
    max_grad_norm=0.3,
    max_steps=-1,
    warmup_ratio=0.05,
    group_by_length=True,
    lr_scheduler_type="constant",
    report_to="tensorboard",
)

trainer = SFTTrainer(
    model=model,
    train_dataset=train_data,
    eval_dataset=test_data,
    peft_config=peft_params,
    dataset_text_field="text",
    max_seq_length=900,
    tokenizer=tokenizer,
    args=training_params,
    packing=False,
    compute_metrics=metric_fn,
)

In [None]:
trainer.train()

In [None]:
new_model = "falcon"
trainer.model.save_pretrained(new_model)
trainer.tokenizer.save_pretrained(new_model)

In [21]:
validate_df = load_from_disk("/home/y.khan/cai6307-y.khan/Query-Focused-Tabular-Summarization/data/decomposed/decomposed_validate")
validate_df_size = len(validate_df)
step_size = 3
num_batches = validate_df_size // step_size

valid = []

for i in range(num_batches):
    start_index = i * step_size
    end_index = (i + 1) * step_size
    valid.append(validate_df.select(range(start_index, end_index)))

# If there are remaining data points that don't fit into full batches of size 3
if validate_df_size % step_size != 0:
    remaining_data = validate_df_size % step_size
    valid.append(validate_df.select(range(validate_df_size - remaining_data, validate_df_size)))

In [22]:
import numpy as np
rougeL = []
bert = []
bertscore = evaluate.load("bertscore")

for i in range(len(valid)):
    validate_df = process_dataset(valid[i])
    #validate_df = valid[i].map(tokenization_with_answer, batched=True)
    predict_results = trainer.predict(validate_df)
    metrics = predict_results.metrics
    predictions = tokenizer.batch_decode(predict_results.predictions, skip_special_tokens=True, clean_up_tokenization_spaces=True)
    predictions = [pred.strip() for pred in predictions]
    
    bert_score = bertscore.compute(predictions=predictions, references=valid[i]['summary'], lang = "en")
    rougeL.append(metrics['test_rougeLsum'])
    bert.append(np.mean(bert_score['f1']))

sum(rougeL)/len(rougeL), sum(bert)/len(bert)

AttributeError: 'NoneType' object has no attribute 'get'

In [None]:
generated = []
validate_df = load_from_disk("/home/y.khan/cai6307-y.khan/Query-Focused-Tabular-Summarization/data/decomposed/decomposed_validate")
validate_df = process_dataset(validate_df)
for ind in range(200):
    
    query = validate_df['query'][ind]
    table = validate_df['table'][ind]
    summary = validate_df['summary'][ind]
    task_prefix = "Given a table and a query, generate a summary that answers the query based on the information in the table: "

    prompt = f"""{task_prefix}
        \nTable: {table.strip()}
    Query: {query.strip()} \n
    Summary: """.strip()

    inputs = tokenizer(prompt, return_tensors='pt')
    output = tokenizer.decode(model.generate(inputs["input_ids"], max_new_tokens=100)[0], skip_special_tokens=True)

    lst = output.split("\n\n")
    generated.append(lst[1])

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.
The atte

In [42]:
lst = output.split("\n\n")
print(lst[1])

Summary: From table information, it not seem there is strong connection between director (Nobuo Mizuta vs Jun Aizawa) and how many people watch first ten episodes of Dr. Rintarō. Both directors have episodes with high and low rating, so it be hard to say one director better than other. But, episodes directed by Nobuo Mizuta have little more viewers, with highest rating of 17.4%, but Jun Aizawa's episodes also have good ratings, starting


In [26]:
DEVICE = "cuda:0"
validate_df = load_from_disk("/home/y.khan/cai6307-y.khan/Query-Focused-Tabular-Summarization/data/decomposed/decomposed_validate")
sample = validate_df[101]
def summarize(model, text: str):
    inputs = tokenizer(text, return_tensors="pt").to(DEVICE)
    inputs_length = len(inputs["input_ids"][0])
    with torch.inference_mode():
        outputs = model.generate(**inputs, max_new_tokens=256, temperature=0.0001)
    return tokenizer.decode(outputs[0][inputs_length:], skip_special_tokens=True)


table = sample['table']
summary = sample['summary']
task_prefix = "Given a table and a query, generate a summary that answers the query based on the information in the table: "

prompt = generate_training_prompt(table, summar, )
summary = summarize(model, prompt['text'])



TypeError: generate_training_prompt() missing 3 required positional arguments: 'table', 'summary', and 'system_prompt'

In [32]:
validate_df = load_from_disk("/home/y.khan/cai6307-y.khan/Query-Focused-Tabular-Summarization/data/decomposed/decomposed_validate")
validate_df = process_dataset(validate_df)
def tokenize_function(examples):
    return tokenizer(examples['text'], padding="max_length", truncation=True)

tokenized_validate_df = validate_df.map(tokenize_function, batched=True)

In [33]:
#validate_df = load_from_disk("/home/y.khan/cai6307-y.khan/Query-Focused-Tabular-Summarization/data/decomposed/decomposed_validate")
validate_df_size = len(validate_df)
step_size = 3
num_batches = validate_df_size // step_size

valid = []

for i in range(num_batches):
    start_index = i * step_size
    end_index = (i + 1) * step_size
    valid.append(validate_df.select(range(start_index, end_index)))

# If there are remaining data points that don't fit into full batches of size 3
if validate_df_size % step_size != 0:
    remaining_data = validate_df_size % step_size
    valid.append(validate_df.select(range(validate_df_size - remaining_data, validate_df_size)))

In [34]:
rougeL = []
for i in range(2):
    va = valid[i+3].map(tokenize_function, batched=True)
    eval_result = trainer.evaluate(va)
    decoded_summaries = tokenizer.batch_decode(eval_results.predictions, skip_special_tokens=True)


    # rougeL.append(metrics['test_rougeLsum'])

Map: 100%|██████████| 3/3 [00:00<00:00, 220.59 examples/s]


TypeError: argument 'ids': 'list' object cannot be interpreted as an integer

In [None]:
generated_texts = predict_results.predictions
