In [40]:
import warnings
import torch 
warnings.filterwarnings('ignore')
torch.cuda.empty_cache()
torch.cuda.device_count()

3

In [52]:
import os
import torch
from datasets import load_dataset, load_from_disk
from typing import List, Dict
from datasets import Dataset
from transformers import (
    AutoModelForSeq2SeqLM,
    AutoModelForCausalLM,
    AutoTokenizer,
    Seq2SeqTrainingArguments,
    BitsAndBytesConfig,
    TrainingArguments,
    pipeline,
    logging,
)
from trl import SFTTrainer, DataCollatorForCompletionOnlyLM
from peft import LoraConfig, get_peft_model, TaskType
import evaluate
from typing import List, Dict

train_df = load_from_disk("/home/y.khan/cai6307-y.khan/Query-Focused-Tabular-Summarization/data/decomposed/decomposed_train")
test_df = load_from_disk("/home/y.khan/cai6307-y.khan/Query-Focused-Tabular-Summarization/data/decomposed/decomposed_test")
validate_df = load_from_disk("/home/y.khan/cai6307-y.khan/Query-Focused-Tabular-Summarization/data/decomposed/decomposed_validate")

In [53]:
test_example_ids = set(test_df['example_id'])
validate_example_ids = set(validate_df['example_id'])
common_example_ids = test_example_ids.intersection(validate_example_ids)

test_df = test_df.filter(lambda example: example['example_id'] not in common_example_ids)
test_df

Dataset({
    features: ['row_ids', 'table', 'summary', 'query', 'example_id', 'coordinates', 'answers'],
    num_rows: 300
})

In [54]:
def flatten_table(table: Dict) -> str:
    header = table.get('header', [])
    rows = table.get('rows', [])
    
    flattened_rows = []
    for i, row in enumerate(rows):
        row_text = f"Row {i}, " + ",".join([f"{col}:{val}" for col, val in zip(header, row)])
        flattened_rows.append("## " + row_text)

    flattened_table = " ".join(flattened_rows)
    return flattened_table

def generate_prompt(examples):
    table = examples['table']
    query = examples['query']
    summary = examples['summary']
    task_prefix = "Given a table and a query, generate a summary that answers the query based on the information in the table:"

    
    flattened_table = flatten_table(table)
    input_text = f"### Instruction: {task_prefix}\nTable ## {flattened_table}.\nQuery: {query}\n### Response: {summary}"
    return {'text' : input_text}
        
def process_dataset(data: Dataset):
    return (
        data.shuffle(seed=42)
        .map(generate_prompt).remove_columns(['row_ids', 'example_id', 'coordinates', 'answers']
    ))

## APPLYING PREPROCESSING ON WHOLE DATASET
train_df = process_dataset(train_df)
test_df = process_dataset(test_df)

Map: 100%|██████████| 2000/2000 [00:01<00:00, 1386.29 examples/s]
Map: 100%|██████████| 300/300 [00:01<00:00, 275.67 examples/s]


In [58]:
def generate_validate_prompt(examples):
    table = examples['table']
    query = examples['query']
    summary = examples['summary']
    table_title = table['title']
    system_prompt = "You are a helpful, respectful and honest assistant. Below is an instruction that describes a query-focused summarization task. Write a summary that appropriately response to the user query."
    
    task = "Using the information from the table, generate a paragraph-long summary to response to the following user query:"

    
    flattened_table = flatten_table(table)
    input_text = f"Table Title: {table_title}\n{flattened_table}\n{task}\nQuery: {query}\n\nSummary:\n"
    prompt = f"""<s>[INST] <<SYS>>
{system_prompt}
<</SYS>>
{input_text} [/INST]"""
    return prompt
        
def process_validate_dataset(data: Dataset):
    return (
        data.shuffle(seed=42)
        .map(generate_validate_prompt).remove_columns(['row_ids', 'example_id', 'coordinates', 'answers']
    ))

pr = generate_validate_prompt(validate_df[1])

In [59]:
print(pr)

<s>[INST] <<SYS>>
You are a helpful, respectful and honest assistant. Below is an instruction that describes a query-focused summarization task. Write a summary that appropriately response to the user query.
<</SYS>>
Table Title: Swiss Locomotive And Machine Works
## Row 0, Built:1895,Number:1,Type:Mountain Railway Rack Steam Locomotive,Slm Number:923,Wheel Arrangement:0 - 4 - 2 T,Location:Snowdon Mountain Railway ## Row 1, Built:1895,Number:2,Type:Mountain Railway Rack Steam Locomotive,Slm Number:924,Wheel Arrangement:0 - 4 - 2 T,Location:Snowdon Mountain Railway ## Row 2, Built:1895,Number:3,Type:Mountain Railway Rack Steam Locomotive,Slm Number:925,Wheel Arrangement:0 - 4 - 2 T,Location:Snowdon Mountain Railway ## Row 3, Built:1896,Number:4,Type:Mountain Railway Rack Steam Locomotive,Slm Number:988,Wheel Arrangement:0 - 4 - 2 T,Location:Snowdon Mountain Railway ## Row 4, Built:1896,Number:5,Type:Mountain Railway Rack Steam Locomotive,Slm Number:989,Wheel Arrangement:0 - 4 - 2 T,Loca

In [51]:
def process_table_data(table):
    header_str = " | ".join(table["header"])
    rows_str = "\n".join([" | ".join(row) for row in table["rows"]])
    return f"{header_str}\n{rows_str}\n"

print(process_table_data(validate_df[1]['table']))

Built | Number | Type | Slm Number | Wheel Arrangement | Location
1895 | 1 | Mountain Railway Rack Steam Locomotive | 923 | 0 - 4 - 2 T | Snowdon Mountain Railway
1895 | 2 | Mountain Railway Rack Steam Locomotive | 924 | 0 - 4 - 2 T | Snowdon Mountain Railway
1895 | 3 | Mountain Railway Rack Steam Locomotive | 925 | 0 - 4 - 2 T | Snowdon Mountain Railway
1896 | 4 | Mountain Railway Rack Steam Locomotive | 988 | 0 - 4 - 2 T | Snowdon Mountain Railway
1896 | 5 | Mountain Railway Rack Steam Locomotive | 989 | 0 - 4 - 2 T | Snowdon Mountain Railway
1922 | 6 | Mountain Railway Rack Steam Locomotive | 2838 | 0 - 4 - 2 T | Snowdon Mountain Railway
1923 | 7 | Mountain Railway Rack Steam Locomotive | 2869 | 0 - 4 - 2 T | Snowdon Mountain Railway
1923 | 8 | Mountain Railway Rack Steam Locomotive | 2870 | 0 - 4 - 2 T | Snowdon Mountain Railway



In [6]:
nf4_config = BitsAndBytesConfig(
   load_in_4bit=True,
   bnb_4bit_quant_type="nf4",
   bnb_4bit_use_double_quant=True,
   bnb_4bit_compute_dtype=torch.bfloat16
)

base_model = "meta-llama/Meta-Llama-3-70B"
cache_dir='./llama3-70B_cache'
model = AutoModelForCausalLM.from_pretrained(
    base_model,
    quantization_config=nf4_config,
    token="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",
    device_map="auto",
    cache_dir=cache_dir
)
model.config.use_cache = False
model.config.pretraining_tp = 1
tokenizer = AutoTokenizer.from_pretrained(base_model, trust_remote_code=True, 
                                          token="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",
                                          cache_dir=cache_dir
                                         )
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

Loading checkpoint shards: 100%|██████████| 30/30 [04:57<00:00,  9.91s/it]
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [62]:
from tqdm import tqdm
for i in tqdm(range(100)):
    prompt = generate_validate_prompt(validate_df[i])
    inputs = tokenizer(prompt, return_tensors='pt')
    output = tokenizer.decode(model.generate(inputs["input_ids"], max_new_tokens=256, temperature=0.0001)[0], skip_special_tokens=True)
    break
output

  0%|          | 0/100 [00:00<?, ?it/s]The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
  0%|          | 0/100 [04:51<?, ?it/s]


"<s>[INST] <<SYS>>\nYou are a helpful, respectful and honest assistant. Below is an instruction that describes a query-focused summarization task. Write a summary that appropriately response to the user query.\n<</SYS>>\nTable Title: 2008 - 09 Connecticut Huskies Women 'S Basketball Team\n## Row 0, Player:Moore Maya Moore,Points:754 ## Row 1, Player:Montgomery Renee Montgomery,Points:644 ## Row 2, Player:Charles Tina Charles,Points:642 ## Row 3, Player:Greene Kalana Greene,Points:338 ## Row 4, Player:Doty Caroline Doty,Points:147 ## Row 5, Player:Hayes Tiffany Hayes,Points:327 ## Row 6, Player:Mclaren Kaili Mclaren,Points:159 ## Row 7, Player:Williams Tahirah Williams,Points:88 ## Row 8, Player:Dixon Lorin Dixon,Points:89 ## Row 9, Player:Gardler Meghan Gardler,Points:58 ## Row 10, Player:Fernandes Jacquie Fernandes,Points:23\nUsing the information from the table, generate a paragraph-long summary to response to the following user query:\nQuery: Which player(s) scored more than 600 poi

In [64]:
prompt = generate_validate_prompt(validate_df[2])
inputs = tokenizer(prompt, return_tensors='pt')
output = tokenizer.decode(model.generate(inputs["input_ids"], max_new_tokens=256, temperature=0.0001)[0], skip_special_tokens=True)
print(output)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


<s>[INST] <<SYS>>
You are a helpful, respectful and honest assistant. Below is an instruction that describes a query-focused summarization task. Write a summary that appropriately response to the user query.
<</SYS>>
Table Title: Wind power in Maine - Wind projects
## Row 0, Number of Turbines:28,Completed:2006 ## Row 1, Number of Turbines:3,Completed:2008 ## Row 2, Number of Turbines:38,Completed:2009 ## Row 3, Number of Turbines:3,Completed:2009 ## Row 4, Number of Turbines:1,Completed:2009 ## Row 5, Number of Turbines:44,Completed:2010 ## Row 6, Number of Turbines:17,Completed:2010 ## Row 7, Number of Turbines:2,Completed:2010 ## Row 8, Number of Turbines:10,Completed:2011 ## Row 9, Number of Turbines:40,Completed:2011 ## Row 10, Number of Turbines:22,Completed:2012 ## Row 11, Number of Turbines:19,Completed:2012 ## Row 12, Number of Turbines:48,Completed:2015 ## Row 13, Number of Turbines:12,Completed:2015 ## Row 14, Number of Turbines:13,Completed:2016 ## Row 15, Number of Turbine

In [10]:
output_summary = []

In [8]:
prompt = validate_df[7]['text']
inputs = tokenizer(prompt, return_tensors='pt').to('cuda')
output = tokenizer.decode(model.generate(inputs["input_ids"], max_new_tokens=256, temperature=0.0001).to('cuda')[0], skip_special_tokens=True)
output_summary.append(output)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


In [11]:
for i in range (100):
    if (i%5 == 0):
        torch.cuda.empty_cache()
    prompt = validate_df[i]['text']
    inputs = tokenizer(prompt, return_tensors='pt').to('cuda')
    output = tokenizer.decode(model.generate(inputs["input_ids"], max_new_tokens=256, temperature=0.0001).to('cuda')[0], skip_special_tokens=True)
    output_summary.append(output)
    print(i)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


0


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


1
2


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


3


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


4
5


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


6


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


7


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


8
9


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


10


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


11
12


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


13


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


14
15


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


16


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


17


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


18


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


19
20


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


21


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


22
23


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


24


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


25


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


26


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


27


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


28
29


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


30


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


31


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


32


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


33
34


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


35


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


36
37


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


38


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


39


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


40


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


41
42


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


43


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


44


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


45


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


46


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


47


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


48


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


49


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


50


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


51


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


52
53


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


54
55


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


56


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


57


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


58
59


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


60


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


61


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


62


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


63


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


64


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


65


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


66
67


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


68


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


69


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


70


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


71


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


72


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


73
74


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


75


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


76
77


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


78


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


79
80


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


81


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


82


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


83


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


84


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


85


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


86


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


87


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


88


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


89


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


90


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


91
92


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


93
94


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


95
96


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


97
98


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


99


In [33]:
final_summary = []
for ind in range(100):
    lst = output_summary[ind].split("\n")
    final_summary.append(lst[3])
    
final = [s.split(': ', 1)[1] for s in final_summary]

print(final[8])

 The size of the electoral population does not seem to have any connection with the number of ethnic representatives in the different constituencies. The number of ethnic representatives is the same in all constituencies, regardless of the size of the electoral population.


In [37]:
validate = validate_df.select(range(100))
len(validate)

100

In [38]:
import numpy as np
rougeL = []
bert = []
bertscore = evaluate.load("bertscore")
rougescore = evaluate.load("rouge")

bert_score = bertscore.compute(predictions=final, references=validate['summary'], lang = "en")
rouge_score = rougescore.compute(predictions=final, references=validate['summary'])
rouge_score

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


{'rouge1': 0.347787620244061,
 'rouge2': 0.17487224138507423,
 'rougeL': 0.27863835702164574,
 'rougeLsum': 0.2771994004199221}

In [None]:
for i in range (100, 200):
    prompt = validate_df[i]['text']
    inputs = tokenizer(prompt, return_tensors='pt').to('cuda')
    output = tokenizer.decode(model.generate(inputs["input_ids"], max_new_tokens=256, temperature=0.0001).to('cuda')[0], skip_special_tokens=True)
    output_summary.append(output)
    print(i)

In [68]:
lora_r = 16
lora_alpha = 64
lora_dropout = 0.05
lora_target_modules = ["q_proj", "lm_head", "up_proj", "o_proj", "k_proj", "down_proj", "gate_proj","v_proj", "ln", "fc"]


peft_params = LoraConfig(
    r=lora_r,
    lora_alpha=lora_alpha,
    lora_dropout=lora_dropout,
    target_modules=lora_target_modules,
    bias="none",
    task_type="CAUSAL_LM",
)

In [69]:
def postprocess_text(preds, labels):
        preds = [pred.strip() for pred in preds]
        labels = [label.strip() for label in labels]

        # rougeLSum expects newline after each sentence
        preds = ["\n".join(nltk.sent_tokenize(pred)) for pred in preds]
        labels = ["\n".join(nltk.sent_tokenize(label)) for label in labels]

        return preds, labels

def metric_fn(eval_predictions):
    predictions, labels = eval_predictions
    decoded_predictions = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    for label in labels:
        label[label < 0] = tokenizer.pad_token_id  # Replace masked label tokens
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    
    decoded_predictions, decoded_labels = postprocess_text(decoded_predictions, decoded_labels)

    rouge = evaluate.load('rouge')

    # Compute ROUGE scores
    rouge_results = rouge.compute(predictions=decoded_predictions, references=decoded_labels)

    return rouge_results

response_template = "### Response: "
collator = DataCollatorForCompletionOnlyLM(response_template, tokenizer=tokenizer)

training_params = Seq2SeqTrainingArguments(
    output_dir="./train_weights_70b",
    save_strategy = "no",
    num_train_epochs=20,
    per_device_train_batch_size=6,
     per_device_eval_batch_size=2,
    gradient_accumulation_steps=2,
    optim="paged_adamw_32bit",
    save_steps=-1,
    logging_steps=25,
    learning_rate=2e-4,
    weight_decay=0.003,
    fp16=False,
    bf16=False,
    max_grad_norm=0.3,
    max_steps=-1,
    warmup_ratio=0.05,
    group_by_length=True,
    predict_with_generate=True,
    lr_scheduler_type="constant",
    report_to="tensorboard",
)

trainer = SFTTrainer(
    model=model,
    train_dataset=train_df,
    eval_dataset=test_df,
    peft_config=peft_params,
    dataset_text_field="text",
    max_seq_length=1024,
    tokenizer=tokenizer,
    args=training_params,
    data_collator=collator,
    packing=False,
    compute_metrics=metric_fn,
)

In [70]:
validate_df_size = len(validate_df)
step_size = 3
num_batches = validate_df_size // step_size

valid = []

for i in range(num_batches):
    start_index = i * step_size
    end_index = (i + 1) * step_size
    valid.append(validate_df.select(range(start_index, end_index)))

# If there are remaining data points that don't fit into full batches of size 3
if validate_df_size % step_size != 0:
    remaining_data = validate_df_size % step_size
    valid.append(validate_df.select(range(validate_df_size - remaining_data, validate_df_size)))

In [71]:
valid[1]

Dataset({
    features: ['table', 'summary', 'query', 'text'],
    num_rows: 3
})