#### This Notebook is the implementation of fine-tuning LLaMA 2 LLM on FinQA Dataset for Question-answering task. Evaluation was performed on test dataset and results have been reported. Please note that this cod is for experiment to select the rank parameter

### Install Required Libraries and Packages

### Login into Hugging Face Hub

In [None]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

### Load the data

- Train and Test data has been loaded
- FINQA has separate Split for train and test data and hence same has been loaded from hugging face hub
- Data has been converted into a format which can be used to fine tune the LLMs

In [None]:

pd.set_option('display.max_colwidth', -1)
dataset = load_dataset("dreamerdeo/finqa",split="train") ### Load train data from hugging face hub

dataset_pd = dataset.to_pandas() ### convert to pandas

def table_format(x_input): ### format the tabular data in the financial report in pipe delimited format
  fin_list = []
  x = x_input.tolist()
  for i in range(len(x)):
    j = x[i]
    x_str="|".join(str(element) for element in j)
    fin_list.append(x_str)
  final_str = f"\n".join(str(element) for element in fin_list)
  return final_str


def conv_list_string(x): ### Convert the list of sentences into a single paragraph of sentences 
  x_list = x.tolist()
  x_string="".join(str(element) for element in x)
  return x_string

##apply the functions
dataset_pd['pre_text'] = dataset_pd['pre_text'].map(conv_list_string)
dataset_pd['table'] = dataset_pd['table'].apply(table_format)
dataset_pd['post_text'] = dataset_pd['post_text'].map(conv_list_string)
dataset_pd['gold_evidence'] = dataset_pd['gold_evidence'].map(conv_list_string)


### Instructions to be overlaid to the train dataset
instructions = [
    "Financial data and expert analysis has been provided as context, use the provided context to answer the question.",
    "Please provide the required answer to this expert-authored finance question based on the context given.",
    "A deep financial question based on a financial report has been given along with required context, could you help answer the financial question.",
    "Analysis based on financial documents has been given as context, please answer the question using the context.",
    "Financial report data and analysis has been given as context , please respond to the question using the context given."
]

## Add the instructions
ins_list = []
for ins in range(6251):
  ins = random.choice(instructions)
  ins_list.append(ins)


dataset_pd['instructions'] = ins_list

### Finally combine all the formatted columns together to form the final prompt
ins_str = 'context has financial text followed by pipe delimited table and some more financial text is given below the pipe delimited table. Use multi-step numerical reasoning and logical reasoning wherever required to answer the given question'
dataset_pd['final_instruction'] = f'###Instruction:'  + dataset_pd['instructions'] + ins_str + f'\n\n' + f'####context:' + f'\n' + dataset_pd['pre_text'] + f'\n'+ dataset_pd['table'] + f'\n' + dataset_pd['post_text'] + f'\n\n' +f'####question:' + dataset_pd['question'] +  f'\n\n###Answer:' +  dataset_pd['answer']

from datasets import Dataset
train_df = Dataset.from_pandas(dataset_pd)
dataset_dct = train_df.train_test_split(test_size=0.25)  ### Create validation dataset from train dataset

### Get the Train and Test Dataset from Hugging Face Dataset Dict object
df_train = dataset_dct["train"]
df_eval = dataset_dct["test"]

  pd.set_option('display.max_colwidth', -1)


Downloading builder script:   0%|          | 0.00/3.56k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/515 [00:00<?, ?B/s]



Downloading data: 0.00B [00:00, ?B/s]

/root/.cache/huggingface/datasets/downloads/extracted/ca045c27488b49db24426c67e0b35e06b0188746c40ffaf8d01a7b51e52924dc


Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

### Fine-Tuning Framework Parameters

In [None]:
# The model that you want to train from the Hugging Face hub
model_name = "meta-llama/Llama-2-7b-hf"



# Fine-tuned model name
new_model = "llama-2-7b-finqa_rank_128-vj"

################################################################################
# QLoRA parameters
################################################################################

# LoRA attention dimension
lora_r = 128 ### RANK PARAMETER

# Alpha parameter for LoRA scaling
lora_alpha = 256

# Dropout probability for LoRA layers
lora_dropout = 0.1

################################################################################
# bitsandbytes parameters
################################################################################

# Activate 4-bit precision base model loading
use_4bit = True

# Compute dtype for 4-bit base models
bnb_4bit_compute_dtype = "float16"

# Quantization type (fp4 or nf4)
bnb_4bit_quant_type = "nf4"

# Activate nested quantization for 4-bit base models (double quantization)
use_nested_quant = False

################################################################################
# TrainingArguments parameters
################################################################################

# Output directory where the model predictions and checkpoints will be stored
output_dir = "./results"

# Number of training epochs
num_train_epochs = 1

# Enable fp16/bf16 training (set bf16 to True with an A100)
fp16 = False
bf16 = True

# Batch size per GPU for training
per_device_train_batch_size = 4

# Batch size per GPU for evaluation
per_device_eval_batch_size = 4

# Number of update steps to accumulate the gradients for
gradient_accumulation_steps = 1

# Enable gradient checkpointing
gradient_checkpointing = True

# Maximum gradient normal (gradient clipping)
max_grad_norm = 0.3

# Initial learning rate (AdamW optimizer)
learning_rate = 2e-4

# Weight decay to apply to all layers except bias/LayerNorm weights
weight_decay = 0.001

# Optimizer to use
optim = "paged_adamw_32bit"

# Learning rate schedule
lr_scheduler_type = "cosine"

# Number of training steps (overrides num_train_epochs)
max_steps = -1

# Ratio of steps for a linear warmup (from 0 to learning rate)
warmup_ratio = 0.03

# Group sequences into batches with same length
# Saves memory and speeds up training considerably
group_by_length = True

# Save checkpoint every X updates steps
save_steps = 0

# Log every X updates steps
logging_steps = 25

################################################################################
# SFT parameters
################################################################################

# Maximum sequence length to use
max_seq_length = 2000

# Pack multiple short examples in the same input sequence to increase efficiency
packing = False

device_map = {"": 0}

In [None]:
# Load tokenizer and model with QLoRA configuration
compute_dtype = getattr(torch, bnb_4bit_compute_dtype)

bnb_config = BitsAndBytesConfig(
    load_in_4bit=use_4bit,
    bnb_4bit_quant_type=bnb_4bit_quant_type,
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=use_nested_quant,
)


# Load base model
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map=device_map
)

config.json:   0%|          | 0.00/609 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/188 [00:00<?, ?B/s]

### Perform Fine-tuning

In [None]:
model.config.use_cache = False
model.config.pretraining_tp = 1

# Load LLaMA tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right" # Fix weird overflow issue with fp16 training

# Load LoRA configuration
peft_config = LoraConfig(
    lora_alpha=lora_alpha,
    lora_dropout=lora_dropout,
    r=lora_r,
    bias="none",
    task_type="CAUSAL_LM",
)

# Set training parameters
training_arguments = TrainingArguments(
    output_dir=output_dir,
    num_train_epochs=num_train_epochs,
    per_device_train_batch_size=per_device_train_batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    optim=optim,
    save_steps=save_steps,
    logging_steps=logging_steps,
    learning_rate=learning_rate,
    weight_decay=weight_decay,
    fp16=fp16,
    bf16=bf16,
    max_grad_norm=max_grad_norm,
    max_steps=max_steps,
    warmup_ratio=warmup_ratio,
    group_by_length=group_by_length,
    lr_scheduler_type=lr_scheduler_type,
    report_to="tensorboard",
    evaluation_strategy="epoch"
)

# Set supervised fine-tuning parameters
trainer = SFTTrainer(
    model=model,
    train_dataset=df_train,
    eval_dataset=df_eval,
    peft_config=peft_config,
    dataset_text_field="final_instruction",
    max_seq_length=max_seq_length,
    tokenizer=tokenizer,
    args=training_arguments,
    packing=packing,
)

# Train model
trainer.train()

# Save trained model
trainer.model.save_pretrained(new_model)

tokenizer_config.json:   0%|          | 0.00/776 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]



Map:   0%|          | 0/4688 [00:00<?, ? examples/s]

Map:   0%|          | 0/1563 [00:00<?, ? examples/s]

You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss


Epoch,Training Loss,Validation Loss
1,0.7753,0.842145


In [None]:
# The model that you want to train from the Hugging Face hub
model_name = "meta-llama/Llama-2-7b-hf"

# Fine-tuned model name
new_model = "llama-2-7b-finqa_rank_128-vj"


# Reload model in FP16 and merge it with LoRA weights
base_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    low_cpu_mem_usage=True,
    return_dict=True,
    torch_dtype=torch.float16,
    device_map=device_map,
)
model = PeftModel.from_pretrained(base_model, new_model)
model = model.merge_and_unload()

# Reload tokenizer to save it
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.add_special_tokens({'pad_token': '[PAD]'})
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

### Push the fine-tuned model to Hugging Face hub and save it

In [None]:
!huggingface-cli login

model.push_to_hub(new_model, use_temp_dir=False)
tokenizer.push_to_hub(new_model, use_temp_dir=False)


    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

    A token is already saved on your machine. Run `huggingface-cli whoami` to get more information or `huggingface-cli logout` if you want to log out.
    Setting a new token will erase the existing one.
    To login, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Token: 
Add token as git credential? (Y/n) Y
Token is valid (permission: write).
Your token has been saved in your con

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

pytorch_model-00001-of-00002.bin:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

pytorch_model-00002-of-00002.bin:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/vijayshankar245/llama-2-7b-finqa_rank_128-vj/commit/2cb96b73830badfbe0ece21941aacf921b58f423', commit_message='Upload tokenizer', commit_description='', oid='2cb96b73830badfbe0ece21941aacf921b58f423', pr_url=None, pr_revision=None, pr_num=None)

### Evaluation

- Apply the necessary formatting on test data (Same steps which were followed for train dataset)

In [None]:
dataset = load_dataset("dreamerdeo/finqa",split="test")

dataset_pd = dataset.to_pandas()

def table_format(x_input):
  fin_list = []
  x = x_input.tolist()
  for i in range(len(x)):
    j = x[i]
    x_str="|".join(str(element) for element in j)
    fin_list.append(x_str)
  final_str = f"\n".join(str(element) for element in fin_list)
  return final_str


def conv_list_string(x):
  x_list = x.tolist()
  x_string="".join(str(element) for element in x)
  return x_string

dataset_pd['pre_text'] = dataset_pd['pre_text'].map(conv_list_string)
dataset_pd['table'] = dataset_pd['table'].apply(table_format)
dataset_pd['post_text'] = dataset_pd['post_text'].map(conv_list_string)
dataset_pd['gold_evidence'] = dataset_pd['gold_evidence'].map(conv_list_string)




ins_str = 'context has financial text followed by pipe delimited table and some more financial text is given below the pipe delimited table. Use multi-step numerical reasoning and logical reasoning wherever required to answer the given question'
dataset_pd['final_instruction'] = ins_str + f'\n\n' + f'####context:' + f'\n' + dataset_pd['pre_text'] + f'\n'+ dataset_pd['table'] + f'\n' + dataset_pd['post_text'] + f'\n\n' +f'####question:' + dataset_pd['question']

from datasets import Dataset
test_df = Dataset.from_pandas(dataset_pd)

Downloading builder script:   0%|          | 0.00/3.56k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/515 [00:00<?, ?B/s]



Downloading data: 0.00B [00:00, ?B/s]

/root/.cache/huggingface/datasets/downloads/extracted/ca045c27488b49db24426c67e0b35e06b0188746c40ffaf8d01a7b51e52924dc


Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

### Load the fine-tuned model for evaluation

In [None]:
# Ignore warnings
logging.set_verbosity(logging.CRITICAL)
model = "vijayshankar245/llama-2-7b-finqa_rank_128-vj"
tokenizer = AutoTokenizer.from_pretrained(model, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"
# Run text generation pipeline with our next model

pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer,return_full_text=False,temperature=0.1,top_p =0.9,max_new_tokens = 15)



tokenizer_config.json:   0%|          | 0.00/725 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/437 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

pytorch_model.bin.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

pytorch_model-00001-of-00002.bin:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

pytorch_model-00002-of-00002.bin:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/183 [00:00<?, ?B/s]

### Perform the evaluation and extract the results

In [None]:
import time
start_time = time.time()
from transformers.pipelines.pt_utils import KeyDataset
output_list = []
j=0
for i in pipe(KeyDataset(test_df, "final_instruction")):
  j=j+1
  print(i[0]['generated_text'])
  output_list.append(i[0]['generated_text'])
  print(j)
  df = pd.DataFrame({'output': output_list})
print("--- %s seconds ---" % (time.time() - start_time))


[1;30;43mStreaming output truncated to the last 5000 lines.[0m

###Answer:24.4%\\n\\n
13


###Answer:yes\\n

###Yes|
14
?

###Answer:10.4%\\n\\
15


###Answer:2967.3333
16


###answer:100000000
17


###Answer:704.45\\n\\
18


###Answer:11.2%\\n\\n
19


###Answer:19.914257
20
?

###answer:15.9% increase in the
21


###Answer:11.81%\\n

22


###Answer:14.4%\\n\\n
23


###answer:18%\\n\\ncapital
24


###answer:1.67:1.00
25


###answer:1.25\\n\\n

26


###Answer:37.5%\\n"

27


###Answer:11.4% increase in the the
28
?

###Answer:10.85667
29


###Answer:10.5%\\n\\n
30


###Answer:17.2%\\n


31


###Answer:5.4%\\n\\nin
32


###Answer:1040.5656
33


###answer:6.1%\\n

##
34


###Answer:48609 million dollars


35


###answer:168.5\\n


36


###Answer:23.5%\\n\\n
37


###answer:10.5%\\n\\n
38


###answer:2.6698333
39


###Answer:4777553 shares.
40


###Answer:131.6 million dollars..
41


###Answer:148.36%\\n
42
 in millions

###Answer:160.215
43


###answer:45.5%\\n\\n
44


###answer:882.8 mi

In [None]:
dataset_pd['prediction'] = output_list ### Add the output to test dataset for evaluation

### Perform Post Processing

In [None]:
def map_yes_no(x):
    x = str(x)
    if x.lower() in 'yes':
        return '1'
    elif x.lower() in 'no':
        return '0'
    else:
        return x
    
def remove_non_numeric(x):
    x = str(x)
    pattern = re.compile(r"[^\d.]")
    x = pattern.sub("", x)
    return x


def other_formatting(x):
  x =str(x)
  x=x.replace('...', '')
  x=x.replace('..', '')
  tofind = '.'
  if x.count(tofind) > 1:
    pos = x.index(tofind, x.index(tofind) + 1)
    x = x[:pos] + '' + x[pos + 1:]
    return x
  else:
    return x


def replace_blanks(x):
  x =str(x)
  if x.strip() == '':
    return '0'
  else:
    return x

### Appply Post processing functions

In [None]:
dataset_pd['formatted_answer']  = dataset_pd['answer'].apply(map_yes_no)
dataset_pd['formatted_answer'] = dataset_pd['formatted_answer'].apply(remove_non_numeric)
dataset_pd['formatted_answer'] = dataset_pd['formatted_answer'].apply(other_formatting)
dataset_pd['formatted_answer'] = dataset_pd['formatted_answer'].apply(replace_blanks)
dataset_pd['formatted_answer'] = dataset_pd['formatted_answer'].astype(float)
dataset_pd['formatted_answer'] = dataset_pd['formatted_answer'].round()

In [None]:
dataset_pd['formatted_prediction']  = dataset_pd['prediction'].apply(map_yes_no)
dataset_pd['formatted_prediction'] = dataset_pd['formatted_prediction'].apply(remove_non_numeric)
dataset_pd['formatted_prediction'] = dataset_pd['formatted_prediction'].apply(other_formatting)
dataset_pd['formatted_prediction'] = dataset_pd['formatted_prediction'].apply(replace_blanks)
dataset_pd['formatted_prediction'] = dataset_pd['formatted_prediction'].astype(float)
dataset_pd['formatted_prediction'] = dataset_pd['formatted_prediction'].round()

## Extract the final result

In [None]:
import numpy as np
dataset_pd['match_ind'] = np.where( dataset_pd['formatted_answer'] == dataset_pd['formatted_prediction'], 1,0)
dataset_pd['match_ind'].mean()

0.16041848299912817

### The exact match accuracy score for LLaMA 2 model with Rank 128 is 0.16