# Testing flare

### Load Model
Load the base model and finetuned model

In [1]:
from sklearn.metrics import accuracy_score,f1_score
from datasets import load_dataset
from peft import PeftModel
from tqdm import tqdm
import pandas as pd
import datasets
import torch
from transformers import (
    AutoModel,
    AutoTokenizer,
    TrainingArguments,
    Trainer,
    BitsAndBytesConfig,
    AutoModelForCausalLM
)

base_model = "meta-llama/Llama-3.1-8B-Instruct"
peft_model = "../finetuned_models/test1_202410181442"

tokenizer = AutoTokenizer.from_pretrained(base_model, trust_remote_code=True, padding_side='left')
model = AutoModelForCausalLM.from_pretrained(
    base_model,
    # load_in_8bit=True,
    # load_in_4bit=True,
    device_map={
    "": 0
    },
    # torch_dtype=torch.float16,
)

model = PeftModel.from_pretrained(model, peft_model)
model = model.eval()

tokenizer.pad_token = tokenizer.eos_token


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

RuntimeError: Error(s) in loading state_dict for PeftModelForCausalLM:
	size mismatch for base_model.model.model.embed_tokens.weight: copying a param with shape torch.Size([128257, 4096]) from checkpoint, the shape in current model is torch.Size([128256, 4096]).
	size mismatch for base_model.model.lm_head.weight: copying a param with shape torch.Size([128257, 4096]) from checkpoint, the shape in current model is torch.Size([128256, 4096]).

In [2]:
model.generation_config.pad_token_id = tokenizer.pad_token_id

from FinNLP.finnlp.benchmarks.fpb import test_fpb
from FinNLP.finnlp.benchmarks.fiqa import test_fiqa , add_instructions
from FinNLP.finnlp.benchmarks.tfns import test_tfns
from FinNLP.finnlp.benchmarks.nwgi import test_nwgi
batch_size = 8

test_fpb(model, tokenizer, batch_size=batch_size)
test_fiqa(model, tokenizer, batch_size=batch_size)
test_tfns(model, tokenizer, batch_size=batch_size)
test_nwgi(model, tokenizer, batch_size=batch_size)



Prompt example:
Instruction: What is the sentiment of this news? Please choose an answer from {negative/neutral/positive}.
Input: L&T has also made a commitment to redeem the remaining shares by the end of 2011 .
Answer: 


Total len: 1212. Batchsize: 8. Total steps: 152


  0%|                                                                                                                                                                            | 0/152 [00:00<?, ?it/s]


ValueError: Asking to pad but the tokenizer does not have a padding token. Please select a token to use as `pad_token` `(tokenizer.pad_token = tokenizer.eos_token e.g.)` or add a new pad token via `tokenizer.add_special_tokens({'pad_token': '[PAD]'})`.

### Testing code

You can follow one of the files here: https://github.com/AI4Finance-Foundation/FinNLP/tree/main/finnlp/benchmarks

In [3]:
model.generation_config.pad_token_id = tokenizer.pad_token_id

def parse_answer(text):
  """
  Parses the answer from the model's output text without using regex.

  Args:
    text: The output text from the language model.

  Returns:
    The first letter of the answer, or None if no answer is found.
  """
  try:
    # Find the position of "Answer:" in the text
    answer_pos = text.index("Answer:")

    # Extract the first letter after "Answer:"
    first_letter = text[answer_pos + len("Answer:"):].strip()[0]

    # Convert the letter to uppercase and return it
    return first_letter

  except ValueError:
    # "Answer:" is not found in the text
    return ""

  except IndexError:
    # No letter found after "Answer:"
    return ""
    
    
def test(model, tokenizer, batch_size = 14, prompt_fun = None ):
    dataset = load_dataset('TheFinAI/flare-cfa')
    dataset = dataset['test']
    dataset = dataset.to_pandas()
    # dataset = dataset.head(100)

    # print example
    print(f"\n\nPrompt example:\n{dataset['text'][0]}\n\n")

    context = dataset['text'].tolist()
    context = ["Instruction: What is the correct answer to this question? Please choose an answer from  {A/B/C}. \nInput: " + x + "\nAnswer:"
               for x in context]
    total_steps = dataset.shape[0] // batch_size + 1
    print(f"Total len: {len(context)}. Batchsize: {batch_size}. Total steps: {total_steps}")


    out_text_list = []
    for i in tqdm(range(total_steps)):
        tmp_context = context[i* batch_size: min((i+1)* batch_size, len(context))]
        tokens = tokenizer(tmp_context, return_tensors='pt', padding=True, padding_side="left")
        for k in tokens.keys():
            tokens[k] = tokens[k].cuda()
        res = model.generate(**tokens, max_length=300)
        res_sentences = [tokenizer.decode(i, skip_special_tokens=True) for i in res]
        # print(res_sentences[0])
        out_text = [parse_answer(o) for o in res_sentences]
        # print(out_text)
        out_text_list += out_text
        torch.cuda.empty_cache()

    dataset["out_text"] = out_text_list
    # dataset["new_target"] = dataset["target"].apply(change_target)
    # dataset["new_out"] = dataset["out_text"].apply(change_target)
    

    acc = accuracy_score(dataset["answer"], dataset["out_text"])
    f1_macro = f1_score(dataset["answer"], dataset["out_text"], average = "macro")
    f1_micro = f1_score(dataset["answer"], dataset["out_text"], average = "micro")
    f1_weighted = f1_score(dataset["answer"], dataset["out_text"], average = "weighted")

    print(f"Acc: {acc}. F1 macro: {f1_macro}. F1 micro: {f1_micro}. F1 weighted (BloombergGPT): {f1_weighted}. ")
    print(f"Acc: {acc}. ")

    return dataset

dataset = test(model, tokenizer)



Prompt example:
Q:The nominal risk-free rate is best described as the sum of the real risk-free rate and a premium for:,CHOICES: A: maturity.,B: liquidity.,C: expected inflation.


Total len: 1032. Batchsize: 14. Total steps: 74


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 74/74 [00:41<00:00,  1.77it/s]

Acc: 0.4903100775193798. F1 macro: 0.10296047869200162. F1 micro: 0.4903100775193798. F1 weighted (BloombergGPT): 0.4507126041643837. 
Acc: 0.4903100775193798. 



