# Testing flare

### Load Model
Load the base model and finetuned model

In [1]:
from sklearn.metrics import accuracy_score,f1_score
from datasets import load_dataset
from peft import PeftModel
from tqdm import tqdm
import pandas as pd
import datasets
import torch
from transformers import (
    AutoModel,
    AutoTokenizer,
    TrainingArguments,
    Trainer,
    BitsAndBytesConfig,
    LlamaForCausalLM
)

base_model = "meta-llama/Llama-3.1-8B-Instruct"
peft_model = "../l-8b-r8-out"

tokenizer = AutoTokenizer.from_pretrained(base_model, trust_remote_code=True, padding_side='left')
model = LlamaForCausalLM.from_pretrained(
    base_model,
    # load_in_8bit=True,
    # load_in_4bit=True,
    trust_remote_code=True,
    device_map={
    "": 0
    },
    torch_dtype=torch.float16,
)
model = PeftModel.from_pretrained(model, peft_model)
model = model.eval()

tokenizer.pad_token = tokenizer.eos_token


The argument `trust_remote_code` is to be used with Auto classes. It has no effect here and is ignored.


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [2]:
model.generation_config.pad_token_id = tokenizer.pad_token_id

from FinNLP.finnlp.benchmarks.fpb import test_fpb
from FinNLP.finnlp.benchmarks.fiqa import test_fiqa , add_instructions
from FinNLP.finnlp.benchmarks.tfns import test_tfns
from FinNLP.finnlp.benchmarks.nwgi import test_nwgi
batch_size = 8

test_fpb(model, tokenizer, batch_size=batch_size)
test_fiqa(model, tokenizer, batch_size=batch_size)
test_tfns(model, tokenizer, batch_size=batch_size)
test_nwgi(model, tokenizer, batch_size=batch_size)



Prompt example:
Instruction: What is the sentiment of this news? Please choose an answer from {negative/neutral/positive}.
Input: L&T has also made a commitment to redeem the remaining shares by the end of 2011 .
Answer: 


Total len: 1212. Batchsize: 8. Total steps: 152


  0%|                                                                                                                                                                                                                                                  | 0/152 [00:00<?, ?it/s]Starting from v4.46, the `logits` model output will have the same type as the model (except at train time, where it will always be FP32)
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 152/152 [00:27<00:00,  5.56it/s]


Acc: 0.8597359735973598. F1 macro: 0.8521758625664301. F1 micro: 0.8597359735973598. F1 weighted (BloombergGPT): 0.8582934906818401. 


Prompt example:
Instruction: What is the sentiment of this news? Please choose an answer from {negative/neutral/positive}.
Input: This $BBBY stock options trade would have more than doubled your money https://t.co/Oa0loiRIJL via @TheStreet
Answer: 


Total len: 275. Batchsize: 8. Total steps: 35


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 35/35 [00:05<00:00,  6.14it/s]


Acc: 0.8654545454545455. F1 macro: 0.7658162488735493. F1 micro: 0.8654545454545455. F1 weighted (BloombergGPT): 0.879037623309444. 


Prompt example:
Instruction: What is the sentiment of this tweet? Please choose an answer from {negative/neutral/positive}.
Input: $ALLY - Ally Financial pulls outlook https://t.co/G9Zdi1boy5
Answer: 


Total len: 2388. Batchsize: 8. Total steps: 299


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 299/299 [00:49<00:00,  6.10it/s]


Acc: 0.8965661641541038. F1 macro: 0.873412785752509. F1 micro: 0.8965661641541038. F1 weighted (BloombergGPT): 0.8974006335116431. 


Prompt example:
Instruction: What is the sentiment of this news? Please choose an answer from {negative/neutral/positive}.
Input: In the latest trading session, Adobe Systems (ADBE) closed at $535.98, marking a +0.31% move from the previous day.
Answer: 


Total len: 4047. Batchsize: 8. Total steps: 506


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 506/506 [01:40<00:00,  5.01it/s]


Acc: 0.6330615270570793. F1 macro: 0.6430639552603411. F1 micro: 0.6330615270570793. F1 weighted (BloombergGPT): 0.6311732986569045. 


Unnamed: 0,input,output,instruction,context,target,out_text,new_target,new_out
0,"In the latest trading session, Adobe Systems (...",neutral,What is the sentiment of this news? Please cho...,Instruction: What is the sentiment of this new...,neutral,positive<|end_of_text|>,neutral,positive
1,Tech stocks are down today after an antitrust ...,negative,What is the sentiment of this news? Please cho...,Instruction: What is the sentiment of this new...,negative,negative<|end_of_text|>,negative,negative
2,Intel Corp is committing $20 billion to build ...,positive,What is the sentiment of this news? Please cho...,Instruction: What is the sentiment of this new...,positive,positive<|end_of_text|>,positive,positive
3,High costs and supply chain disruptions are li...,negative,What is the sentiment of this news? Please cho...,Instruction: What is the sentiment of this new...,negative,negative<|end_of_text|>,negative,negative
4,AMD still seems set to generate significant gr...,positive,What is the sentiment of this news? Please cho...,Instruction: What is the sentiment of this new...,positive,positive<|end_of_text|>,positive,positive
...,...,...,...,...,...,...,...,...
4042,"Amazon.com Inc. AMZN, +1.08% has proposed on T...",negative,What is the sentiment of this news? Please cho...,Instruction: What is the sentiment of this new...,negative,negative<|end_of_text|>,negative,negative
4043,Not everyone has thousands of dollars on hand ...,neutral,What is the sentiment of this news? Please cho...,Instruction: What is the sentiment of this new...,neutral,neutral<|end_of_text|>,neutral,neutral
4044,Amazon has delivered strong advertising growth...,positive,What is the sentiment of this news? Please cho...,Instruction: What is the sentiment of this new...,positive,positive<|end_of_text|>,positive,positive
4045,U.S. chip manufacturer SkyWater Technology Inc...,positive,What is the sentiment of this news? Please cho...,Instruction: What is the sentiment of this new...,positive,positive<|end_of_text|>,positive,positive


### Testing code

You can follow one of the files here: https://github.com/AI4Finance-Foundation/FinNLP/tree/main/finnlp/benchmarks

In [3]:
model.generation_config.pad_token_id = tokenizer.pad_token_id

def parse_answer(text):
  """
  Parses the answer from the model's output text without using regex.

  Args:
    text: The output text from the language model.

  Returns:
    The first letter of the answer, or None if no answer is found.
  """
  try:
    # Find the position of "Answer:" in the text
    answer_pos = text.index("Answer:")

    # Extract the first letter after "Answer:"
    first_letter = text[answer_pos + len("Answer:"):].strip()[0]

    # Convert the letter to uppercase and return it
    return first_letter

  except ValueError:
    # "Answer:" is not found in the text
    return ""

  except IndexError:
    # No letter found after "Answer:"
    return ""
    
    
def test(model, tokenizer, batch_size = 14, prompt_fun = None ):
    dataset = load_dataset('TheFinAI/flare-cfa')
    dataset = dataset['test']
    dataset = dataset.to_pandas()
    # dataset = dataset.head(100)

    # print example
    print(f"\n\nPrompt example:\n{dataset['text'][0]}\n\n")

    context = dataset['text'].tolist()
    context = ["Instruction: What is the correct answer to this question? Please choose an answer from  {A/B/C}. \nInput: " + x + "\nAnswer:"
               for x in context]
    total_steps = dataset.shape[0] // batch_size + 1
    print(f"Total len: {len(context)}. Batchsize: {batch_size}. Total steps: {total_steps}")


    out_text_list = []
    for i in tqdm(range(total_steps)):
        tmp_context = context[i* batch_size: min((i+1)* batch_size, len(context))]
        tokens = tokenizer(tmp_context, return_tensors='pt', padding=True, padding_side="left")
        for k in tokens.keys():
            tokens[k] = tokens[k].cuda()
        res = model.generate(**tokens, max_length=300)
        res_sentences = [tokenizer.decode(i, skip_special_tokens=True) for i in res]
        # print(res_sentences[0])
        out_text = [parse_answer(o) for o in res_sentences]
        # print(out_text)
        out_text_list += out_text
        torch.cuda.empty_cache()

    dataset["out_text"] = out_text_list
    # dataset["new_target"] = dataset["target"].apply(change_target)
    # dataset["new_out"] = dataset["out_text"].apply(change_target)
    

    acc = accuracy_score(dataset["answer"], dataset["out_text"])
    f1_macro = f1_score(dataset["answer"], dataset["out_text"], average = "macro")
    f1_micro = f1_score(dataset["answer"], dataset["out_text"], average = "micro")
    f1_weighted = f1_score(dataset["answer"], dataset["out_text"], average = "weighted")

    print(f"Acc: {acc}. F1 macro: {f1_macro}. F1 micro: {f1_micro}. F1 weighted (BloombergGPT): {f1_weighted}. ")
    print(f"Acc: {acc}. ")

    return dataset

dataset = test(model, tokenizer)



Prompt example:
Q:The nominal risk-free rate is best described as the sum of the real risk-free rate and a premium for:,CHOICES: A: maturity.,B: liquidity.,C: expected inflation.


Total len: 1032. Batchsize: 14. Total steps: 74


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 74/74 [00:41<00:00,  1.77it/s]

Acc: 0.4903100775193798. F1 macro: 0.10296047869200162. F1 micro: 0.4903100775193798. F1 weighted (BloombergGPT): 0.4507126041643837. 
Acc: 0.4903100775193798. 



