# FinGPT Test: FiQA Sentiment Analysis

This notebook demonstrates how to test FinGPT on the FiQA sentiment analysis dataset.

## 1. Install Dependencies

In [None]:
!pip install transformers==4.32.0 peft==0.5.0 datasets accelerate bitsandbytes sentencepiece tqdm scikit-learn pandas matplotlib seaborn

## 2. Clone the FinGPT Repository

In [None]:
!git clone https://github.com/AI4Finance-Foundation/FinGPT.git
%cd FinGPT

## 3. Create Sentiment Templates File

In [3]:
!mkdir -p fingpt/FinGPT_Benchmark/benchmarks

templates = """What is the sentiment of this {type}?
Determine the sentiment of this {type}.
How would you describe the sentiment of this {type}?
Is the sentiment of this {type} positive or negative?
Analyze the sentiment of this {type}.
What's the sentiment of this {type}?"""

with open('fingpt/FinGPT_Benchmark/benchmarks/sentiment_templates.txt', 'w') as f:
    f.write(templates)

## 4. Download the FiQA Dataset

In [4]:
import datasets
from pathlib import Path

data_dir = Path('./fingpt/FinGPT_Benchmark/data')
data_dir.mkdir(parents=True, exist_ok=True)

print("Downloading FiQA dataset...")
dataset = datasets.load_dataset('pauri32/fiqa-2018')

# Save the dataset to disk
save_path = str(data_dir / "fiqa-2018")
print(f"Saving dataset to {save_path}")
dataset.save_to_disk(save_path)
print("Dataset download complete!")

Downloading FiQA dataset...


train.csv: 0.00B [00:00, ?B/s]

validation.csv: 0.00B [00:00, ?B/s]

test.csv: 0.00B [00:00, ?B/s]

Generating train split:   0%|          | 0/961 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/102 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/150 [00:00<?, ? examples/s]

Saving dataset to fingpt/FinGPT_Benchmark/data/fiqa-2018


Saving the dataset (0/1 shards):   0%|          | 0/961 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/102 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/150 [00:00<?, ? examples/s]

Dataset download complete!


## 5. Testing Module of FiQA

In [5]:
%%writefile fingpt/FinGPT_Benchmark/benchmarks/fiqa.py
import warnings
warnings.filterwarnings("ignore")

from sklearn.metrics import accuracy_score, f1_score
from datasets import load_dataset, load_from_disk, Dataset
from tqdm import tqdm
import datasets
import torch

from torch.utils.data import DataLoader
from functools import partial
from pathlib import Path

with open(Path(__file__).parent / 'sentiment_templates.txt') as f:
    templates = [l.strip() for l in f.readlines()]

def format_example(example: dict) -> dict:
    context = f"Instruction: {example['instruction']}\n"
    if example.get("input"):
        context += f"Input: {example['input']}\n"
    context += "Answer: "
    target = example["output"]
    return {"context": context, "target": target}

def add_instructions(x):
    if x.format == "post":
        return "What is the sentiment of this tweet? Please choose an answer from {negative/neutral/positive}."
    else:
        return "What is the sentiment of this news? Please choose an answer from {negative/neutral/positive}."

def make_label(x):
    if x < - 0.1: return "negative"
    elif x >=-0.1 and x < 0.1: return "neutral"
    elif x >= 0.1: return "positive"

def change_target(x):
    if 'positive' in x or 'Positive' in x:
        return 'positive'
    elif 'negative' in x or 'Negative' in x:
        return 'negative'
    else:
        return 'neutral'

def vote_output(x):
    output_dict = {'positive': 0, 'negative': 0, 'neutral': 0}
    for i in range(len(templates)):
        pred = change_target(x[f'out_text_{i}'].lower())
        output_dict[pred] += 1
    if output_dict['positive'] > output_dict['negative']:
        return 'positive'
    elif output_dict['negative'] > output_dict['positive']:
        return 'negative'
    else:
        return 'neutral'

def test_fiqa(args, model, tokenizer, prompt_fun=add_instructions):
    batch_size = args.batch_size
    # dataset = load_dataset('pauri32/fiqa-2018')
    dataset = load_from_disk(Path(__file__).parent.parent / 'data/fiqa-2018/')
    dataset = datasets.concatenate_datasets([dataset["train"], dataset["validation"] ,dataset["test"] ])
    dataset = dataset.train_test_split(0.226, seed = 42)['test']
    dataset = dataset.to_pandas()
    dataset["output"] = dataset.sentiment_score.apply(make_label)
    if prompt_fun is None:
        dataset["instruction"] = "What is the sentiment of this news? Please choose an answer from {negative/neutral/positive}."
    else:
        dataset["instruction"] = dataset.apply(prompt_fun, axis = 1)

    dataset = dataset[['sentence', 'output',"instruction"]]
    dataset.columns = ["input", "output","instruction"]
    dataset[["context","target"]] = dataset.apply(format_example, axis=1, result_type="expand")

    print(f"\n\nPrompt example:\n{dataset['context'][0]}\n\n")

    context = dataset['context'].tolist()
    total_steps = dataset.shape[0]//batch_size + 1
    print(f"Total len: {len(context)}. Batchsize: {batch_size}. Total steps: {total_steps}")

    out_text_list = []

    for i in tqdm(range(total_steps)):
        tmp_context = context[i* batch_size:(i+1)* batch_size]
        if not tmp_context:
            continue
        tokens = tokenizer(tmp_context, return_tensors='pt', padding=True, max_length=512, return_token_type_ids=False)
        # tokens.pop('token_type_ids')
        for k in tokens.keys():
            tokens[k] = tokens[k].cuda()

        res = model.generate(**tokens, max_length=512, eos_token_id=tokenizer.eos_token_id)
        res_sentences = [tokenizer.decode(i, skip_special_tokens=True) for i in res]
        tqdm.write(f'{i}: {res_sentences[0]}')
        out_text = [o.split("Answer: ")[1] if "Answer: " in o else o for o in res_sentences]
        out_text_list += out_text
        torch.cuda.empty_cache()

    dataset["out_text"] = out_text_list
    dataset["new_target"] = dataset["target"].apply(change_target)
    dataset["new_out"] = dataset["out_text"].apply(change_target)

    acc = accuracy_score(dataset["new_target"], dataset["new_out"])
    f1_macro = f1_score(dataset["new_target"], dataset["new_out"], average = "macro")
    f1_micro = f1_score(dataset["new_target"], dataset["new_out"], average = "micro")
    f1_weighted = f1_score(dataset["new_target"], dataset["new_out"], average = "weighted")

    print(f"Acc: {acc}. F1 macro: {f1_macro}. F1 micro: {f1_micro}. F1 weighted (BloombergGPT): {f1_weighted}. ")

    dataset.to_csv('fiqa_results.csv', index=False)
    print("Results saved to fiqa_results.csv")

    return dataset


def test_fiqa_mlt(args, model, tokenizer):
    batch_size = args.batch_size
    # dataset = load_dataset('pauri32/fiqa-2018')
    dataset = load_from_disk(Path(__file__).parent.parent / 'data/fiqa-2018/')
    dataset = datasets.concatenate_datasets([dataset["train"], dataset["validation"] ,dataset["test"] ])
    dataset = dataset.train_test_split(0.226, seed=42)['test']
    dataset = dataset.to_pandas()
    dataset["output"] = dataset.sentiment_score.apply(make_label)
    dataset["text_type"] = dataset.apply(lambda x: 'tweet' if x.format == "post" else 'news', axis=1)
    dataset = dataset[['sentence', 'output', "text_type"]]
    dataset.columns = ["input", "output", "text_type"]

    dataset["output"] = dataset["output"].apply(change_target)
    dataset = dataset[dataset["output"] != 'neutral']

    out_texts_list = [[] for _ in range(len(templates))]

    def collate_fn(batch):
        inputs = tokenizer(
            [f["context"] for f in batch], return_tensors='pt',
            padding=True, max_length=args.max_length,
            return_token_type_ids=False
        )
        return inputs

    for i, template in enumerate(templates):
        print(f"\nTesting with template {i+1}/{len(templates)}: '{template}'")
        dataset_temp = dataset[['input', 'output', "text_type"]].copy()
        dataset_temp["instruction"] = dataset_temp['text_type'].apply(lambda x: template.format(type=x) + "\nOptions: positive, negative")
        dataset_temp[["context", "target"]] = dataset_temp.apply(format_example, axis=1, result_type="expand")

        dataloader = DataLoader(Dataset.from_pandas(dataset_temp), batch_size=args.batch_size, collate_fn=collate_fn, shuffle=False)

        log_interval = max(1, len(dataloader) // 5)

        for idx, inputs in enumerate(tqdm(dataloader)):
            inputs = {key: value.to(model.device) for key, value in inputs.items()}
            res = model.generate(**inputs, do_sample=False, max_length=args.max_length, eos_token_id=tokenizer.eos_token_id)
            res_sentences = [tokenizer.decode(i, skip_special_tokens=True) for i in res]
            if idx % log_interval == 0:
                tqdm.write(f'Template {i+1}, batch {idx}: {res_sentences[0]}')
            out_text = [o.split("Answer: ")[1] if "Answer: " in o else o for o in res_sentences]
            out_texts_list[i] += out_text
            torch.cuda.empty_cache()

    original_dataset = dataset.copy()
    for i in range(len(templates)):
        original_dataset[f"out_text_{i}"] = out_texts_list[i]
        original_dataset[f"out_text_{i}"] = original_dataset[f"out_text_{i}"].apply(change_target)

    original_dataset["new_out"] = original_dataset.apply(vote_output, axis=1, result_type="expand")
    original_dataset.to_csv('fiqa_mlt_results.csv', index=False)
    print("Results saved to fiqa_mlt_results.csv")

    for k in [f"out_text_{i}" for i in range(len(templates))] + ["new_out"]:
        template_name = "Ensemble (Voting)" if k == "new_out" else f"Template {k.split('_')[-1]}"
        acc = accuracy_score(original_dataset["output"], original_dataset[k])
        f1_macro = f1_score(original_dataset["output"], original_dataset[k], average="macro")
        f1_micro = f1_score(original_dataset["output"], original_dataset[k], average="micro")
        f1_weighted = f1_score(original_dataset["output"], original_dataset[k], average="weighted")

        print(f"{template_name}: Acc: {acc:.4f}. F1 macro: {f1_macro:.4f}. F1 micro: {f1_micro:.4f}. F1 weighted: {f1_weighted:.4f}")

    return original_dataset

Overwriting fingpt/FinGPT_Benchmark/benchmarks/fiqa.py


## 6. Create Benchmarks Runner Script

In [6]:
%%writefile fingpt/FinGPT_Benchmark/benchmarks/benchmarks.py
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import PeftModel, get_peft_model, LoraConfig, TaskType
import torch
import argparse

from fiqa import test_fiqa, test_fiqa_mlt

import sys
sys.path.append('../')
from utils import *

def main(args):
    if args.from_remote:
        model_name = parse_model_name(args.base_model, args.from_remote)
    else:
        model_name = '../' + parse_model_name(args.base_model)

    model = AutoModelForCausalLM.from_pretrained(
        model_name, trust_remote_code=True,
        # load_in_8bit=True
        device_map="auto",
        # fp16=True
    )
    model.model_parallel = True

    tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)

    # tokenizer.pad_token_id = tokenizer.eos_token_id

    tokenizer.padding_side = "left"
    if args.base_model == 'qwen':
        tokenizer.eos_token_id = tokenizer.convert_tokens_to_ids('<|endoftext|>')
        tokenizer.pad_token_id = tokenizer.convert_tokens_to_ids('<|extra_0|>')
    if not tokenizer.pad_token or tokenizer.pad_token_id == tokenizer.eos_token_id:
        tokenizer.add_special_tokens({'pad_token': '[PAD]'})
        model.resize_token_embeddings(len(tokenizer))

    print(f'pad: {tokenizer.pad_token_id}, eos: {tokenizer.eos_token_id}')

    model = PeftModel.from_pretrained(model, args.peft_model)
    model = model.eval()

    with torch.no_grad():
        for data in args.dataset.split(','):
            if data == 'fiqa':
                test_fiqa(args, model, tokenizer)
            elif data == 'fiqa_mlt':
                test_fiqa_mlt(args, model, tokenizer)
            else:
                raise ValueError('undefined dataset.')

    print('Evaluation Ends.')


if __name__ == "__main__":

    parser = argparse.ArgumentParser()
    parser.add_argument("--dataset", required=True, type=str)
    parser.add_argument("--base_model", required=True, type=str, choices=['chatglm2', 'llama2', 'llama2-13b', 'llama2-13b-nr', 'baichuan', 'falcon', 'internlm', 'qwen', 'mpt', 'bloom'])
    parser.add_argument("--peft_model", required=True, type=str)
    parser.add_argument("--max_length", default=512, type=int)
    parser.add_argument("--batch_size", default=4, type=int, help="The train batch size per device")
    parser.add_argument("--instruct_template", default='default')
    parser.add_argument("--from_remote", default=False, type=bool)

    args = parser.parse_args()

    print(args.base_model)
    print(args.peft_model)

    main(args)

Overwriting fingpt/FinGPT_Benchmark/benchmarks/benchmarks.py


## 7. Create Utils Module

In [7]:
%%writefile fingpt/FinGPT_Benchmark/utils.py
def parse_model_name(base_model, from_remote=False):
    model_map = {
        'chatglm2': 'THUDM/chatglm2-6b',
        'llama2': 'meta-llama/Llama-2-7b-hf',
        'llama2-13b': 'meta-llama/Llama-2-13b-hf',
        'llama2-13b-nr': 'NousResearch/Llama-2-13b-hf',
        'baichuan': 'baichuan-inc/Baichuan-7B',
        'falcon': 'tiiuae/falcon-7b',
        'internlm': 'internlm/internlm-7b',
        'qwen': 'Qwen/Qwen-7B',
        'mpt': 'mosaicml/mpt-7b',
        'bloom': 'bigscience/bloom-7b1',
    }
    if base_model not in model_map:
        raise ValueError(f"Unknown base model: {base_model}")
    return model_map[base_model]

Overwriting fingpt/FinGPT_Benchmark/utils.py


In [9]:
from huggingface_hub import login
login(token="token")

## 8. Run the FiQA Benchmark Test

In [11]:
%cd fingpt/FinGPT_Benchmark/benchmarks

base_model = 'llama2'
# The FinGPT adapter model
peft_model = 'FinGPT/fingpt-mt_llama2-7b_lora'
batch_size = 4
max_length = 512

!python benchmarks.py --dataset fiqa --base_model {base_model} --peft_model {peft_model} --batch_size {batch_size} --max_length {max_length} --from_remote True

/workspace/FinLoRA/test/fingpt_tests/FinGPT/fingpt/FinGPT_Benchmark/benchmarks
  torch.utils._pytree._register_pytree_node(
llama2
FinGPT/fingpt-mt_llama2-7b_lora
Loading checkpoint shards: 100%|██████████████████| 2/2 [00:09<00:00,  4.77s/it]
Using pad_token, but it is not set yet.
You are resizing the embedding layer without providing a `pad_to_multiple_of` parameter. This means that the new embeding dimension will be 32001. This might induce some performance reduction as *Tensor Cores* will not be available. For more details  about this, or help on choosing the correct value for resizing, refer to this guide: https://docs.nvidia.com/deeplearning/performance/dl-performance-matrix-multiplication/index.html#requirements-tc
pad: 32000, eos: 2


Prompt example:
Instruction: What is the sentiment of this tweet? Please choose an answer from {negative/neutral/positive}.
Input: This $BBBY stock options trade would have more than doubled your money https://t.co/Oa0loiRIJL via @TheStreet
Answe