# FinGPT Test: Financial Headline Analysis

This notebook demonstrates how to test FinGPT on the Financial Headline Analysis dataset.

## 1. Install Dependencies

In [None]:
!pip install transformers==4.32.0 peft==0.5.0 datasets accelerate bitsandbytes sentencepiece tqdm scikit-learn pandas matplotlib seaborn

## 2. Clone the FinGPT Repository

In [None]:
!git clone https://github.com/AI4Finance-Foundation/FinGPT.git
%cd FinGPT

## 3. Download the Financial Headline Dataset

In [None]:
import datasets
from pathlib import Path

data_dir = Path('./fingpt/FinGPT_Benchmark/data')
data_dir.mkdir(parents=True, exist_ok=True)

print("Downloading Financial Headline dataset...")
try:
    dataset = datasets.load_dataset("FinGPT/fingpt-headline")

    # Save the dataset to disk
    save_path = str(data_dir / "fingpt-headline-instruct")
    print(f"Saving dataset to {save_path}")
    dataset.save_to_disk(save_path)
    print("Dataset download complete!")
except Exception as e:
    print(f"Error loading dataset: {e}")
    print("You may need to manually download the Headline dataset and place it in the fingpt/FinGPT_Benchmark/data directory.")

## 4. Testing Module for Headline

In [None]:
%%writefile fingpt/FinGPT_Benchmark/benchmarks/headline.py
from sklearn.metrics import accuracy_score, f1_score, classification_report
from datasets import load_dataset, load_from_disk
from tqdm import tqdm
import datasets
import torch
from torch.utils.data import DataLoader
from functools import partial
from pathlib import Path

import sys
sys.path.append('../')

def binary2multi(dataset):
    pred, label = [], []
    tmp_pred, tmp_label = [], []
    for i, row in dataset.iterrows():
        tmp_pred.append(row['pred'])
        tmp_label.append(row['label'])
        if (i + 1) % 9 == 0:
            pred.append(tmp_pred)
            label.append(tmp_label)
            tmp_pred, tmp_label = [], []
    return pred, label


def map_output(feature):
    pred = 1 if 'yes' in feature['out_text'].lower() else 0
    label = 1 if 'yes' in feature['output'].lower() else 0
    return {'label': label, 'pred': pred}


def test_mapping(args, example):
    prompt = f"Instruction: {example['instruction']}\nInput: {example['input']}\nAnswer: "
    return {"prompt": prompt}


def test_headline(args, model, tokenizer):
    print("Loading Financial Headline dataset...")
    # dataset = load_from_disk('../data/fingpt-headline')['test']
    dataset = load_from_disk(Path(__file__).parent.parent / 'data/fingpt-headline-instruct')['test']
    dataset = dataset.map(partial(test_mapping, args), load_from_cache_file=False)

    def collate_fn(batch):
        inputs = tokenizer(
            [f["prompt"] for f in batch], return_tensors='pt',
            padding=True, max_length=args.max_length,
            return_token_type_ids=False
        )
        return inputs

    dataloader = DataLoader(dataset, batch_size=args.batch_size, collate_fn=collate_fn, shuffle=False)

    print(f"Running inference on {len(dataset)} examples with batch size {args.batch_size}...")
    out_text_list = []
    log_interval = max(1, len(dataloader) // 5)

    for idx, inputs in enumerate(tqdm(dataloader)):
        inputs = {key: value.to(model.device) for key, value in inputs.items()}
        res = model.generate(**inputs, max_length=args.max_length, eos_token_id=tokenizer.eos_token_id)
        res_sentences = [tokenizer.decode(i, skip_special_tokens=True) for i in res]
        if (idx + 1) % log_interval == 0:
            tqdm.write(f'Example {idx}: {res_sentences[0]}')
        out_text = [o.split("Answer: ")[1] if "Answer: " in o else o for o in res_sentences]
        out_text_list += out_text
        torch.cuda.empty_cache()

    print("Processing results...")
    dataset = dataset.add_column("out_text", out_text_list)
    dataset = dataset.map(map_output, load_from_cache_file=False)
    dataset = dataset.to_pandas()

    dataset.to_csv('headline_results.csv', index=False)
    print("Results saved to headline_results.csv")

    acc = accuracy_score(dataset["label"], dataset["pred"])
    f1 = f1_score(dataset["label"], dataset["pred"], average="binary")

    pred, label = binary2multi(dataset)

    print(f"\n|| Accuracy: {acc:.4f} || F1 (binary): {f1:.4f} ||\n")

    category_names = [
        'price or not', 'price up', 'price stable',
        'price down', 'price past', 'price future',
        'event past', 'event future', 'asset comp'
    ]
    print(classification_report(label, pred, digits=4, target_names=category_names))

    return dataset

## 5. Update Benchmarking Script

In [None]:
%%writefile fingpt/FinGPT_Benchmark/benchmarks/benchmarks.py
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import PeftModel, get_peft_model, LoraConfig, TaskType
import torch
import argparse

from headline import test_headline

import sys
sys.path.append('../')
from utils import *

def main(args):
    if args.from_remote:
        model_name = parse_model_name(args.base_model, args.from_remote)
    else:
        model_name = '../' + parse_model_name(args.base_model)

    print(f"Loading base model: {model_name}")
    model = AutoModelForCausalLM.from_pretrained(
        model_name, trust_remote_code=True,
        # load_in_8bit=True
        device_map="auto",
        # fp16=True
    )
    model.model_parallel = True

    tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)

    # tokenizer.pad_token_id = tokenizer.eos_token_id

    tokenizer.padding_side = "left"
    if args.base_model == 'qwen':
        tokenizer.eos_token_id = tokenizer.convert_tokens_to_ids('<|endoftext|>')
        tokenizer.pad_token_id = tokenizer.convert_tokens_to_ids('<|extra_0|>')
    if not tokenizer.pad_token or tokenizer.pad_token_id == tokenizer.eos_token_id:
        tokenizer.add_special_tokens({'pad_token': '[PAD]'})
        model.resize_token_embeddings(len(tokenizer))

    print(f'pad: {tokenizer.pad_token_id}, eos: {tokenizer.eos_token_id}')

    print(f"Loading FinGPT adapter: {args.peft_model}")
    model = PeftModel.from_pretrained(model, args.peft_model)
    model = model.eval()

    with torch.no_grad():
        for data in args.dataset.split(','):
            if data == 'headline':
                test_headline(args, model, tokenizer)
            else:
                raise ValueError(f'Undefined dataset: {data}')

    print('Evaluation Ends.')


if __name__ == "__main__":

    parser = argparse.ArgumentParser()
    parser.add_argument("--dataset", required=True, type=str)
    parser.add_argument("--base_model", required=True, type=str, choices=['chatglm2', 'llama2', 'llama2-13b', 'llama2-13b-nr', 'baichuan', 'falcon', 'internlm', 'qwen', 'mpt', 'bloom'])
    parser.add_argument("--peft_model", required=True, type=str)
    parser.add_argument("--max_length", default=512, type=int)
    parser.add_argument("--batch_size", default=4, type=int, help="The train batch size per device")
    parser.add_argument("--instruct_template", default='default')
    parser.add_argument("--from_remote", default=False, type=bool)

    args = parser.parse_args()

    print(args.base_model)
    print(args.peft_model)

    main(args)

## 6. Create Utils Module


In [None]:
%%writefile fingpt/FinGPT_Benchmark/utils.py
def parse_model_name(base_model, from_remote=False):
    model_map = {
        'chatglm2': 'THUDM/chatglm2-6b',
        'llama2': 'meta-llama/Llama-2-7b-hf',
        'llama2-13b': 'meta-llama/Llama-2-13b-hf',
        'llama2-13b-nr': 'NousResearch/Llama-2-13b-hf',
        'baichuan': 'baichuan-inc/Baichuan-7B',
        'falcon': 'tiiuae/falcon-7b',
        'internlm': 'internlm/internlm-7b',
        'qwen': 'Qwen/Qwen-7B',
        'mpt': 'mosaicml/mpt-7b',
        'bloom': 'bigscience/bloom-7b1',
    }
    if base_model not in model_map:
        raise ValueError(f"Unknown base model: {base_model}")
    return model_map[base_model]

In [None]:
from huggingface_hub import login
login(token="token")

## 7. Run the Financial Headline Benchmark Test

Now that we have set up all the necessary files, let's run the benchmark test.

In [None]:
%cd /content/FinGPT/fingpt/FinGPT_Benchmark/benchmarks

base_model = 'llama2'
# The FinGPT adapter model
peft_model = 'FinGPT/fingpt-mt_llama2-7b_lora'
batch_size = 4
max_length = 512

!python benchmarks.py --dataset headline --base_model {base_model} --peft_model {peft_model} --batch_size {batch_size} --max_length {max_length} --from_remote True

Example 5134: Instruction: Does the news headline talk about a general event (apart from prices) in the future? Please choose an answer from {Yes/No}.
Input: august gold up $7.60 at $878.80 an ounce on nymex
Answer:  No
100% 5137/5137 [22:43<00:00,  3.77it/s]
Processing results...
Map: 100% 20547/20547 [00:01<00:00, 16280.23 examples/s]
Results saved to headline_results.csv

|| Accuracy: 0.9701 || F1 (binary): 0.9344 ||

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
              precision    recall  f1-score   support

price or not     0.8765    0.6893    0.7717       103
    price up     0.9411    0.9360    0.9385       938
price stable     0.9036    0.7282    0.8065       103
  price down     0.9386    0.9231    0.9308       845
  price past     0.9712    0.9628    0.9670      1857
price future     0.9219    0.6705    0.7763        88
  event past     0.7821    0.8696    0.8235       322
event future     0.0000    0.0000    0.0000        16
  asset comp   