# Collect Data

- [Financial Phrasebank](https://huggingface.co/datasets/takala/financial_phrasebank) consists of 4840 sentences from English language financial news categorised by sentiment.

In [1]:
import os
import sys

import pandas as pd
from tqdm import tqdm

# Get the current working directory of the notebook
notebook_dir = os.getcwd()
# print(notebook_dir)
# Add the parent directory to the system path
sys.path.append(os.path.join(notebook_dir, '../'))

from data_processing import DataProcessing
from text_generation_models import TextGenerationModelFactory

In [2]:
pd.set_option('max_colwidth', 800)
# pd.set_option('display.max_columns', None)
# pd.set_option('display.max_rows', None)

## Load Data

In [3]:
base_path = os.path.join(notebook_dir, '../data/', 'financial_phrase_bank/')
full_path = os.path.join(base_path, 'all_data-adjusted_header.csv')

# df = DataProcessing.load_from_file(full_path, 'csv')
# df

In [4]:
df = pd.read_csv(full_path, encoding_errors = 'ignore')
df

Unnamed: 0,sentiment,sentence
0,neutral,"According to Gran , the company has no plans to move all production to Russia , although that is where the company is growing ."
1,neutral,"Technopolis plans to develop in stages an area of no less than 100,000 square meters in order to host companies working in computer technologies and telecommunications , the statement said ."
2,negative,"The international electronic industry company Elcoteq has laid off tens of employees from its Tallinn facility ; contrary to earlier layoffs the company contracted the ranks of its office workers , the daily Postimees reported ."
3,positive,With the new production plant the company would increase its capacity to meet the expected increase in demand and would improve the use of raw materials and therefore increase the production profitability .
4,positive,"According to the company 's updated strategy for the years 2009-2012 , Basware targets a long-term net sales growth in the range of 20 % -40 % with an operating profit margin of 10 % -20 % of net sales ."
...,...,...
4841,negative,LONDON MarketWatch -- Share prices ended lower in London Monday as a rebound in bank stocks failed to offset broader weakness for the FTSE 100 .
4842,neutral,"Rinkuskiai 's beer sales fell by 6.5 per cent to 4.16 million litres , while Kauno Alus ' beer sales jumped by 6.9 per cent to 2.48 million litres ."
4843,negative,"Operating profit fell to EUR 35.4 mn from EUR 68.8 mn in 2007 , including vessel sales gain of EUR 12.3 mn ."
4844,negative,"Net sales of the Paper segment decreased to EUR 221.6 mn in the second quarter of 2009 from EUR 241.1 mn in the second quarter of 2008 , while operating profit excluding non-recurring items rose to EUR 8.0 mn from EUR 7.6 mn ."


In [5]:
sentences = DataProcessing.df_to_list(df, 'sentence')
sentences[: 3]

['According to Gran , the company has no plans to move all production to Russia , although that is where the company is growing .',
 'Technopolis plans to develop in stages an area of no less than 100,000 square meters in order to host companies working in computer technologies and telecommunications , the statement said .',
 'The international electronic industry company Elcoteq has laid off tens of employees from its Tallinn facility ; contrary to earlier layoffs the company contracted the ranks of its office workers , the daily Postimees reported .']

In [6]:
prediction_properties = """A prediction <p> = (<p_s>, <p_t>, <p_d>, <p_o>), where it consists of the following four properties:

    1. <p_s>, any source entity in the domain.
        - Can be a person (with a name) or a  person such as a  reporter,  analyst,  expert,  top executive,  senior level person, etc), civilian.
        - Can only be an organization that is associated with the  prediction.
    2. <p_t>, any target entity in the domain.
	    - Can be a person (with a name) or a  person such as a  reporter,  analyst,  expert,  top executive,  senior level person, etc).
        - Can only be an organization that is associated with the  prediction.
    3. <p_d>, date or time range when <p> is expected to come to fruition or when one should observe the <p>.
        - Forecast can range from a second to anytime in the future.
        - Answers the questions: "How far to go out from today?" or "Where to stop?".
    4. <p_o>,  prediction outcome.
        - Details relevant details such as outcome, a quantifiable metric, or slope.

Additionally, two main requirements of a prediction are (1) for it to be future tense (simple tense with simple future, continuous tense with future continuous, perfect tense with future perfect, and perfect continuous tense with future perfect continuous) and (2) it can NOT be past tense.   
"""

In [7]:
tgmf = TextGenerationModelFactory()

# Groq Cloud (https://console.groq.com/docs/overview)
gemma_29b_generation_model = tgmf.create_instance('gemma2-9b-it') 
llama_318b_instant_generation_model = tgmf.create_instance('llama-3.1-8b-instant') 
llama_3370b_versatile_generation_model = tgmf.create_instance('llama-3.3-70b-versatile')  
llama_guard_4_12b_generation_model = tgmf.create_instance('meta-llama/llama-guard-4-12b')  

models = [gemma_29b_generation_model, llama_318b_instant_generation_model, llama_3370b_versatile_generation_model, llama_guard_4_12b_generation_model]

In [8]:
def llm_as_a_judge(sentences: list, prediction_properties: str):
    
    labels = []
    for sentences_idx in tqdm(range(len(sentences))):
        sentence = sentences[sentences_idx]

        prompt = f"Classify the sentence: '{sentence}' as a prediction or non-prediction. I define a prediction with the following properties: {prediction_properties}. Only state a binary label of prediction or non-prediction and nothing else. Do NOT use the word 'safe." 
        if sentences_idx < 3:
            print(f"Prompt: {prompt}")
            # print()

        for model in models:  
            input_prompt = model.user(prompt)
            raw_text_llm_generation = model.chat_completion([input_prompt])
    
            for label in raw_text_llm_generation.split("\n"):
                if label.strip():
                    sentence_to_label = (sentence, label, model.__name__())
            labels.append(sentence_to_label)
    return labels

In [9]:
sentence_label = llm_as_a_judge(sentences[:33], prediction_properties)

  0%|          | 0/33 [00:00<?, ?it/s]

Prompt: Classify the sentence: 'According to Gran , the company has no plans to move all production to Russia , although that is where the company is growing .' as a prediction or non-prediction. I define a prediction with the following properties: A prediction <p> = (<p_s>, <p_t>, <p_d>, <p_o>), where it consists of the following four properties:

    1. <p_s>, any source entity in the domain.
        - Can be a person (with a name) or a  person such as a  reporter,  analyst,  expert,  top executive,  senior level person, etc), civilian.
        - Can only be an organization that is associated with the  prediction.
    2. <p_t>, any target entity in the domain.
	    - Can be a person (with a name) or a  person such as a  reporter,  analyst,  expert,  top executive,  senior level person, etc).
        - Can only be an organization that is associated with the  prediction.
    3. <p_d>, date or time range when <p> is expected to come to fruition or when one should observe the <p>.
      

  3%|▎         | 1/33 [00:02<01:10,  2.21s/it]

Prompt: Classify the sentence: 'Technopolis plans to develop in stages an area of no less than 100,000 square meters in order to host companies working in computer technologies and telecommunications , the statement said .' as a prediction or non-prediction. I define a prediction with the following properties: A prediction <p> = (<p_s>, <p_t>, <p_d>, <p_o>), where it consists of the following four properties:

    1. <p_s>, any source entity in the domain.
        - Can be a person (with a name) or a  person such as a  reporter,  analyst,  expert,  top executive,  senior level person, etc), civilian.
        - Can only be an organization that is associated with the  prediction.
    2. <p_t>, any target entity in the domain.
	    - Can be a person (with a name) or a  person such as a  reporter,  analyst,  expert,  top executive,  senior level person, etc).
        - Can only be an organization that is associated with the  prediction.
    3. <p_d>, date or time range when <p> is expected

  6%|▌         | 2/33 [00:03<00:55,  1.80s/it]

Prompt: Classify the sentence: 'The international electronic industry company Elcoteq has laid off tens of employees from its Tallinn facility ; contrary to earlier layoffs the company contracted the ranks of its office workers , the daily Postimees reported .' as a prediction or non-prediction. I define a prediction with the following properties: A prediction <p> = (<p_s>, <p_t>, <p_d>, <p_o>), where it consists of the following four properties:

    1. <p_s>, any source entity in the domain.
        - Can be a person (with a name) or a  person such as a  reporter,  analyst,  expert,  top executive,  senior level person, etc), civilian.
        - Can only be an organization that is associated with the  prediction.
    2. <p_t>, any target entity in the domain.
	    - Can be a person (with a name) or a  person such as a  reporter,  analyst,  expert,  top executive,  senior level person, etc).
        - Can only be an organization that is associated with the  prediction.
    3. <p_d>, d

100%|██████████| 33/33 [01:33<00:00,  2.83s/it]


In [10]:
sentence_label

[('According to Gran , the company has no plans to move all production to Russia , although that is where the company is growing .',
  'non-prediction ',
  'gemma2-9b-it'),
 ('According to Gran , the company has no plans to move all production to Russia , although that is where the company is growing .',
  'prediction',
  'llama-3.1-8b-instant'),
 ('According to Gran , the company has no plans to move all production to Russia , although that is where the company is growing .',
  'Non-prediction',
  'llama-3.3-70b-versatile'),
 ('According to Gran , the company has no plans to move all production to Russia , although that is where the company is growing .',
  'safe',
  'meta-llama/llama-guard-4-12b'),
 ('Technopolis plans to develop in stages an area of no less than 100,000 square meters in order to host companies working in computer technologies and telecommunications , the statement said .',
  'prediction ',
  'gemma2-9b-it'),
 ('Technopolis plans to develop in stages an area of no le

In [11]:
sentence_label_df = pd.DataFrame(sentence_label, columns=['Sentence', 'Label', 'Model'])
sentence_label_df


Unnamed: 0,Sentence,Label,Model
0,"According to Gran , the company has no plans to move all production to Russia , although that is where the company is growing .",non-prediction,gemma2-9b-it
1,"According to Gran , the company has no plans to move all production to Russia , although that is where the company is growing .",prediction,llama-3.1-8b-instant
2,"According to Gran , the company has no plans to move all production to Russia , although that is where the company is growing .",Non-prediction,llama-3.3-70b-versatile
3,"According to Gran , the company has no plans to move all production to Russia , although that is where the company is growing .",safe,meta-llama/llama-guard-4-12b
4,"Technopolis plans to develop in stages an area of no less than 100,000 square meters in order to host companies working in computer technologies and telecommunications , the statement said .",prediction,gemma2-9b-it
...,...,...,...
127,"The company 's net profit rose 11.4 % on the year to 82.2 million euros in 2005 on sales of 686.5 million euros , 13.8 % up on the year , the company said earlier .",safe,meta-llama/llama-guard-4-12b
128,"The Lithuanian beer market made up 14.41 million liters in January , a rise of 0.8 percent from the year-earlier figure , the Lithuanian Brewers ' Association reporting citing the results from its members .",non-prediction,gemma2-9b-it
129,"The Lithuanian beer market made up 14.41 million liters in January , a rise of 0.8 percent from the year-earlier figure , the Lithuanian Brewers ' Association reporting citing the results from its members .",non-prediction,llama-3.1-8b-instant
130,"The Lithuanian beer market made up 14.41 million liters in January , a rise of 0.8 percent from the year-earlier figure , the Lithuanian Brewers ' Association reporting citing the results from its members .",Non-prediction,llama-3.3-70b-versatile


In [12]:
filt_prediction = (sentence_label_df.Label == 'prediction')
sentence_label_df[filt_prediction]

Unnamed: 0,Sentence,Label,Model
1,"According to Gran , the company has no plans to move all production to Russia , although that is where the company is growing .",prediction,llama-3.1-8b-instant
5,"Technopolis plans to develop in stages an area of no less than 100,000 square meters in order to host companies working in computer technologies and telecommunications , the statement said .",prediction,llama-3.1-8b-instant
13,With the new production plant the company would increase its capacity to meet the expected increase in demand and would improve the use of raw materials and therefore increase the production profitability .,prediction,llama-3.1-8b-instant
17,"According to the company 's updated strategy for the years 2009-2012 , Basware targets a long-term net sales growth in the range of 20 % -40 % with an operating profit margin of 10 % -20 % of net sales .",prediction,llama-3.1-8b-instant
25,"For the last quarter of 2010 , Componenta 's net sales doubled to EUR131m from EUR76m for the same period a year earlier , while it moved to a zero pre-tax profit from a pre-tax loss of EUR7m .",prediction,llama-3.1-8b-instant
61,"Consolidated net sales increased 16 % to reach EUR74 .8 m , while operating profit amounted to EUR0 .9 m compared to a loss of EUR0 .7 m in the prior year period .",prediction,llama-3.1-8b-instant
73,"Incap Contract Manufacturing Services Pvt Ltd , a subsidiary of Incap Corporation of Finland , plans to double its revenues by 2007-2008 .",prediction,llama-3.1-8b-instant
77,"Its board of directors will propose a dividend of EUR0 .12 per share for 2010 , up from the EUR0 .08 per share paid in 2009 .",prediction,llama-3.1-8b-instant
