# Create Dataset: Combine Generated Synthetic Data with Financial PhraseBank Data

In [1]:
import os
import sys

import pandas as pd

from tqdm import tqdm

# Get the current working directory of the notebook
notebook_dir = os.getcwd()
# Add the parent directory to the system path
sys.path.append(os.path.join(notebook_dir, '../'))

# import log_files
from data_processing import DataProcessing

In [2]:
pd.set_option('max_colwidth', 800)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [3]:
save_data_bool = True

## Load Data

In [4]:
base_data_path = os.path.join(notebook_dir, '../data/')
generated_data = os.path.join(base_data_path, 'prediction_logs/batch_1-prediction/batch_1-from_df.csv')
non_p_generated_data = os.path.join(base_data_path, 'observation_logs/batch_19-observation/batch_19-from_df.csv')
finacial_phrase_bank_path = os.path.join(base_data_path, 'financial_phrase_bank/subset_data_with_labels.csv')

In [5]:
generated_data_df = DataProcessing.load_from_file(generated_data, 'csv')
non_p_generated_data_df = DataProcessing.load_from_file(non_p_generated_data, 'csv')
non_p_generated_data_df

Unnamed: 0,Base Sentence,Sentence Label,Domain,Model Name,API Name,Batch ID,Template Number
0,Analyst Jessica Chen observed that the turnover rate at the Miami Heat remained stable in September 2022.,0,sport,gemma2-9b-it,GROQ_CLOUD,0,1
1,"On 08/21/2024, Coach David Wilson monitored the free throw percentage at the Chicago Bulls changed.",0,sport,gemma2-9b-it,GROQ_CLOUD,0,2
2,"Analyst Michael Lee noted on 10/15/2023, the field goal percentage at the Golden State Warriors fell.",0,sport,gemma2-9b-it,GROQ_CLOUD,0,3
3,"According to Coach Emily Carter, the points per game at the Atlanta Hawks rose in December 2021.",0,sport,gemma2-9b-it,GROQ_CLOUD,0,4
4,"In 03/2025, Analyst Kevin Rodriguez envisioned that the save percentage at the Toronto Maple Leafs decreased.",0,sport,gemma2-9b-it,GROQ_CLOUD,0,5
5,"The rebounds per game at the Los Angeles Lakers increased in July 2024, according to Coach Carlos Garcia.",0,sport,gemma2-9b-it,GROQ_CLOUD,0,6
6,Analyst Emma Taylor noted that the home run count at the Chicago Cubs remained stable in 2024.,0,sport,llama-3.1-8b-instant,GROQ_CLOUD,0,1
7,"On Q2 of 2025, Coach Ryan Thompson observed that the pass completion rate at the Green Bay Packers increased.",0,sport,llama-3.1-8b-instant,GROQ_CLOUD,0,2
8,"George noted that on August 28, 2024, the save percentage he had in hockey stayed consistent.",0,sport,llama-3.1-8b-instant,GROQ_CLOUD,0,3
9,"According to the staff of the Los Angeles Lakers, the three-point percentage at the Los Angeles Lakers improved in November 2023.",0,sport,llama-3.1-8b-instant,GROQ_CLOUD,0,4


In [6]:
finacial_phrase_bank_df = DataProcessing.load_from_file(finacial_phrase_bank_path, 'csv')
finacial_phrase_bank_df.dropna(inplace=True) # drop rows without a lable
fpb_label_values = finacial_phrase_bank_df['label_value'].values # convert to np.array
fpb_label_values_updated = fpb_label_values.astype(int) # convert values to be type int
finacial_phrase_bank_df['label_value'] = fpb_label_values_updated # update original df with type int
finacial_phrase_bank_df.rename(columns={"sentence":"Base Sentence", "label_value":"Sentence Label"}, inplace=True)
finacial_phrase_bank_df

Unnamed: 0,sentiment,Base Sentence,label_name,Sentence Label
0,neutral,"According to Gran , the company has no plans to move all production to Russia , although that is where the company is growing .",PREDICTION,1
1,neutral,"Technopolis plans to develop in stages an area of no less than 100,000 square meters in order to host companies working in computer technologies and telecommunications , the statement said .",PREDICTION,1
2,negative,"The international electronic industry company Elcoteq has laid off tens of employees from its Tallinn facility ; contrary to earlier layoffs the company contracted the ranks of its office workers , the daily Postimees reported .",NON-PREDICTION,0
3,positive,With the new production plant the company would increase its capacity to meet the expected increase in demand and would improve the use of raw materials and therefore increase the production profitability .,PREDICTION,1
4,positive,"According to the company 's updated strategy for the years 2009-2012 , Basware targets a long-term net sales growth in the range of 20 % -40 % with an operating profit margin of 10 % -20 % of net sales .",PREDICTION,1
5,positive,FINANCING OF ASPOCOMP 'S GROWTH Aspocomp is aggressively pursuing its growth strategy by increasingly focusing on technologically more demanding HDI printed circuit boards PCBs .,PREDICTION,1
6,positive,"For the last quarter of 2010 , Componenta 's net sales doubled to EUR131m from EUR76m for the same period a year earlier , while it moved to a zero pre-tax profit from a pre-tax loss of EUR7m .",NON-PREDICTION,0
7,positive,"In the third quarter of 2010 , net sales increased by 5.2 % to EUR 205.5 mn , and operating profit by 34.9 % to EUR 23.5 mn .",NON-PREDICTION,0
8,positive,Operating profit rose to EUR 13.1 mn from EUR 8.7 mn in the corresponding period in 2007 representing 7.7 % of net sales .,NON-PREDICTION,0
9,positive,"Operating profit totalled EUR 21.1 mn , up from EUR 18.6 mn in 2007 , representing 9.7 % of net sales .",NON-PREDICTION,0


## Combine Datasets

In [7]:
joint_cols = ['Base Sentence', 'Sentence Label']

In [8]:
generated_sent_with_label_df = generated_data_df.loc[: , joint_cols]
generated_sent_with_label_df

Unnamed: 0,Base Sentence,Sentence Label
0,JPMorgan Chase forecasts that the net profit at Amazon potentially decrease in Q3 of 2027.,1
1,"On August 21, 2024, Bank of America speculates the revenue at Microsoft will likely increase.",1
2,"Citigroup predicts on 2024-08-21, the operating income at Alphabet may rise.",1
3,"According to Goldman Sachs, the research and development expenses at Facebook would fall in 2025.",1
4,"In 21 August 2024, Morgan Stanley envisions that the gross profit at Johnson & Johnson has some probability to remain stable.",1
5,"The stock price at Visa should stay same in Q2 of 2026, according to Wells Fargo.",1
6,JPMorgan forecasts that the revenue at Microsoft potentially decrease in Q3 of 2027.,1
7,"On August 25, 2024, to September 25, 2025, Citigroup speculates the net profit at Johnson & Johnson will likely increase.",1
8,"Bank of America predicts on 2024-08-21, the operating income at Visa may rise.",1
9,"According to Goldman Sachs, the research and development expenses at Alphabet would fall in 2029 Q2.",1


In [9]:
joint_cols = ['Base Sentence', 'Sentence Label']

In [10]:
non_p_generated_sent_with_label_df = non_p_generated_data_df.loc[: , joint_cols]
non_p_generated_sent_with_label_df

Unnamed: 0,Base Sentence,Sentence Label
0,Analyst Jessica Chen observed that the turnover rate at the Miami Heat remained stable in September 2022.,0
1,"On 08/21/2024, Coach David Wilson monitored the free throw percentage at the Chicago Bulls changed.",0
2,"Analyst Michael Lee noted on 10/15/2023, the field goal percentage at the Golden State Warriors fell.",0
3,"According to Coach Emily Carter, the points per game at the Atlanta Hawks rose in December 2021.",0
4,"In 03/2025, Analyst Kevin Rodriguez envisioned that the save percentage at the Toronto Maple Leafs decreased.",0
5,"The rebounds per game at the Los Angeles Lakers increased in July 2024, according to Coach Carlos Garcia.",0
6,Analyst Emma Taylor noted that the home run count at the Chicago Cubs remained stable in 2024.,0
7,"On Q2 of 2025, Coach Ryan Thompson observed that the pass completion rate at the Green Bay Packers increased.",0
8,"George noted that on August 28, 2024, the save percentage he had in hockey stayed consistent.",0
9,"According to the staff of the Los Angeles Lakers, the three-point percentage at the Los Angeles Lakers improved in November 2023.",0


In [11]:
fpb_sent_with_label_df = finacial_phrase_bank_df.loc[: , joint_cols]
fpb_sent_with_label_df

Unnamed: 0,Base Sentence,Sentence Label
0,"According to Gran , the company has no plans to move all production to Russia , although that is where the company is growing .",1
1,"Technopolis plans to develop in stages an area of no less than 100,000 square meters in order to host companies working in computer technologies and telecommunications , the statement said .",1
2,"The international electronic industry company Elcoteq has laid off tens of employees from its Tallinn facility ; contrary to earlier layoffs the company contracted the ranks of its office workers , the daily Postimees reported .",0
3,With the new production plant the company would increase its capacity to meet the expected increase in demand and would improve the use of raw materials and therefore increase the production profitability .,1
4,"According to the company 's updated strategy for the years 2009-2012 , Basware targets a long-term net sales growth in the range of 20 % -40 % with an operating profit margin of 10 % -20 % of net sales .",1
5,FINANCING OF ASPOCOMP 'S GROWTH Aspocomp is aggressively pursuing its growth strategy by increasingly focusing on technologically more demanding HDI printed circuit boards PCBs .,1
6,"For the last quarter of 2010 , Componenta 's net sales doubled to EUR131m from EUR76m for the same period a year earlier , while it moved to a zero pre-tax profit from a pre-tax loss of EUR7m .",0
7,"In the third quarter of 2010 , net sales increased by 5.2 % to EUR 205.5 mn , and operating profit by 34.9 % to EUR 23.5 mn .",0
8,Operating profit rose to EUR 13.1 mn from EUR 8.7 mn in the corresponding period in 2007 representing 7.7 % of net sales .,0
9,"Operating profit totalled EUR 21.1 mn , up from EUR 18.6 mn in 2007 , representing 9.7 % of net sales .",0


In [15]:
combined_df = DataProcessing.concat_dfs([generated_sent_with_label_df, fpb_sent_with_label_df])
combined_df = DataProcessing.concat_dfs([combined_df, non_p_generated_sent_with_label_df])
combined_df

Unnamed: 0,Base Sentence,Sentence Label
0,JPMorgan Chase forecasts that the net profit at Amazon potentially decrease in Q3 of 2027.,1
1,"On August 21, 2024, Bank of America speculates the revenue at Microsoft will likely increase.",1
2,"Citigroup predicts on 2024-08-21, the operating income at Alphabet may rise.",1
3,"According to Goldman Sachs, the research and development expenses at Facebook would fall in 2025.",1
4,"In 21 August 2024, Morgan Stanley envisions that the gross profit at Johnson & Johnson has some probability to remain stable.",1
5,"The stock price at Visa should stay same in Q2 of 2026, according to Wells Fargo.",1
6,JPMorgan forecasts that the revenue at Microsoft potentially decrease in Q3 of 2027.,1
7,"On August 25, 2024, to September 25, 2025, Citigroup speculates the net profit at Johnson & Johnson will likely increase.",1
8,"Bank of America predicts on 2024-08-21, the operating income at Visa may rise.",1
9,"According to Goldman Sachs, the research and development expenses at Alphabet would fall in 2029 Q2.",1


## Save Datasets

In [16]:
if save_data_bool == True:
    save_path = os.path.join(base_data_path, 'combined_generated_fin_phrase_bank/')
    DataProcessing.save_to_file(combined_df, save_path, 'combined_generated_fin_phrase_bank', 'csv')

Saved to: 
	/Users/detraviousjamaribrinkley/Documents/Development/research_labs/uf_ds/predictions/prediction_classification_experiments-v2/../data/combined_generated_fin_phrase_bank/combined_generated_fin_phrase_bank-v1.csv
