# Extract Features

1. Read csv files and load as dfs
2. Combine dfs
3. Get POS, NER
4. Save in new features_df

In [1]:
import os, sys

import pandas as pd

# Get the current working directory of the notebook
notebook_dir = os.getcwd()
# Add the parent directory to the system path
sys.path.append(os.path.join(notebook_dir, '../'))

import log_files
from log_files import LogData
from data_processing import DataProcessing
from feature_extraction import SpacyFeatureExtraction

In [2]:
pd.set_option('max_colwidth', 800)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [3]:
log_file_path = "data/prediction_logs"
predictions = True
df = log_files.read_data(notebook_dir, log_file_path, predictions)
df.head(7)

Start logging batch
log_directory: /Users/detraviousjamaribrinkley/Documents/Development/research_labs/uf_ds/predictions/prediction_correctness/../data/prediction_logs
save_batch_directory: /Users/detraviousjamaribrinkley/Documents/Development/research_labs/uf_ds/predictions/prediction_correctness/../data/prediction_logs/batch_1-predictions
CSV to DF
Load saved csv: /Users/detraviousjamaribrinkley/Documents/Development/research_labs/uf_ds/predictions/prediction_correctness/../data/prediction_logs/batch_1-predictions/batch_1-from_df.csv
save_batch_directory: /Users/detraviousjamaribrinkley/Documents/Development/research_labs/uf_ds/predictions/prediction_correctness/../data/prediction_logs/batch_2-predictions
CSV to DF
Load saved csv: /Users/detraviousjamaribrinkley/Documents/Development/research_labs/uf_ds/predictions/prediction_correctness/../data/prediction_logs/batch_2-predictions/batch_2-from_df.csv
save_batch_directory: /Users/detraviousjamaribrinkley/Documents/Development/research

Unnamed: 0,Base Sentence,Sentence Label,Domain,Model Name,API Name,Batch ID,Template Number
0,"On August 21, 2024, as a financial analyst, I predict that the revenue at Amazon will potentially decrease in Q3 of 2027.",1,finance,gpt-3.5-turbo,NAVI_GATOR,0,1
1,"On 2024-08-21, Morgan Stanley speculates the operating income at Amazon will likely increase.",1,finance,gpt-4o,NAVI_GATOR,0,2
2,"According to JPMorgan Chase, the net profit at Microsoft is expected to decrease in Q2 of 2024.",1,finance,mixtral-8x7b-instruct,NAVI_GATOR,0,1
3,"On August 21, 2023, Goldman Sachs speculates that the stock price at Amazon will likely increase.",1,finance,mixtral-8x7b-instruct,NAVI_GATOR,0,2
4,"George, a financial analyst, predicts that on 01/15/2024, the Google revenue may rise.",1,finance,mixtral-8x7b-instruct,NAVI_GATOR,0,3
5,"According to BlackRock, the operating cash flow at ExxonMobil would fall in Q3 2023.",1,finance,mixtral-8x7b-instruct,NAVI_GATOR,0,4
6,"In Q2 2023, Wells Fargo envisions that the stock price at Tesla has some probability to remain stable.",1,finance,mixtral-8x7b-instruct,NAVI_GATOR,0,5


- issue: `obesity rate` is not being captures by POS or NER.

In [5]:
only_predictions = DataProcessing.df_to_list(df, 'Base Sentence')
# print(only_predictions)
sfe_class = SpacyFeatureExtraction(df, "Base Sentence")
word_leveL_disable_components = ["lemmatizer"]
word_level_pos_tags, word_level_pos_mappings, word_level_ner_entities, word_level_ner_mappings =  sfe_class.extract_features(data=only_predictions, disable_components=word_leveL_disable_components)
all_word_level_pos_df = DataProcessing.convert_to_df(word_level_pos_tags, mapping=word_level_pos_mappings)
all_word_level_ner_df = DataProcessing.convert_to_df(word_level_ner_entities, word_level_ner_mappings)
word_level_tags_entities = [all_word_level_pos_df, all_word_level_ner_df]
word_level_tags_entities_df = DataProcessing.concat_dfs(word_level_tags_entities, axis=1, ignore_index=False)
word_level_tags_entities_df

Spacy Doc (0):  On August 21, 2024, as a financial analyst, I predict that the revenue at Amazon will potentially decrease in Q3 of 2027.
Spacy Doc (1):  On 2024-08-21, Morgan Stanley speculates the operating income at Amazon will likely increase.
Spacy Doc (2):  According to JPMorgan Chase, the net profit at Microsoft is expected to decrease in Q2 of 2024.
Spacy Doc (3):  On August 21, 2023, Goldman Sachs speculates that the stock price at Amazon will likely increase.


Unnamed: 0,NOUN_4,NOUN_8,NOUN_2,DET_5,NOUN_7,PROPN_4,PART_1,PUNCT_1,PROPN_2,NOUN_1,PART_2,VERB_3,ADP_4,NOUN_5,PUNCT_5,DET_3,AUX_1,PUNCT_4,PUNCT_6,VERB_4,CCONJ_1,DET_2,SYM_2,PROPN_6,VERB_2,NOUN_3,SYM_1,NUM_1,PRON_1,ADJ_1,PROPN_5,ADJ_2,DET_4,ADP_2,DET_1,ADP_3,NOUN_6,VERB_5,PROPN_3,AUX_2,ADP_1,PUNCT_3,NUM_3,NUM_2,ADP_5,PROPN_1,ADV_1,PROPN_7,PUNCT_2,SCONJ_1,ADJ_4,ADJ_3,VERB_1,EVENT_1,LOC_1,TIME_1,DATE_1,ORG_1,NORP_1,PERSON_1,ORG_3,PERCENT_1,ORG_2,DATE_2,CARDINAL_1,GPE_1,GPE_2
0,,,revenue,,,,,",",Amazon,analyst,,,in,,,,will,.,,,,the,,,decrease,,,21,I,financial,,,,as,a,at,,,Q3,,On,",",2027.0,2024,of,August,potentially,,",",that,,,predict,,,,"August 21, 2024",Amazon,,,,,,2027,,Q3,
1,,,income,,,,,",",Stanley,operating,,,,,,,will,,,,,,-,,increase,,-,2024,,,,,,at,the,,,,Amazon,,On,,21.0,08,,Morgan,likely,,.,,,,speculates,,,,2024-08-21,Morgan Stanley,,,,,Amazon,,,,
2,,,,,,Q2,to,",",Chase,profit,,decrease,of,,,,is,,,,,,,,expected,,,2024,,net,,,,at,the,in,,,Microsoft,,to,,,,,JPMorgan,,,.,,,,According,,,,2024,JPMorgan Chase,,,Q2,,Microsoft,,,,
3,,,price,,,Amazon,,",",Goldman,stock,,,,,,,will,,,,,,,,increase,,,21,,,,,,at,the,,,,Sachs,,On,.,,2023,,August,likely,,",",that,,,speculates,,,,"August 21, 2023",Goldman Sachs,,,,,Amazon,,,,
4,,,revenue,,,,,",",Google,analyst,,,,,,,may,.,,,,the,,,rise,,,01/15/2024,,financial,,,,,a,,,,,,on,",",,,,George,,,",",that,,,predicts,,,,,Google,,George,,,,,,,
5,,,flow,,,,,",",ExxonMobil,cash,,fall,,,,,would,,,,,,,,operating,,,2023,,,,,,at,the,in,,,Q3,,to,,,,,BlackRock,,,.,,,,According,,,,Q3 2023,BlackRock,,,,,ExxonMobil,,,,
6,,,price,,,Tesla,to,",",Wells,stock,,remain,,,,,,,,,,some,,,has,probability,,2023,,stable,,,,at,the,,,,Fargo,,In,,,,,Q2,,,.,that,,,envisions,,,,Q2 2023,Wells Fargo,,,,,Tesla,,,,
7,,,price,,,Stanley,,",",August,stock,,,,,,,should,,,,,the,,,according,,,2023,,same,,,,in,The,to,,,Morgan,,at,,,,,Apple,,,.,,,,stay,,,,August 2023,Apple,,,,,Morgan Stanley,,,,
8,clinics,,illnesses,,,,,.,Smith,prevalence,,,of,,,,will,,,,,,,,decrease,health,,2026,,chronic,,rural,,at,the,in,,,Q4,,of,,,,,Dr.,potentially,,,that,,,predicts,,,,2026,,,Smith,,,,,,Q4,
9,,,illnesses,,,,,.,,prevalence,,,of,,,,,,,,,,,,decrease,centers,,2024,WHO,chronic,,global,,at,the,in,,,,,of,,,,,Q4,potentially,,,that,,urban,forecasts,,,,Q4 of 2024,,,,,,,,,,


## Observations

In [8]:
log_file_path = "data/observation_logs"
predictions = False
observations_df = log_files.read_data(notebook_dir, log_file_path, predictions)
observations_df.head(7)

Start logging batch
log_directory: /Users/detraviousjamaribrinkley/Documents/Development/research_labs/uf_ds/predictions/prediction_correctness/../data/observation_logs
save_batch_directory: /Users/detraviousjamaribrinkley/Documents/Development/research_labs/uf_ds/predictions/prediction_correctness/../data/observation_logs/batch_1-observations
CSV to DF
Load saved csv: /Users/detraviousjamaribrinkley/Documents/Development/research_labs/uf_ds/predictions/prediction_correctness/../data/observation_logs/batch_1-observations/batch_1-from_df.csv
save_batch_directory: /Users/detraviousjamaribrinkley/Documents/Development/research_labs/uf_ds/predictions/prediction_correctness/../data/observation_logs/batch_2-observations
CSV to DF
Load saved csv: /Users/detraviousjamaribrinkley/Documents/Development/research_labs/uf_ds/predictions/prediction_correctness/../data/observation_logs/batch_2-observations/batch_2-from_df.csv
save_batch_directory: /Users/detraviousjamaribrinkley/Documents/Development

Unnamed: 0,Base Sentence,Sentence Label,Domain,Model Name,API Name,Batch ID,Template Number
0,"The financial reporter, Sarah, witnessed the stock price at Tesla plummeted in Q2 of 2023.",0,finance,gpt-3.5-turbo,NAVI_GATOR,0,1
1,"From May 15, 2025, the financial analyst speculated the net profit at Amazon was drastically different.",0,finance,gpt-3.5-turbo,NAVI_GATOR,0,2
2,"On July 1, 2024, the financial expert heard that the revenue at Google remained stable.",0,finance,gpt-3.5-turbo,NAVI_GATOR,0,5
3,"Apple's operating income generally rose in August 21, 2027, according to the financial top executive.",0,finance,gpt-3.5-turbo,NAVI_GATOR,0,6
4,Goldman Sachs saw the net profit at Tesla plummeted in 2023-11-15.,0,finance,gpt-4o,NAVI_GATOR,0,1
5,"From 2025-03-30, Morgan Stanley speculated the gross profit at Apple was drastically different.",0,finance,gpt-4o,NAVI_GATOR,0,2
6,"2026/12/01, a financial analyst heard that the operating income at Amazon remained stable.",0,finance,gpt-4o,NAVI_GATOR,0,5


In [7]:
only_observations = DataProcessing.df_to_list(observations_df, 'Base Sentence')
# print(only_observations)
word_level_pos_tags, word_level_pos_mappings, word_level_ner_entities, word_level_ner_mappings =  sfe_class.extract_features(data=only_observations, disable_components=word_leveL_disable_components)
all_word_level_pos_df = DataProcessing.convert_to_df(word_level_pos_tags, mapping=word_level_pos_mappings)
all_word_level_ner_df = DataProcessing.convert_to_df(word_level_ner_entities, word_level_ner_mappings)
word_level_tags_entities = [all_word_level_pos_df, all_word_level_ner_df]
word_level_tags_entities_df = DataProcessing.concat_dfs(word_level_tags_entities, axis=1, ignore_index=False)
word_level_tags_entities_df

Spacy Doc (0):  The financial reporter, Sarah, witnessed the stock price at Tesla plummeted in Q2 of 2023.
Spacy Doc (1):  From May 15, 2025, the financial analyst speculated the net profit at Amazon was drastically different.
Spacy Doc (2):  On July 1, 2024, the financial expert heard that the revenue at Google remained stable.
Spacy Doc (3):  Apple's operating income generally rose in August 21, 2027, according to the financial top executive.


Unnamed: 0,NOUN_4,NOUN_8,NOUN_2,DET_5,ADP_8,NOUN_7,SCONJ_2,PROPN_4,PART_1,PUNCT_1,PROPN_2,NOUN_1,VERB_3,ADP_4,NOUN_5,PUNCT_5,DET_3,AUX_1,PUNCT_4,CCONJ_1,VERB_4,DET_2,SYM_2,PROPN_6,VERB_2,NOUN_3,SYM_1,NUM_1,ADJ_1,PROPN_5,PRON_1,ADJ_2,DET_4,ADP_2,DET_1,ADP_3,NOUN_6,PROPN_3,AUX_2,ADP_1,PUNCT_3,ADJ_6,NUM_3,NUM_2,ADP_5,ADJ_5,PROPN_1,ADV_1,PROPN_7,PUNCT_2,ADP_6,SCONJ_1,ADP_7,ADJ_4,ADJ_3,VERB_1,FAC_1,EVENT_1,LOC_1,TIME_1,DATE_1,ORG_1,NORP_1,PERSON_1,ORG_2,DATE_2,CARDINAL_1,GPE_1
0,,,stock,,,,,,,",",Tesla,reporter,,,,,,,,,,the,,,plummeted,price,,2023,financial,,,,,in,The,of,,Q2,,at,.,,,,,,Sarah,,,",",,,,,,witnessed,,,,,,Tesla,,Sarah,Q2,,2023,
1,,,profit,,,,,,,",",Amazon,analyst,,,,,,was,,,,the,,,,,,15,financial,,,net,,at,the,,,,,From,.,,,2025,,,May,drastically,,",",,,,,different,speculated,,,,,"May 15, 2025",Amazon,,,,,,
2,,,revenue,,,,,,,",",Google,expert,,,,,,,,,,the,,,remained,,,1,financial,,,stable,,at,the,,,,,On,.,,,2024,,,July,,,",",,that,,,,heard,,,,,"July 1, 2024",Google,,,,,,
3,,,income,,,,,,'s,",",August,operating,,,,,,,,,,,,,according,executive,,21,financial,,,top,,to,the,,,,,in,.,,,2027,,,Apple,generally,,",",,,,,,rose,,,,,"August 21, 2027",Apple,,,,,,
4,,,,,,,,,,.,Sachs,profit,,,,,,,,,,,-,,plummeted,,-,2023,net,,,,,in,the,,,Tesla,,at,,,15.0,11,,,Goldman,,,,,,,,,saw,,,,,2023-11-15,Goldman Sachs,,,Tesla,,,
5,,,,,,,,,,",",Stanley,profit,,,,,,was,,,,,-,,,,-,2025,gross,,,different,,at,the,,,Apple,,From,,,30.0,03,,,Morgan,drastically,,.,,,,,,speculated,,,,,2025-03-30,Morgan Stanley,,,Apple,,,
6,,,operating,,,,,,,",",,analyst,,,,,,,,,,the,,,remained,income,,2026/12/01,financial,,,stable,,,a,,,,,at,,,,,,,Amazon,,,.,,that,,,,heard,,,,,,Amazon,,,,,2026/12/01,
7,,,expert,,,,,,,",",,revenue,,,,,,,,,,,,,according,,,2024/07/22,financial,,,,,to,a,,,,,in,,,,,,,Microsoft,generally,,.,,,,,,rose,,,,,,Microsoft,,,,,2024/07/22,
8,,,analyst,,,,,,,",",Tesla,date,plummeted,,,,the,had,,,,a,,,observed,profit,,2027,seasoned,,,financial,,at,the,,,,,of,,,,,,,Q3,significantly,,.,,that,,,net,Noting,,Q3 2027,,,,Tesla,,,,,,
9,,,stock,,,,,,,",",Apple,reporter,,,,,,was,,,,the,,,,price,,2024,financial,,,different,,at,a,,,,,From,,,,,,,Q2,drastically,,.,,that,,,,speculated,,,,,Q2 2024,Apple,,,,,,
