# Extract Features

1. Read csv files and load as dfs
2. Combine dfs
3. Get POS, NER
4. Save in new features_df
5. Get semantic cosine similarity

In [1]:
import os, sys

import pandas as pd

# Get the current working directory of the notebook
notebook_dir = os.getcwd()
# Add the parent directory to the system path
sys.path.append(os.path.join(notebook_dir, '../'))

import log_files
from log_files import LogData
from data_processing import DataProcessing
from feature_extraction import SpacyFeatureExtraction

In [2]:
pd.set_option('max_colwidth', 800)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [3]:
log_file_path = "data/prediction_logs"
predictions = True
df = log_files.read_data(notebook_dir, log_file_path, predictions)
df.head(7)

Start logging batch
log_directory: /Users/detraviousjamaribrinkley/Documents/Development/research_labs/uf_ds/predictions/misc_experiments/../data/prediction_logs
save_batch_directory: /Users/detraviousjamaribrinkley/Documents/Development/research_labs/uf_ds/predictions/misc_experiments/../data/prediction_logs/batch_1-prediction
CSV to DF
Load saved csv: /Users/detraviousjamaribrinkley/Documents/Development/research_labs/uf_ds/predictions/misc_experiments/../data/prediction_logs/batch_1-prediction/batch_1-from_df.csv
save_batch_directory: /Users/detraviousjamaribrinkley/Documents/Development/research_labs/uf_ds/predictions/misc_experiments/../data/prediction_logs/batch_2-prediction
CSV to DF
Load saved csv: /Users/detraviousjamaribrinkley/Documents/Development/research_labs/uf_ds/predictions/misc_experiments/../data/prediction_logs/batch_2-prediction/batch_2-from_df.csv


Unnamed: 0,Base Sentence,Sentence Label,Domain,Model Name,API Name,Batch ID,Template Number
0,JPMorgan Chase forecasts that the net profit at Amazon potentially decrease in Q3 of 2027.,1,finance,llama-3.1-70b-instruct,NAVI_GATOR,0,1
1,"On August 21, 2024, Bank of America speculates the revenue at Microsoft will likely increase.",1,finance,llama-3.1-70b-instruct,NAVI_GATOR,0,2
2,"Citigroup predicts on 2024-08-21, the operating income at Alphabet may rise.",1,finance,llama-3.1-70b-instruct,NAVI_GATOR,0,3
3,"According to Goldman Sachs, the research and development expenses at Facebook would fall in 2025.",1,finance,llama-3.1-70b-instruct,NAVI_GATOR,0,4
4,"In 21 August 2024, Morgan Stanley envisions that the gross profit at Johnson & Johnson has some probability to remain stable.",1,finance,llama-3.1-70b-instruct,NAVI_GATOR,0,5
5,"The stock price at Visa should stay same in Q2 of 2026, according to Wells Fargo.",1,finance,llama-3.1-70b-instruct,NAVI_GATOR,0,6
6,JPMorgan forecasts that the revenue at Microsoft potentially decrease in Q3 of 2027.,1,finance,llama-3.3-70b-instruct,NAVI_GATOR,0,1


In [4]:
# df∫

In [5]:
df[2 : 3]

Unnamed: 0,Base Sentence,Sentence Label,Domain,Model Name,API Name,Batch ID,Template Number
2,"Citigroup predicts on 2024-08-21, the operating income at Alphabet may rise.",1,finance,llama-3.1-70b-instruct,NAVI_GATOR,0,3


- issue: `obesity rate` is not being captures by POS or NER.

In [6]:
only_predictions = DataProcessing.df_to_list(df, 'Base Sentence')
print(only_predictions)

sfe_class = SpacyFeatureExtraction(df, "Base Sentence")
word_leveL_disable_components = ["lemmatizer"]
word_level_pos_tags, word_level_pos_mappings, word_level_ner_entities, word_level_ner_mappings =  sfe_class.extract_features(data=only_predictions, disable_components=word_leveL_disable_components)
all_word_level_pos_df = DataProcessing.convert_to_df(word_level_pos_tags, mapping=word_level_pos_mappings)
all_word_level_ner_df = DataProcessing.convert_to_df(word_level_ner_entities, word_level_ner_mappings)
pred_pos_ner_features = [all_word_level_pos_df, all_word_level_ner_df]
pred_pos_ner_features_df = DataProcessing.concat_dfs(pred_pos_ner_features, axis=1, ignore_index=False)
pred_pos_ner_features_df.head()

['JPMorgan Chase forecasts that the net profit at Amazon potentially decrease in Q3 of 2027.', 'On August 21, 2024, Bank of America speculates the revenue at Microsoft will likely increase.', 'Citigroup predicts on 2024-08-21, the operating income at Alphabet may rise.', 'According to Goldman Sachs, the research and development expenses at Facebook would fall in 2025.', 'In 21 August 2024, Morgan Stanley envisions that the gross profit at Johnson & Johnson has some probability to remain stable.', 'The stock price at Visa should stay same in Q2 of 2026, according to Wells Fargo.', 'JPMorgan forecasts that the revenue at Microsoft potentially decrease in Q3 of 2027.', 'On August 25, 2024, to September 25, 2025, Citigroup speculates the net profit at Johnson & Johnson will likely increase.', 'Bank of America predicts on 2024-08-21, the operating income at Visa may rise.', 'According to Goldman Sachs, the research and development expenses at Alphabet would fall in 2029 Q2.', 'In 21 August 

Unnamed: 0,ADJ_3,ADV_1,NOUN_10,PRON_1,SYM_2,DET_2,ADJ_4,NOUN_3,CCONJ_2,DET_5,NOUN_2,AUX_1,PUNCT_1,ADP_5,VERB_2,DET_1,ADJ_2,PUNCT_4,PROPN_1,NUM_4,DET_4,PUNCT_5,DET_3,VERB_4,PUNCT_2,NUM_2,ADP_2,ADP_4,SCONJ_1,PROPN_2,SYM_1,PROPN_3,PROPN_5,ADP_6,ADP_1,PRON_2,PROPN_6,PUNCT_3,CCONJ_1,NOUN_5,NOUN_8,NOUN_11,NUM_1,VERB_3,NUM_3,ADJ_1,PROPN_7,ADP_3,PART_1,VERB_1,NOUN_6,NOUN_9,NOUN_1,NOUN_7,NOUN_4,PROPN_4,ORG_2,MONEY_1,CARDINAL_1,NORP_1,ORG_3,GPE_1,ORG_1,DATE_1,GPE_2,PERSON_2,FAC_1,PERSON_1,DATE_2,PRODUCT_1,LOC_1,EVENT_1,TIME_1
0,,potentially,,,,,,,,,decrease,,.,,,the,,,JPMorgan,,,,,,,,in,,that,Chase,,Amazon,,,at,,,,,,,,2027,,,net,,of,,forecasts,,,profit,,,Q3,Amazon,,,,,Q3,JPMorgan Chase,2027,,,,,,,,,
1,,likely,,,,,,,,,,will,",",,increase,the,,,August,,,,,,",",2024.0,of,,,Bank,,America,,,On,,,.,,,,,21,,,,,at,,speculates,,,revenue,,,Microsoft,Microsoft,,,,,,Bank of America,"August 21, 2024",,,,,,,,,
2,,,,,-,,,,,,income,may,",",,rise,the,,,Citigroup,,,,,,.,8.0,at,,,Alphabet,-,,,,on,,,,,,,,2024,,21.0,,,,,predicts,,,operating,,,,,,,,,,Citigroup,2024-08-21,,,,,,,,,
3,,,,,,,,expenses,,,development,would,",",,fall,the,,,Goldman,,,,,,.,,at,,,Sachs,,Facebook,,,to,,,,and,,,,2025,,,,,in,,According,,,research,,,,Facebook,,,,,,Goldman Sachs,2025,,,,,,,,,
4,,,,,,some,,,,,probability,,",",,has,the,stable,,August,,,,,,.,2024.0,at,,that,Morgan,,Stanley,Johnson,,In,,,,&,,,,21,remain,,gross,,,to,envisions,,,profit,,,Johnson,Johnson & Johnson,,,,,,Morgan Stanley,21 August 2024,,,,,,,,,


## Observations

In [7]:
log_file_path = "data/observation_logs"
predictions = False
observations_df = log_files.read_data(notebook_dir, log_file_path, predictions)
observations_df.head(7)

Start logging batch
log_directory: /Users/detraviousjamaribrinkley/Documents/Development/research_labs/uf_ds/predictions/misc_experiments/../data/observation_logs
save_batch_directory: /Users/detraviousjamaribrinkley/Documents/Development/research_labs/uf_ds/predictions/misc_experiments/../data/observation_logs/batch_1-observation
CSV to DF
Load saved csv: /Users/detraviousjamaribrinkley/Documents/Development/research_labs/uf_ds/predictions/misc_experiments/../data/observation_logs/batch_1-observation/batch_1-from_df.csv
save_batch_directory: /Users/detraviousjamaribrinkley/Documents/Development/research_labs/uf_ds/predictions/misc_experiments/../data/observation_logs/batch_2-observation
CSV to DF
Load saved csv: /Users/detraviousjamaribrinkley/Documents/Development/research_labs/uf_ds/predictions/misc_experiments/../data/observation_logs/batch_2-observation/batch_2-from_df.csv
save_batch_directory: /Users/detraviousjamaribrinkley/Documents/Development/research_labs/uf_ds/predictions/m

Unnamed: 0,Base Sentence,Sentence Label,Domain,Model Name,API Name,Batch ID,Template Number
0,The financial analyst at Goldman Sachs observed that the operating income at Tesla had increased in the first quarter of 2024.,0,finance,llama-3.1-70b-instruct,NAVI_GATOR,0,1
1,"On 2024-08-20 to 2025-08-20, Morgan Stanley speculates the stock price at Amazon will likely rise.",0,finance,llama-3.1-70b-instruct,NAVI_GATOR,0,2
2,"A young investor predicts on 2025-03-15, the S&P 500 index may rise.",0,finance,llama-3.1-70b-instruct,NAVI_GATOR,0,3
3,"According to Bank of America, the net profit at Microsoft would fall in the second quarter of 2026.",0,finance,llama-3.1-70b-instruct,NAVI_GATOR,0,4
4,"In 2027-01-01 to 2027-12-31, Wells Fargo envisions that the interest rates at the Federal Reserve have some probability to remain stable.",0,finance,llama-3.1-70b-instruct,NAVI_GATOR,0,5
5,"The trading volume at Apple should stay same in the fourth quarter of 2025, according to a financial expert at JPMorgan Chase.",0,finance,llama-3.1-70b-instruct,NAVI_GATOR,0,6
6,JPMorgan observed that the net profit at Microsoft had risen in September 2023.,0,finance,llama-3.3-70b-instruct,NAVI_GATOR,0,1


In [8]:
only_observations = DataProcessing.df_to_list(observations_df, 'Base Sentence')
# print(only_observations)
word_level_pos_tags, word_level_pos_mappings, word_level_ner_entities, word_level_ner_mappings = sfe_class.extract_features(data=only_observations, disable_components=word_leveL_disable_components)
all_word_level_pos_df = DataProcessing.convert_to_df(word_level_pos_tags, mapping=word_level_pos_mappings)
all_word_level_ner_df = DataProcessing.convert_to_df(word_level_ner_entities, word_level_ner_mappings)
obs_pos_ner_features = [all_word_level_pos_df, all_word_level_ner_df]
obs_pos_ner_features_df = DataProcessing.concat_dfs(obs_pos_ner_features, axis=1, ignore_index=False)
obs_pos_ner_features_df.head(3)

Spacy Doc (0):  The financial analyst at Goldman Sachs observed that the operating income at Tesla had increased in the first quarter of 2024.
Spacy Doc (1):  On 2024-08-20 to 2025-08-20, Morgan Stanley speculates the stock price at Amazon will likely rise.
Spacy Doc (2):  A young investor predicts on 2025-03-15, the S&P 500 index may rise.
Spacy Doc (3):  According to Bank of America, the net profit at Microsoft would fall in the second quarter of 2026.


Unnamed: 0,ADJ_3,PROPN_8,PRON_1,SYM_2,DET_6,DET_2,ADJ_4,NOUN_3,AUX_2,SYM_3,PROPN_10,DET_5,NOUN_2,AUX_1,SCONJ_2,AUX_3,PUNCT_1,PUNCT_6,ADP_5,VERB_2,NUM_5,DET_1,ADJ_2,PUNCT_4,PROPN_1,NUM_4,PROPN_9,DET_4,PUNCT_5,DET_3,VERB_4,PUNCT_2,NUM_2,ADP_2,ADP_7,ADP_4,PROPN_4,SCONJ_1,ADP_9,PROPN_2,SYM_1,PROPN_3,ADJ_5,PROPN_5,ADP_6,ADP_1,PROPN_6,PUNCT_3,NUM_6,CCONJ_1,SYM_4,NOUN_5,NOUN_8,ADV_2,NUM_1,VERB_3,NUM_3,ADJ_1,PROPN_7,ADP_3,PART_2,PART_1,ADP_8,VERB_1,NOUN_6,NOUN_9,NOUN_1,NOUN_7,NOUN_4,ADV_1,ORG_2,ORG_1,DATE_1,ORDINAL_1,CARDINAL_2,TIME_1,CARDINAL_1,ORG_4,LOC_1,NORP_1,GPE_1,DATE_2,LAW_1,FAC_1,EVENT_1,ORG_3,GPE_2,PERSON_1,PERCENT_1,QUANTITY_1,DATE_3
0,,,,,,the,,income,,,,,operating,had,,,.,,,increased,,The,first,,Goldman,,,,,the,,,,at,,of,,that,,Sachs,,Tesla,,,,at,,,,,,,,,2024,,,financial,,in,,,,observed,,,analyst,,quarter,,Tesla,Goldman Sachs,the first quarter of 2024,,,,,,,,,,,,,,,,,,
1,,,,-,,,,,,-,,,price,will,,,",",,,rise,8.0,the,,,Morgan,2025.0,,,,,,.,8.0,at,,,,,,Stanley,-,Amazon,,,,On,,,20.0,,-,,,,2024,,20.0,,,,,to,,speculates,,,stock,,,likely,Amazon,Morgan Stanley,2024-08-20 to,,,,,,,,,,,,,,,,,,
2,,,,-,,the,,,,,,,index,may,,,",",,,rise,,A,,,S&P,500.0,,,,,,.,3.0,,,,,,,,-,,,,,on,,,,,,,,,2025,,15.0,young,,,,,,predicts,,,investor,,,,,,2025-03-15,,,,,,,,,,,,,,,,,,


## Both

- Create a knowledge graph
    - Nodes: words
    - Edges: connection to other words (same/diff sentence)
- Look at code from Graphbreeding project on 2019 Mac

In [9]:
pred_sentence_features_df = DataProcessing.include_sentence_and_label(pred_pos_ner_features_df, df)
pred_sentence_features_df.head(3)
pred_profit_filt = (pred_pos_ner_features_df.NOUN_1.values == 'profit')
pred_profit_df = pred_pos_ner_features_df.loc[pred_profit_filt == True]
pred_profit_df

Unnamed: 0,Base Sentence,Sentence Label,ADJ_3,ADV_1,NOUN_10,PRON_1,SYM_2,DET_2,ADJ_4,NOUN_3,CCONJ_2,DET_5,NOUN_2,AUX_1,PUNCT_1,ADP_5,VERB_2,DET_1,ADJ_2,PUNCT_4,PROPN_1,NUM_4,DET_4,PUNCT_5,DET_3,VERB_4,PUNCT_2,NUM_2,ADP_2,ADP_4,SCONJ_1,PROPN_2,SYM_1,PROPN_3,PROPN_5,ADP_6,ADP_1,PRON_2,PROPN_6,PUNCT_3,CCONJ_1,NOUN_5,NOUN_8,NOUN_11,NUM_1,VERB_3,NUM_3,ADJ_1,PROPN_7,ADP_3,PART_1,VERB_1,NOUN_6,NOUN_9,NOUN_1,NOUN_7,NOUN_4,PROPN_4,ORG_2,MONEY_1,CARDINAL_1,NORP_1,ORG_3,GPE_1,ORG_1,DATE_1,GPE_2,PERSON_2,FAC_1,PERSON_1,DATE_2,PRODUCT_1,LOC_1,EVENT_1,TIME_1
0,JPMorgan Chase forecasts that the net profit at Amazon potentially decrease in Q3 of 2027.,1,,potentially,,,,,,,,,decrease,,.,,,the,,,JPMorgan,,,,,,,,in,,that,Chase,,Amazon,,,at,,,,,,,,2027,,,net,,of,,forecasts,,,profit,,,Q3,Amazon,,,,,Q3,JPMorgan Chase,2027,,,,,,,,,
4,"In 21 August 2024, Morgan Stanley envisions that the gross profit at Johnson & Johnson has some probability to remain stable.",1,,,,,,some,,,,,probability,,",",,has,the,stable,,August,,,,,,.,2024.0,at,,that,Morgan,,Stanley,Johnson,,In,,,,&,,,,21,remain,,gross,,,to,envisions,,,profit,,,Johnson,Johnson & Johnson,,,,,,Morgan Stanley,21 August 2024,,,,,,,,,
7,"On August 25, 2024, to September 25, 2025, Citigroup speculates the net profit at Johnson & Johnson will likely increase.",1,,likely,,,,,,,,,,will,",",,increase,the,,",",August,2025.0,,.,,,",",2024.0,to,,,September,,Citigroup,Johnson,,On,,,",",&,,,,25,,25.0,net,,at,,speculates,,,profit,,,Johnson,Johnson & Johnson,,,,,,Citigroup,"August 25, 2024",,,,,"September 25, 2025",,,,
10,"In 21 August 2024, Wells Fargo envisions that the gross profit at Procter & Gamble has some probability to remain stable.",1,,,,,,some,,,,,probability,,",",,has,the,stable,,August,,,,,,.,2024.0,at,,that,Wells,,Fargo,Gamble,,In,,,,&,,,,21,remain,,gross,,,to,envisions,,,profit,,,Procter,Procter & Gamble,,,,,,Wells Fargo,21 August 2024,,,,,,,,,
73,"On August 21, 2024, Goldman Sachs speculates that the net profit at Microsoft will likely increase in the following fiscal year.",1,,likely,,,,the,,,,,year,will,",",,increase,the,fiscal,,August,,,,,,",",2024.0,at,,that,Goldman,,Sachs,,,On,,,.,,,,,21,following,,net,,in,,speculates,,,profit,,,Microsoft,Microsoft,,,,,,Goldman Sachs,"August 21, 2024",,,,,the following fiscal year,,,,
83,"The net profit at Amazon should stay same in 2024/08/21, according to a financial reporter.",1,financial,,,,,a,,,,,reporter,should,",",,according,The,same,,Amazon,,,,,,.,,in,,,,,,,,at,,,,,,,,2024/08/21,,,net,,to,,stay,,,profit,,,,,,2024/08/21,,,,Amazon,,,,,,,,,,


In [10]:
pred_profit_sentences = DataProcessing.df_to_list(pred_profit_df, 'Base Sentence')
# pred_profit_sentences_2 = DataProcessing.df_to_list(pred_profit_df, 'Base Sentence')
# pred_profit_sentences_3 = DataProcessing.df_to_list(pred_profit_df, 'Base Sentence')
# pred_profit_sentences_4 = DataProcessing.df_to_list(pred_profit_df, 'Base Sentence')


In [11]:
obs_profit_sentence_df = DataProcessing.include_sentence_and_label(obs_pos_ner_features_df, observations_df)
obs_profit_filt = (obs_pos_ner_features_df.NOUN_1.values == 'profit')
obs_profit_df = obs_pos_ner_features_df.loc[obs_profit_filt == True]
obs_profit_df

Unnamed: 0,Base Sentence,Sentence Label,ADJ_3,PROPN_8,PRON_1,SYM_2,DET_6,DET_2,ADJ_4,NOUN_3,AUX_2,SYM_3,PROPN_10,DET_5,NOUN_2,AUX_1,SCONJ_2,AUX_3,PUNCT_1,PUNCT_6,ADP_5,VERB_2,NUM_5,DET_1,ADJ_2,PUNCT_4,PROPN_1,NUM_4,PROPN_9,DET_4,PUNCT_5,DET_3,VERB_4,PUNCT_2,NUM_2,ADP_2,ADP_7,ADP_4,PROPN_4,SCONJ_1,ADP_9,PROPN_2,SYM_1,PROPN_3,ADJ_5,PROPN_5,ADP_6,ADP_1,PROPN_6,PUNCT_3,NUM_6,CCONJ_1,SYM_4,NOUN_5,NOUN_8,ADV_2,NUM_1,VERB_3,NUM_3,ADJ_1,PROPN_7,ADP_3,PART_2,PART_1,ADP_8,VERB_1,NOUN_6,NOUN_9,NOUN_1,NOUN_7,NOUN_4,ADV_1,ORG_2,ORG_1,DATE_1,ORDINAL_1,CARDINAL_2,TIME_1,CARDINAL_1,ORG_4,LOC_1,NORP_1,GPE_1,DATE_2,LAW_1,FAC_1,EVENT_1,ORG_3,GPE_2,PERSON_1,PERCENT_1,QUANTITY_1,DATE_3
3,"According to Bank of America, the net profit at Microsoft would fall in the second quarter of 2026.",0,,,,,,the,,,,,,,quarter,would,,,",",,of,fall,,the,second,,Bank,,,,,,,.,,of,,in,,,,America,,Microsoft,,,,to,,,,,,,,,2026,,,net,,at,,,,According,,,profit,,,,Microsoft,Bank of America,the second quarter of 2026,,,,,,,,,,,,,,,,,,
6,JPMorgan observed that the net profit at Microsoft had risen in September 2023.,0,,,,,,,,,,,,,,had,,,.,,,risen,,the,,,JPMorgan,,,,,,,,,in,,,,that,,Microsoft,,September,,,,at,,,,,,,,,2023,,,net,,,,,,observed,,,profit,,,,Microsoft,JPMorgan,September 2023,,,,,,,,,,,,,,,,,,
9,"According to Google, the gross profit at Facebook would fall in the second quarter of 2026.",0,,,,,,the,,,,,,,quarter,would,,,",",,,fall,,the,second,,Google,,,,,,,.,,at,,of,,,,Facebook,,,,,,to,,,,,,,,,2026,,,gross,,in,,,,According,,,profit,,,,Facebook,Google,the second quarter of 2026,,,,,,,,,,,,,,,,,,
210,"According to Microsoft, the net profit at Facebook would fall in the second quarter of 2026.",0,,,,,,the,,,,,,,quarter,would,,,",",,,fall,,the,second,,Microsoft,,,,,,,.,,at,,of,,,,Facebook,,,,,,to,,,,,,,,,2026,,,net,,in,,,,According,,,profit,,,,Facebook,Microsoft,the second quarter of 2026,,,,,,,,,,,,,,,,,,
232,"On 11/15/2022, the Wall Street Journal speculated the gross profit at Microsoft would likely increase.",0,,,,,,the,,,,,,,,would,,,",",,,increase,,the,,,Wall,,,,,,,.,,at,,,Microsoft,,,Street,,Journal,,,,On,,,,,,,,,11/15/2022,,,gross,,,,,,speculated,,,profit,,,likely,Microsoft,the Wall Street Journal,11/15/2022,,,,,,,,,,,,,,,,,,
234,"According to JPMorgan Chase, the net profit at Tesla would fall in 06/2022.",0,,,,,,,,,,,,,,would,,,",",,,fall,,the,,,JPMorgan,,,,,,,.,,at,,,,,,Chase,,Tesla,,,,to,,,,,,,,,06/2022,,,net,,in,,,,According,,,profit,,,,Tesla,JPMorgan Chase,,,,,06/2022,,,,,,,,,,,,,,
413,"Citigroup noted that the net profit at Microsoft had risen significantly from September 10, 2023, to September 10, 2024.",0,,,,,,,,,,,,,,had,,,",",,,risen,,the,,.,Citigroup,2024.0,,,,,,",",2023.0,from,,,September,that,,Microsoft,,September,,,,at,,",",,,,,,,10,,10.0,net,,to,,,,noted,,,profit,,,significantly,Microsoft,Citigroup,"September 10, 2023, to",,,,,,,,,"September 10, 2024",,,,,,,,,
612,"JPMorgan Chase noted that the net profit at Amazon had risen significantly from September 10, 2023, to September 10, 2024.",0,,,,,,,,,,,,,,had,,,",",,,risen,,the,,.,JPMorgan,2024.0,,,,,,",",2023.0,from,,,September,that,,Chase,,Amazon,,September,,at,,",",,,,,,,10,,10.0,net,,to,,,,noted,,,profit,,,significantly,Amazon,JPMorgan Chase,"September 10, 2023, to",,,,,,,,,"September 10, 2024",,,,,,,,,
623,"Google's net profit should stay the same in Q2 2027, according to financial records.",0,financial,,,,,,,,,,,,records,should,,,",",,,according,,the,same,,Google,,,,,,,.,,to,,,,,,Q2,,,,,,in,,,,,,,,,2027,,,net,,,,'s,,stay,,,profit,,,,Q2 2027,Google,,,,,,,,,,,,,,,,,,,
803,JPMorgan Chase observed that the net profit at Amazon had remained stable in Q2 2026.,0,,,,,,,,,,,,,,had,,,.,,,remained,,the,stable,,JPMorgan,,,,,,,,,in,,,Q2,that,,Chase,,Amazon,,,,at,,,,,,,,,2026,,,net,,,,,,observed,,,profit,,,,Amazon,JPMorgan Chase,Q2 2026,,,,,,,,,,,,,,,,,,


In [12]:
obs_profit_sentences = DataProcessing.df_to_list(obs_profit_df, 'Base Sentence')
obs_profit_sentences

['According to Bank of America, the net profit at Microsoft would fall in the second quarter of 2026.',
 'JPMorgan observed that the net profit at Microsoft had risen in September 2023.',
 'According to Google, the gross profit at Facebook would fall in the second quarter of 2026.',
 'According to Microsoft, the net profit at Facebook would fall in the second quarter of 2026.',
 'On 11/15/2022, the Wall Street Journal speculated the gross profit at Microsoft would likely increase.',
 'According to JPMorgan Chase, the net profit at Tesla would fall in 06/2022.',
 'Citigroup noted that the net profit at Microsoft had risen significantly from September 10, 2023, to September 10, 2024.',
 'JPMorgan Chase noted that the net profit at Amazon had risen significantly from September 10, 2023, to September 10, 2024.',
 "Google's net profit should stay the same in Q2 2027, according to financial records.",
 'JPMorgan Chase observed that the net profit at Amazon had remained stable in Q2 2026.',
 

In [13]:
pred_obs_dict = {}
for pred_sent_idx in range(len(pred_profit_sentences)):
    pred_sentence = pred_profit_sentences[pred_sent_idx]
    pred_obs_dict[pred_sentence] = obs_profit_sentences
pred_obs_dict

{'JPMorgan Chase forecasts that the net profit at Amazon potentially decrease in Q3 of 2027.': ['According to Bank of America, the net profit at Microsoft would fall in the second quarter of 2026.',
  'JPMorgan observed that the net profit at Microsoft had risen in September 2023.',
  'According to Google, the gross profit at Facebook would fall in the second quarter of 2026.',
  'According to Microsoft, the net profit at Facebook would fall in the second quarter of 2026.',
  'On 11/15/2022, the Wall Street Journal speculated the gross profit at Microsoft would likely increase.',
  'According to JPMorgan Chase, the net profit at Tesla would fall in 06/2022.',
  'Citigroup noted that the net profit at Microsoft had risen significantly from September 10, 2023, to September 10, 2024.',
  'JPMorgan Chase noted that the net profit at Amazon had risen significantly from September 10, 2023, to September 10, 2024.',
  "Google's net profit should stay the same in Q2 2027, according to financial

In [14]:
import spacy

load_nlp_model = spacy.load("en_core_web_sm")

all_mappings = {} # Mapping of predictions to observations
pom_mappings = {} # Mapping of predictions to observations to metrics/scoresl pos can be misleading

oms = []
for prediction in pred_obs_dict.keys():
    print(f"Prediction: {prediction}")

    om_mappings = {} # Mapping of observations to metrics/scores

    for observation in pred_obs_dict[prediction]:
        print(f"    Observation: {observation}")

        # Calculate the similarity score
        prediction_doc = load_nlp_model(prediction)
        observation_doc = load_nlp_model(observation)
        similarity = prediction_doc.similarity(observation_doc)
        print(f"    Similarity: {similarity}")
        om_mappings[observation] = [similarity]
        print()   
        # print(f"    OM Mapping: {om_mappings}")
    pom_mappings[prediction] = om_mappings

# Flatten the dictionary into a list of rows
data = []
for prediction, observations in pom_mappings.items():
    for observation, scores in observations.items():
        data.append([prediction, observation, scores[0]])

# Create the DataFrame
df = pd.DataFrame(data, columns=['Prediction', 'Observation', 'Spacy Similarity'])

# Display the DataFrame
df['Scores'] = "Similarity"
df

Prediction: JPMorgan Chase forecasts that the net profit at Amazon potentially decrease in Q3 of 2027.
    Observation: According to Bank of America, the net profit at Microsoft would fall in the second quarter of 2026.
    Similarity: 0.706486701965332

    Observation: JPMorgan observed that the net profit at Microsoft had risen in September 2023.
    Similarity: 0.7725691795349121

    Observation: According to Google, the gross profit at Facebook would fall in the second quarter of 2026.
    Similarity: 0.6635857224464417

    Observation: According to Microsoft, the net profit at Facebook would fall in the second quarter of 2026.
    Similarity: 0.6858891844749451

    Observation: On 11/15/2022, the Wall Street Journal speculated the gross profit at Microsoft would likely increase.
    Similarity: 0.6503056287765503

    Observation: According to JPMorgan Chase, the net profit at Tesla would fall in 06/2022.
    Similarity: 0.6568577289581299

    Observation: Citigroup noted tha

  similarity = prediction_doc.similarity(observation_doc)


    Similarity: 0.8545955419540405

    Observation: Google's net profit should stay the same in Q2 2027, according to financial records.
    Similarity: 0.5586764812469482

    Observation: JPMorgan Chase observed that the net profit at Amazon had remained stable in Q2 2026.
    Similarity: 0.5882915258407593

    Observation: Bank of America observed that on October 15, 2024, the net profit at General Motors fell sharply.
    Similarity: 0.8109949231147766

Prediction: In 21 August 2024, Wells Fargo envisions that the gross profit at Procter & Gamble has some probability to remain stable.
    Observation: According to Bank of America, the net profit at Microsoft would fall in the second quarter of 2026.
    Similarity: 0.5780841112136841

    Observation: JPMorgan observed that the net profit at Microsoft had risen in September 2023.
    Similarity: 0.6776936054229736

    Observation: According to Google, the gross profit at Facebook would fall in the second quarter of 2026.
    Sim

Unnamed: 0,Prediction,Observation,Spacy Similarity,Scores
0,JPMorgan Chase forecasts that the net profit at Amazon potentially decrease in Q3 of 2027.,"According to Bank of America, the net profit at Microsoft would fall in the second quarter of 2026.",0.706487,Similarity
1,JPMorgan Chase forecasts that the net profit at Amazon potentially decrease in Q3 of 2027.,JPMorgan observed that the net profit at Microsoft had risen in September 2023.,0.772569,Similarity
2,JPMorgan Chase forecasts that the net profit at Amazon potentially decrease in Q3 of 2027.,"According to Google, the gross profit at Facebook would fall in the second quarter of 2026.",0.663586,Similarity
3,JPMorgan Chase forecasts that the net profit at Amazon potentially decrease in Q3 of 2027.,"According to Microsoft, the net profit at Facebook would fall in the second quarter of 2026.",0.685889,Similarity
4,JPMorgan Chase forecasts that the net profit at Amazon potentially decrease in Q3 of 2027.,"On 11/15/2022, the Wall Street Journal speculated the gross profit at Microsoft would likely increase.",0.650306,Similarity
5,JPMorgan Chase forecasts that the net profit at Amazon potentially decrease in Q3 of 2027.,"According to JPMorgan Chase, the net profit at Tesla would fall in 06/2022.",0.656858,Similarity
6,JPMorgan Chase forecasts that the net profit at Amazon potentially decrease in Q3 of 2027.,"Citigroup noted that the net profit at Microsoft had risen significantly from September 10, 2023, to September 10, 2024.",0.586532,Similarity
7,JPMorgan Chase forecasts that the net profit at Amazon potentially decrease in Q3 of 2027.,"JPMorgan Chase noted that the net profit at Amazon had risen significantly from September 10, 2023, to September 10, 2024.",0.606574,Similarity
8,JPMorgan Chase forecasts that the net profit at Amazon potentially decrease in Q3 of 2027.,"Google's net profit should stay the same in Q2 2027, according to financial records.",0.513798,Similarity
9,JPMorgan Chase forecasts that the net profit at Amazon potentially decrease in Q3 of 2027.,JPMorgan Chase observed that the net profit at Amazon had remained stable in Q2 2026.,0.779024,Similarity


In [15]:
data

[['JPMorgan Chase forecasts that the net profit at Amazon potentially decrease in Q3 of 2027.',
  'According to Bank of America, the net profit at Microsoft would fall in the second quarter of 2026.',
  0.706486701965332],
 ['JPMorgan Chase forecasts that the net profit at Amazon potentially decrease in Q3 of 2027.',
  'JPMorgan observed that the net profit at Microsoft had risen in September 2023.',
  0.7725691795349121],
 ['JPMorgan Chase forecasts that the net profit at Amazon potentially decrease in Q3 of 2027.',
  'According to Google, the gross profit at Facebook would fall in the second quarter of 2026.',
  0.6635857224464417],
 ['JPMorgan Chase forecasts that the net profit at Amazon potentially decrease in Q3 of 2027.',
  'According to Microsoft, the net profit at Facebook would fall in the second quarter of 2026.',
  0.6858891844749451],
 ['JPMorgan Chase forecasts that the net profit at Amazon potentially decrease in Q3 of 2027.',
  'On 11/15/2022, the Wall Street Journal sp

In [16]:
table = pd.pivot_table(df, values='Spacy Similarity', index=['Prediction', 'Observation'],
                       columns=['Scores'])

- Only grouping by `p_a = profit`. 
- What if also group by sentiment?
    - fall/decrease ---  drastically different --- neutral bc dd doesn't indicate slope/how
    - fall --- plummeted, so similar

- What set of observations certify prediction x?
1. Collect data
    1. Real data
        1. Classify as prediction or observation
    2. Generate data
        1. Verify prediction or observation
2. Certify 
    1. Match observations with predictions
        1. Partial
        2. Full
    2. State which observations verify prediction as true or false

In [17]:
table

Unnamed: 0_level_0,Scores,Similarity
Prediction,Observation,Unnamed: 2_level_1
"In 21 August 2024, Morgan Stanley envisions that the gross profit at Johnson & Johnson has some probability to remain stable.","According to Bank of America, the net profit at Microsoft would fall in the second quarter of 2026.",0.598909
"In 21 August 2024, Morgan Stanley envisions that the gross profit at Johnson & Johnson has some probability to remain stable.","According to Google, the gross profit at Facebook would fall in the second quarter of 2026.",0.602676
"In 21 August 2024, Morgan Stanley envisions that the gross profit at Johnson & Johnson has some probability to remain stable.","According to JPMorgan Chase, the net profit at Tesla would fall in 06/2022.",0.600226
"In 21 August 2024, Morgan Stanley envisions that the gross profit at Johnson & Johnson has some probability to remain stable.","According to Microsoft, the net profit at Facebook would fall in the second quarter of 2026.",0.598431
"In 21 August 2024, Morgan Stanley envisions that the gross profit at Johnson & Johnson has some probability to remain stable.","Bank of America observed that on October 15, 2024, the net profit at General Motors fell sharply.",0.656063
"In 21 August 2024, Morgan Stanley envisions that the gross profit at Johnson & Johnson has some probability to remain stable.","Citigroup noted that the net profit at Microsoft had risen significantly from September 10, 2023, to September 10, 2024.",0.573934
"In 21 August 2024, Morgan Stanley envisions that the gross profit at Johnson & Johnson has some probability to remain stable.","Google's net profit should stay the same in Q2 2027, according to financial records.",0.551721
"In 21 August 2024, Morgan Stanley envisions that the gross profit at Johnson & Johnson has some probability to remain stable.","JPMorgan Chase noted that the net profit at Amazon had risen significantly from September 10, 2023, to September 10, 2024.",0.598704
"In 21 August 2024, Morgan Stanley envisions that the gross profit at Johnson & Johnson has some probability to remain stable.",JPMorgan Chase observed that the net profit at Amazon had remained stable in Q2 2026.,0.759091
"In 21 August 2024, Morgan Stanley envisions that the gross profit at Johnson & Johnson has some probability to remain stable.",JPMorgan observed that the net profit at Microsoft had risen in September 2023.,0.718996


In [18]:
# Methods to alter Precision and Recall for prob rather than binary
    
#     https://en.wikipedia.org/wiki/Fuzzy_classification

# Support or deny classes

In [19]:
from sentence_transformers import SentenceTransformer

# 1. Load a pretrained Sentence Transformer model
model = SentenceTransformer("all-MiniLM-L6-v2")

# The sentences to encode
sentences = [
    "The weather is lovely today.",
    "It's so sunny outside!",
    "He drove to the stadium.",
]

# 2. Calculate embeddings by calling model.encode()
embeddings = model.encode(sentences)
print(embeddings.shape)
# [3, 384]

# 3. Calculate the embedding similarities
similarities = model.similarity(embeddings, embeddings)
print(similarities)
# tensor([[1.0000, 0.6660, 0.1046],
#         [0.6660, 1.0000, 0.1411],
#         [0.1046, 0.1411, 1.0000]])

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

(3, 384)
tensor([[1.0000, 0.6660, 0.1046],
        [0.6660, 1.0000, 0.1411],
        [0.1046, 0.1411, 1.0000]])


In [20]:
pred_profit_sentences

['JPMorgan Chase forecasts that the net profit at Amazon potentially decrease in Q3 of 2027.',
 'In 21 August 2024, Morgan Stanley envisions that the gross profit at Johnson & Johnson has some probability to remain stable.',
 'On August 25, 2024, to September 25, 2025, Citigroup speculates the net profit at Johnson & Johnson will likely increase.',
 'In 21 August 2024, Wells Fargo envisions that the gross profit at Procter & Gamble has some probability to remain stable.',
 'On August 21, 2024, Goldman Sachs speculates that the net profit at Microsoft will likely increase in the following fiscal year.',
 'The net profit at Amazon should stay same in 2024/08/21, according to a financial reporter.']

In [21]:
obs_profit_sentences

['According to Bank of America, the net profit at Microsoft would fall in the second quarter of 2026.',
 'JPMorgan observed that the net profit at Microsoft had risen in September 2023.',
 'According to Google, the gross profit at Facebook would fall in the second quarter of 2026.',
 'According to Microsoft, the net profit at Facebook would fall in the second quarter of 2026.',
 'On 11/15/2022, the Wall Street Journal speculated the gross profit at Microsoft would likely increase.',
 'According to JPMorgan Chase, the net profit at Tesla would fall in 06/2022.',
 'Citigroup noted that the net profit at Microsoft had risen significantly from September 10, 2023, to September 10, 2024.',
 'JPMorgan Chase noted that the net profit at Amazon had risen significantly from September 10, 2023, to September 10, 2024.',
 "Google's net profit should stay the same in Q2 2027, according to financial records.",
 'JPMorgan Chase observed that the net profit at Amazon had remained stable in Q2 2026.',
 

In [22]:
my_sentences = pred_profit_sentences + obs_profit_sentences
my_sentences

['JPMorgan Chase forecasts that the net profit at Amazon potentially decrease in Q3 of 2027.',
 'In 21 August 2024, Morgan Stanley envisions that the gross profit at Johnson & Johnson has some probability to remain stable.',
 'On August 25, 2024, to September 25, 2025, Citigroup speculates the net profit at Johnson & Johnson will likely increase.',
 'In 21 August 2024, Wells Fargo envisions that the gross profit at Procter & Gamble has some probability to remain stable.',
 'On August 21, 2024, Goldman Sachs speculates that the net profit at Microsoft will likely increase in the following fiscal year.',
 'The net profit at Amazon should stay same in 2024/08/21, according to a financial reporter.',
 'According to Bank of America, the net profit at Microsoft would fall in the second quarter of 2026.',
 'JPMorgan observed that the net profit at Microsoft had risen in September 2023.',
 'According to Google, the gross profit at Facebook would fall in the second quarter of 2026.',
 'Accordin

In [23]:
my_sentences = ["10%", "10.5", "11"]

In [24]:
from sentence_transformers import SentenceTransformer

# 1. Load a pretrained Sentence Transformer model
model = SentenceTransformer("all-MiniLM-L6-v2")

# 2. Calculate embeddings by calling model.encode()
embeddings = model.encode(my_sentences)
print(embeddings.shape)
# [3, 384]

# 3. Calculate the embedding similarities
similarities = model.similarity(embeddings, embeddings)
print(similarities)
# tensor([[1.0000, 0.6660, 0.1046],
#         [0.6660, 1.0000, 0.1411],
#         [0.1046, 0.1411, 1.0000]])

KeyboardInterrupt: 