# Extract Features

1. Read csv files and load as dfs
2. Combine dfs
3. Get POS, NER
4. Save in new features_df
5. Get semantic cosine similarity

In [1]:
import os, sys

import pandas as pd

# Get the current working directory of the notebook
notebook_dir = os.getcwd()
# Add the parent directory to the system path
sys.path.append(os.path.join(notebook_dir, '../'))

import log_files
from log_files import LogData
from data_processing import DataProcessing
from feature_extraction import SpacyFeatureExtraction

In [2]:
pd.set_option('max_colwidth', 800)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [3]:
log_file_path = "data/prediction_logs"
predictions = True
df = log_files.read_data(notebook_dir, log_file_path, predictions)
df.head(7)

Start logging batch
log_directory: /Users/detraviousjamaribrinkley/Documents/Development/research_labs/uf_ds/predictions/prediction_correctness_experiments/../data/prediction_logs
save_batch_directory: /Users/detraviousjamaribrinkley/Documents/Development/research_labs/uf_ds/predictions/prediction_correctness_experiments/../data/prediction_logs/batch_1-predictions
CSV to DF
Load saved csv: /Users/detraviousjamaribrinkley/Documents/Development/research_labs/uf_ds/predictions/prediction_correctness_experiments/../data/prediction_logs/batch_1-predictions/batch_1-from_df.csv
save_batch_directory: /Users/detraviousjamaribrinkley/Documents/Development/research_labs/uf_ds/predictions/prediction_correctness_experiments/../data/prediction_logs/batch_2-predictions
CSV to DF
Load saved csv: /Users/detraviousjamaribrinkley/Documents/Development/research_labs/uf_ds/predictions/prediction_correctness_experiments/../data/prediction_logs/batch_2-predictions/batch_2-from_df.csv
save_batch_directory: /U

Unnamed: 0,Base Sentence,Sentence Label,Domain,Model Name,API Name,Batch ID,Template Number
0,"On August 21, 2024, as a financial analyst, I predict that the revenue at Amazon will potentially decrease in Q3 of 2027.",1,finance,gpt-3.5-turbo,NAVI_GATOR,0,1
1,"On 2024-08-21, Morgan Stanley speculates the operating income at Amazon will likely increase.",1,finance,gpt-4o,NAVI_GATOR,0,2
2,"According to JPMorgan Chase, the net profit at Microsoft is expected to decrease in Q2 of 2024.",1,finance,mixtral-8x7b-instruct,NAVI_GATOR,0,1
3,"On August 21, 2023, Goldman Sachs speculates that the stock price at Amazon will likely increase.",1,finance,mixtral-8x7b-instruct,NAVI_GATOR,0,2
4,"George, a financial analyst, predicts that on 01/15/2024, the Google revenue may rise.",1,finance,mixtral-8x7b-instruct,NAVI_GATOR,0,3
5,"According to BlackRock, the operating cash flow at ExxonMobil would fall in Q3 2023.",1,finance,mixtral-8x7b-instruct,NAVI_GATOR,0,4
6,"In Q2 2023, Wells Fargo envisions that the stock price at Tesla has some probability to remain stable.",1,finance,mixtral-8x7b-instruct,NAVI_GATOR,0,5


In [4]:
df[66 : 67]

Unnamed: 0,Base Sentence,Sentence Label,Domain,Model Name,API Name,Batch ID,Template Number
66,"According to JPMorgan Chase, the net profit at Microsoft is expected to decrease in Q2 of 2024.",1,finance,mixtral-8x7b-instruct,NAVI_GATOR,0,1


In [5]:
df[2 : 3]

Unnamed: 0,Base Sentence,Sentence Label,Domain,Model Name,API Name,Batch ID,Template Number
2,"According to JPMorgan Chase, the net profit at Microsoft is expected to decrease in Q2 of 2024.",1,finance,mixtral-8x7b-instruct,NAVI_GATOR,0,1


- issue: `obesity rate` is not being captures by POS or NER.

In [6]:
only_predictions = DataProcessing.df_to_list(df, 'Base Sentence')
print(only_predictions)

sfe_class = SpacyFeatureExtraction(df, "Base Sentence")
word_leveL_disable_components = ["lemmatizer"]
word_level_pos_tags, word_level_pos_mappings, word_level_ner_entities, word_level_ner_mappings =  sfe_class.extract_features(data=only_predictions, disable_components=word_leveL_disable_components)
all_word_level_pos_df = DataProcessing.convert_to_df(word_level_pos_tags, mapping=word_level_pos_mappings)
all_word_level_ner_df = DataProcessing.convert_to_df(word_level_ner_entities, word_level_ner_mappings)
pred_pos_ner_features = [all_word_level_pos_df, all_word_level_ner_df]
pred_pos_ner_features_df = DataProcessing.concat_dfs(pred_pos_ner_features, axis=1, ignore_index=False)
pred_pos_ner_features_df.head()

['On August 21, 2024, as a financial analyst, I predict that the revenue at Amazon will potentially decrease in Q3 of 2027.', 'On 2024-08-21, Morgan Stanley speculates the operating income at Amazon will likely increase.', 'According to JPMorgan Chase, the net profit at Microsoft is expected to decrease in Q2 of 2024.', 'On August 21, 2023, Goldman Sachs speculates that the stock price at Amazon will likely increase.', 'George, a financial analyst, predicts that on 01/15/2024, the Google revenue may rise.', 'According to BlackRock, the operating cash flow at ExxonMobil would fall in Q3 2023.', 'In Q2 2023, Wells Fargo envisions that the stock price at Tesla has some probability to remain stable.', 'The stock price at Apple should stay the same in August 2023, according to Morgan Stanley.', 'Dr. Smith predicts that the prevalence of chronic illnesses at rural health clinics will potentially decrease in Q4 of 2026.', 'WHO forecasts that the prevalence of chronic illnesses at global urban

Unnamed: 0,ADP_3,ADP_5,PUNCT_5,NOUN_7,PROPN_4,PUNCT_3,VERB_2,PROPN_5,NOUN_3,DET_4,SYM_1,NOUN_1,ADP_4,NOUN_4,VERB_3,NOUN_5,NUM_1,PROPN_2,VERB_1,PART_2,NUM_2,DET_3,VERB_5,NOUN_8,ADJ_4,PROPN_1,SCONJ_1,ADP_1,PUNCT_1,DET_2,ADJ_3,DET_5,CCONJ_1,PUNCT_4,PART_1,ADV_1,VERB_4,PRON_1,ADJ_1,PUNCT_6,PUNCT_2,ADP_2,NUM_3,DET_1,SYM_2,NOUN_6,NOUN_2,AUX_1,PROPN_6,ADJ_2,PROPN_7,AUX_2,PROPN_3,ORG_1,ORG_3,GPE_2,LOC_1,TIME_1,GPE_1,NORP_1,PERSON_1,DATE_1,ORG_2,DATE_2,CARDINAL_1,PERCENT_1,EVENT_1
0,at,of,,,,",",decrease,,,,,analyst,in,,,,21,Amazon,predict,,2024.0,,,,,August,that,On,",",the,,,,.,,potentially,,I,financial,,",",as,2027.0,a,,,revenue,will,,,,,Q3,Amazon,,,,,Q3,,,"August 21, 2024",,2027.0,,,
1,,,,,,,increase,,,,-,operating,,,,,2024,Stanley,speculates,,8.0,,,,,Morgan,,On,",",,,,,,,likely,,,,,.,at,21.0,the,-,,income,will,,,,,Amazon,Morgan Stanley,,,,,,,,2024-08-21,Amazon,,,,
2,in,,,,Q2,,expected,,,,,profit,of,,decrease,,2024,Chase,According,,,,,,,JPMorgan,,to,",",,,,,,to,,,,net,,.,at,,the,,,,is,,,,,Microsoft,JPMorgan Chase,Q2,,,,,,,2024,Microsoft,,,,
3,,,,,Amazon,.,increase,,,,,stock,,,,,21,Goldman,speculates,,2023.0,,,,,August,that,On,",",,,,,,,likely,,,,,",",at,,the,,,price,will,,,,,Sachs,Goldman Sachs,,,,,,,,"August 21, 2023",Amazon,,,,
4,,,,,,",",rise,,,,,analyst,,,,,01/15/2024,Google,predicts,,,,,,,George,that,on,",",the,,,,.,,,,,financial,,",",,,a,,,revenue,may,,,,,,Google,,,,,,,George,,,,,,


## Observations

In [7]:
log_file_path = "data/observation_logs"
predictions = False
observations_df = log_files.read_data(notebook_dir, log_file_path, predictions)
observations_df.head(7)

Start logging batch
log_directory: /Users/detraviousjamaribrinkley/Documents/Development/research_labs/uf_ds/predictions/prediction_correctness_experiments/../data/observation_logs
save_batch_directory: /Users/detraviousjamaribrinkley/Documents/Development/research_labs/uf_ds/predictions/prediction_correctness_experiments/../data/observation_logs/batch_1-observations
CSV to DF
Load saved csv: /Users/detraviousjamaribrinkley/Documents/Development/research_labs/uf_ds/predictions/prediction_correctness_experiments/../data/observation_logs/batch_1-observations/batch_1-from_df.csv
save_batch_directory: /Users/detraviousjamaribrinkley/Documents/Development/research_labs/uf_ds/predictions/prediction_correctness_experiments/../data/observation_logs/batch_2-observations
CSV to DF
Load saved csv: /Users/detraviousjamaribrinkley/Documents/Development/research_labs/uf_ds/predictions/prediction_correctness_experiments/../data/observation_logs/batch_2-observations/batch_2-from_df.csv
save_batch_dire

Unnamed: 0,Base Sentence,Sentence Label,Domain,Model Name,API Name,Batch ID,Template Number
0,"The financial reporter, Sarah, witnessed the stock price at Tesla plummeted in Q2 of 2023.",0,finance,gpt-3.5-turbo,NAVI_GATOR,0,1
1,"From May 15, 2025, the financial analyst speculated the net profit at Amazon was drastically different.",0,finance,gpt-3.5-turbo,NAVI_GATOR,0,2
2,"On July 1, 2024, the financial expert heard that the revenue at Google remained stable.",0,finance,gpt-3.5-turbo,NAVI_GATOR,0,5
3,"Apple's operating income generally rose in August 21, 2027, according to the financial top executive.",0,finance,gpt-3.5-turbo,NAVI_GATOR,0,6
4,Goldman Sachs saw the net profit at Tesla plummeted in 2023-11-15.,0,finance,gpt-4o,NAVI_GATOR,0,1
5,"From 2025-03-30, Morgan Stanley speculated the gross profit at Apple was drastically different.",0,finance,gpt-4o,NAVI_GATOR,0,2
6,"2026/12/01, a financial analyst heard that the operating income at Amazon remained stable.",0,finance,gpt-4o,NAVI_GATOR,0,5


In [8]:
only_observations = DataProcessing.df_to_list(observations_df, 'Base Sentence')
# print(only_observations)
word_level_pos_tags, word_level_pos_mappings, word_level_ner_entities, word_level_ner_mappings = sfe_class.extract_features(data=only_observations, disable_components=word_leveL_disable_components)
all_word_level_pos_df = DataProcessing.convert_to_df(word_level_pos_tags, mapping=word_level_pos_mappings)
all_word_level_ner_df = DataProcessing.convert_to_df(word_level_ner_entities, word_level_ner_mappings)
obs_pos_ner_features = [all_word_level_pos_df, all_word_level_ner_df]
obs_pos_ner_features_df = DataProcessing.concat_dfs(obs_pos_ner_features, axis=1, ignore_index=False)
obs_pos_ner_features_df.head(3)

Spacy Doc (0):  The financial reporter, Sarah, witnessed the stock price at Tesla plummeted in Q2 of 2023.
Spacy Doc (1):  From May 15, 2025, the financial analyst speculated the net profit at Amazon was drastically different.
Spacy Doc (2):  On July 1, 2024, the financial expert heard that the revenue at Google remained stable.
Spacy Doc (3):  Apple's operating income generally rose in August 21, 2027, according to the financial top executive.


Unnamed: 0,ADP_3,ADP_5,PUNCT_5,NOUN_7,PROPN_4,VERB_2,PUNCT_3,PROPN_5,NOUN_3,DET_4,SYM_1,NOUN_1,ADP_4,NOUN_4,SCONJ_2,VERB_3,NOUN_5,NUM_1,PROPN_2,VERB_1,ADJ_5,NUM_2,DET_3,NOUN_8,ADP_8,ADJ_4,PROPN_1,DET_2,ADP_1,PUNCT_1,ADJ_3,SCONJ_1,CCONJ_1,DET_5,PUNCT_4,PART_1,ADV_1,VERB_4,PRON_1,ADP_7,ADP_6,ADJ_1,PUNCT_2,ADP_2,NUM_3,DET_1,SYM_2,NOUN_6,NOUN_2,ADJ_6,AUX_1,PROPN_6,ADJ_2,PROPN_7,AUX_2,PROPN_3,ORG_1,LOC_1,TIME_1,GPE_1,NORP_1,PERSON_1,DATE_1,ORG_2,FAC_1,DATE_2,CARDINAL_1,EVENT_1
0,of,,,,,plummeted,.,,price,,,reporter,,,,,,2023,Tesla,witnessed,,,,,,,Sarah,the,at,",",,,,,,,,,,,,financial,",",in,,The,,,stock,,,,,,,Q2,Tesla,,,,,Sarah,,Q2,,,2023.0,
1,,,,,,,.,,,,,analyst,,,,,,15,Amazon,speculated,,2025.0,,,,,May,the,From,",",different,,,,,,drastically,,,,,financial,",",at,,the,,,profit,,was,,net,,,,Amazon,,,,,,"May 15, 2025",,,,,
2,,,,,,remained,.,,,,,expert,,,,,,1,Google,heard,,2024.0,,,,,July,the,On,",",,that,,,,,,,,,,financial,",",at,,the,,,revenue,,,,stable,,,,Google,,,,,,"July 1, 2024",,,,,


## Both

- Create a knowledge graph
    - Nodes: words
    - Edges: connection to other words (same/diff sentence)
- Look at code from Graphbreeding project on 2019 Mac

In [9]:
pred_sentence_features_df = DataProcessing.include_sentence_and_label(pred_pos_ner_features_df, df)
pred_sentence_features_df.head(3)
pred_profit_filt = (pred_pos_ner_features_df.NOUN_1.values == 'profit')
pred_profit_df = pred_pos_ner_features_df.loc[pred_profit_filt == True]
pred_profit_df

Unnamed: 0,Base Sentence,Sentence Label,ADP_3,ADP_5,PUNCT_5,NOUN_7,PROPN_4,PUNCT_3,VERB_2,PROPN_5,NOUN_3,DET_4,SYM_1,NOUN_1,ADP_4,NOUN_4,VERB_3,NOUN_5,NUM_1,PROPN_2,VERB_1,PART_2,NUM_2,DET_3,VERB_5,NOUN_8,ADJ_4,PROPN_1,SCONJ_1,ADP_1,PUNCT_1,DET_2,ADJ_3,DET_5,CCONJ_1,PUNCT_4,PART_1,ADV_1,VERB_4,PRON_1,ADJ_1,PUNCT_6,PUNCT_2,ADP_2,NUM_3,DET_1,SYM_2,NOUN_6,NOUN_2,AUX_1,PROPN_6,ADJ_2,PROPN_7,AUX_2,PROPN_3,ORG_1,ORG_3,GPE_2,LOC_1,TIME_1,GPE_1,NORP_1,PERSON_1,DATE_1,ORG_2,DATE_2,CARDINAL_1,PERCENT_1,EVENT_1
2,"According to JPMorgan Chase, the net profit at Microsoft is expected to decrease in Q2 of 2024.",1,in,,,,Q2,,expected,,,,,profit,of,,decrease,,2024,Chase,According,,,,,,,JPMorgan,,to,",",,,,,,to,,,,net,,.,at,,the,,,,is,,,,,Microsoft,JPMorgan Chase,Q2,,,,,,,2024,Microsoft,,,,
34,"According to JPMorgan Chase, the net profit at Goldman Sachs would fall in Q3 of 2027.",1,in,,,,Sachs,,fall,Q3,,,,profit,of,,,,2027,Chase,According,,,,,,,JPMorgan,,to,",",,,,,,,,,,net,,.,at,,the,,,,would,,,,,Goldman,JPMorgan Chase,,,,,Q3,,,2027,Goldman Sachs,,,,
65,Goldman Sachs forecasts that the net profit at Amazon potentially decrease in Q4 of 2025.,1,of,,,,Q4,,,,,,,profit,,,,,2025,Sachs,forecasts,,,,,,,Goldman,that,at,.,,,,,,,potentially,,,net,,,in,,the,,,decrease,,,,,,Amazon,Goldman Sachs,,,,,Q4,,,2025,Amazon,,,,
66,"According to JPMorgan Chase, the net profit at Microsoft is expected to decrease in Q2 of 2024.",1,in,,,,Q2,,expected,,,,,profit,of,,decrease,,2024,Chase,According,,,,,,,JPMorgan,,to,",",,,,,,to,,,,net,,.,at,,the,,,,is,,,,,Microsoft,JPMorgan Chase,Q2,,,,,,,2024,Microsoft,,,,


In [10]:
pred_profit_sentences = DataProcessing.df_to_list(pred_profit_df, 'Base Sentence')
# pred_profit_sentences_2 = DataProcessing.df_to_list(pred_profit_df, 'Base Sentence')
# pred_profit_sentences_3 = DataProcessing.df_to_list(pred_profit_df, 'Base Sentence')
# pred_profit_sentences_4 = DataProcessing.df_to_list(pred_profit_df, 'Base Sentence')


In [11]:
obs_profit_sentence_df = DataProcessing.include_sentence_and_label(obs_pos_ner_features_df, observations_df)
obs_profit_filt = (obs_pos_ner_features_df.NOUN_1.values == 'profit')
obs_profit_df = obs_pos_ner_features_df.loc[obs_profit_filt == True]
obs_profit_df

Unnamed: 0,Base Sentence,Sentence Label,ADP_3,ADP_5,PUNCT_5,NOUN_7,PROPN_4,VERB_2,PUNCT_3,PROPN_5,NOUN_3,DET_4,SYM_1,NOUN_1,ADP_4,NOUN_4,SCONJ_2,VERB_3,NOUN_5,NUM_1,PROPN_2,VERB_1,ADJ_5,NUM_2,DET_3,NOUN_8,ADP_8,ADJ_4,PROPN_1,DET_2,ADP_1,PUNCT_1,ADJ_3,SCONJ_1,CCONJ_1,DET_5,PUNCT_4,PART_1,ADV_1,VERB_4,PRON_1,ADP_7,ADP_6,ADJ_1,PUNCT_2,ADP_2,NUM_3,DET_1,SYM_2,NOUN_6,NOUN_2,ADJ_6,AUX_1,PROPN_6,ADJ_2,PROPN_7,AUX_2,PROPN_3,ORG_1,LOC_1,TIME_1,GPE_1,NORP_1,PERSON_1,DATE_1,ORG_2,FAC_1,DATE_2,CARDINAL_1,EVENT_1
4,Goldman Sachs saw the net profit at Tesla plummeted in 2023-11-15.,0,,,,,,plummeted,,,,,-,profit,,,,,,2023,Sachs,saw,,11,,,,,Goldman,,at,.,,,,,,,,,,,,net,,in,15,the,-,,,,,,,,,Tesla,Goldman Sachs,,,,,,2023-11-15,Tesla,,,,
5,"From 2025-03-30, Morgan Stanley speculated the gross profit at Apple was drastically different.",0,,,,,,,,,,,-,profit,,,,,,2025,Stanley,speculated,,3,,,,,Morgan,,From,",",,,,,,,drastically,,,,,gross,.,at,30,the,-,,,,was,,different,,,Apple,Morgan Stanley,,,,,,2025-03-30,Apple,,,,
49,Goldman Sachs saw the net profit at JPMorgan Chase plummeted in 2023-11-15.,0,,,,,Chase,plummeted,,,,,-,profit,,,,,,2023,Sachs,saw,,11,,,,,Goldman,,at,.,,,,,,,,,,,,net,,in,15,the,-,,,,,,,,,JPMorgan,Goldman Sachs,,,,,,2023-11-15,JPMorgan Chase,,,,
94,Goldman Sachs saw the net profit at Apple plummeted in 2023-11-15.,0,,,,,,plummeted,,,,,-,profit,,,,,,2023,Sachs,saw,,11,,,,,Goldman,,at,.,,,,,,,,,,,,net,,in,15,the,-,,,,,,,,,Apple,Goldman Sachs,,,,,,2023-11-15,Apple,,,,


In [12]:
obs_profit_sentences = DataProcessing.df_to_list(obs_profit_df, 'Base Sentence')
obs_profit_sentences

['Goldman Sachs saw the net profit at Tesla plummeted in 2023-11-15.',
 'From 2025-03-30, Morgan Stanley speculated the gross profit at Apple was drastically different.',
 'Goldman Sachs saw the net profit at JPMorgan Chase plummeted in 2023-11-15.',
 'Goldman Sachs saw the net profit at Apple plummeted in 2023-11-15.']

In [13]:
pred_obs_dict = {}
for pred_sent_idx in range(len(pred_profit_sentences)):
    pred_sentence = pred_profit_sentences[pred_sent_idx]
    pred_obs_dict[pred_sentence] = obs_profit_sentences
pred_obs_dict

{'According to JPMorgan Chase, the net profit at Microsoft is expected to decrease in Q2 of 2024.': ['Goldman Sachs saw the net profit at Tesla plummeted in 2023-11-15.',
  'From 2025-03-30, Morgan Stanley speculated the gross profit at Apple was drastically different.',
  'Goldman Sachs saw the net profit at JPMorgan Chase plummeted in 2023-11-15.',
  'Goldman Sachs saw the net profit at Apple plummeted in 2023-11-15.'],
 'According to JPMorgan Chase, the net profit at Goldman Sachs would fall in Q3 of 2027.': ['Goldman Sachs saw the net profit at Tesla plummeted in 2023-11-15.',
  'From 2025-03-30, Morgan Stanley speculated the gross profit at Apple was drastically different.',
  'Goldman Sachs saw the net profit at JPMorgan Chase plummeted in 2023-11-15.',
  'Goldman Sachs saw the net profit at Apple plummeted in 2023-11-15.'],
 'Goldman Sachs forecasts that the net profit at Amazon potentially decrease in Q4 of 2025.': ['Goldman Sachs saw the net profit at Tesla plummeted in 2023-1

In [14]:
import spacy

load_nlp_model = spacy.load("en_core_web_sm")

all_mappings = {} # Mapping of predictions to observations
pom_mappings = {} # Mapping of predictions to observations to metrics/scoresl pos can be misleading

oms = []
for prediction in pred_obs_dict.keys():
    print(f"Prediction: {prediction}")

    om_mappings = {} # Mapping of observations to metrics/scores

    for observation in pred_obs_dict[prediction]:
        print(f"    Observation: {observation}")

        # Calculate the similarity score
        prediction_doc = load_nlp_model(prediction)
        observation_doc = load_nlp_model(observation)
        similarity = prediction_doc.similarity(observation_doc)
        print(f"    Similarity: {similarity}")
        om_mappings[observation] = [similarity]
        print()   
        # print(f"    OM Mapping: {om_mappings}")
    pom_mappings[prediction] = om_mappings

# Flatten the dictionary into a list of rows
data = []
for prediction, observations in pom_mappings.items():
    for observation, scores in observations.items():
        data.append([prediction, observation, scores[0]])

# Create the DataFrame
df = pd.DataFrame(data, columns=['Prediction', 'Observation', 'Spacy Similarity'])

# Display the DataFrame
df['Scores'] = "Similarity"
df

Prediction: According to JPMorgan Chase, the net profit at Microsoft is expected to decrease in Q2 of 2024.
    Observation: Goldman Sachs saw the net profit at Tesla plummeted in 2023-11-15.
    Similarity: 0.625748872756958

    Observation: From 2025-03-30, Morgan Stanley speculated the gross profit at Apple was drastically different.
    Similarity: 0.6004109978675842

    Observation: Goldman Sachs saw the net profit at JPMorgan Chase plummeted in 2023-11-15.
    Similarity: 0.6323454976081848

    Observation: Goldman Sachs saw the net profit at Apple plummeted in 2023-11-15.
    Similarity: 0.6263935565948486

Prediction: According to JPMorgan Chase, the net profit at Goldman Sachs would fall in Q3 of 2027.
    Observation: Goldman Sachs saw the net profit at Tesla plummeted in 2023-11-15.
    Similarity: 0.6256189346313477

    Observation: From 2025-03-30, Morgan Stanley speculated the gross profit at Apple was drastically different.
    Similarity: 0.596201479434967

    Obse

  similarity = prediction_doc.similarity(observation_doc)


Unnamed: 0,Prediction,Observation,Spacy Similarity,Scores
0,"According to JPMorgan Chase, the net profit at Microsoft is expected to decrease in Q2 of 2024.",Goldman Sachs saw the net profit at Tesla plummeted in 2023-11-15.,0.625749,Similarity
1,"According to JPMorgan Chase, the net profit at Microsoft is expected to decrease in Q2 of 2024.","From 2025-03-30, Morgan Stanley speculated the gross profit at Apple was drastically different.",0.600411,Similarity
2,"According to JPMorgan Chase, the net profit at Microsoft is expected to decrease in Q2 of 2024.",Goldman Sachs saw the net profit at JPMorgan Chase plummeted in 2023-11-15.,0.632345,Similarity
3,"According to JPMorgan Chase, the net profit at Microsoft is expected to decrease in Q2 of 2024.",Goldman Sachs saw the net profit at Apple plummeted in 2023-11-15.,0.626394,Similarity
4,"According to JPMorgan Chase, the net profit at Goldman Sachs would fall in Q3 of 2027.",Goldman Sachs saw the net profit at Tesla plummeted in 2023-11-15.,0.625619,Similarity
5,"According to JPMorgan Chase, the net profit at Goldman Sachs would fall in Q3 of 2027.","From 2025-03-30, Morgan Stanley speculated the gross profit at Apple was drastically different.",0.596201,Similarity
6,"According to JPMorgan Chase, the net profit at Goldman Sachs would fall in Q3 of 2027.",Goldman Sachs saw the net profit at JPMorgan Chase plummeted in 2023-11-15.,0.653105,Similarity
7,"According to JPMorgan Chase, the net profit at Goldman Sachs would fall in Q3 of 2027.",Goldman Sachs saw the net profit at Apple plummeted in 2023-11-15.,0.617825,Similarity
8,Goldman Sachs forecasts that the net profit at Amazon potentially decrease in Q4 of 2025.,Goldman Sachs saw the net profit at Tesla plummeted in 2023-11-15.,0.534696,Similarity
9,Goldman Sachs forecasts that the net profit at Amazon potentially decrease in Q4 of 2025.,"From 2025-03-30, Morgan Stanley speculated the gross profit at Apple was drastically different.",0.476514,Similarity


In [17]:
data

[['According to JPMorgan Chase, the net profit at Microsoft is expected to decrease in Q2 of 2024.',
  'Goldman Sachs saw the net profit at Tesla plummeted in 2023-11-15.',
  0.625748872756958],
 ['According to JPMorgan Chase, the net profit at Microsoft is expected to decrease in Q2 of 2024.',
  'From 2025-03-30, Morgan Stanley speculated the gross profit at Apple was drastically different.',
  0.6004109978675842],
 ['According to JPMorgan Chase, the net profit at Microsoft is expected to decrease in Q2 of 2024.',
  'Goldman Sachs saw the net profit at JPMorgan Chase plummeted in 2023-11-15.',
  0.6323454976081848],
 ['According to JPMorgan Chase, the net profit at Microsoft is expected to decrease in Q2 of 2024.',
  'Goldman Sachs saw the net profit at Apple plummeted in 2023-11-15.',
  0.6263935565948486],
 ['According to JPMorgan Chase, the net profit at Goldman Sachs would fall in Q3 of 2027.',
  'Goldman Sachs saw the net profit at Tesla plummeted in 2023-11-15.',
  0.62561893463

In [15]:
table = pd.pivot_table(df, values='Spacy Similarity', index=['Prediction', 'Observation'],
                       columns=['Scores'])

- Only grouping by `p_a = profit`. 
- What if also group by sentiment?
    - fall/decrease ---  drastically different --- neutral bc dd doesn't indicate slope/how
    - fall --- plummeted, so similar

In [16]:
table

Unnamed: 0_level_0,Scores,Similarity
Prediction,Observation,Unnamed: 2_level_1
"According to JPMorgan Chase, the net profit at Goldman Sachs would fall in Q3 of 2027.","From 2025-03-30, Morgan Stanley speculated the gross profit at Apple was drastically different.",0.596201
"According to JPMorgan Chase, the net profit at Goldman Sachs would fall in Q3 of 2027.",Goldman Sachs saw the net profit at Apple plummeted in 2023-11-15.,0.617825
"According to JPMorgan Chase, the net profit at Goldman Sachs would fall in Q3 of 2027.",Goldman Sachs saw the net profit at JPMorgan Chase plummeted in 2023-11-15.,0.653105
"According to JPMorgan Chase, the net profit at Goldman Sachs would fall in Q3 of 2027.",Goldman Sachs saw the net profit at Tesla plummeted in 2023-11-15.,0.625619
"According to JPMorgan Chase, the net profit at Microsoft is expected to decrease in Q2 of 2024.","From 2025-03-30, Morgan Stanley speculated the gross profit at Apple was drastically different.",0.600411
"According to JPMorgan Chase, the net profit at Microsoft is expected to decrease in Q2 of 2024.",Goldman Sachs saw the net profit at Apple plummeted in 2023-11-15.,0.626394
"According to JPMorgan Chase, the net profit at Microsoft is expected to decrease in Q2 of 2024.",Goldman Sachs saw the net profit at JPMorgan Chase plummeted in 2023-11-15.,0.632345
"According to JPMorgan Chase, the net profit at Microsoft is expected to decrease in Q2 of 2024.",Goldman Sachs saw the net profit at Tesla plummeted in 2023-11-15.,0.625749
Goldman Sachs forecasts that the net profit at Amazon potentially decrease in Q4 of 2025.,"From 2025-03-30, Morgan Stanley speculated the gross profit at Apple was drastically different.",0.476514
Goldman Sachs forecasts that the net profit at Amazon potentially decrease in Q4 of 2025.,Goldman Sachs saw the net profit at Apple plummeted in 2023-11-15.,0.531176


In [None]:
# Methods to alter Precision and Recall for prob rather than binary
    
#     https://en.wikipedia.org/wiki/Fuzzy_classification

# Support or deny classes

In [None]:
from sentence_transformers import SentenceTransformer

# 1. Load a pretrained Sentence Transformer model
model = SentenceTransformer("all-MiniLM-L6-v2")

# The sentences to encode
sentences = [
    "The weather is lovely today.",
    "It's so sunny outside!",
    "He drove to the stadium.",
]

# 2. Calculate embeddings by calling model.encode()
embeddings = model.encode(sentences)
print(embeddings.shape)
# [3, 384]

# 3. Calculate the embedding similarities
similarities = model.similarity(embeddings, embeddings)
print(similarities)
# tensor([[1.0000, 0.6660, 0.1046],
#         [0.6660, 1.0000, 0.1411],
#         [0.1046, 0.1411, 1.0000]])

In [19]:
pred_profit_sentences

['According to JPMorgan Chase, the net profit at Microsoft is expected to decrease in Q2 of 2024.',
 'According to JPMorgan Chase, the net profit at Goldman Sachs would fall in Q3 of 2027.',
 'Goldman Sachs forecasts that the net profit at Amazon potentially decrease in Q4 of 2025.',
 'According to JPMorgan Chase, the net profit at Microsoft is expected to decrease in Q2 of 2024.']

In [18]:
obs_profit_sentences

['Goldman Sachs saw the net profit at Tesla plummeted in 2023-11-15.',
 'From 2025-03-30, Morgan Stanley speculated the gross profit at Apple was drastically different.',
 'Goldman Sachs saw the net profit at JPMorgan Chase plummeted in 2023-11-15.',
 'Goldman Sachs saw the net profit at Apple plummeted in 2023-11-15.']

In [22]:
my_sentences = pred_profit_sentences + obs_profit_sentences
my_sentences

['According to JPMorgan Chase, the net profit at Microsoft is expected to decrease in Q2 of 2024.',
 'According to JPMorgan Chase, the net profit at Goldman Sachs would fall in Q3 of 2027.',
 'Goldman Sachs forecasts that the net profit at Amazon potentially decrease in Q4 of 2025.',
 'According to JPMorgan Chase, the net profit at Microsoft is expected to decrease in Q2 of 2024.',
 'Goldman Sachs saw the net profit at Tesla plummeted in 2023-11-15.',
 'From 2025-03-30, Morgan Stanley speculated the gross profit at Apple was drastically different.',
 'Goldman Sachs saw the net profit at JPMorgan Chase plummeted in 2023-11-15.',
 'Goldman Sachs saw the net profit at Apple plummeted in 2023-11-15.']

In [None]:
my_sentences = ["10%", "10.5", "11"]

In [23]:
from sentence_transformers import SentenceTransformer

# 1. Load a pretrained Sentence Transformer model
model = SentenceTransformer("all-MiniLM-L6-v2")

# 2. Calculate embeddings by calling model.encode()
embeddings = model.encode(my_sentences)
print(embeddings.shape)
# [3, 384]

# 3. Calculate the embedding similarities
similarities = model.similarity(embeddings, embeddings)
print(similarities)
# tensor([[1.0000, 0.6660, 0.1046],
#         [0.6660, 1.0000, 0.1411],
#         [0.1046, 0.1411, 1.0000]])

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

(8, 384)
tensor([[1.0000, 0.7502, 0.6276, 1.0000, 0.4409, 0.5184, 0.6046, 0.4759],
        [0.7502, 1.0000, 0.6490, 0.7502, 0.6481, 0.4828, 0.7861, 0.6294],
        [0.6276, 0.6490, 1.0000, 0.6276, 0.5264, 0.4574, 0.4515, 0.5608],
        [1.0000, 0.7502, 0.6276, 1.0000, 0.4409, 0.5184, 0.6046, 0.4759],
        [0.4409, 0.6481, 0.5264, 0.4409, 1.0000, 0.5015, 0.7743, 0.8444],
        [0.5184, 0.4828, 0.4574, 0.5184, 0.5015, 1.0000, 0.4835, 0.6708],
        [0.6046, 0.7861, 0.4515, 0.6046, 0.7743, 0.4835, 1.0000, 0.7902],
        [0.4759, 0.6294, 0.5608, 0.4759, 0.8444, 0.6708, 0.7902, 1.0000]])
