# Embeddings with Cosine Similarity

- Get the embeddings of predictions and observations with an entailment label of either neutral or entail.
- Embedding models: Spacy, SentenceTransformer, BERT
- Pass all models to sklearn cosine similarity

In [1]:
import os, sys

import numpy as np
import pandas as pd

from tqdm import tqdm

# Get the current working directory of the notebook
notebook_dir = os.getcwd()
# Add the parent directory to the system path
sys.path.append(os.path.join(notebook_dir, '../'))

import log_files
from metrics import Metrics
from data_processing import DataProcessing
from feature_extraction import SpacyFeatureExtraction, SentenceTransformerFeatureExtraction, BertFeatureExtraction

In [2]:
pd.set_option('max_colwidth', 800)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

## Load Entailment Data

In [3]:
entailment_df = pd.read_csv('../data/entailment/entailment-v1.csv')  
entailment_df

Unnamed: 0,Prediction,Observation,Entailment Label
0,"Citigroup predicts on 2024-08-21, the operating income at Alphabet may rise.","In Q2 2025, a financial research advisor envisions that the research and development expenses at Google has some probability to remain stable.",NEUTRAL
1,JPMorgan Chase forecasts that the net profit at Amazon potentially decrease in Q3 of 2027.,"According to the financial top executive at BlackRock, the stock price at Amazon may rise in Q4 2028.",NEUTRAL
2,JPMorgan Chase forecasts that the net profit at Amazon potentially decrease in Q3 of 2027.,"In the fourth quarter of 2027, Wells Fargo envisions that the operating cash flow at Intel has some probability to remain stable.",NEUTRAL
3,JPMorgan Chase forecasts that the net profit at Amazon potentially decrease in Q3 of 2027.,"Intel stock price should stay the same in January 2028, according to a financial expert at Harvard University.",NEUTRAL
4,JPMorgan Chase forecasts that the net profit at Amazon potentially decrease in Q3 of 2027.,"On March 1, 2029, the financial advisor at Wells Fargo envisions that the inflation rate at the Federal Reserve has some probability to remain stable.",NEUTRAL
5,JPMorgan Chase forecasts that the net profit at Amazon potentially decrease in Q3 of 2027.,"On November 10, 2022, to November 10, 2023, Citigroup speculates the research and development expenses at Amazon will likely increase.",NEUTRAL
6,JPMorgan Chase forecasts that the net profit at Amazon potentially decrease in Q3 of 2027.,"The trading volume at Apple should stay same in the fourth quarter of 2025, according to a financial expert at JPMorgan Chase.",NEUTRAL
7,"On August 21, 2024, Bank of America speculates the revenue at Microsoft will likely increase.","According to Bank of America, the net profit at Microsoft would fall in the second quarter of 2026.",NEUTRAL
8,"On August 21, 2024, Bank of America speculates the revenue at Microsoft will likely increase.","Apple stock price decreased in August 2024, according to Roger.",NEUTRAL
9,"On August 21, 2024, Bank of America speculates the revenue at Microsoft will likely increase.","Intel stock price should stay the same in January 2028, according to a financial expert at Harvard University.",NEUTRAL


## Extract Embeddings with Spacy

In [4]:
disable_components = [""]
# pred_spacy_fe = SpacyFeatureExtraction(predictions_df, "Base Sentence")
spacy_fe_pred = SpacyFeatureExtraction(entailment_df, 'Prediction')
spacy_embeddings_pred = spacy_fe_pred.sentence_feature_extraction()

spacy_fe_obser = SpacyFeatureExtraction(entailment_df, 'Observation')
obser_embeddings_pred = spacy_fe_obser.sentence_feature_extraction()

spacy_cs_metrics = Metrics.get_cosine_similarity(spacy_embeddings_pred, obser_embeddings_pred)
entailment_df['Spacy'] = spacy_cs_metrics
entailment_df

100%|██████████| 12/12 [00:00<00:00, 315.66it/s]
100%|██████████| 12/12 [00:00<00:00, 294.25it/s]
100%|██████████| 12/12 [00:00<00:00, 7139.24it/s]


Unnamed: 0,Prediction,Observation,Entailment Label,Spacy
0,"Citigroup predicts on 2024-08-21, the operating income at Alphabet may rise.","In Q2 2025, a financial research advisor envisions that the research and development expenses at Google has some probability to remain stable.",NEUTRAL,0.773352
1,JPMorgan Chase forecasts that the net profit at Amazon potentially decrease in Q3 of 2027.,"According to the financial top executive at BlackRock, the stock price at Amazon may rise in Q4 2028.",NEUTRAL,0.866826
2,JPMorgan Chase forecasts that the net profit at Amazon potentially decrease in Q3 of 2027.,"In the fourth quarter of 2027, Wells Fargo envisions that the operating cash flow at Intel has some probability to remain stable.",NEUTRAL,0.865429
3,JPMorgan Chase forecasts that the net profit at Amazon potentially decrease in Q3 of 2027.,"Intel stock price should stay the same in January 2028, according to a financial expert at Harvard University.",NEUTRAL,0.812578
4,JPMorgan Chase forecasts that the net profit at Amazon potentially decrease in Q3 of 2027.,"On March 1, 2029, the financial advisor at Wells Fargo envisions that the inflation rate at the Federal Reserve has some probability to remain stable.",NEUTRAL,0.845318
5,JPMorgan Chase forecasts that the net profit at Amazon potentially decrease in Q3 of 2027.,"On November 10, 2022, to November 10, 2023, Citigroup speculates the research and development expenses at Amazon will likely increase.",NEUTRAL,0.815297
6,JPMorgan Chase forecasts that the net profit at Amazon potentially decrease in Q3 of 2027.,"The trading volume at Apple should stay same in the fourth quarter of 2025, according to a financial expert at JPMorgan Chase.",NEUTRAL,0.859226
7,"On August 21, 2024, Bank of America speculates the revenue at Microsoft will likely increase.","According to Bank of America, the net profit at Microsoft would fall in the second quarter of 2026.",NEUTRAL,0.927874
8,"On August 21, 2024, Bank of America speculates the revenue at Microsoft will likely increase.","Apple stock price decreased in August 2024, according to Roger.",NEUTRAL,0.876008
9,"On August 21, 2024, Bank of America speculates the revenue at Microsoft will likely increase.","Intel stock price should stay the same in January 2028, according to a financial expert at Harvard University.",NEUTRAL,0.900117


## Extract Embeddings with Sentence Transformer

In [5]:
st_fe_pred = SentenceTransformerFeatureExtraction(entailment_df, 'Prediction')
st_embeddings_pred = st_fe_pred.sentence_feature_extraction()

st_fe_obser = SentenceTransformerFeatureExtraction(entailment_df, 'Observation')
st_embeddings_obser = st_fe_obser.sentence_feature_extraction()

st_cs_metrics = Metrics.get_cosine_similarity(st_embeddings_pred, st_embeddings_obser)
entailment_df['SentenceTransformer'] = st_cs_metrics
entailment_df

100%|██████████| 12/12 [00:00<00:00, 85.52it/s]
100%|██████████| 12/12 [00:00<00:00, 56.57it/s]
100%|██████████| 12/12 [00:00<00:00, 8184.01it/s]


Unnamed: 0,Prediction,Observation,Entailment Label,Spacy,SentenceTransformer
0,"Citigroup predicts on 2024-08-21, the operating income at Alphabet may rise.","In Q2 2025, a financial research advisor envisions that the research and development expenses at Google has some probability to remain stable.",NEUTRAL,0.773352,0.328824
1,JPMorgan Chase forecasts that the net profit at Amazon potentially decrease in Q3 of 2027.,"According to the financial top executive at BlackRock, the stock price at Amazon may rise in Q4 2028.",NEUTRAL,0.866826,0.64474
2,JPMorgan Chase forecasts that the net profit at Amazon potentially decrease in Q3 of 2027.,"In the fourth quarter of 2027, Wells Fargo envisions that the operating cash flow at Intel has some probability to remain stable.",NEUTRAL,0.865429,0.311582
3,JPMorgan Chase forecasts that the net profit at Amazon potentially decrease in Q3 of 2027.,"Intel stock price should stay the same in January 2028, according to a financial expert at Harvard University.",NEUTRAL,0.812578,0.321734
4,JPMorgan Chase forecasts that the net profit at Amazon potentially decrease in Q3 of 2027.,"On March 1, 2029, the financial advisor at Wells Fargo envisions that the inflation rate at the Federal Reserve has some probability to remain stable.",NEUTRAL,0.845318,0.282765
5,JPMorgan Chase forecasts that the net profit at Amazon potentially decrease in Q3 of 2027.,"On November 10, 2022, to November 10, 2023, Citigroup speculates the research and development expenses at Amazon will likely increase.",NEUTRAL,0.815297,0.490033
6,JPMorgan Chase forecasts that the net profit at Amazon potentially decrease in Q3 of 2027.,"The trading volume at Apple should stay same in the fourth quarter of 2025, according to a financial expert at JPMorgan Chase.",NEUTRAL,0.859226,0.489393
7,"On August 21, 2024, Bank of America speculates the revenue at Microsoft will likely increase.","According to Bank of America, the net profit at Microsoft would fall in the second quarter of 2026.",NEUTRAL,0.927874,0.771568
8,"On August 21, 2024, Bank of America speculates the revenue at Microsoft will likely increase.","Apple stock price decreased in August 2024, according to Roger.",NEUTRAL,0.876008,0.375553
9,"On August 21, 2024, Bank of America speculates the revenue at Microsoft will likely increase.","Intel stock price should stay the same in January 2028, according to a financial expert at Harvard University.",NEUTRAL,0.900117,0.394057


## Extract Embeddings with Bert 

In [6]:
bert_fe_pred = BertFeatureExtraction(entailment_df, 'Prediction')
bert_embeddings_pred = bert_fe_pred.sentence_feature_extraction()

bert_fe_obser = BertFeatureExtraction(entailment_df, 'Observation')
bert_embeddings_obser = bert_fe_obser.sentence_feature_extraction()

bert_cs_metrics = Metrics.get_cosine_similarity(bert_embeddings_pred, bert_embeddings_obser)
entailment_df['BERT'] = bert_cs_metrics
entailment_df

100%|██████████| 12/12 [00:00<00:00, 36.57it/s]
100%|██████████| 12/12 [00:00<00:00, 35.03it/s]
100%|██████████| 12/12 [00:00<00:00, 7067.07it/s]


Unnamed: 0,Prediction,Observation,Entailment Label,Spacy,SentenceTransformer,BERT
0,"Citigroup predicts on 2024-08-21, the operating income at Alphabet may rise.","In Q2 2025, a financial research advisor envisions that the research and development expenses at Google has some probability to remain stable.",NEUTRAL,0.773352,0.328824,0.899819
1,JPMorgan Chase forecasts that the net profit at Amazon potentially decrease in Q3 of 2027.,"According to the financial top executive at BlackRock, the stock price at Amazon may rise in Q4 2028.",NEUTRAL,0.866826,0.64474,0.941268
2,JPMorgan Chase forecasts that the net profit at Amazon potentially decrease in Q3 of 2027.,"In the fourth quarter of 2027, Wells Fargo envisions that the operating cash flow at Intel has some probability to remain stable.",NEUTRAL,0.865429,0.311582,0.896069
3,JPMorgan Chase forecasts that the net profit at Amazon potentially decrease in Q3 of 2027.,"Intel stock price should stay the same in January 2028, according to a financial expert at Harvard University.",NEUTRAL,0.812578,0.321734,0.903572
4,JPMorgan Chase forecasts that the net profit at Amazon potentially decrease in Q3 of 2027.,"On March 1, 2029, the financial advisor at Wells Fargo envisions that the inflation rate at the Federal Reserve has some probability to remain stable.",NEUTRAL,0.845318,0.282765,0.819832
5,JPMorgan Chase forecasts that the net profit at Amazon potentially decrease in Q3 of 2027.,"On November 10, 2022, to November 10, 2023, Citigroup speculates the research and development expenses at Amazon will likely increase.",NEUTRAL,0.815297,0.490033,0.913402
6,JPMorgan Chase forecasts that the net profit at Amazon potentially decrease in Q3 of 2027.,"The trading volume at Apple should stay same in the fourth quarter of 2025, according to a financial expert at JPMorgan Chase.",NEUTRAL,0.859226,0.489393,0.918903
7,"On August 21, 2024, Bank of America speculates the revenue at Microsoft will likely increase.","According to Bank of America, the net profit at Microsoft would fall in the second quarter of 2026.",NEUTRAL,0.927874,0.771568,0.91856
8,"On August 21, 2024, Bank of America speculates the revenue at Microsoft will likely increase.","Apple stock price decreased in August 2024, according to Roger.",NEUTRAL,0.876008,0.375553,0.888524
9,"On August 21, 2024, Bank of America speculates the revenue at Microsoft will likely increase.","Intel stock price should stay the same in January 2028, according to a financial expert at Harvard University.",NEUTRAL,0.900117,0.394057,0.874122


## Extract Embeddings with TF x IDF

In [7]:
# tfidf_fe_pred = TfidfFeatureExtraction(entailment_df, 'Prediction')
# tfidf_pred_df = tfidf_fe_pred.feature_scores(max_features=300)
# tfidf_pred_df[:1]
# tfidf_embedding_pred = tfidf_pred_df.iloc[:1 , 2:].to_numpy()

# tfidf_fe_obser = TfidfFeatureExtraction(entailment_df, 'Observation')
# tfidf_obser_df = tfidf_fe_obser.feature_scores(max_features=300)
# tfidf_obser_df[:1]
# tfidf_embedding_obser = tfidf_obser_df.iloc[:1 , 2:].to_numpy()

# tfidf_cs_metrics = Metrics.get_cosine_similarity(tfidf_embedding_pred, tfidf_embedding_obser)
# entailment_df['TF x IDF'] = tfidf_cs_metrics
# entailment_df

In [8]:
pred_obser_features_df = pd.pivot_table(entailment_df, values='Entailment Label', index=['Prediction', 'Observation', 'Entailment Label', 'Spacy', 'SentenceTransformer', 'BERT'])
pred_obser_features_df

Prediction,Observation,Entailment Label,Spacy,SentenceTransformer,BERT
"Citigroup predicts on 2024-08-21, the operating income at Alphabet may rise.","In Q2 2025, a financial research advisor envisions that the research and development expenses at Google has some probability to remain stable.",NEUTRAL,0.773352,0.328824,0.899819
JPMorgan Chase forecasts that the net profit at Amazon potentially decrease in Q3 of 2027.,"According to the financial top executive at BlackRock, the stock price at Amazon may rise in Q4 2028.",NEUTRAL,0.866826,0.64474,0.941268
JPMorgan Chase forecasts that the net profit at Amazon potentially decrease in Q3 of 2027.,"In the fourth quarter of 2027, Wells Fargo envisions that the operating cash flow at Intel has some probability to remain stable.",NEUTRAL,0.865429,0.311582,0.896069
JPMorgan Chase forecasts that the net profit at Amazon potentially decrease in Q3 of 2027.,"Intel stock price should stay the same in January 2028, according to a financial expert at Harvard University.",NEUTRAL,0.812578,0.321734,0.903572
JPMorgan Chase forecasts that the net profit at Amazon potentially decrease in Q3 of 2027.,"On March 1, 2029, the financial advisor at Wells Fargo envisions that the inflation rate at the Federal Reserve has some probability to remain stable.",NEUTRAL,0.845318,0.282765,0.819832
JPMorgan Chase forecasts that the net profit at Amazon potentially decrease in Q3 of 2027.,"On November 10, 2022, to November 10, 2023, Citigroup speculates the research and development expenses at Amazon will likely increase.",NEUTRAL,0.815297,0.490033,0.913402
JPMorgan Chase forecasts that the net profit at Amazon potentially decrease in Q3 of 2027.,"The trading volume at Apple should stay same in the fourth quarter of 2025, according to a financial expert at JPMorgan Chase.",NEUTRAL,0.859226,0.489393,0.918903
"On August 21, 2024, Bank of America speculates the revenue at Microsoft will likely increase.","According to Bank of America, the net profit at Microsoft would fall in the second quarter of 2026.",NEUTRAL,0.927874,0.771568,0.91856
"On August 21, 2024, Bank of America speculates the revenue at Microsoft will likely increase.","Apple stock price decreased in August 2024, according to Roger.",NEUTRAL,0.876008,0.375553,0.888524
"On August 21, 2024, Bank of America speculates the revenue at Microsoft will likely increase.","Intel stock price should stay the same in January 2028, according to a financial expert at Harvard University.",NEUTRAL,0.900117,0.394057,0.874122
