# Embeddings

In [2]:
import os, sys

import numpy as np
import pandas as pd

from tqdm import tqdm

# Get the current working directory of the notebook
notebook_dir = os.getcwd()
# Add the parent directory to the system path
sys.path.append(os.path.join(notebook_dir, '../predictions/'))

import log_files
from log_files import LogData
from data_processing import DataProcessing
from feature_extraction import SpacyFeatureExtraction, TfidfFeatureExtraction

In [3]:
pd.set_option('max_colwidth', 800)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

## Read csv files and load as df

In [4]:
log_file_path = "data/prediction_logs"
predictions = True
predictions_df = log_files.read_data(notebook_dir, log_file_path, predictions)
predictions_df.head(7)

Start logging batch
log_directory: /Users/detraviousjamaribrinkley/Documents/Development/research_labs/uf_ds/scratch-predictions/notebook_experiments/../data/prediction_logs
save_batch_directory: /Users/detraviousjamaribrinkley/Documents/Development/research_labs/uf_ds/scratch-predictions/notebook_experiments/../data/prediction_logs/batch_1-prediction
CSV to DF
Load saved csv: /Users/detraviousjamaribrinkley/Documents/Development/research_labs/uf_ds/scratch-predictions/notebook_experiments/../data/prediction_logs/batch_1-prediction/batch_1-from_df.csv
save_batch_directory: /Users/detraviousjamaribrinkley/Documents/Development/research_labs/uf_ds/scratch-predictions/notebook_experiments/../data/prediction_logs/batch_2-prediction
CSV to DF
Load saved csv: /Users/detraviousjamaribrinkley/Documents/Development/research_labs/uf_ds/scratch-predictions/notebook_experiments/../data/prediction_logs/batch_2-prediction/batch_2-from_df.csv
save_batch_directory: /Users/detraviousjamaribrinkley/Docu

Unnamed: 0,Base Sentence,Sentence Label,Domain,Model Name,API Name,Batch ID,Template Number
0,JPMorgan Chase forecasts that the net profit at Amazon potentially decrease in Q3 of 2027.,1,finance,llama-3.1-70b-instruct,NAVI_GATOR,0,1
1,"On August 21, 2024, Bank of America speculates the revenue at Microsoft will likely increase.",1,finance,llama-3.1-70b-instruct,NAVI_GATOR,0,2
2,"Citigroup predicts on 2024-08-21, the operating income at Alphabet may rise.",1,finance,llama-3.1-70b-instruct,NAVI_GATOR,0,3
3,"According to Goldman Sachs, the research and development expenses at Facebook would fall in 2025.",1,finance,llama-3.1-70b-instruct,NAVI_GATOR,0,4
4,"In 21 August 2024, Morgan Stanley envisions that the gross profit at Johnson & Johnson has some probability to remain stable.",1,finance,llama-3.1-70b-instruct,NAVI_GATOR,0,5
5,"The stock price at Visa should stay same in Q2 of 2026, according to Wells Fargo.",1,finance,llama-3.1-70b-instruct,NAVI_GATOR,0,6
6,JPMorgan forecasts that the revenue at Microsoft potentially decrease in Q3 of 2027.,1,finance,llama-3.3-70b-instruct,NAVI_GATOR,0,1


In [None]:
log_file_path = "data/observation_logs"
predictions = False
observations_df = log_files.read_data(notebook_dir, log_file_path, predictions)
observations_df.head(7)

In [None]:
col_name = 'Base Sentence'
predictions = DataProcessing.df_to_list(predictions_df, col_name)
observations = DataProcessing.df_to_list(observations_df, col_name)
len(predictions), len(observations)

## Extract Spacy embeddings

In [None]:
disable_components = [""]
pred_spacy_fe = SpacyFeatureExtraction(predictions_df, "Base Sentence")
pred_sentence_features = pred_spacy_fe.sentence_feature_extraction()

obser_spacy_fe = SpacyFeatureExtraction(observations_df, "Base Sentence")
obser_sentence_features = obser_spacy_fe.sentence_feature_extraction()

In [None]:
spacy_pred_sent_embedding = pred_sentence_features[0]
spacy_obser_sent_embedding = obser_sentence_features[0]
# spacy_pred_sent_embedding.shape

In [None]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer("all-MiniLM-L6-v2")

sentences = [] 
sentences.append(predictions[0])
sentences.append(observations[0])
print(sentences)

embeddings = model.encode(sentences)
print(embeddings.shape)
# => (3, 384)


# spacy_pred_sent_embedding.shape

similarities = model.similarity(embeddings, embeddings)
print(similarities)
# tensor([[1.0000, 0.6660, 0.1046],
#         [0.6660, 1.0000, 0.1411],
#         [0.1046, 0.1411, 1.0000]])

sent_transformer_pred_sent_embedding = model.encode(predictions[0])
sent_transformer_obser_sent_embedding = model.encode(observations[0])
similarities = model.similarity(sent_transformer_pred_sent_embedding, sent_transformer_obser_sent_embedding)
print(similarities)


In [None]:
import torch
from transformers import AutoTokenizer, AutoModel

from transformers import BertTokenizer, BertModel
def get_bert_embeddings(sentence: str):
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    model = BertModel.from_pretrained("bert-base-uncased")
    # text = "Replace me by any text you'd like."
    encoded_input = tokenizer(sentence, return_tensors='pt')
    # Get hidden states from BERT
    with torch.no_grad():
        output = model(**encoded_input)

    # Extract embeddings for [CLS] token
    sentence_embedding = output.last_hidden_state[:, 0, :].squeeze()

    return sentence_embedding

In [None]:
bert_pred_sent_embedding = get_bert_embeddings(predictions[0])
bert_obser_sent_embedding = get_bert_embeddings(observations[0])

In [None]:
pred_tfidf_fe = TfidfFeatureExtraction(predictions_df, 'Base Sentence')
obser_tfidf_fe = TfidfFeatureExtraction(observations_df, 'Base Sentence')

pred_tfidf_df = pred_tfidf_fe.feature_scores(max_features=300)
obser_tfidf_df = obser_tfidf_fe.feature_scores(max_features=300)

In [None]:
pred_tfidf_df[:1]

In [None]:
tfidf_pred_embedding = pred_tfidf_df.iloc[:1 , 2:].to_numpy()

In [None]:
obser_tfidf_df[:1]

In [None]:
tfidf_obser_embedding = obser_tfidf_df.iloc[:1 , 2:].to_numpy()
tfidf_obser_embedding

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

def get_cosine_similarity(prediction_embeddings: np.array, observation_embeddings: np.array):

    # make them (1 × vector_dim) for sklearn
    pred_sent_embedding_reshaped = prediction_embeddings.reshape(1, -1)
    obser_sent_embedding_reshaped = observation_embeddings.reshape(1, -1)

    sim = cosine_similarity(pred_sent_embedding_reshaped, obser_sent_embedding_reshaped)[0, 0]
    
    return sim

In [None]:
spacy_cs_metric = get_cosine_similarity(spacy_pred_sent_embedding, spacy_obser_sent_embedding)
sent_tranformer_cs_metric = get_cosine_similarity(sent_transformer_pred_sent_embedding, sent_transformer_obser_sent_embedding)
bert_cs_metric = get_cosine_similarity(bert_pred_sent_embedding, bert_obser_sent_embedding)
tfidf_cs_metric = get_cosine_similarity(tfidf_pred_embedding, tfidf_obser_embedding)

spacy_cs_metric, sent_tranformer_cs_metric, bert_cs_metric, tfidf_cs_metric