In [1]:
import os
import sys

import pandas as pd

from scipy.spatial import distance

notebook_dir = os.getcwd()

sys.path.append(os.path.join(notebook_dir, ".."))

from data_processing import DataProcessing
from feature_extraction import SpacyFeatureExtraction

In [2]:
pd.set_option('max_colwidth', 800)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

## Load Data

In [3]:
base_data_path = os.path.join(notebook_dir, "../data")
subset_size = 3

In [4]:
prediction_file_path = os.path.join(base_data_path, "prediction_logs/batch_1-prediction/batch_1-from_df.csv")
prediction_df = DataProcessing.load_from_file(prediction_file_path, 'csv')
prediction_df = prediction_df.loc[:subset_size, :]
prediction_df.head(3)

Unnamed: 0,Base Sentence,Sentence Label,Domain,Model Name,API Name,Batch ID,Template Number
0,JPMorgan Chase forecasts that the net profit at Amazon potentially decrease in Q3 of 2027.,1,finance,llama-3.1-70b-instruct,NAVI_GATOR,0,1
1,"On August 21, 2024, Bank of America speculates the revenue at Microsoft will likely increase.",1,finance,llama-3.1-70b-instruct,NAVI_GATOR,0,2
2,"Citigroup predicts on 2024-08-21, the operating income at Alphabet may rise.",1,finance,llama-3.1-70b-instruct,NAVI_GATOR,0,3


In [5]:
observation_file_path = os.path.join(base_data_path, "observation_logs/batch_13-observation/batch_13-from_df.csv")
observation_df = DataProcessing.load_from_file(observation_file_path, 'csv')
observation_df = observation_df.loc[:subset_size, :]
observation_df.head(3)

Unnamed: 0,Base Sentence,Sentence Label,Domain,Model Name,API Name,Batch ID,Template Number
0,Morgan Stanley observed that the revenue at Tesla remained stable in Q4 2023.,0,finance,gemma2-9b-it,GROQ_CLOUD,0,1
1,"On 08/21/2024, Bloomberg analysts monitored the net profit at Amazon changed.",0,finance,gemma2-9b-it,GROQ_CLOUD,0,2
2,"JP Morgan noted on 12/31/2022, the market capitalization at Alphabet fell.",0,finance,gemma2-9b-it,GROQ_CLOUD,0,3


## Embed Sentences

In [6]:
prediction_fe = SpacyFeatureExtraction(prediction_df, 'Base Sentence')
prediction_sentence_to_embeddings_df = prediction_fe.sentence_feature_extraction(attach_to_df=True)
prediction_sentence_to_embeddings_df.head(3)

100%|██████████| 4/4 [00:00<00:00, 205.06it/s]


Unnamed: 0,Base Sentence,Sentence Label,Domain,Model Name,API Name,Batch ID,Template Number,Embedding
0,JPMorgan Chase forecasts that the net profit at Amazon potentially decrease in Q3 of 2027.,1,finance,llama-3.1-70b-instruct,NAVI_GATOR,0,1,"[-0.3040466, 0.22634023, -0.044873744, 0.05321863, -0.021814935, 0.050506078, -0.050781496, 0.17705332, 0.16600926, 1.4690967, -0.40100077, 0.07522851, 0.092218935, 0.035675135, 0.043386526, 0.030550629, -0.04794823, 0.92671317, -0.08738538, -0.10214056, -0.0046942458, 0.23118651, 0.05690287, -0.09401375, 0.023469876, 0.15920831, -0.024384828, 0.045344595, 0.0046730023, 0.0573052, 0.035406068, -0.013383806, -0.04864075, 0.13582823, -0.0032525612, -0.00046420842, 0.05715994, 0.20158848, 0.026289606, -0.021469561, 0.08178532, 0.043315817, 0.17112774, -0.020757563, 0.09502707, -0.07977632, -0.00072006695, -0.022497945, -0.07319131, 0.015171076, 0.015923813, -0.005278694, -0.112299636, -0.08400263, 0.040624812, -0.11581402, -0.021309063, -0.14613399, -0.095131546, -0.12737882, -0.05559337,..."
1,"On August 21, 2024, Bank of America speculates the revenue at Microsoft will likely increase.",1,finance,llama-3.1-70b-instruct,NAVI_GATOR,0,2,"[-0.18284982, 0.31618637, 0.05471527, -0.06443716, -0.12651742, 0.04094796, -0.050175864, 0.012213166, 0.06476061, 1.8643172, -0.3114558, 0.04549353, 0.08643741, 0.058468692, -0.13359468, 0.02161961, -0.11321817, 1.1386179, -0.071944274, -0.0092364615, 0.008749722, 0.14753342, 0.03712756, -0.11733479, 0.04739451, 0.069236614, -0.16650662, -0.046210695, 0.0134748295, 0.1481042, 0.0036202346, -0.054114997, 0.011698042, 0.16874154, -0.0103031555, 0.027015723, -0.05061591, 0.09802757, -0.007672908, -0.005642483, -0.03455541, 0.02005733, 0.1546997, -0.04231457, 0.063842505, -0.035977997, -0.010835616, -0.10927093, 0.041564394, 0.076044075, 0.037202165, 0.03659551, -0.0971661, -0.08038428, 0.00661144, -0.02415889, 0.033524778, -0.10093203, -0.054347552, -0.07694652, -0.02421616, -0.07161913,..."
2,"Citigroup predicts on 2024-08-21, the operating income at Alphabet may rise.",1,finance,llama-3.1-70b-instruct,NAVI_GATOR,0,3,"[-0.16487946, 0.39756328, 0.054461822, 0.094671406, 0.00819997, 0.023589306, -0.05402224, 0.04678935, -0.029905295, 1.5209506, -0.27982494, 0.023466066, 0.0056946985, 0.033199526, -0.15232277, 0.03827806, -0.12154387, 1.1366441, -0.087331764, -0.0059556076, -0.027337998, 0.06343441, 0.011176354, 0.018617705, 0.22321375, 0.18261969, -0.11046365, -0.08711927, 0.029319417, 0.15519166, -0.005161861, 0.07109976, 0.020871151, -0.009692762, 0.04298247, 0.12779824, -0.0033220947, 0.16457641, 0.008928422, 0.012633492, -0.03465824, -0.0031197632, 0.07736342, -0.00086362223, 0.043974623, 0.0016469404, -0.014479402, -0.06107889, 0.0966277, 0.12048379, 0.038715117, 0.097983934, -0.1381956, -0.020974701, -0.0050487113, 0.009455884, 0.058299344, -0.030679941, -0.095401876, -0.104209825, 0.020901423, ..."


In [7]:
observation_fe = SpacyFeatureExtraction(observation_df, 'Base Sentence')
observation_sentence_to_embeddings_df = observation_fe.sentence_feature_extraction(attach_to_df=True)
observation_sentence_to_embeddings_df.head(3)

100%|██████████| 4/4 [00:00<00:00, 307.15it/s]


Unnamed: 0,Base Sentence,Sentence Label,Domain,Model Name,API Name,Batch ID,Template Number,Embedding
0,Morgan Stanley observed that the revenue at Tesla remained stable in Q4 2023.,0,finance,gemma2-9b-it,GROQ_CLOUD,0,1,"[-0.1316162, 0.2799326, 0.014293938, -0.010601596, -0.052468587, -0.031385913, -0.01743913, 0.00787945, 0.18814914, 1.3222173, -0.07643226, 0.06745286, 0.104227334, 0.040666774, 0.0064706625, 0.034329336, 0.025694935, 0.82014126, -0.089338824, -0.016382465, 0.13008274, 0.06921094, 0.083439805, -0.032673933, -0.020209273, 0.15608875, -0.17188089, 0.07992408, 0.020131003, 0.19941163, -0.05865679, -0.09097073, 0.017988674, 0.09900647, -0.092653394, -0.012442573, 0.10437608, 0.13059562, -0.16318339, -0.028446939, -0.010134664, 0.079758324, 0.14247426, 0.083422, 0.13712993, -0.054064002, 0.016065933, -0.17071342, -0.054295667, -0.11801446, 0.10039752, 0.04016822, -0.16423514, -0.08418427, 0.09913173, -0.13340446, 0.02725059, -0.10365815, 0.0412934, -0.055259, -0.12495499, -0.015519348, 0.01..."
1,"On 08/21/2024, Bloomberg analysts monitored the net profit at Amazon changed.",0,finance,gemma2-9b-it,GROQ_CLOUD,0,2,"[-0.25720593, 0.22906184, 0.01313231, 0.10379808, -0.09960501, 0.08714679, -0.08546818, -0.10745308, 0.03502116, 1.6454042, -0.27729586, 0.016878154, 0.111134544, -0.0861568, -0.036773838, 0.00694631, -0.09029221, 0.95719117, -0.13386115, 0.018098846, -0.016745878, 0.18134768, -0.053090155, 0.004529695, 0.003277077, 0.067548305, -0.21297279, 0.009631998, 0.027344607, 0.05089432, 0.074267395, -0.064367004, 0.016522845, 0.20069632, -0.085794464, 0.10959423, -0.07732474, 0.24975634, -0.068905994, -0.07547615, -0.02732014, -0.10759524, 0.118316226, 0.023133108, 0.039753698, 0.020240692, -0.09821439, -0.105103776, 0.004498928, 0.010155388, 0.02861061, 0.059838388, -0.23280254, 0.023359763, -0.022957077, -0.19820715, -0.06420855, -0.13613507, 0.07689747, -0.13237162, -0.06374891, -0.07543639..."
2,"JP Morgan noted on 12/31/2022, the market capitalization at Alphabet fell.",0,finance,gemma2-9b-it,GROQ_CLOUD,0,3,"[-0.11430392, 0.22474276, -0.09229383, -0.016676463, 0.049170457, -0.04921122, -0.049676795, -0.04188707, -0.055777695, 1.5504638, -0.25899917, -0.02484892, 0.16953184, -0.09150458, -0.05117438, 0.008394304, 0.0077573857, 0.88561535, -0.08691209, 0.027567608, 0.04576216, 0.15499592, 0.029860388, -0.06043917, 0.08349931, 0.1111407, -0.12770452, -0.020157997, -0.040339787, 0.07163678, 0.17477639, 0.03681408, 0.10708122, 0.09253777, -0.027724545, 0.041797843, 0.03290626, 0.16698934, -0.101132154, 0.038469836, 0.018272692, -0.034920227, -0.062111314, -0.017082917, 0.25162053, 0.022562865, -0.01889662, -0.05030362, 0.06188784, -0.023369847, 0.008708846, 0.15923938, -0.107831, 0.1687106, 0.008106685, -0.09502069, -0.016046384, -0.1652213, -0.015465991, -0.19346625, -0.017864218, -0.08498016,..."


In [21]:
def get_common_words(p_sentence, o_sentence):
    split_p = p_sentence.lower().split()
    split_o = o_sentence.lower().split()

    p_words = set(split_p)
    o_words = set(split_o)

    common_words = p_words & o_words  # {'the', 'brown', 'jumps'}

    return common_words

In [14]:
def get_similarites(p_embeddng, o_embedding):
    distances = {}
    dice_dissimilarity = distance.dice(p_embeddng, o_embedding)
    distances['Dice'] = 1 - dice_dissimilarity

    return distances

In [25]:
similarity_measures = []
for p_idx, p_row in prediction_sentence_to_embeddings_df.iterrows():
    per_prediction = []
    p_sentence = p_row['Base Sentence']
    print(f'Prediction: {p_sentence}')
    p_embedding = p_row['Embedding']

    for o_idx, o_row in observation_sentence_to_embeddings_df.iterrows():
        o_sentence = o_row['Base Sentence']
        print(f' Observation: {o_sentence}')
        o_embedding = o_row['Embedding']

        common_words = get_common_words(p_sentence, o_sentence)
        print(f"\tCommon words: {common_words}")
        # measures = get_similarites(p_embedding, o_embedding)

        # pair = {
        #     'Prediction Sentence': p_sentence,
        #     'Observation Sentence': o_sentence,
        #     **measures  # Unpacks the distance measures into this dictionary
        # }
        
        # similarity_measures.append(pair)    

Prediction: JPMorgan Chase forecasts that the net profit at Amazon potentially decrease in Q3 of 2027.
 Observation:  Morgan Stanley observed that the revenue at Tesla remained stable in Q4 2023.
	Common words: {'the', 'at', 'that', 'in'}
 Observation: On 08/21/2024, Bloomberg analysts monitored the net profit at Amazon changed.
	Common words: {'the', 'net', 'profit', 'amazon', 'at'}
 Observation: JP Morgan noted on 12/31/2022, the market capitalization at Alphabet fell.
	Common words: {'the', 'at'}
 Observation: According to Goldman Sachs, the operating income at Microsoft rose in 2024.
	Common words: {'the', 'at', 'in'}
Prediction: On August 21, 2024, Bank of America speculates the revenue at Microsoft will likely increase.
 Observation:  Morgan Stanley observed that the revenue at Tesla remained stable in Q4 2023.
	Common words: {'the', 'at', 'revenue'}
 Observation: On 08/21/2024, Bloomberg analysts monitored the net profit at Amazon changed.
	Common words: {'the', 'at', 'on'}
 Obs

In [23]:
similarity_df = pd.DataFrame(similarity_measures)
similarity_df

In [20]:
s1 = "Citigroup predicts on 2024-08-21, the operating income at Alphabet may rise.	"
s2 = "the lazy brown dog jumps"

lower = s1.lower()
split = lower.split()
print(split)

words_s1 = set(split)
words_s2 = set(s2.lower().split())

common_words = words_s1 & words_s2  # {'the', 'brown', 'jumps'}
common_words

['citigroup', 'predicts', 'on', '2024-08-21,', 'the', 'operating', 'income', 'at', 'alphabet', 'may', 'rise.']


{'the'}

- Remove stop words -> common words -> LLM(is common word a prediction tag) 
    - yes -> store
    - no -> next word