# Extract Features

1. Read csv files and load as dfs
2. Combine dfs
3. Get semantic cosine similarity

In [1]:
import os, sys

import pandas as pd

# Get the current working directory of the notebook
notebook_dir = os.getcwd()
# Add the parent directory to the system path
sys.path.append(os.path.join(notebook_dir, '../'))

import log_files
from log_files import LogData
from data_processing import DataProcessing
from feature_extraction import SpacyFeatureExtraction

In [2]:
pd.set_option('max_colwidth', 800)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [3]:
log_file_path = "data/prediction_logs"
predictions = True
predictions_df = log_files.read_data(notebook_dir, log_file_path, predictions)
predictions_df.head(7)

Start logging batch
log_directory: /Users/detraviousjamaribrinkley/Documents/Development/research_labs/uf_ds/predictions/misc_experiments/../data/prediction_logs
save_batch_directory: /Users/detraviousjamaribrinkley/Documents/Development/research_labs/uf_ds/predictions/misc_experiments/../data/prediction_logs/batch_1-predictions
CSV to DF
Load saved csv: /Users/detraviousjamaribrinkley/Documents/Development/research_labs/uf_ds/predictions/misc_experiments/../data/prediction_logs/batch_1-predictions/batch_1-from_df.csv
save_batch_directory: /Users/detraviousjamaribrinkley/Documents/Development/research_labs/uf_ds/predictions/misc_experiments/../data/prediction_logs/batch_2-predictions
CSV to DF
Load saved csv: /Users/detraviousjamaribrinkley/Documents/Development/research_labs/uf_ds/predictions/misc_experiments/../data/prediction_logs/batch_2-predictions/batch_2-from_df.csv
save_batch_directory: /Users/detraviousjamaribrinkley/Documents/Development/research_labs/uf_ds/predictions/misc_e

Unnamed: 0,Base Sentence,Sentence Label,Domain,Model Name,API Name,Batch ID,Template Number
0,"On August 21, 2024, as a financial analyst, I predict that the revenue at Amazon will potentially decrease in Q3 of 2027.",1,finance,gpt-3.5-turbo,NAVI_GATOR,0,1
1,"On 2024-08-21, Morgan Stanley speculates the operating income at Amazon will likely increase.",1,finance,gpt-4o,NAVI_GATOR,0,2
2,"According to JPMorgan Chase, the net profit at Microsoft is expected to decrease in Q2 of 2024.",1,finance,mixtral-8x7b-instruct,NAVI_GATOR,0,1
3,"On August 21, 2023, Goldman Sachs speculates that the stock price at Amazon will likely increase.",1,finance,mixtral-8x7b-instruct,NAVI_GATOR,0,2
4,"George, a financial analyst, predicts that on 01/15/2024, the Google revenue may rise.",1,finance,mixtral-8x7b-instruct,NAVI_GATOR,0,3
5,"According to BlackRock, the operating cash flow at ExxonMobil would fall in Q3 2023.",1,finance,mixtral-8x7b-instruct,NAVI_GATOR,0,4
6,"In Q2 2023, Wells Fargo envisions that the stock price at Tesla has some probability to remain stable.",1,finance,mixtral-8x7b-instruct,NAVI_GATOR,0,5


## Observations

In [4]:
log_file_path = "data/observation_logs"
predictions = False
observations_df = log_files.read_data(notebook_dir, log_file_path, predictions)
observations_df.head(7)

Start logging batch
log_directory: /Users/detraviousjamaribrinkley/Documents/Development/research_labs/uf_ds/predictions/misc_experiments/../data/observation_logs
save_batch_directory: /Users/detraviousjamaribrinkley/Documents/Development/research_labs/uf_ds/predictions/misc_experiments/../data/observation_logs/batch_1-observations
CSV to DF
Load saved csv: /Users/detraviousjamaribrinkley/Documents/Development/research_labs/uf_ds/predictions/misc_experiments/../data/observation_logs/batch_1-observations/batch_1-from_df.csv
save_batch_directory: /Users/detraviousjamaribrinkley/Documents/Development/research_labs/uf_ds/predictions/misc_experiments/../data/observation_logs/batch_2-observations
CSV to DF
Load saved csv: /Users/detraviousjamaribrinkley/Documents/Development/research_labs/uf_ds/predictions/misc_experiments/../data/observation_logs/batch_2-observations/batch_2-from_df.csv
save_batch_directory: /Users/detraviousjamaribrinkley/Documents/Development/research_labs/uf_ds/predictio

Unnamed: 0,Base Sentence,Sentence Label,Domain,Model Name,API Name,Batch ID,Template Number
0,"The financial reporter, Sarah, witnessed the stock price at Tesla plummeted in Q2 of 2023.",0,finance,gpt-3.5-turbo,NAVI_GATOR,0,1
1,"From May 15, 2025, the financial analyst speculated the net profit at Amazon was drastically different.",0,finance,gpt-3.5-turbo,NAVI_GATOR,0,2
2,"On July 1, 2024, the financial expert heard that the revenue at Google remained stable.",0,finance,gpt-3.5-turbo,NAVI_GATOR,0,5
3,"Apple's operating income generally rose in August 21, 2027, according to the financial top executive.",0,finance,gpt-3.5-turbo,NAVI_GATOR,0,6
4,Goldman Sachs saw the net profit at Tesla plummeted in 2023-11-15.,0,finance,gpt-4o,NAVI_GATOR,0,1
5,"From 2025-03-30, Morgan Stanley speculated the gross profit at Apple was drastically different.",0,finance,gpt-4o,NAVI_GATOR,0,2
6,"2026/12/01, a financial analyst heard that the operating income at Amazon remained stable.",0,finance,gpt-4o,NAVI_GATOR,0,5


## Both

- Create a knowledge graph
    - Nodes: words
    - Edges: connection to other words (same/diff sentence)
- Look at code from Graphbreeding project on 2019 Mac

In [5]:
df = DataProcessing.concat_dfs([predictions_df, observations_df])
df.head(3)

Unnamed: 0,Base Sentence,Sentence Label,Domain,Model Name,API Name,Batch ID,Template Number
0,"On August 21, 2024, as a financial analyst, I predict that the revenue at Amazon will potentially decrease in Q3 of 2027.",1,finance,gpt-3.5-turbo,NAVI_GATOR,0,1
1,"On 2024-08-21, Morgan Stanley speculates the operating income at Amazon will likely increase.",1,finance,gpt-4o,NAVI_GATOR,0,2
2,"According to JPMorgan Chase, the net profit at Microsoft is expected to decrease in Q2 of 2024.",1,finance,mixtral-8x7b-instruct,NAVI_GATOR,0,1


In [6]:
predictions = DataProcessing.df_to_list(predictions_df, "Base Sentence")
observations = DataProcessing.df_to_list(observations_df, "Base Sentence")

In [27]:
from text_generation_models import LlamaVersatileTextGenerationModel
llama_versatile_generation_model = LlamaVersatileTextGenerationModel()

prompt = f"Return a list of the observations ({observations}) that certify this prediction ({predictions[0]})?"
input_prompt = llama_versatile_generation_model.user(prompt)
print(input_prompt)

df = pd.DataFrame(predictions, columns=['Base Sentence'])
# raw_text = self.chat_completion([self.user(prompt_template)])
raw_text = llama_versatile_generation_model.chat_completion([input_prompt])
output = []
for line in raw_text.split("\n"):
    if line.strip():  # Skip empty lines
        output.append(line.strip())

df = pd.DataFrame(output, columns=["Certified Matches"])
df

{'role': 'user', 'content': 'Return a list of the observations ([\'The financial reporter, Sarah, witnessed the stock price at Tesla plummeted in Q2 of 2023.\', \'From May 15, 2025, the financial analyst speculated the net profit at Amazon was drastically different.\', \'On July 1, 2024, the financial expert heard that the revenue at Google remained stable.\', "Apple\'s operating income generally rose in August 21, 2027, according to the financial top executive.", \'Goldman Sachs saw the net profit at Tesla plummeted in 2023-11-15.\', \'From 2025-03-30, Morgan Stanley speculated the gross profit at Apple was drastically different.\', \'2026/12/01, a financial analyst heard that the operating income at Amazon remained stable.\', \'Microsoft revenue generally rose in 2024/07/22, according to a financial expert.\', \'Noting the date of Q3 2027, a seasoned financial analyst observed that the net profit at Tesla had significantly plummeted.\', \'From Q2 2024, a financial reporter speculated

Unnamed: 0,Certified Matches
0,"Based on the provided observations, the following list of observations may certify the prediction that the revenue at Amazon will potentially decrease in Q3 of 2027:"
1,"1. 'From May 15, 2025, the financial analyst speculated the net profit at Amazon was drastically different.'"
2,"2. 'From July 15, 2024, the financial analyst, Mark, speculated the net profit at Amazon was drastically different.'"
3,"3. 'Noting the date of Q3 2027, a seasoned financial analyst observed that the net profit at Tesla had significantly plummeted.' (This observation is related to Tesla, but it mentions Q3 2027, which is relevant to the prediction.)"
4,"4. 'From Q2 2024, Goldman Sachs' research department speculated that the stock price at Microsoft was drastically different.' (This observation is related to Microsoft, but it shows that financial analysts are speculating about drastic changes in stock prices and revenue.)"
5,"5. 'The financial reporter, Sarah, witnessed the stock price at Tesla plummeted in Q2 of 2023.' (This observation shows that stock prices can plummet, which may be related to revenue decreases.)"
6,"These observations do not directly certify the prediction, but they provide some context and insights into the financial market and the speculations of financial analysts. However, it is essential to note that these observations are not conclusive evidence and should be considered in conjunction with other factors and data to make a more informed prediction."


In [None]:
# from text_generation_models import TextGenerationModelFactory
# tgmf = TextGenerationModelFactory()

# llama_versatile_generation_model = tgmf.create_instance(model_name='llama-3.3-70b-versatile')
# llama_instant_generation_model = tgmf.create_instance('llama-3.1-8b-instant')
# llama_70b_8192_generation_model = tgmf.create_instance('llama3-70b-8192')
# llama_8b_8192_generation_model = tgmf.create_instance('llama3-8b-8192')

# models = [llama_versatile_generation_model, llama_instant_generation_model, llama_70b_8192_generation_model, llama_8b_8192_generation_model]
# # Prompt for the model

# prompt = f"Return a list of the observations ({observations}) that certify this prediction ({predictions[0]})?"
# input_prompt = llama_versatile_generation_model.user(prompt)
# # print(input_prompt)

# df = pd.DataFrame(columns=["Model", "Prompt", "Response"])
# for model in models:
#     raw_text = model.chat_completion([input_prompt])
    