# Extract Features

1. Read csv files and load as dfs
2. Combine dfs
3. Get semantic cosine similarity

In [1]:
import os, sys

import pandas as pd

# Get the current working directory of the notebook
notebook_dir = os.getcwd()
# Add the parent directory to the system path
sys.path.append(os.path.join(notebook_dir, '../'))

import log_files
from log_files import LogData
from data_processing import DataProcessing
from feature_extraction import SpacyFeatureExtraction

In [2]:
pd.set_option('max_colwidth', 800)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

## Predictions

- Use the structure from `1-generate_predictions-all_domains.ipynb`

In [3]:
log_file_path = "data/prediction_logs"
predictions = True
predictions_df = log_files.read_data(notebook_dir, log_file_path, predictions)
predictions_df.head(7)

Start logging batch
log_directory: /Users/detraviousjamaribrinkley/Documents/Development/research_labs/uf_ds/predictions/misc_experiments/../data/prediction_logs
save_batch_directory: /Users/detraviousjamaribrinkley/Documents/Development/research_labs/uf_ds/predictions/misc_experiments/../data/prediction_logs/batch_1-prediction
CSV to DF
Load saved csv: /Users/detraviousjamaribrinkley/Documents/Development/research_labs/uf_ds/predictions/misc_experiments/../data/prediction_logs/batch_1-prediction/batch_1-from_df.csv
save_batch_directory: /Users/detraviousjamaribrinkley/Documents/Development/research_labs/uf_ds/predictions/misc_experiments/../data/prediction_logs/batch_2-prediction
CSV to DF
Load saved csv: /Users/detraviousjamaribrinkley/Documents/Development/research_labs/uf_ds/predictions/misc_experiments/../data/prediction_logs/batch_2-prediction/batch_2-from_df.csv


Unnamed: 0,Base Sentence,Sentence Label,Domain,Model Name,API Name,Batch ID,Template Number
0,JPMorgan Chase forecasts that the net profit at Amazon potentially decrease in Q3 of 2027.,1,finance,llama-3.1-70b-instruct,NAVI_GATOR,0,1
1,"On August 21, 2024, Bank of America speculates the revenue at Microsoft will likely increase.",1,finance,llama-3.1-70b-instruct,NAVI_GATOR,0,2
2,"Citigroup predicts on 2024-08-21, the operating income at Alphabet may rise.",1,finance,llama-3.1-70b-instruct,NAVI_GATOR,0,3
3,"According to Goldman Sachs, the research and development expenses at Facebook would fall in 2025.",1,finance,llama-3.1-70b-instruct,NAVI_GATOR,0,4
4,"In 21 August 2024, Morgan Stanley envisions that the gross profit at Johnson & Johnson has some probability to remain stable.",1,finance,llama-3.1-70b-instruct,NAVI_GATOR,0,5
5,"The stock price at Visa should stay same in Q2 of 2026, according to Wells Fargo.",1,finance,llama-3.1-70b-instruct,NAVI_GATOR,0,6
6,JPMorgan forecasts that the revenue at Microsoft potentially decrease in Q3 of 2027.,1,finance,llama-3.3-70b-instruct,NAVI_GATOR,0,1


## Observations

In [4]:
log_file_path = "data/observation_logs"
predictions = False
observations_df = log_files.read_data(notebook_dir, log_file_path, predictions)
observations_df.head(7)

Start logging batch
log_directory: /Users/detraviousjamaribrinkley/Documents/Development/research_labs/uf_ds/predictions/misc_experiments/../data/observation_logs
save_batch_directory: /Users/detraviousjamaribrinkley/Documents/Development/research_labs/uf_ds/predictions/misc_experiments/../data/observation_logs/batch_1-observation
CSV to DF
Load saved csv: /Users/detraviousjamaribrinkley/Documents/Development/research_labs/uf_ds/predictions/misc_experiments/../data/observation_logs/batch_1-observation/batch_1-from_df.csv
save_batch_directory: /Users/detraviousjamaribrinkley/Documents/Development/research_labs/uf_ds/predictions/misc_experiments/../data/observation_logs/batch_2-observation
CSV to DF
Load saved csv: /Users/detraviousjamaribrinkley/Documents/Development/research_labs/uf_ds/predictions/misc_experiments/../data/observation_logs/batch_2-observation/batch_2-from_df.csv
save_batch_directory: /Users/detraviousjamaribrinkley/Documents/Development/research_labs/uf_ds/predictions/m

Unnamed: 0,Base Sentence,Sentence Label,Domain,Model Name,API Name,Batch ID,Template Number
0,The financial analyst at Goldman Sachs observed that the operating income at Tesla had increased in the first quarter of 2024.,0,finance,llama-3.1-70b-instruct,NAVI_GATOR,0,1
1,"On 2024-08-20 to 2025-08-20, Morgan Stanley speculates the stock price at Amazon will likely rise.",0,finance,llama-3.1-70b-instruct,NAVI_GATOR,0,2
2,"A young investor predicts on 2025-03-15, the S&P 500 index may rise.",0,finance,llama-3.1-70b-instruct,NAVI_GATOR,0,3
3,"According to Bank of America, the net profit at Microsoft would fall in the second quarter of 2026.",0,finance,llama-3.1-70b-instruct,NAVI_GATOR,0,4
4,"In 2027-01-01 to 2027-12-31, Wells Fargo envisions that the interest rates at the Federal Reserve have some probability to remain stable.",0,finance,llama-3.1-70b-instruct,NAVI_GATOR,0,5
5,"The trading volume at Apple should stay same in the fourth quarter of 2025, according to a financial expert at JPMorgan Chase.",0,finance,llama-3.1-70b-instruct,NAVI_GATOR,0,6
6,JPMorgan observed that the net profit at Microsoft had risen in September 2023.,0,finance,llama-3.3-70b-instruct,NAVI_GATOR,0,1


## Both

- Create a knowledge graph
    - Nodes: words
    - Edges: connection to other words (same/diff sentence)
- Look at code from Graphbreeding project on 2019 Mac

In [5]:
df = DataProcessing.concat_dfs([predictions_df, observations_df])
df.head(3)

Unnamed: 0,Base Sentence,Sentence Label,Domain,Model Name,API Name,Batch ID,Template Number
0,JPMorgan Chase forecasts that the net profit at Amazon potentially decrease in Q3 of 2027.,1,finance,llama-3.1-70b-instruct,NAVI_GATOR,0,1
1,"On August 21, 2024, Bank of America speculates the revenue at Microsoft will likely increase.",1,finance,llama-3.1-70b-instruct,NAVI_GATOR,0,2
2,"Citigroup predicts on 2024-08-21, the operating income at Alphabet may rise.",1,finance,llama-3.1-70b-instruct,NAVI_GATOR,0,3


In [6]:
predictions = DataProcessing.df_to_list(predictions_df, "Base Sentence")
observations = DataProcessing.df_to_list(observations_df, "Base Sentence")

In [7]:
# from text_generation_models import LlamaVersatileTextGenerationModel
# llama_versatile_generation_model = LlamaVersatileTextGenerationModel()

# from text_generation_models import TextGenerationModelFactory
# tgmf = TextGenerationModelFactory()

# # llama_versatile_generation_model = tgmf.create_instance(model_name='llama-3.3-70b-versatile')
# # llama_instant_generation_model = tgmf.create_instance('llama-3.1-8b-instant')
# llama_8b_8192_generation_model = tgmf.create_instance('llama3-8b-8192')


# prompt = f"Can you return the observations ({observations}) that certify this prediction ({predictions[0]})? Only write the observations that certify the prediction."
# # prompt = f"Can you return a list of the observations ({observations}) that certify this prediction ({predictions[0]}) and why? Only write the observations that certify the prediction and why. Do not write any other text. "
# input_prompt = llama_8b_8192_generation_model.user(prompt)
# # print(input_prompt)
# # raw_text = self.chat_completion([self.user(prompt_template)])
# raw_text = llama_8b_8192_generation_model.chat_completion([input_prompt])
# print(raw_text)

In [8]:
# raw_text

In [9]:
# # Extract observations from the string
# observations = raw_text.split('\n\n')[1].split('\n')[1:-1]
# # Remove numbering and quotes
# observations = [obs.split('. ', 1)[1].strip('"') for obs in observations]
# # Print the list
# print(observations)


In [10]:
# type(observations)

In [11]:
# observations[1]

In [12]:
# from text_generation_models import TextGenerationModelFactory
# tgmf = TextGenerationModelFactory()

# llama_versatile_generation_model = tgmf.create_instance(model_name='llama-3.3-70b-versatile')
# llama_instant_generation_model = tgmf.create_instance('llama-3.1-8b-instant')
# llama_70b_8192_generation_model = tgmf.create_instance('llama3-70b-8192')
# llama_8b_8192_generation_model = tgmf.create_instance('llama3-8b-8192')

# models = [llama_70b_8192_generation_model]
# # models = [llama_versatile_generation_model, llama_instant_generation_model, llama_70b_8192_generation_model, llama_8b_8192_generation_model]
# # Prompt for the model

# prompt = f"Can you return the observations ({observations}) that certify this prediction ({predictions[0]})? Write in the format of ({predictions[0]}, {observations})"
# input_prompt = llama_versatile_generation_model.user(prompt)
# # print(input_prompt)

# model_certify = {}
# for model in models:    
    
#     raw_text = model.chat_completion([input_prompt])
#     print(raw_text)

In [13]:
# df

In [14]:
# data = []
# for model, output in model_certify.items():
#     for o in output:
#         data.append([model, o])

# # Create the DataFrame
# df = pd.DataFrame(data, columns=['Model', 'Output'])
# df

In [15]:
# import pandas as pd
# from text_generation_models import TextGenerationModelFactory

# # Initialize the TextGenerationModelFactory
# tgmf = TextGenerationModelFactory()

# # Create instances of the models
# llama_versatile_generation_model = tgmf.create_instance(model_name='llama-3.3-70b-versatile')
# llama_instant_generation_model = tgmf.create_instance('llama-3.1-8b-instant')
# llama_70b_8192_generation_model = tgmf.create_instance('llama3-70b-8192')
# llama_8b_8192_generation_model = tgmf.create_instance('llama3-8b-8192')

# # List of models
# models = [llama_instant_generation_model, llama_70b_8192_generation_model, llama_8b_8192_generation_model]

# # Prompt for the model
# prompt = f"Can you return a list of the observations ({observations}) that certify this prediction ({predictions[0]})? Only write the observations that certify the prediction. Do not write any other text."
# input_prompt = llama_versatile_generation_model.user(prompt)

# # Dictionary to store model outputs
# model_certify = {}
# for model in models:
#     raw_text = model.chat_completion([input_prompt])
#     output = [line.strip().replace("*", "") for line in raw_text.split("\n") if line.strip()]
#     model_certify[model.model_name] = output

# # Prepare data for DataFrame
# data = []
# for model, output in model_certify.items():
#     for o in output:
#         if isinstance(o, list):
#             o = ', '.join(o)
#         data.append([predictions[0], model, o])

# # Create the DataFrame
# df = pd.DataFrame(data, columns=['Prediction', 'Model', 'Observations'])

# # Display the DataFrame
# df


In [16]:
# df

- Don't use only the {predictions[0]}, use the structure from `1-generate_prediction-all_domains.ipynb` and the spacy tokens

In [17]:
{predictions[0]}

{'JPMorgan Chase forecasts that the net profit at Amazon potentially decrease in Q3 of 2027.'}

In [18]:
from text_generation_models import TextGenerationModelFactory
tgmf = TextGenerationModelFactory()

llama_versatile_generation_model = tgmf.create_instance(model_name='llama-3.3-70b-versatile')
llama_instant_generation_model = tgmf.create_instance('llama-3.1-8b-instant')
llama_70b_8192_generation_model = tgmf.create_instance('llama3-70b-8192')
llama_8b_8192_generation_model = tgmf.create_instance('llama3-8b-8192')

# models = [llama_instant_generation_model, llama_70b_8192_generation_model, llama_8b_8192_generation_model]
models = [llama_instant_generation_model]

# Prompt for the model

prompt = f"Return a list of the observations ({observations}) that certify this prediction ({predictions[0]})?"
input_prompt = llama_versatile_generation_model.user(prompt)
# print(input_prompt)

# df = pd.DataFrame(columns=["Model", "Prompt", "Response"])
model_certify = {}
for model in models:    
    
    raw_text = model.chat_completion([input_prompt])
    output = []
    for line in raw_text.split("\n"):
        if line.strip():  # Skip empty lines
            output.append(line.strip())
    # print(output)
    model_certify[model.model_name] = output

print(model_certify)
model_certify.keys()

data = []
for model, output in model_certify.items():
    for output in output:
        data.append([model, output])

# Create the DataFrame
df = pd.DataFrame(data, columns=['Model', 'Output'])
df

AuthenticationError: Error code: 401 - {'error': {'message': 'Invalid API Key', 'type': 'invalid_request_error', 'code': 'invalid_api_key'}}