# Extract Features

1. Read csv files and load as dfs
2. Combine dfs
3. Get semantic cosine similarity

In [21]:
import os, sys

import pandas as pd
import numpy as np

# Get the current working directory of the notebook
notebook_dir = os.getcwd()
# Add the parent directory to the system path
sys.path.append(os.path.join(notebook_dir, '../'))

import log_files
from log_files import LogData
from data_processing import DataProcessing
from feature_extraction import SpacyFeatureExtraction

In [2]:
pd.set_option('max_colwidth', 800)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

## Predictions

- Use the structure from `1-generate_predictions-all_domains.ipynb`

In [3]:
log_file_path = "data/prediction_logs"
predictions = True
predictions_df = log_files.read_data(notebook_dir, log_file_path, predictions)
predictions_df.head(7)

Start logging batch
log_directory: /orange/ufdatastudios/dj.brinkley/predictions/misc_experiments/../data/prediction_logs
save_batch_directory: /orange/ufdatastudios/dj.brinkley/predictions/misc_experiments/../data/prediction_logs/batch_1-prediction
CSV to DF
Load saved csv: /orange/ufdatastudios/dj.brinkley/predictions/misc_experiments/../data/prediction_logs/batch_1-prediction/batch_1-from_df.csv
save_batch_directory: /orange/ufdatastudios/dj.brinkley/predictions/misc_experiments/../data/prediction_logs/batch_2-prediction
CSV to DF
Load saved csv: /orange/ufdatastudios/dj.brinkley/predictions/misc_experiments/../data/prediction_logs/batch_2-prediction/batch_2-from_df.csv
save_batch_directory: /orange/ufdatastudios/dj.brinkley/predictions/misc_experiments/../data/prediction_logs/batch_3-prediction
CSV to DF
Load saved csv: /orange/ufdatastudios/dj.brinkley/predictions/misc_experiments/../data/prediction_logs/batch_3-prediction/batch_3-from_df.csv
save_batch_directory: /orange/ufdatast

Unnamed: 0,Base Sentence,Sentence Label,Domain,Model Name,API Name,Batch ID,Template Number
0,"Detravious, a financial analyst forecasts that the stock price at Johnson & Johnson will likely decrease in 2027 Q2.",1,finance,llama-3.1-8b-instant,GROQ_CLOUD,0,1
1,"On August 21, 2024, Goldman Sachs speculates that the operating cash flow at Microsoft will likely increase.",1,finance,llama-3.1-8b-instant,GROQ_CLOUD,0,2
2,"Morgan Stanley predicts that on September 15, 2025, the S&P 500 composite index will likely rise.",1,finance,llama-3.1-8b-instant,GROQ_CLOUD,0,3
3,"According to Apple, the projected revenue at Amazon will likely fall in Q4 2026.",1,finance,llama-3.1-8b-instant,GROQ_CLOUD,0,4
4,"In Q2 2025, Wells Fargo envisions that the U.S. dollar index will likely stay stable.",1,finance,llama-3.1-8b-instant,GROQ_CLOUD,0,5
5,"The Dow Jones Industrial Average will likely rise in Q3 2027, according to JPMorgan Chase.",1,finance,llama-3.1-8b-instant,GROQ_CLOUD,0,6
6,The World Health Organization forecasts that the obesity rates at urban health centers in the United States will likely decrease in 2027.,1,health,llama-3.1-8b-instant,GROQ_CLOUD,0,1


## Observations

In [4]:
log_file_path = "data/observation_logs"
predictions = False
observations_df = log_files.read_data(notebook_dir, log_file_path, predictions)
observations_df.head(7)

Start logging batch
log_directory: /orange/ufdatastudios/dj.brinkley/predictions/misc_experiments/../data/observation_logs
save_batch_directory: /orange/ufdatastudios/dj.brinkley/predictions/misc_experiments/../data/observation_logs/batch_1-observation
CSV to DF
Load saved csv: /orange/ufdatastudios/dj.brinkley/predictions/misc_experiments/../data/observation_logs/batch_1-observation/batch_1-from_df.csv
save_batch_directory: /orange/ufdatastudios/dj.brinkley/predictions/misc_experiments/../data/observation_logs/batch_2-observation
CSV to DF
Load saved csv: /orange/ufdatastudios/dj.brinkley/predictions/misc_experiments/../data/observation_logs/batch_2-observation/batch_2-from_df.csv
save_batch_directory: /orange/ufdatastudios/dj.brinkley/predictions/misc_experiments/../data/observation_logs/batch_3-observation
CSV to DF
Load saved csv: /orange/ufdatastudios/dj.brinkley/predictions/misc_experiments/../data/observation_logs/batch_3-observation/batch_3-from_df.csv
save_batch_directory: /or

Unnamed: 0,Base Sentence,Sentence Label,Domain,Model Name,API Name,Batch ID,Template Number
0,The financial analyst at Goldman Sachs observed that the operating income at Tesla had increased in Q4 2027.,0,finance,llama-3.1-70b-instruct,NAVI_GATOR,0,1
1,"On 02/20/2026 to 02/20/2027, the research advisor at Harvard University monitored the net profit at Amazon changed.",0,finance,llama-3.1-70b-instruct,NAVI_GATOR,0,2
2,"Morgan Stanley noted on 08/15/2025, the revenue at Google fell.",0,finance,llama-3.1-70b-instruct,NAVI_GATOR,0,3
3,"According to the financial expert at Bloomberg, the stock price at Microsoft rose in Q1 2028.",0,finance,llama-3.1-70b-instruct,NAVI_GATOR,0,4
4,"In 2029 of Q3, the senior level person at Apple envisioned that the research and development expenses at Facebook decreased.",0,finance,llama-3.1-70b-instruct,NAVI_GATOR,0,5
5,"The gross profit at Johnson & Johnson increased in 2025, according to the financial top executive at JPMorgan Chase.",0,finance,llama-3.1-70b-instruct,NAVI_GATOR,0,6
6,"JPMorgan Chase observed that the operating income at Amazon had remained stable from September 10, 2023, to September 10, 2024.",0,finance,llama-3.3-70b-instruct,NAVI_GATOR,0,1


## Both

- Create a knowledge graph
    - Nodes: words
    - Edges: connection to other words (same/diff sentence)
- Look at code from Graphbreeding project on 2019 Mac

In [5]:
sub_dataset = True
if sub_dataset == True:
    predictions_df = predictions_df.loc[:7, ]
    observations_df = observations_df.loc[:7, ]

In [6]:
df = DataProcessing.concat_dfs([predictions_df, observations_df])
df.head(7)

Unnamed: 0,Base Sentence,Sentence Label,Domain,Model Name,API Name,Batch ID,Template Number
0,"Detravious, a financial analyst forecasts that the stock price at Johnson & Johnson will likely decrease in 2027 Q2.",1,finance,llama-3.1-8b-instant,GROQ_CLOUD,0,1
1,"On August 21, 2024, Goldman Sachs speculates that the operating cash flow at Microsoft will likely increase.",1,finance,llama-3.1-8b-instant,GROQ_CLOUD,0,2
2,"Morgan Stanley predicts that on September 15, 2025, the S&P 500 composite index will likely rise.",1,finance,llama-3.1-8b-instant,GROQ_CLOUD,0,3
3,"According to Apple, the projected revenue at Amazon will likely fall in Q4 2026.",1,finance,llama-3.1-8b-instant,GROQ_CLOUD,0,4
4,"In Q2 2025, Wells Fargo envisions that the U.S. dollar index will likely stay stable.",1,finance,llama-3.1-8b-instant,GROQ_CLOUD,0,5
5,"The Dow Jones Industrial Average will likely rise in Q3 2027, according to JPMorgan Chase.",1,finance,llama-3.1-8b-instant,GROQ_CLOUD,0,6
6,The World Health Organization forecasts that the obesity rates at urban health centers in the United States will likely decrease in 2027.,1,health,llama-3.1-8b-instant,GROQ_CLOUD,0,1


In [7]:
predictions = DataProcessing.df_to_list(predictions_df, "Base Sentence")
observations = DataProcessing.df_to_list(observations_df, "Base Sentence")

In [8]:
# predictions

In [9]:
disable_components = [""]
spacy_fe = SpacyFeatureExtraction(predictions_df, "Base Sentence")
all_pos_tags, tags, all_ner_tags, entities = spacy_fe.extract_features(disable_components)

Pipeline: ['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']
Spacy Doc (0):  Detravious, a financial analyst forecasts that the stock price at Johnson & Johnson will likely decrease in 2027 Q2.
 POS: Detravious---PROPN---Detravious---nsubj---False
 POS: ,---PUNCT---,---punct---False
 POS: a---DET---a---det---True
 POS: financial---ADJ---financial---amod---False
 POS: analyst---NOUN---analyst---nsubj---False
 POS: forecasts---VERB---forecast---ROOT---False
 POS: that---SCONJ---that---mark---True
 POS: the---DET---the---det---True
 POS: stock---NOUN---stock---compound---False
 POS: price---NOUN---price---nsubj---False
 POS: at---ADP---at---prep---True
 POS: Johnson---PROPN---Johnson---pobj---False
 POS: &---CCONJ---&---cc---False
 POS: Johnson---PROPN---Johnson---conj---False
 POS: will---AUX---will---aux---True
 POS: likely---ADV---likely---advmod---False
 POS: decrease---VERB---decrease---ccomp---False
 POS: in---ADP---in---prep---True
 POS: 2027---NUM---2027---num

# Mapping variables : words in sentence(s)

- Sentenc/Spacy Doc (0): JPMorgan Chase forecasts that the net profit at Amazon potentially decrease in Q3 of 2027.
    1. $ p_s $: JPMorgan Chase
    2. $ p_t $: Amazon
    3. $ p_d $: Q3 of 2027
    4. $ p_{outcome} $: net profit decrease or decrease of net profit

- Sentence/Spacy Doc (1):  On August 21, 2024, Bank of America speculates the revenue at Microsoft will likely increase.
    1. $ p_s $: Bank of America
    2. $ p_t $: Microsoft
    3. $ p_d $: August 21, 2024
    4. $ p_o $: revenue increase or increase in revenue


- Would I want to provide mappings when generating the data?


- Fine tune an event extraction model
- Enties model
- Event: SPO (Something happended to somebody)


 I (source) predict the Pacers (target) to win (attribute) the 2024-2025 (date) NBA Finals.

 Map my work to event extraction

In [10]:
pos_df = DataProcessing.convert_tags_entities_to_dataframe(all_pos_tags, tags)
pos_df.head(1)

Unnamed: 0,VERB_2,PUNCT_2,CCONJ_1,DET_3,NOUN_4,ADJ_1,VERB_1,ADP_2,NUM_1,DET_2,PROPN_5,PUNCT_1,NOUN_1,PROPN_6,PROPN_3,NUM_2,DET_1,NOUN_3,PROPN_1,NUM_3,PUNCT_3,ADP_1,PROPN_7,SCONJ_1,NOUN_2,ADJ_2,ADV_1,ADP_3,PROPN_4,PROPN_2,ADJ_3,AUX_1,VERB_3
0,decrease,.,&,,,financial,forecasts,in,2027,the,,",",analyst,,Johnson,,a,price,Detravious,,,at,,that,stock,,likely,,Q2,Johnson,,will,


In [11]:
ner_df = DataProcessing.convert_tags_entities_to_dataframe(all_ner_tags, entities)
ner_df.head(1)

Unnamed: 0,ORG_1,ORG_2,DATE_1,GPE_1
0,Johnson & Johnson,,2027 Q2,Detravious


## DF to Text

In [12]:
predictions_df

Unnamed: 0,Base Sentence,Sentence Label,Domain,Model Name,API Name,Batch ID,Template Number
0,"Detravious, a financial analyst forecasts that the stock price at Johnson & Johnson will likely decrease in 2027 Q2.",1,finance,llama-3.1-8b-instant,GROQ_CLOUD,0,1
1,"On August 21, 2024, Goldman Sachs speculates that the operating cash flow at Microsoft will likely increase.",1,finance,llama-3.1-8b-instant,GROQ_CLOUD,0,2
2,"Morgan Stanley predicts that on September 15, 2025, the S&P 500 composite index will likely rise.",1,finance,llama-3.1-8b-instant,GROQ_CLOUD,0,3
3,"According to Apple, the projected revenue at Amazon will likely fall in Q4 2026.",1,finance,llama-3.1-8b-instant,GROQ_CLOUD,0,4
4,"In Q2 2025, Wells Fargo envisions that the U.S. dollar index will likely stay stable.",1,finance,llama-3.1-8b-instant,GROQ_CLOUD,0,5
5,"The Dow Jones Industrial Average will likely rise in Q3 2027, according to JPMorgan Chase.",1,finance,llama-3.1-8b-instant,GROQ_CLOUD,0,6
6,The World Health Organization forecasts that the obesity rates at urban health centers in the United States will likely decrease in 2027.,1,health,llama-3.1-8b-instant,GROQ_CLOUD,0,1
7,"On August 15, 2026, the American Heart Association speculates that the average physical activity levels at U.S. high schools will likely increase.",1,health,llama-3.1-8b-instant,GROQ_CLOUD,0,2


In [13]:
sentences = DataProcessing.df_to_list(predictions_df, "Base Sentence")
sentences

['Detravious, a financial analyst forecasts that the stock price at Johnson & Johnson will likely decrease in 2027 Q2.',
 'On August 21, 2024, Goldman Sachs speculates that the operating cash flow at Microsoft will likely increase.',
 'Morgan Stanley predicts that on September 15, 2025, the S&P 500 composite index will likely rise.',
 'According to Apple, the projected revenue at Amazon will likely fall in Q4 2026.',
 'In Q2 2025, Wells Fargo envisions that the U.S. dollar index will likely stay stable.',
 'The Dow Jones Industrial Average will likely rise in Q3 2027, according to JPMorgan Chase.',
 'The World Health Organization forecasts that the obesity rates at urban health centers in the United States will likely decrease in 2027.',
 'On August 15, 2026, the American Heart Association speculates that the average physical activity levels at U.S. high schools will likely increase.']

In [14]:
# !python -m spacy download en_core_web_sm

In [15]:
import spacy

words = []

nlp = spacy.load("en_core_web_sm")
for sentence in sentences:
    
    doc = nlp(sentence)
    for token in doc:
        print(token.text)
        words.append(token.text)
    words.append(" ")

Detravious
,
a
financial
analyst
forecasts
that
the
stock
price
at
Johnson
&
Johnson
will
likely
decrease
in
2027
Q2
.
On
August
21
,
2024
,
Goldman
Sachs
speculates
that
the
operating
cash
flow
at
Microsoft
will
likely
increase
.
Morgan
Stanley
predicts
that
on
September
15
,
2025
,
the
S&P
500
composite
index
will
likely
rise
.
According
to
Apple
,
the
projected
revenue
at
Amazon
will
likely
fall
in
Q4
2026
.
In
Q2
2025
,
Wells
Fargo
envisions
that
the
U.S.
dollar
index
will
likely
stay
stable
.
The
Dow
Jones
Industrial
Average
will
likely
rise
in
Q3
2027
,
according
to
JPMorgan
Chase
.
The
World
Health
Organization
forecasts
that
the
obesity
rates
at
urban
health
centers
in
the
United
States
will
likely
decrease
in
2027
.
On
August
15
,
2026
,
the
American
Heart
Association
speculates
that
the
average
physical
activity
levels
at
U.S.
high
schools
will
likely
increase
.


In [25]:
words_df = pd.DataFrame(words, columns=['Word'])
words_df['Word Label'] = np.where(words_df['Word'] == ' ', ' ', 'O')
words_df.head(3)

Unnamed: 0,Word,Word Label
0,Detravious,O
1,",",O
2,a,O


In [28]:
words_df

save_path = notebook_dir + '/../data/tagging/train'
words_df.to_csv(save_path, sep='\t') 