# Extract Features

1. Read csv files and load as dfs
2. Combine dfs
3. Get semantic cosine similarity

In [30]:
import os, sys

import numpy as np
import pandas as pd

from tqdm import tqdm

# Get the current working directory of the notebook
notebook_dir = os.getcwd()
# Add the parent directory to the system path
sys.path.append(os.path.join(notebook_dir, '../'))

import log_files
from log_files import LogData
from data_processing import DataProcessing
from feature_extraction import SpacyFeatureExtraction

In [2]:
pd.set_option('max_colwidth', 800)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

## Predictions

- Use the structure from `1-generate_predictions-all_domains.ipynb`

In [3]:
log_file_path = "data/prediction_logs"
predictions = True
predictions_df = log_files.read_data(notebook_dir, log_file_path, predictions)
predictions_df.head(7)

Start logging batch


 12%|█▏        | 36/308 [00:00<00:00, 347.74it/s]

save_batch_directory: /orange/ufdatastudios/dj.brinkley/predictions/misc_experiments/../data/prediction_logs/batch_1-prediction
save_batch_directory: /orange/ufdatastudios/dj.brinkley/predictions/misc_experiments/../data/prediction_logs/batch_2-prediction


100%|██████████| 308/308 [00:00<00:00, 379.41it/s]


Unnamed: 0,Base Sentence,Sentence Label,Domain,Model Name,API Name,Batch ID,Template Number
0,"Detravious, a financial analyst forecasts that the stock price at Johnson & Johnson will likely decrease in 2027 Q2.",1,finance,llama-3.1-8b-instant,GROQ_CLOUD,0,1
1,"On August 21, 2024, Goldman Sachs speculates that the operating cash flow at Microsoft will likely increase.",1,finance,llama-3.1-8b-instant,GROQ_CLOUD,0,2
2,"Morgan Stanley predicts that on September 15, 2025, the S&P 500 composite index will likely rise.",1,finance,llama-3.1-8b-instant,GROQ_CLOUD,0,3
3,"According to Apple, the projected revenue at Amazon will likely fall in Q4 2026.",1,finance,llama-3.1-8b-instant,GROQ_CLOUD,0,4
4,"In Q2 2025, Wells Fargo envisions that the U.S. dollar index will likely stay stable.",1,finance,llama-3.1-8b-instant,GROQ_CLOUD,0,5
5,"The Dow Jones Industrial Average will likely rise in Q3 2027, according to JPMorgan Chase.",1,finance,llama-3.1-8b-instant,GROQ_CLOUD,0,6
6,The World Health Organization forecasts that the obesity rates at urban health centers in the United States will likely decrease in 2027.,1,health,llama-3.1-8b-instant,GROQ_CLOUD,0,1


In [4]:
spacy_fe = SpacyFeatureExtraction(predictions_df, "Base Sentence")
# sentences_split_into_words = spacy_fe.split_words_in_sentence()
# sentences_split_into_words[:3]

In [5]:
disable_components = [""]
pos_features = spacy_fe.extract_pos_features(disable_components)
ner_features = spacy_fe.extract_ner_features(disable_components)

51it [00:00, 221.11it/s]

Spacy Doc (0):  Detravious, a financial analyst forecasts that the stock price at Johnson & Johnson will likely decrease in 2027 Q2.
Spacy Doc (1):  On August 21, 2024, Goldman Sachs speculates that the operating cash flow at Microsoft will likely increase.
Spacy Doc (2):  Morgan Stanley predicts that on September 15, 2025, the S&P 500 composite index will likely rise.
Spacy Doc (3):  According to Apple, the projected revenue at Amazon will likely fall in Q4 2026.


22784it [01:03, 359.86it/s]
51it [00:00, 232.57it/s]

Spacy Doc (0):  Detravious, a financial analyst forecasts that the stock price at Johnson & Johnson will likely decrease in 2027 Q2.
Spacy Doc (1):  On August 21, 2024, Goldman Sachs speculates that the operating cash flow at Microsoft will likely increase.
Spacy Doc (2):  Morgan Stanley predicts that on September 15, 2025, the S&P 500 composite index will likely rise.
Spacy Doc (3):  According to Apple, the projected revenue at Amazon will likely fall in Q4 2026.


22784it [01:04, 354.12it/s]


In [6]:
pos_word_df = pd.Series(pos_features[0])
pos_tag_df = pd.Series(pos_features[1])
pos_unique_label_df = pd.Series(pos_features[2])
pos_lemma_df = pd.Series(pos_features[3])
pos_dependency_df = pd.Series(pos_features[4])
pos_is_stop_word_df = pd.Series(pos_features[5])

ner_word_df = pd.Series(ner_features[0])
ner_tag_df = pd.Series(ner_features[1])
ner_unique_label_df = pd.Series(ner_features[2])
ner_start_chars_df = pd.Series(ner_features[3])
ner_end_chars_df = pd.Series(ner_features[4])

In [7]:
pos_dfs = [pos_word_df, pos_tag_df, pos_unique_label_df, pos_lemma_df, pos_dependency_df, pos_is_stop_word_df]
pos_df = DataProcessing.concat_dfs(pos_dfs, axis=1)
pos_df.rename(columns={0: "Word", 1: "POS", 2: "POS Unique Tag", 3: "Lemma", 4: "Dependency", 5: "Is Stop Word"})

Unnamed: 0,Word,POS,POS Unique Tag,Lemma,Dependency,Is Stop Word
0,Detravious,PROPN,PROPN_1,Detravious,nsubj,False
1,",",PUNCT,PUNCT_1,",",punct,False
2,a,DET,DET_1,a,det,True
3,financial,ADJ,ADJ_1,financial,amod,False
4,analyst,NOUN,NOUN_1,analyst,nsubj,False
5,forecasts,VERB,VERB_1,forecast,ROOT,False
6,that,SCONJ,SCONJ_1,that,mark,True
7,the,DET,DET_2,the,det,True
8,stock,NOUN,NOUN_2,stock,compound,False
9,price,NOUN,NOUN_3,price,nsubj,False


In [8]:
ner_dfs = [ner_word_df, ner_tag_df, ner_unique_label_df, ner_start_chars_df, ner_end_chars_df]
ner_df = DataProcessing.concat_dfs(ner_dfs, axis=1)
ner_df.rename(columns={1: "Word", 2: "NER", 3: "NER Unique Tag", 4: "Start", 5: "End"})

Unnamed: 0,0,Word,NER,NER Unique Tag,Start
0,Detravious,GPE,GPE_1,0,10
1,Johnson & Johnson,ORG,ORG_1,66,83
2,2027 Q2,DATE,DATE_1,108,115


In [9]:
# entity

In [10]:
# all_pos_tags, tags, all_ner_tags, entities = spacy_fe.extract_features(disable_components)

In [11]:
# all_pos_tags

In [12]:
# pos_df = DataProcessing.convert_tags_entities_to_dataframe(all_pos_tags, tags)
# pos_df.head(1)

In [13]:
# pos_sent_label_df = DataProcessing.include_sentence_and_label(pos_df, predictions_df)
# pos_sent_label_df.head(3)

In [14]:
# ner_df = DataProcessing.convert_tags_entities_to_dataframe(all_ner_tags, entities)
# ner_df.head(1)

In [15]:
# pos_sent_label_df["Sentence by Words"] = sentences_split_into_words
# pos_sent_label_df.head(3)

In [16]:
# extract_pos_features

In [17]:
pos_features[0][0]

'Detravious'

In [18]:
pos_features[1][0]

'PROPN'

In [19]:
# for i in range(len(pos_features[0])):
#     pos_feature = pos_features[0][i]
#     print(pos_feature)

In [23]:
def load_data(file_path: str, file_name: str, is_test_file: bool, config_index: bool = True):
    
    if config_index == True:
        if is_test_file != True:
            file =  file_path + file_name
            open_df = pd.read_table(file, sep = "\t", names=['Index', 'Term', 'BIO x Prediction Tag'], skip_blank_lines=False)
        else:
            file =  file_path + file_name
            open_df = pd.read_table(file, sep = "\t", names=['Index', 'Word'], skip_blank_lines=False)
        
    return open_df

In [24]:
path = "../data/tagging/official/"
term_bio_pred_df = load_data(path, 'dev', False)
term_bio_pred_df

Unnamed: 0,Index,Term,BIO x Prediction Tag
0,0,Detravious,B-p_s
1,1,",",I-p_s
2,2,a,I-p_s
3,3,financial,I-p_s
4,4,analyst,E-p_s
5,5,forecasts,O
6,6,that,O
7,7,the,O
8,8,stock,B-p_o
9,9,price,E-p_o


In [41]:
term_bio_pred_pos_features_df = DataProcessing.concat_dfs([term_bio_pred_df, pos_df], axis=1, ignore_index=False)
term_bio_pred_pos_features_df.rename(columns={0: "Word", 1: "POS", 2: "Unique POS Tag", 3: "Lemma", 4: "Dependency", 5: "Is Stop Word"}, inplace=True)
term_bio_pred_pos_features_df.drop(columns=['Index', 'Word'], inplace=True)
term_bio_pred_pos_features_df

Unnamed: 0,Term,BIO x Prediction Tag,POS,Unique POS Tag,Lemma,Dependency,Is Stop Word
0,Detravious,B-p_s,PROPN,PROPN_1,Detravious,nsubj,False
1,",",I-p_s,PUNCT,PUNCT_1,",",punct,False
2,a,I-p_s,DET,DET_1,a,det,True
3,financial,I-p_s,ADJ,ADJ_1,financial,amod,False
4,analyst,E-p_s,NOUN,NOUN_1,analyst,nsubj,False
5,forecasts,O,VERB,VERB_1,forecast,ROOT,False
6,that,O,SCONJ,SCONJ_1,that,mark,True
7,the,O,DET,DET_2,the,det,True
8,stock,B-p_o,NOUN,NOUN_2,stock,compound,False
9,price,E-p_o,NOUN,NOUN_3,price,nsubj,False


In [42]:
sentences = DataProcessing.df_to_list(predictions_df, 'Base Sentence')
sentences[:3]

['Detravious, a financial analyst forecasts that the stock price at Johnson & Johnson will likely decrease in 2027 Q2.',
 'On August 21, 2024, Goldman Sachs speculates that the operating cash flow at Microsoft will likely increase.',
 'Morgan Stanley predicts that on September 15, 2025, the S&P 500 composite index will likely rise.']

In [50]:
space_filt = (term_bio_pred_pos_features_df["Term"] == ' ')
sentence_break_idx = term_bio_pred_pos_features_df.loc[space_filt, ["Term"]].index.to_numpy()
sentence_break_idx

array([21, 42])

In [72]:

sample_input = []    
sentence_tokens = []
pos_features = []
bert_event_extraction_format = {}
for row in term_bio_pred_pos_features_df.itertuples():
    # print(f"{row}")
    index, term, bio_p, pos = row.Index, row.Term, row._2, row.POS
    
    if index not in sentence_break_idx:
        sentence_tokens.append(term)
        pos_features.append(pos)
        # print(f"   {pos_features}")
    elif index in sentence_break_idx:
        # print("New sentence")
        bert_event_extraction_format["tokens"] = sentence_tokens
        bert_event_extraction_format["pos-tag"] = pos_features
        sample_input.append(bert_event_extraction_format)
        sentence_tokens = []
        pos_features = []
        bert_event_extraction_format = {}
sample_input

[{'tokens': ['Detravious',
   ',',
   'a',
   'financial',
   'analyst',
   'forecasts',
   'that',
   'the',
   'stock',
   'price',
   'at',
   'Johnson',
   '&',
   'Johnson',
   'will',
   'likely',
   'decrease',
   'in',
   '2027',
   'Q2',
   '.'],
  'pos-tag': ['PROPN',
   'PUNCT',
   'DET',
   'ADJ',
   'NOUN',
   'VERB',
   'SCONJ',
   'DET',
   'NOUN',
   'NOUN',
   'ADP',
   'PROPN',
   'CCONJ',
   'PROPN',
   'AUX',
   'ADV',
   'VERB',
   'ADP',
   'NUM',
   'PROPN',
   'PUNCT']},
 {'tokens': ['On',
   'August',
   '21',
   ',',
   '2024',
   ',',
   'Goldman',
   'Sachs',
   'speculates',
   'that',
   'the',
   'operating',
   'cash',
   'flow',
   'at',
   'Microsoft',
   'will',
   'likely',
   'increase',
   '.'],
  'pos-tag': [nan,
   nan,
   nan,
   nan,
   nan,
   nan,
   nan,
   nan,
   nan,
   nan,
   nan,
   nan,
   nan,
   nan,
   nan,
   nan,
   nan,
   nan,
   nan,
   nan]}]