# NER Frequency

1. Read csv files and load as df
2. Use Spacy to extract NERs
3. Count the frequency of each NER or NERs of interests

In [1]:
import os, sys

import numpy as np
import pandas as pd

from tqdm import tqdm

# Get the current working directory of the notebook
notebook_dir = os.getcwd()
# Add the parent directory to the system path
sys.path.append(os.path.join(notebook_dir, '../'))

import log_files
from log_files import LogData
from data_processing import DataProcessing
from feature_extraction import SpacyFeatureExtraction

In [2]:
pd.set_option('max_colwidth', 800)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

## Read csv files and load as df

In [3]:
log_file_path = "../data/prediction_logs"
predictions = True
predictions_df = log_files.read_data(notebook_dir, log_file_path, predictions)
predictions_df.head(7)

Start logging batch
log_directory: /Users/detraviousjamaribrinkley/Documents/Development/research_labs/uf_ds/scratch-predictions/predictions/pipelines/../../data/prediction_logs
save_batch_directory: /Users/detraviousjamaribrinkley/Documents/Development/research_labs/uf_ds/scratch-predictions/predictions/pipelines/../../data/prediction_logs/batch_1-prediction
CSV to DF
Load saved csv: /Users/detraviousjamaribrinkley/Documents/Development/research_labs/uf_ds/scratch-predictions/predictions/pipelines/../../data/prediction_logs/batch_1-prediction/batch_1-from_df.csv
save_batch_directory: /Users/detraviousjamaribrinkley/Documents/Development/research_labs/uf_ds/scratch-predictions/predictions/pipelines/../../data/prediction_logs/batch_2-prediction
CSV to DF
Load saved csv: /Users/detraviousjamaribrinkley/Documents/Development/research_labs/uf_ds/scratch-predictions/predictions/pipelines/../../data/prediction_logs/batch_2-prediction/batch_2-from_df.csv
save_batch_directory: /Users/detraviou

Unnamed: 0,Base Sentence,Sentence Label,Domain,Model Name,API Name,Batch ID,Template Number
0,JPMorgan Chase forecasts that the net profit at Amazon potentially decrease in Q3 of 2027.,1,finance,llama-3.1-70b-instruct,NAVI_GATOR,0,1
1,"On August 21, 2024, Bank of America speculates the revenue at Microsoft will likely increase.",1,finance,llama-3.1-70b-instruct,NAVI_GATOR,0,2
2,"Citigroup predicts on 2024-08-21, the operating income at Alphabet may rise.",1,finance,llama-3.1-70b-instruct,NAVI_GATOR,0,3
3,"According to Goldman Sachs, the research and development expenses at Facebook would fall in 2025.",1,finance,llama-3.1-70b-instruct,NAVI_GATOR,0,4
4,"In 21 August 2024, Morgan Stanley envisions that the gross profit at Johnson & Johnson has some probability to remain stable.",1,finance,llama-3.1-70b-instruct,NAVI_GATOR,0,5
5,"The stock price at Visa should stay same in Q2 of 2026, according to Wells Fargo.",1,finance,llama-3.1-70b-instruct,NAVI_GATOR,0,6
6,JPMorgan forecasts that the revenue at Microsoft potentially decrease in Q3 of 2027.,1,finance,llama-3.3-70b-instruct,NAVI_GATOR,0,1


## Use Spacy to extract NERs

In [4]:
disable_components = [""]
spacy_fe = SpacyFeatureExtraction(predictions_df, "Base Sentence")
ner_features_df = spacy_fe.extract_ner_features(disable_components)

51it [00:00, 262.60it/s]

Spacy Doc (0):  JPMorgan Chase forecasts that the net profit at Amazon potentially decrease in Q3 of 2027.
Spacy Doc (1):  On August 21, 2024, Bank of America speculates the revenue at Microsoft will likely increase.
Spacy Doc (2):  Citigroup predicts on 2024-08-21, the operating income at Alphabet may rise.
Spacy Doc (3):  According to Goldman Sachs, the research and development expenses at Facebook would fall in 2025.


496it [00:01, 466.39it/s]


In [5]:
ner_features_df.head(34)

Unnamed: 0,Sentence,Term,NER Label,Unique NER Label,Start Char,End Char
0,JPMorgan Chase forecasts that the net profit at Amazon potentially decrease in Q3 of 2027.,JPMorgan Chase,ORG,ORG_1,0.0,14.0
1,JPMorgan Chase forecasts that the net profit at Amazon potentially decrease in Q3 of 2027.,Amazon,ORG,ORG_2,48.0,54.0
2,JPMorgan Chase forecasts that the net profit at Amazon potentially decrease in Q3 of 2027.,Q3,GPE,GPE_1,79.0,81.0
3,JPMorgan Chase forecasts that the net profit at Amazon potentially decrease in Q3 of 2027.,2027,DATE,DATE_1,85.0,89.0
4,,,,,,
5,"On August 21, 2024, Bank of America speculates the revenue at Microsoft will likely increase.","August 21, 2024",DATE,DATE_2,3.0,18.0
6,"On August 21, 2024, Bank of America speculates the revenue at Microsoft will likely increase.",Bank of America,ORG,ORG_3,20.0,35.0
7,"On August 21, 2024, Bank of America speculates the revenue at Microsoft will likely increase.",Microsoft,ORG,ORG_4,62.0,71.0
8,,,,,,
9,"Citigroup predicts on 2024-08-21, the operating income at Alphabet may rise.",Citigroup,ORG,ORG_5,0.0,9.0


## Count the frequency of each NER or NERs of interests

In [6]:
org_df = ner_features_df[ner_features_df['NER Label'] == 'ORG']

# Count frequency of each 'org' entity
org_counts = org_df['Term'].value_counts()

print(org_counts)

Term
Amazon                                                                  19
Goldman Sachs                                                           16
the Los Angeles Lakers                                                  13
Microsoft                                                               13
the New England Patriots                                                12
Q2                                                                      12
The World Health Organization                                           10
Johnson & Johnson                                                       10
the Boston Celtics                                                      10
The American Heart Association                                           9
Wells Fargo                                                              8
Tesla                                                                    8
Morgan Stanley                                                           8
the New York Yankees

In [7]:
ner_features_df.size

11412

In [8]:
filtered_df = ner_features_df[ner_features_df['Sentence'].str.contains('CFO', case=False, na=False)]
filtered_df

Unnamed: 0,Sentence,Term,NER Label,Unique NER Label,Start Char,End Char
656,"The CFO of Procter & Gamble predicts on 21 August 2024, the stock price at Johnson & Johnson may rise.",CFO,ORG,ORG_166,4,7
657,"The CFO of Procter & Gamble predicts on 21 August 2024, the stock price at Johnson & Johnson may rise.",Procter & Gamble,ORG,ORG_167,11,27
658,"The CFO of Procter & Gamble predicts on 21 August 2024, the stock price at Johnson & Johnson may rise.",21 August 2024,DATE,DATE_145,40,54
659,"The CFO of Procter & Gamble predicts on 21 August 2024, the stock price at Johnson & Johnson may rise.",Johnson & Johnson,ORG,ORG_168,75,92


In [9]:
filtered_df = ner_features_df[ner_features_df['Sentence'].str.contains('iPhones', case=False, na=False)]
filtered_df

Unnamed: 0,Sentence,Term,NER Label,Unique NER Label,Start Char,End Char
538,"According to the miscellaneous top executive at Apple, the sales of iPhones would fall in 2024-08-21.",Apple,ORG,ORG_119,48,53
539,"According to the miscellaneous top executive at Apple, the sales of iPhones would fall in 2024-08-21.",iPhones,ORG,ORG_120,68,75
540,"According to the miscellaneous top executive at Apple, the sales of iPhones would fall in 2024-08-21.",2024-08-21,DATE,DATE_117,90,100
