# Spacy Pipeline

- **Goal:** Prediction Recognition

- **Purpose:** To extract named entities (NER), part-of-speech (POS), etc.
    1. Use to train model as feature extraction (ie: TF x IDF) alone isn't enough

- **Misc:**
    - `%store`: Cell magic will store the variable of interest so we can load in another notebook

In [1]:
import os
import sys
import spacy

import pandas as pd
# Get the current working directory of the notebook
notebook_dir = os.getcwd()
# Add the parent directory to the system path
sys.path.append(os.path.join(notebook_dir, '../'))

from pipelines import BasePipeline
from data_processing import DataProcessing

In [2]:
pd.set_option('max_colwidth', 800)
nlp = spacy.load("en_core_web_sm")
%store -r predictions_df
%store -r non_predictions_df

python eval() and df.apply()

In [3]:
predictions_df.head(3)

Unnamed: 0,Base Sentence,Prediction Label,Model Name,Domain,Template Number
0,"On 2024-10-15, Rachel Patel, a financial analyst, predicts that the operating cash flow at General Motors will likely decrease by $5 billion to $10 billion in Q2 of 2026.",1,llama-3.3-70b-versatile,financial,1
1,"In 2024, Julian Sanchez from Bank of America, forecasts that the stock price will rise from $50 to $75 per share in 2028.",1,llama-3.3-70b-versatile,financial,2
2,"Emily Wilson, a financial expert, predicts on 20/08/2024 that the research and development expenses at Pfizer may stay stable at $15 million in 2029.",1,llama-3.3-70b-versatile,financial,3


In [4]:
base_pipeline = BasePipeline()

cleaned_predictions_df = base_pipeline.clean_predictions(predictions_df)
cleaned_predictions_df

Unnamed: 0,Base Sentence,Prediction Label,Model Name,Domain,Template Number
0,"on 2024-10-15, rachel patel, a financial analyst, predicts that the operating cash flow at general motors will likely decrease by $5 billion to $10 billion in q2 of 2026.",1,llama-3.3-70b-versatile,financial,1
1,"in 2024, julian sanchez from bank of america, forecasts that the stock price will rise from $50 to $75 per share in 2028.",1,llama-3.3-70b-versatile,financial,2
2,"emily wilson, a financial expert, predicts on 20/08/2024 that the research and development expenses at pfizer may stay stable at $15 million in 2029.",1,llama-3.3-70b-versatile,financial,3
3,"according to a senior executive from cisco, on 2024/08/20, the net profit is expected to increase beyond $8 billion in the timeframe of q4 of 2027.",1,llama-3.3-70b-versatile,financial,4
4,"in 2025-02-18, the revenue at visa has a probability of 20 percent to reach $25 billion, which is a 10% increase, as predicted by david lee, a financial reporter, on 15 oct 2024.",1,llama-3.3-70b-versatile,financial,5
5,"on wednesday, november 20, 2024, michael davis, a financial analyst, predicts that the gross profit at 3m will likely decrease by 15% to $12 billion in q1 of 2026.",1,llama-3.3-70b-versatile,financial,1
6,"in q3 of 2024, olivia brown from johnson & johnson, envisions that the operating income will rise from $10 billion to $15 billion in 2028.",1,llama-3.3-70b-versatile,financial,2
7,"kevin white, a financial expert, predicts on 10/10/2024 that the revenue at at&t may increase by $5 billion to $20 billion in 2027.",1,llama-3.3-70b-versatile,financial,3
8,"according to a top executive from intel, on 2024-07-25, the net profit is expected to increase beyond $12 billion in the timeframe of q2 of 2029.",1,llama-3.3-70b-versatile,financial,4
9,"in 2026-08-25, the stock price at mcdonald's is expected to be $200 per share, which is a 25% increase, as predicted by sophia rodriguez, a financial analyst, on 25 july 2024.",1,llama-3.3-70b-versatile,financial,5


In [5]:
# predictions_df

In [6]:
only_predictions = DataProcessing.df_to_list(cleaned_predictions_df, 'Base Sentence')
only_predictions

['on 2024-10-15, rachel patel, a financial analyst, predicts that the operating cash flow at general motors will likely decrease by $5 billion to $10 billion in q2 of 2026.',
 'in 2024, julian sanchez from bank of america, forecasts that the stock price will rise from $50 to $75 per share in 2028.',
 'emily wilson, a financial expert, predicts on 20/08/2024 that the research and development expenses at pfizer may stay stable at $15 million in 2029.',
 'according to a senior executive from cisco, on 2024/08/20, the net profit is expected to increase beyond $8 billion in the timeframe of q4 of 2027.',
 'in 2025-02-18, the revenue at visa has a probability of 20 percent to reach $25 billion, which is a 10% increase, as predicted by david lee, a financial reporter, on 15 oct 2024.',
 'on wednesday, november 20, 2024, michael davis, a financial analyst, predicts that the gross profit at 3m will likely decrease by 15% to $12 billion in q1 of 2026.',
 'in q3 of 2024, olivia brown from johnson

In [7]:
initialize_spacy = DataProcessing.setup_spacy()

### Word

In [None]:
word_leveL_disable_components = ["lemmatizer"]
word_level_pos_tags, word_level_pos_mappings, word_level_ner_entities, word_level_ner_mappings = DataProcessing.extract_entities(only_predictions, initialize_spacy, word_leveL_disable_components)

In [9]:
word_level_pos_tags

{'ADJ',
 'ADP',
 'ADV',
 'AUX',
 'CCONJ',
 'DET',
 'NOUN',
 'NUM',
 'PART',
 'PRON',
 'PROPN',
 'PUNCT',
 'SCONJ',
 'SYM',
 'VERB'}

In [10]:
type(word_level_pos_mappings[0][0])

tuple

In [11]:
# ['on 2024-10-15, rachel patel, a financial analyst, predicts that the operating cash flow at general motors will likely decrease by $5 billion to $10 billion in q2 of 2026.',
# ['on 2024-10-15,  patel, a financial analyst, predicts that  operating cash flow at  motors    by 5 billion to $10 billion in   .',

"to" isn't in "on 2024-10-15, rachel patel, a financial analyst, predicts that the operating cash flow at general motors will likely decrease by $5 billion to $10 billion in q2 of 2026."

In [12]:
all_word_level_pos_df = DataProcessing.convert_to_df(word_level_pos_tags, mapping=word_level_pos_mappings)
all_word_level_pos_df

Unnamed: 0,SCONJ,ADJ,NUM,PART,VERB,AUX,ADV,PRON,DET,NOUN,PROPN,CCONJ,SYM,ADP,PUNCT
0,that,general,2026,to,decrease,will,likely,,the,q2,rachel,,$,of,.
1,that,julian,2028,to,rise,will,,,the,share,america,,$,in,.
2,that,stable,2029,,stay,may,,,the,pfizer,wilson,and,$,in,.
3,,net,2027,to,increase,is,,,the,timeframe,q4,,$,of,.
4,,financial,2024,to,predicted,is,,,a,reporter,oct,,$,on,.
5,that,gross,2026,to,decrease,will,likely,,the,%,q1,,$,of,.
6,that,olivia,2028,to,rise,will,,,the,income,johnson,&,$,in,.
7,that,financial,2027,to,increase,may,,,the,revenue,at&t,,$,in,.
8,,net,2029,to,increase,is,,,the,q2,intel,,$,of,.
9,,financial,2024,to,predicted,is,,,a,analyst,july,,$,on,.


In [13]:
drop_word_level_pos_columns = ['PRON', 'DET', 'ADP', 'SCONJ', 'CCONJ', 'PUNCT', 'PART']
core_word_level_pos_df = DataProcessing.drop_df_columns(all_word_level_pos_df, drop_word_level_pos_columns)
core_word_level_pos_df

Unnamed: 0,ADJ,NUM,VERB,AUX,ADV,NOUN,PROPN,SYM
0,general,2026,decrease,will,likely,q2,rachel,$
1,julian,2028,rise,will,,share,america,$
2,stable,2029,stay,may,,pfizer,wilson,$
3,net,2027,increase,is,,timeframe,q4,$
4,financial,2024,predicted,is,,reporter,oct,$
5,gross,2026,decrease,will,likely,%,q1,$
6,olivia,2028,rise,will,,income,johnson,$
7,financial,2027,increase,may,,revenue,at&t,$
8,net,2029,increase,is,,q2,intel,$
9,financial,2024,predicted,is,,analyst,july,$


In [15]:
encoded_word_level_pos_df = DataProcessing.encode_tags_entities_dataframe(core_word_level_pos_df)
encoded_word_level_pos_df

Unnamed: 0,ADJ,NUM,VERB,AUX,ADV,NOUN,PROPN,SYM
0,1,1,1,1,1,1,1,1
1,1,1,1,1,0,1,1,1
2,1,1,1,1,0,1,1,1
3,1,1,1,1,0,1,1,1
4,1,1,1,1,0,1,1,1
5,1,1,1,1,1,1,1,1
6,1,1,1,1,0,1,1,1
7,1,1,1,1,0,1,1,1
8,1,1,1,1,0,1,1,1
9,1,1,1,1,0,1,1,1


In [17]:
all_word_level_ner_df = DataProcessing.convert_tags_entities_to_dataframe(word_level_ner_entities, word_level_ner_mappings)
all_word_level_ner_df

Unnamed: 0,DATE_1,PERSON_2,CARDINAL_1,MONEY_1,ORG_1,PERCENT_1,PERCENT_2,QUANTITY_1,DATE_3,GPE_1,NORP_1,DATE_2,PERSON_1,DATE_4
0,2024-10-15,,,$5 billion to $10 billion,,,,,,,,2026,rachel patel,
1,2024,,,$50 to $75,bank of america,,,,,,,2028,julian sanchez,
2,20/08/2024,,,$15 million,,,,,,,,2029,emily wilson,
3,2027,,2024/08/20,$8 billion,,,,,,cisco,,,,
4,2025-02-18,,,$25 billion,,20 percent,10%,,,,,15 oct 2024,david lee,
5,"wednesday, november 20, 2024",,3,$12 billion,,15%,,,,,,2026,michael davis,
6,q3 of 2024,,,$10 billion to $15 billion,johnson & johnson,,,,,,,2028,,
7,10/10/2024,,,$5 billion to $20 billion,at&t,,,,,,,2027,kevin white,
8,2024-07-25,,,$12 billion,intel,,,,,,,2029,,
9,2026-08-25,,,200,mcdonald's,25%,,,,,,25 july 2024,,


In [None]:
# Do not want to drop any

# drop_word_level_ner_columns = None
# core_word_level_ner_df = DataProcessing.drop_df_columns(all_word_level_ner_df, drop_word_level_pos_columns)
# core_word_level_ner_df

In [19]:
encoded_word_level_ner_df = DataProcessing.encode_tags_entities_dataframe(all_word_level_ner_df)
encoded_word_level_ner_df

Unnamed: 0,DATE_1,PERSON_2,CARDINAL_1,MONEY_1,ORG_1,PERCENT_1,PERCENT_2,QUANTITY_1,DATE_3,GPE_1,NORP_1,DATE_2,PERSON_1,DATE_4
0,1,0,0,1,0,0,0,0,0,0,0,1,1,0
1,1,0,0,1,1,0,0,0,0,0,0,1,1,0
2,1,0,0,1,0,0,0,0,0,0,0,1,1,0
3,1,0,1,1,0,0,0,0,0,1,0,0,0,0
4,1,0,0,1,0,1,1,0,0,0,0,1,1,0
5,1,0,1,1,0,1,0,0,0,0,0,1,1,0
6,1,0,0,1,1,0,0,0,0,0,0,1,0,0
7,1,0,0,1,1,0,0,0,0,0,0,1,1,0
8,1,0,0,1,1,0,0,0,0,0,0,1,0,0
9,1,0,0,1,1,1,0,0,0,0,0,1,0,0


In [18]:
word_level_tags_entities = [all_word_level_pos_df, all_word_level_ner_df]
word_level_tags_entities_df = DataProcessing.concat_dfs(word_level_tags_entities, axis=1, ignore_index=False)
word_level_tags_entities_df

Unnamed: 0,SCONJ,ADJ,NUM,PART,VERB,AUX,ADV,PRON,DET,NOUN,...,ORG_1,PERCENT_1,PERCENT_2,QUANTITY_1,DATE_3,GPE_1,NORP_1,DATE_2,PERSON_1,DATE_4
0,that,general,2026,to,decrease,will,likely,,the,q2,...,,,,,,,,2026,rachel patel,
1,that,julian,2028,to,rise,will,,,the,share,...,bank of america,,,,,,,2028,julian sanchez,
2,that,stable,2029,,stay,may,,,the,pfizer,...,,,,,,,,2029,emily wilson,
3,,net,2027,to,increase,is,,,the,timeframe,...,,,,,,cisco,,,,
4,,financial,2024,to,predicted,is,,,a,reporter,...,,20 percent,10%,,,,,15 oct 2024,david lee,
5,that,gross,2026,to,decrease,will,likely,,the,%,...,,15%,,,,,,2026,michael davis,
6,that,olivia,2028,to,rise,will,,,the,income,...,johnson & johnson,,,,,,,2028,,
7,that,financial,2027,to,increase,may,,,the,revenue,...,at&t,,,,,,,2027,kevin white,
8,,net,2029,to,increase,is,,,the,q2,...,intel,,,,,,,2029,,
9,,financial,2024,to,predicted,is,,,a,analyst,...,mcdonald's,25%,,,,,,25 july 2024,,


In [20]:
encoded_word_level_tags_entities_df = DataProcessing.encode_tags_entities_dataframe(word_level_tags_entities_df)
encoded_word_level_tags_entities_df

Unnamed: 0,SCONJ,ADJ,NUM,PART,VERB,AUX,ADV,PRON,DET,NOUN,...,ORG_1,PERCENT_1,PERCENT_2,QUANTITY_1,DATE_3,GPE_1,NORP_1,DATE_2,PERSON_1,DATE_4
0,1,1,1,1,1,1,1,0,1,1,...,0,0,0,0,0,0,0,1,1,0
1,1,1,1,1,1,1,0,0,1,1,...,1,0,0,0,0,0,0,1,1,0
2,1,1,1,0,1,1,0,0,1,1,...,0,0,0,0,0,0,0,1,1,0
3,0,1,1,1,1,1,0,0,1,1,...,0,0,0,0,0,1,0,0,0,0
4,0,1,1,1,1,1,0,0,1,1,...,0,1,1,0,0,0,0,1,1,0
5,1,1,1,1,1,1,1,0,1,1,...,0,1,0,0,0,0,0,1,1,0
6,1,1,1,1,1,1,0,0,1,1,...,1,0,0,0,0,0,0,1,0,0
7,1,1,1,1,1,1,0,0,1,1,...,1,0,0,0,0,0,0,1,1,0
8,0,1,1,1,1,1,0,0,1,1,...,1,0,0,0,0,0,0,1,0,0
9,0,1,1,1,1,1,0,0,1,1,...,1,1,0,0,0,0,0,1,0,0


- Need to clean. I don't think we're capturing all of the words for both POS and NER.

### Sentence

In [21]:
sentence_leveL_disable_components = ["tok2vec", "parser", "lemmatizer"]
sentence_level_pos_tags, sentence_level_pos_mappings, sentence_level_ner_entities, sentence_level_ner_mappings = DataProcessing.extract_entities(only_predictions, initialize_spacy, sentence_leveL_disable_components)

In [22]:
sentence_level_pos_tags

{'NOUN'}

In [23]:
sentence_level_pos_df = DataProcessing.convert_tags_entities_to_dataframe(sentence_level_pos_tags, sentence_level_pos_mappings)
sentence_level_pos_df

Unnamed: 0,NOUN
0,.
1,.
2,.
3,.
4,.
5,.
6,.
7,.
8,.
9,.


In [24]:
sentence_level_ner_df = DataProcessing.convert_tags_entities_to_dataframe(sentence_level_ner_entities, sentence_level_ner_mappings)
sentence_level_ner_df

Unnamed: 0,DATE_1,PERSON_2,CARDINAL_1,MONEY_1,ORG_1,PERCENT_1,PERCENT_2,QUANTITY_1,DATE_3,GPE_1,NORP_1,DATE_2,PERSON_1,DATE_4
0,2024-10-15,,,$5 billion to $10 billion,,,,,,,,2026,rachel patel,
1,2024,,,$50 to $75,bank of america,,,,,,,2028,julian sanchez,
2,20/08/2024,,,$15 million,,,,,,,,2029,emily wilson,
3,2027,,2024/08/20,$8 billion,,,,,,cisco,,,,
4,2025-02-18,,,$25 billion,,20 percent,10%,,,,,15 oct 2024,david lee,
5,"wednesday, november 20, 2024",,3,$12 billion,,15%,,,,,,2026,michael davis,
6,q3 of 2024,,,$10 billion to $15 billion,johnson & johnson,,,,,,,2028,,
7,10/10/2024,,,$5 billion to $20 billion,at&t,,,,,,,2027,kevin white,
8,2024-07-25,,,$12 billion,intel,,,,,,,2029,,
9,2026-08-25,,,200,mcdonald's,25%,,,,,,25 july 2024,,


In [25]:
sentence_level_tags_entities = [sentence_level_pos_df, sentence_level_ner_df]
sentence_level_tags_entities_df = DataProcessing.concat_dfs(sentence_level_tags_entities, axis=1, ignore_index=False)
sentence_level_tags_entities_df

Unnamed: 0,NOUN,DATE_1,PERSON_2,CARDINAL_1,MONEY_1,ORG_1,PERCENT_1,PERCENT_2,QUANTITY_1,DATE_3,GPE_1,NORP_1,DATE_2,PERSON_1,DATE_4
0,.,2024-10-15,,,$5 billion to $10 billion,,,,,,,,2026,rachel patel,
1,.,2024,,,$50 to $75,bank of america,,,,,,,2028,julian sanchez,
2,.,20/08/2024,,,$15 million,,,,,,,,2029,emily wilson,
3,.,2027,,2024/08/20,$8 billion,,,,,,cisco,,,,
4,.,2025-02-18,,,$25 billion,,20 percent,10%,,,,,15 oct 2024,david lee,
5,.,"wednesday, november 20, 2024",,3,$12 billion,,15%,,,,,,2026,michael davis,
6,.,q3 of 2024,,,$10 billion to $15 billion,johnson & johnson,,,,,,,2028,,
7,.,10/10/2024,,,$5 billion to $20 billion,at&t,,,,,,,2027,kevin white,
8,.,2024-07-25,,,$12 billion,intel,,,,,,,2029,,
9,.,2026-08-25,,,200,mcdonald's,25%,,,,,,25 july 2024,,


In [26]:
import spacy


nlp = spacy.load("en_core_web_sm")
doc = nlp(only_predictions[0])
word_embeddings = [token.vector for token in doc]
len(word_embeddings)

39

In [None]:
word_embeddings.

In [None]:
nlp = spacy.load("en_core_web_sm")
doc = nlp(text_data_sentence)
word_embeddings = [token.vector for token in doc]
word_embeddings

In [None]:
only_predictions

In [None]:
from spacy.vectors import Vectors
import numpy as np

# empty_vectors = Vectors(shape=(10000, 300))

data = np.zeros((len(only_predictions), 300), dtype='f')
vectors = Vectors(data=data, keys=only_predictions)
vectors

In [None]:
# Convert vectors to a numpy array
vector_data = vectors.data
vector_data

In [None]:

# Create a corresponding array of keys
vector_keys = np.array(only_predictions)
vector_keys

In [None]:
prediction_labels = predictions_df['Prediction Label']
prediction_labels

In [None]:
X_train, X_test, y_train, y_test = DataProcessing.split_data(vector_data, vector_keys)
X_train

## Play

- Remove once finalized

In [None]:
pos_col_names = list(pos_df.columns)
for pos_col_name in pos_col_names:
    print(f"pos_col_name: {spacy.explain(pos_col_name)}")

In [None]:
list(ner_df.columns)

In [None]:
ner_col_names = list(ner_df.columns)
for ner_col_name in ner_col_names:
    print(ner_col_name)
    print(f"ner_col_name: {spacy.explain(ner_col_name)}")

- Patterns:
    - P1 goes to ?
        - $: SYM
        - 10: NUM
    - P2 goes to ?
        - 2024: NUM
        - -: SYM
        - 10: NUM
        - -: SYM
        - 20: NUM
- Write regex for \$\d+ and \d+-\d+-\d+? -> Manually label?

- Create new function in clean_predictions.py called `remove_symbols`

In [None]:
texts = ["On [2024-10-20], [Samantha Thompson, a financial analyst] predicts that the [operating cash flow] at [Johnson & Johnson] [will likely] [increase] by [10 percent to $25 billion] in [2026 Q2]"]

texts_2 = "On [2024-10-20], [Samantha Thompson, a financial analyst] predicts that the [operating cash flow] at [Johnson & Johnson] [will likely] [increase] by [$10 percent to $25 billion] in [2026 Q2]"

texts_2 = "On [2024-10-20] $10 percent to $25 billion]"

texts_2_no_brackets = [texts_2.replace('[', '').replace(']', '')]
# ",".join(texts_2_no_brackets)
# print(texts_2_no_brackets)

# text_join = texts_2_no_brackets.split()
# print(text_join)

nlp = spacy.load("en_core_web_sm")
def extract_entities(data: pd.Series, nlp: spacy.Language):
    """
    Extract entities using the provided SpaCy NLP model.

    Parameters:
    -----------
    data : `pd.Series`
        A Series containing textual data for entity extraction.
    nlp : `spacy.Language`
        A SpaCy NLP model.

    Returns:
    --------
    tuple
        A tuple containing a list of entities and a set of unique NER tags.
    """
    entities = []
    all_ner_tags = set()
    label_counts = {}

    for doc in nlp.pipe(data, disable=["ner"]):
        # doc_entities = []
        # for ent in doc.ents:
        #     label = ent.label_
        #     text = ent.text
        #     print(label, text)
        print(doc)
        for token in doc:
            print(f"{token.text}: {token.pos_}")

        # entities.append(doc_entities)

    return entities, all_ner_tags

# extract_entities(texts, nlp)
# print()
print(texts_2_no_brackets)
extract_entities(texts_2_no_brackets, nlp)

2024: NUM
-: SYM
10: NUM
-: SYM
20: NUM


Create NER DATE from this

10: NUM
percent: NOUN

Johnson: PROPN
&: CCONJ
Johnson: PROPN

In [None]:
pos_col_names = list(pos_df.columns)
for pos_col_name in pos_col_names:
    print(f"pos_col_name: {spacy.explain(pos_col_name)}")


list(ner_df.columns)

In [None]:
ner_col_names = list(ner_df.columns)
for ner_col_name in ner_col_names:
    print(ner_col_name)
    print(f"ner_col_name: {spacy.explain(ner_col_name)}")

In [None]:
# import numpy as np
# df_to_vectorize = shuffled_df
# col_name_to_vectorize = 'Base Predictions'

# def extract_text_to_vectorize(df_to_vectorize, col_name_to_vectorize):
#     text_to_vectorize = DataProcessing.df_to_list(df_to_vectorize, col_name_to_vectorize)
#     return text_to_vectorize

# def word_feature_extraction():
#     """Extract word vector embeddings using Spacy
    
#     Returns:
#     list
#         A list containing the word vector embeddings
#     """
#     text_to_vectorize = extract_text_to_vectorize(df_to_vectorize, col_name_to_vectorize)
#     word_embeddings = []
#     words = []
#     dfs = []
#     nlp = spacy.load("en_core_web_sm")

#     for sentence in text_to_vectorize:
#         print(sentence)
#         doc = nlp(sentence)
#         for token in doc:
#             # print(token.text, token.vector.size, token.vector)
#             avg_word_vector = np.mean(token.vector, axis=0)
#             print(token.text, avg_word_vector)
#             quit()
#     #         words.append(token.text)
#     #         word_embeddings.append(token.vector)

#     #     print(words, word_embeddings)
#     #     print()
#     #     df = pd.DataFrame(word_embeddings, columns=words)
#     # dfs.append(df)
#     # return dfs

In [None]:
# word_feature_extraction()