# Spacy Pipeline

- **Goal:** Prediction Recognition

- **Purpose:** To extract named entities (NER), part-of-speech (POS), etc.
    1. Use to train model as feature extraction (ie: TF x IDF) alone isn't enough

- **Misc:**
    - `%store`: Cell magic will store the variable of interest so we can load in another notebook

In [1]:
import os
import sys
import spacy

import pandas as pd
# Get the current working directory of the notebook
notebook_dir = os.getcwd()
# Add the parent directory to the system path
sys.path.append(os.path.join(notebook_dir, '../'))

from pipelines import BasePipeline
from data_processing import DataProcessing
from feature_extraction import SpacyFeatureExtraction

In [2]:
%store -r shuffled_base_df
%store -r shuffled_cleaned_df
%store -r ner_dfs
pd.set_option('max_colwidth', 800)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

python eval() and df.apply()

In [3]:
clean_data = True

if clean_data:
    shuffled_df = shuffled_cleaned_df
else: 
    shuffled_df = shuffled_base_df

shuffled_df

Unnamed: 0,Base Sentence,Prediction Label,Model Name,Domain,Template Number
0,the financial report shows a significant increase in revenue last year.,0,llama-3.3-70b-versatile,any,
1,"in 2025-08-20, the number of investors in the new york stock exchange will likely have a 20% increase, as predicted by policy expert, david lee, on 20 october 2024.",1,llama-3.3-70b-versatile,policy,5.0
2,the research team is conducting experiments to gather more data information.,0,llama-3.3-70b-versatile,any,
3,"on 2024-10-15, dr. thompson, a health expert from the world health organization, predicts that the obesity rate at the united states will likely decrease by 3% in q2 of 2026.",1,llama-3.3-70b-versatile,health,1.0
4,"james davis, a financial expert, predicts on 2024/08/22 that the revenue at visa may rise by 15% to $25 billion in 2028.",1,llama-3.3-70b-versatile,financial,3.0
5,the sales team is traveling to attend the annual industry conference.,0,llama-3.3-70b-versatile,any,
6,"according to a policy analyst, olivia brown, from the securities and exchange commission, on 2024-08-21, the number of initial public offerings is expected to increase as much as 50% in the timeframe of q2 of 2029.",1,llama-3.3-70b-versatile,policy,4.0
7,the company is currently undergoing a major restructuring effort to improve efficiency.,0,llama-3.3-70b-versatile,any,
8,"in q2 of 2025, dr. brown, a researcher from the american heart association, foresee that the average daily physical activity levels may fall by 5% in 2029.",1,llama-3.3-70b-versatile,health,2.0
9,"according to a policy reporter, emily patel, from the federal reserve, on 08/22/2024, the inflation rate is expected to decrease beyond 2% in the timeframe of q1 of 2029.",1,llama-3.3-70b-versatile,policy,4.0


In [4]:
spacy_feature_extractor = SpacyFeatureExtraction(shuffled_df, 'Base Sentence')
initialize_spacy = DataProcessing.setup_spacy()

### Word

In [5]:
# word_embeddings = spacy_feature_extractor.word_feature_extraction()
# word_embeddings

In [6]:
# word_embeddings_df = spacy_feature_extractor.word_feature_scores()
# word_embeddings_df

### Sentence

In [7]:
# sent_embeddings = spacy_feature_extractor.sentence_feature_extraction()
# sent_embeddings

In [8]:
# len(sent_embeddings)

## Extract Part-of-Speech (POS) Tags and Named Entity Recognition (NER) Entities at Word Level

In [9]:
only_predictions = DataProcessing.df_to_list(shuffled_df, 'Base Sentence')
only_predictions

['the financial report shows a significant increase in revenue last year.',
 'in 2025-08-20, the number of investors in the new york stock exchange will likely have a 20% increase, as predicted by policy expert, david lee, on 20 october 2024.',
 'the research team is conducting experiments to gather more data information.',
 'on 2024-10-15, dr. thompson, a health expert from the world health organization, predicts that the obesity rate at the united states will likely decrease by 3% in q2 of 2026.',
 'james davis, a financial expert, predicts on 2024/08/22 that the revenue at visa may rise by 15% to $25 billion in 2028.',
 'the sales team is traveling to attend the annual industry conference.',
 'according to a policy analyst, olivia brown, from the securities and exchange commission, on 2024-08-21, the number of initial public offerings is expected to increase as much as 50% in the timeframe of q2 of 2029.',
 'the company is currently undergoing a major restructuring effort to improve

In [10]:
word_leveL_disable_components = ["lemmatizer"]
word_level_pos_tags, word_level_pos_mappings, word_level_ner_entities, word_level_ner_mappings = DataProcessing.extract_entities(only_predictions, initialize_spacy, word_leveL_disable_components)

### Visualize as DF

In [11]:
all_word_level_pos_df = DataProcessing.convert_to_df(word_level_pos_tags, mapping=word_level_pos_mappings)
all_word_level_pos_df

Unnamed: 0,SCONJ,NUM,AUX,PART,PROPN,CCONJ,ADJ,DET,PRON,VERB,PUNCT,ADP,SYM,ADV,NOUN
0,,,,,,,last,a,,shows,.,in,,,year
1,as,2024,will,,october,,,a,,predicted,.,on,-,likely,expert
2,,,is,to,,,more,the,,gather,.,,,,information
3,that,2026,will,,states,,,the,,decrease,.,of,-,likely,q2
4,that,2028,may,to,davis,,financial,the,,rise,.,in,$,,%
5,,,is,to,,,annual,the,,attend,.,,,,conference
6,,2029,is,to,commission,and,much,the,,increase,.,of,-,as,q2
7,,,is,to,,,major,a,,improve,.,,,currently,efficiency
8,that,2029,may,,association,,physical,the,,fall,.,in,,,%
9,,2029,is,to,q1,,patel,the,,decrease,.,of,,emily,timeframe


In [12]:
all_word_level_ner_df = DataProcessing.convert_to_df(word_level_ner_entities, word_level_ner_mappings)
all_word_level_ner_df

Unnamed: 0,ORG_1,DATE_1,CARDINAL_2,MONEY_1,CARDINAL_1,GPE_1,DATE_2,PERCENT_2,PERCENT_1,PERSON_1,DATE_4,GPE_2,MONEY_2,QUANTITY_1,DATE_3
0,,last year,,,,,,,,,,,,,
1,the new york stock exchange,2025-08-20,,,,,20 october 2024,,20%,david lee,,,,,
3,the world health organization,2024-10-15,,,,the united states,2026,,3%,thompson,,,,,
4,,2028,,$25 billion,2024/08/22,,,,15%,james davis,,,,,
6,the securities and exchange commission,2024-08-21,,,,,2029,,as much as 50%,olivia brown,,,,,
8,the american heart association,2025,,,,,daily,,5%,brown,2029.0,,,,may fall
9,the federal reserve,2029,,,,,,,2%,,,,,,
10,the european centre,2026,,,,germany,weekly,,10%,taylor,,,,,2024-10-05
11,,2024,,,2500,,daily,,,lee,2028.0,,,,2000
12,,2024-10-15,,,,los angeles,2026,,15%,rachel kim,,,,,


In [13]:
word_level_tags_entities = [all_word_level_pos_df, all_word_level_ner_df]
word_level_tags_entities_df = DataProcessing.concat_dfs(word_level_tags_entities, axis=1, ignore_index=False)
word_level_tags_entities_df

Unnamed: 0,SCONJ,NUM,AUX,PART,PROPN,CCONJ,ADJ,DET,PRON,VERB,PUNCT,ADP,SYM,ADV,NOUN,ORG_1,DATE_1,CARDINAL_2,MONEY_1,CARDINAL_1,GPE_1,DATE_2,PERCENT_2,PERCENT_1,PERSON_1,DATE_4,GPE_2,MONEY_2,QUANTITY_1,DATE_3
0,,,,,,,last,a,,shows,.,in,,,year,,last year,,,,,,,,,,,,,
1,as,2024,will,,october,,,a,,predicted,.,on,-,likely,expert,the new york stock exchange,2025-08-20,,,,,20 october 2024,,20%,david lee,,,,,
2,,,is,to,,,more,the,,gather,.,,,,information,,,,,,,,,,,,,,,
3,that,2026,will,,states,,,the,,decrease,.,of,-,likely,q2,the world health organization,2024-10-15,,,,the united states,2026,,3%,thompson,,,,,
4,that,2028,may,to,davis,,financial,the,,rise,.,in,$,,%,,2028,,$25 billion,2024/08/22,,,,15%,james davis,,,,,
5,,,is,to,,,annual,the,,attend,.,,,,conference,,,,,,,,,,,,,,,
6,,2029,is,to,commission,and,much,the,,increase,.,of,-,as,q2,the securities and exchange commission,2024-08-21,,,,,2029,,as much as 50%,olivia brown,,,,,
7,,,is,to,,,major,a,,improve,.,,,currently,efficiency,,,,,,,,,,,,,,,
8,that,2029,may,,association,,physical,the,,fall,.,in,,,%,the american heart association,2025,,,,,daily,,5%,brown,2029.0,,,,may fall
9,,2029,is,to,q1,,patel,the,,decrease,.,of,,emily,timeframe,the federal reserve,2029,,,,,,,2%,,,,,,


### Drop Non-Essential Columns

In [14]:
len(all_word_level_ner_df)

50

In [15]:
drop_word_level_pos_columns = ['DET', 'ADP', 'SCONJ', 'CCONJ', 'PUNCT', 'PART']
core_word_level_pos_df = DataProcessing.drop_df_columns(all_word_level_pos_df, drop_word_level_pos_columns)
core_word_level_pos_df

Unnamed: 0,NUM,AUX,PROPN,ADJ,PRON,VERB,SYM,ADV,NOUN
0,,,,last,,shows,,,year
1,2024,will,october,,,predicted,-,likely,expert
2,,is,,more,,gather,,,information
3,2026,will,states,,,decrease,-,likely,q2
4,2028,may,davis,financial,,rise,$,,%
5,,is,,annual,,attend,,,conference
6,2029,is,commission,much,,increase,-,as,q2
7,,is,,major,,improve,,currently,efficiency
8,2029,may,association,physical,,fall,,,%
9,2029,is,q1,patel,,decrease,,emily,timeframe


In [16]:
drop_word_level_pos_columns = ['DET', 'ADP', 'SCONJ', 'CCONJ', 'PUNCT', 'PART']
core_word_level_pos_ner_df = DataProcessing.drop_df_columns(word_level_tags_entities_df, drop_word_level_pos_columns)
core_word_level_pos_ner_df

Unnamed: 0,NUM,AUX,PROPN,ADJ,PRON,VERB,SYM,ADV,NOUN,ORG_1,DATE_1,CARDINAL_2,MONEY_1,CARDINAL_1,GPE_1,DATE_2,PERCENT_2,PERCENT_1,PERSON_1,DATE_4,GPE_2,MONEY_2,QUANTITY_1,DATE_3
0,,,,last,,shows,,,year,,last year,,,,,,,,,,,,,
1,2024,will,october,,,predicted,-,likely,expert,the new york stock exchange,2025-08-20,,,,,20 october 2024,,20%,david lee,,,,,
2,,is,,more,,gather,,,information,,,,,,,,,,,,,,,
3,2026,will,states,,,decrease,-,likely,q2,the world health organization,2024-10-15,,,,the united states,2026,,3%,thompson,,,,,
4,2028,may,davis,financial,,rise,$,,%,,2028,,$25 billion,2024/08/22,,,,15%,james davis,,,,,
5,,is,,annual,,attend,,,conference,,,,,,,,,,,,,,,
6,2029,is,commission,much,,increase,-,as,q2,the securities and exchange commission,2024-08-21,,,,,2029,,as much as 50%,olivia brown,,,,,
7,,is,,major,,improve,,currently,efficiency,,,,,,,,,,,,,,,
8,2029,may,association,physical,,fall,,,%,the american heart association,2025,,,,,daily,,5%,brown,2029.0,,,,may fall
9,2029,is,q1,patel,,decrease,,emily,timeframe,the federal reserve,2029,,,,,,,2%,,,,,,


### Encode

In [17]:
encoded_word_level_pos_df = DataProcessing.encode_tags_entities_dataframe(core_word_level_pos_df)
encoded_word_level_pos_df

Unnamed: 0,NUM,AUX,PROPN,ADJ,PRON,VERB,SYM,ADV,NOUN
0,0,0,0,1,0,1,0,0,1
1,1,1,1,0,0,1,1,1,1
2,0,1,0,1,0,1,0,0,1
3,1,1,1,0,0,1,1,1,1
4,1,1,1,1,0,1,1,0,1
5,0,1,0,1,0,1,0,0,1
6,1,1,1,1,0,1,1,1,1
7,0,1,0,1,0,1,0,1,1
8,1,1,1,1,0,1,0,0,1
9,1,1,1,1,0,1,0,1,1


In [18]:
encoded_word_level_ner_df = DataProcessing.encode_tags_entities_dataframe(all_word_level_ner_df)
encoded_word_level_ner_df

Unnamed: 0,ORG_1,DATE_1,CARDINAL_2,MONEY_1,CARDINAL_1,GPE_1,DATE_2,PERCENT_2,PERCENT_1,PERSON_1,DATE_4,GPE_2,MONEY_2,QUANTITY_1,DATE_3
0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
1,1,1,0,0,0,0,1,0,1,1,0,0,0,0,0
3,1,1,0,0,0,1,1,0,1,1,0,0,0,0,0
4,0,1,0,1,1,0,0,0,1,1,0,0,0,0,0
6,1,1,0,0,0,0,1,0,1,1,0,0,0,0,0
8,1,1,0,0,0,0,1,0,1,1,1,0,0,0,1
9,1,1,0,0,0,0,0,0,1,0,0,0,0,0,0
10,1,1,0,0,0,1,1,0,1,1,0,0,0,0,1
11,0,1,0,0,1,0,1,0,0,1,1,0,0,0,1
12,0,1,0,0,0,1,1,0,1,1,0,0,0,0,0


### Combine Encodings

- NOTE: Obtaining encoding individually then combining isn't working properly, so best to combine in an earlier step and get encodings of the already combined.

In [19]:
# encoded_word_level_pos_ner_list = [encoded_word_level_pos_df, encoded_word_level_ner_df]
# encoded_word_level_pos_ner_df = pd.concat(encoded_word_level_pos_ner_list, axis=1)
# encoded_word_level_pos_ner_df

In [20]:
encoded_word_level_pos_ner_df = DataProcessing.encode_tags_entities_dataframe(core_word_level_pos_ner_df)
encoded_word_level_pos_ner_df

Unnamed: 0,NUM,AUX,PROPN,ADJ,PRON,VERB,SYM,ADV,NOUN,ORG_1,DATE_1,CARDINAL_2,MONEY_1,CARDINAL_1,GPE_1,DATE_2,PERCENT_2,PERCENT_1,PERSON_1,DATE_4,GPE_2,MONEY_2,QUANTITY_1,DATE_3
0,0,0,0,1,0,1,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
1,1,1,1,0,0,1,1,1,1,1,1,0,0,0,0,1,0,1,1,0,0,0,0,0
2,0,1,0,1,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,1,1,1,0,0,1,1,1,1,1,1,0,0,0,1,1,0,1,1,0,0,0,0,0
4,1,1,1,1,0,1,1,0,1,0,1,0,1,1,0,0,0,1,1,0,0,0,0,0
5,0,1,0,1,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
6,1,1,1,1,0,1,1,1,1,1,1,0,0,0,0,1,0,1,1,0,0,0,0,0
7,0,1,0,1,0,1,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
8,1,1,1,1,0,1,0,0,1,1,1,0,0,0,0,1,0,1,1,1,0,0,0,1
9,1,1,1,1,0,1,0,1,1,1,1,0,0,0,0,0,0,1,0,0,0,0,0,0


## Combine Sentence Embeddings and Encodings

- Need to extract word and/or sentence embeddings
- The encodings would be much better if we can reclassify some of them (and drop certain columns)

In [21]:
# _ = [word_embeddings_df, sent_embeddings_df]
# encoded_word_level_pos_ner_df = DataProcessing.concat_dfs(_, encoded_word_level_pos_ner_df)
# encoded_word_level_pos_ner_df

In [23]:
encoded_word_level_pos_ner_df

Unnamed: 0,NUM,AUX,PROPN,ADJ,PRON,VERB,SYM,ADV,NOUN,ORG_1,DATE_1,CARDINAL_2,MONEY_1,CARDINAL_1,GPE_1,DATE_2,PERCENT_2,PERCENT_1,PERSON_1,DATE_4,GPE_2,MONEY_2,QUANTITY_1,DATE_3
0,0,0,0,1,0,1,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
1,1,1,1,0,0,1,1,1,1,1,1,0,0,0,0,1,0,1,1,0,0,0,0,0
2,0,1,0,1,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,1,1,1,0,0,1,1,1,1,1,1,0,0,0,1,1,0,1,1,0,0,0,0,0
4,1,1,1,1,0,1,1,0,1,0,1,0,1,1,0,0,0,1,1,0,0,0,0,0
5,0,1,0,1,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
6,1,1,1,1,0,1,1,1,1,1,1,0,0,0,0,1,0,1,1,0,0,0,0,0
7,0,1,0,1,0,1,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
8,1,1,1,1,0,1,0,0,1,1,1,0,0,0,0,1,0,1,1,1,0,0,0,1
9,1,1,1,1,0,1,0,1,1,1,1,0,0,0,0,0,0,1,0,0,0,0,0,0


In [22]:
%store encoded_word_level_pos_ner_df

Stored 'encoded_word_level_pos_ner_df' (DataFrame)
