# Spacy Pipeline

- **Goal:** Prediction Recognition

- **Purpose:** To extract named entities (NER), part-of-speech (POS), etc.
    1. Use to train model as feature extraction (ie: TF x IDF) alone isn't enough

- **Misc:**
    - `%store`: Cell magic will store the variable of interest so we can load in another notebook

In [1]:
import os
import sys
import spacy

import pandas as pd
# Get the current working directory of the notebook
notebook_dir = os.getcwd()
# Add the parent directory to the system path
sys.path.append(os.path.join(notebook_dir, '../'))

from pipelines import BasePipeline
from data_processing import DataProcessing
from feature_extraction import SpacyFeatureExtraction

In [2]:
%store -r shuffled_base_df
pd.set_option('max_colwidth', 800)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

python eval() and df.apply()

In [3]:
shuffled_base_df

Unnamed: 0,Base Sentence,Prediction Label,Model Name,Domain,Template Number
0,The music echoed through the empty hall.,0,llama-3.3-70b-versatile,any,0
1,"According to a policy analyst, Emily Chen, from the Congressional Budget Office, on 2024-08-22, the federal budget deficit is expected to decrease beyond $1 trillion in the timeframe of Q4 of 2027.",1,llama-3.3-70b-versatile,policy,4
2,"On 2024-10-15, Dr. David Lee, a health expert, predicts that the obesity rate at the World Health Organization will likely decrease by 3% in Q2 of 2026.",1,llama-3.3-70b-versatile,health,1
3,"According to a senior level person from 3M, on 2024/08/22, the operating income is expected to increase as much as $500 million, reflecting a 20% increase, in the timeframe of Q2 of 2029.",1,llama-3.3-70b-versatile,financial,4
4,"On 2024-10-15, Rachel Patel, a financial analyst, predicts that the operating income at General Motors will likely increase by $5 billion in Q2 of 2026.",1,llama-3.3-70b-versatile,financial,1
5,She practiced yoga on the quiet morning.,0,llama-3.3-70b-versatile,any,0
6,He played with his dog in the backyard.,0,llama-3.3-70b-versatile,any,0
7,The kids played tag in the park playground.,0,llama-3.3-70b-versatile,any,0
8,They went to the movies on a Friday night.,0,llama-3.3-70b-versatile,any,0
9,He ate a healthy breakfast every morning.,0,llama-3.3-70b-versatile,any,0


In [4]:
# initialize the spacy model
spacy_feature_extractor = SpacyFeatureExtraction(shuffled_base_df, 'Base Sentence')
spacy_feature_extractor

<feature_extraction.SpacyFeatureExtraction at 0x130951d10>

## Extract Part-of-Speech (POS) Tags and Named Entity Recognition (NER) Entities at Word Level

In [5]:
only_predictions = DataProcessing.df_to_list(shuffled_base_df, 'Base Sentence')
only_predictions

['The music echoed through the empty hall.',
 'According to a policy analyst, Emily Chen, from the Congressional Budget Office, on 2024-08-22, the federal budget deficit is expected to decrease beyond $1 trillion in the timeframe of Q4 of 2027.',
 'On 2024-10-15, Dr. David Lee, a health expert, predicts that the obesity rate at the World Health Organization will likely decrease by 3% in Q2 of 2026.',
 'According to a senior level person from 3M, on 2024/08/22, the operating income is expected to increase as much as $500 million, reflecting a 20% increase, in the timeframe of Q2 of 2029.',
 'On 2024-10-15, Rachel Patel, a financial analyst, predicts that the operating income at General Motors will likely increase by $5 billion in Q2 of 2026.',
 'She practiced yoga on the quiet morning.',
 'He played with his dog in the backyard.',
 'The kids played tag in the park playground.',
 'They went to the movies on a Friday night.',
 'He ate a healthy breakfast every morning.',
 'According to a 

In [6]:
word_leveL_disable_components = ["lemmatizer"]
word_level_pos_tags, word_level_pos_mappings, word_level_ner_entities, word_level_ner_mappings = spacy_feature_extractor.extract_entities(only_predictions, word_leveL_disable_components)

### Visualize as DF

In [7]:
all_word_level_pos_df = DataProcessing.convert_to_df(word_level_pos_tags, mapping=word_level_pos_mappings)
all_word_level_pos_df

Unnamed: 0,ADV,ADP,CCONJ,DET,SYM,NUM,PART,SCONJ,PROPN,PUNCT,PRON,AUX,ADJ,NOUN,VERB
0,,through,,the,,,,,,.,,,empty,hall,echoed
1,,of,,the,$,2027.0,to,,Q4,.,,is,federal,timeframe,decrease
2,likely,of,,the,-,2026.0,,that,Q2,.,,will,,%,decrease
3,as,of,,the,$,2029.0,to,,Q2,.,,is,much,timeframe,reflecting
4,likely,of,,the,$,2026.0,,that,Q2,.,,will,financial,income,increase
5,,on,,the,,,,,,.,She,,quiet,morning,practiced
6,,in,,the,,,,,,.,his,,,backyard,played
7,,in,,the,,,,,,.,,,,playground,played
8,,on,,a,,,,,Friday,.,They,,,night,went
9,,,,every,,,,,,.,He,,healthy,morning,ate


In [8]:
all_word_level_ner_df = DataProcessing.convert_to_df(word_level_ner_entities, word_level_ner_mappings)
all_word_level_ner_df

Unnamed: 0,DATE_1,CARDINAL_3,ORG_1,PRODUCT_1,TIME_1,ORG_2,CARDINAL_1,CARDINAL_2,PERCENT_1,QUANTITY_1,DATE_3,PERSON_1,GPE_1,DATE_2,MONEY_1,MONEY_2,PERCENT_2
1,2024-08-22,,the Congressional Budget Office,,,,Q4,2027,,,,Emily Chen,,,$1 trillion,,
2,2024-10-15,,the World Health Organization,Q2,,,2026,,3%,,,David Lee,,,,,
3,,,3M,Q2 of 2029,,,2024/08/22,,20%,,,,,,as much as $500 million,,
4,2024-10-15,,General Motors,Q2 of 2026,,,,,,,,Rachel Patel,,,$5 billion,,
5,,,,,the quiet morning,,,,,,,,,,,,
8,Friday,,,,night,,,,,,,,,,,,
10,2024-11-25,,the Meteorological Service of Canada,,,,,,,20 inches,,,Toronto,2026-02-01,,,
14,2024-08-24,10 million,,,,,Q4,2027,25%,,,Daniel Hall,the United States,,,,
16,2024-10-11,,the Department of Commerce,,,Q3,2029,,,,,Lisa Nguyen,,,$20 billion,,
18,2024-07-22,,the National Weather Service,,,,2028,,10%,,,Samantha Brown,Chicago,,,,


In [9]:
word_level_tags_entities = [all_word_level_pos_df, all_word_level_ner_df]
word_level_tags_entities_df = DataProcessing.concat_dfs(word_level_tags_entities, axis=1, ignore_index=False)
word_level_tags_entities_df

Unnamed: 0,ADV,ADP,CCONJ,DET,SYM,NUM,PART,SCONJ,PROPN,PUNCT,PRON,AUX,ADJ,NOUN,VERB,DATE_1,CARDINAL_3,ORG_1,PRODUCT_1,TIME_1,ORG_2,CARDINAL_1,CARDINAL_2,PERCENT_1,QUANTITY_1,DATE_3,PERSON_1,GPE_1,DATE_2,MONEY_1,MONEY_2,PERCENT_2
0,,through,,the,,,,,,.,,,empty,hall,echoed,,,,,,,,,,,,,,,,,
1,,of,,the,$,2027.0,to,,Q4,.,,is,federal,timeframe,decrease,2024-08-22,,the Congressional Budget Office,,,,Q4,2027,,,,Emily Chen,,,$1 trillion,,
2,likely,of,,the,-,2026.0,,that,Q2,.,,will,,%,decrease,2024-10-15,,the World Health Organization,Q2,,,2026,,3%,,,David Lee,,,,,
3,as,of,,the,$,2029.0,to,,Q2,.,,is,much,timeframe,reflecting,,,3M,Q2 of 2029,,,2024/08/22,,20%,,,,,,as much as $500 million,,
4,likely,of,,the,$,2026.0,,that,Q2,.,,will,financial,income,increase,2024-10-15,,General Motors,Q2 of 2026,,,,,,,,Rachel Patel,,,$5 billion,,
5,,on,,the,,,,,,.,She,,quiet,morning,practiced,,,,,the quiet morning,,,,,,,,,,,,
6,,in,,the,,,,,,.,his,,,backyard,played,,,,,,,,,,,,,,,,,
7,,in,,the,,,,,,.,,,,playground,played,,,,,,,,,,,,,,,,,
8,,on,,a,,,,,Friday,.,They,,,night,went,Friday,,,,night,,,,,,,,,,,,
9,,,,every,,,,,,.,He,,healthy,morning,ate,,,,,,,,,,,,,,,,,


### Encode

In [10]:
encoded_word_level_tags_entities_df = DataProcessing.encode_tags_entities_df(word_level_tags_entities_df)
encoded_word_level_tags_entities_df

Unnamed: 0,ADV,ADP,CCONJ,DET,SYM,NUM,PART,SCONJ,PROPN,PUNCT,PRON,AUX,ADJ,NOUN,VERB,DATE_1,CARDINAL_3,ORG_1,PRODUCT_1,TIME_1,ORG_2,CARDINAL_1,CARDINAL_2,PERCENT_1,QUANTITY_1,DATE_3,PERSON_1,GPE_1,DATE_2,MONEY_1,MONEY_2,PERCENT_2
0,0,1,0,1,0,0,0,0,0,1,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,1,0,1,1,1,1,0,1,1,0,1,1,1,1,1,0,1,0,0,0,1,1,0,0,0,1,0,0,1,0,0
2,1,1,0,1,1,1,0,1,1,1,0,1,0,1,1,1,0,1,1,0,0,1,0,1,0,0,1,0,0,0,0,0
3,1,1,0,1,1,1,1,0,1,1,0,1,1,1,1,0,0,1,1,0,0,1,0,1,0,0,0,0,0,1,0,0
4,1,1,0,1,1,1,0,1,1,1,0,1,1,1,1,1,0,1,1,0,0,0,0,0,0,0,1,0,0,1,0,0
5,0,1,0,1,0,0,0,0,0,1,1,0,1,1,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0
6,0,1,0,1,0,0,0,0,0,1,1,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
7,0,1,0,1,0,0,0,0,0,1,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
8,0,1,0,1,0,0,0,0,1,1,1,0,0,1,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0
9,0,0,0,1,0,0,0,0,0,1,1,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [11]:
%store encoded_word_level_tags_entities_df

Stored 'encoded_word_level_tags_entities_df' (DataFrame)
