# Spacy Pipeline

- **Goal:** Prediction Recognition

- **Purpose:** To extract named entities (NER), part-of-speech (POS), etc.
    1. Use to train model as feature extraction (ie: TF x IDF) alone isn't enough

- **Misc:**
    - `%store`: Cell magic will store the variable of interest so we can load in another notebook

In [1]:
import os
import sys
import spacy

import pandas as pd
# Get the current working directory of the notebook
notebook_dir = os.getcwd()
# Add the parent directory to the system path
sys.path.append(os.path.join(notebook_dir, '../'))

from pipelines import BasePipeline
from data_processing import DataProcessing
from feature_extraction import SpacyFeatureExtraction

In [2]:
%store -r shuffled_base_df
%store -r shuffled_cleaned_df
%store -r ner_dfs
pd.set_option('max_colwidth', 800)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

python eval() and df.apply()

In [3]:
clean_data = True

if clean_data:
    shuffled_df = shuffled_cleaned_df
else: 
    shuffled_df = shuffled_base_df

shuffled_df

Unnamed: 0,Base Predictions,Prediction Label
0,the weather today is mostly sunny with some clouds in the sky.,0
1,"according to david harper from accuweather, on friday, august 15, 2024, the rainfall in portland will decrease by 12% in the timeframe of early june 2025.",1
2,the hotel room was clean and comfortable for guests to stay.,0
3,**health-based predictions**,1
4,"in 2027, the average weekly exercise hours in the united states are expected to rise by 18%, as predicted by the national institutes of health on friday, december 13, 2025.",1
5,the company's mission statement is to provide excellent customer service always.,0
6,**weather-based predictions**,1
7,the team is working diligently to resolve the ongoing technical issues.,0
8,the meeting will be rescheduled due to unforeseen circumstances that occurred.,0
9,"according to julian hall from microsoft (msft), on tuesday, july 23, 2024, the net profit will increase by 10% to $25 billion in the timeframe of q3 of 2029.",1


In [4]:
# nlp = spacy.load("en_core_web_sm")
# doc = nlp(shuffled_df['Base Predictions'][0])
# word_embeddings = [token.vector for token in doc]
# # pos_tags = [token.pos_ for token in doc]
# # ner_tags = [token.ent_type_ for token in doc]

# # Get sentence-level embeddings from Spacy
# sentence_embeddings = [sent.vector for sent in doc.sents]

[array([ 0.662991  , -0.21156058, -0.20048952, -0.12118668, -0.22676182,
        -0.25694323,  0.24674413, -0.21157083, -0.09538038,  0.26959398,
         0.01399686,  0.27714017, -0.02267465,  0.3724324 , -0.17011325,
        -0.04379023, -0.13943155,  0.30492797,  0.10463957, -0.13475452,
         0.0241785 ,  0.05528714, -0.20327629, -0.26266506,  0.14887236,
        -0.08133844,  0.12936035, -0.24352883,  0.30017778,  0.3994013 ,
         0.05827643,  0.0861609 ,  0.02457866, -0.22384998, -0.0602062 ,
         0.03821513,  0.01632952,  0.04838169, -0.01543305, -0.08810471,
        -0.17859665,  0.26483986,  0.05982164, -0.20512445,  0.01541738,
        -0.07931299, -0.09695514,  0.4877248 ,  0.00624968, -0.07577613,
        -0.20204765, -0.04950607, -0.05664921, -0.23302075, -0.09530719,
        -0.16260701,  0.01315193, -0.2579068 ,  0.33751994, -0.10189056,
        -0.30145591, -0.13413303,  0.03628096, -0.23133725, -0.00782322,
         0.19889727,  0.36628872, -0.48181278, -0.3

### Word

In [9]:
spacy_feature_extractor = SpacyFeatureExtraction(shuffled_df, 'Base Predictions')
word_embeddings = spacy_feature_extractor.word_feature_extraction()
word_embeddings

[array([ 1.2557808 , -0.2730553 , -0.05277061,  0.6520514 , -0.04622172,
        -0.60923415, -0.4833792 , -0.45323235, -0.9001082 ,  0.96256423,
        -0.04206129,  1.8058378 ,  1.0849649 ,  0.0821459 , -0.82509154,
        -0.87171376, -0.62434715,  2.415669  , -0.3986623 , -0.66857094,
        -0.18613659,  0.13227992,  0.4813207 , -0.6782787 ,  1.4480814 ,
        -0.38775215,  0.3440814 , -0.10912246,  0.39807475,  1.3983884 ,
         0.2689714 , -0.5952974 , -0.09329724, -1.4015822 ,  0.14311925,
        -0.72895664, -0.08295457,  0.49438483, -0.47703826,  1.7069368 ,
        -0.03583494,  1.4617534 , -0.54627174,  0.5320741 , -0.15453327,
        -0.01306841, -0.5807851 ,  0.90508944,  0.23614335,  0.60896003,
        -0.13250214,  0.5743287 ,  0.68293226,  1.5849355 ,  0.571682  ,
        -0.36314642, -0.9793233 ,  0.12180568, -1.1444669 , -0.15437017,
        -0.36800274,  0.05638431,  0.18841448, -0.5911322 , -0.24214824,
        -0.606945  ,  0.17194352, -1.0552365 , -0.8

In [10]:
len(word_embeddings)

873

### Sentence

In [8]:
spacy_feature_extractor = SpacyFeatureExtraction(shuffled_df, 'Base Predictions')
sent_embeddings = spacy_feature_extractor.sentence_feature_extraction()
sent_embeddings

[array([ 0.662991  , -0.21156058, -0.20048952, -0.12118668, -0.22676182,
        -0.25694323,  0.24674413, -0.21157083, -0.09538038,  0.26959398,
         0.01399686,  0.27714017, -0.02267465,  0.3724324 , -0.17011325,
        -0.04379023, -0.13943155,  0.30492797,  0.10463957, -0.13475452,
         0.0241785 ,  0.05528714, -0.20327629, -0.26266506,  0.14887236,
        -0.08133844,  0.12936035, -0.24352883,  0.30017778,  0.3994013 ,
         0.05827643,  0.0861609 ,  0.02457866, -0.22384998, -0.0602062 ,
         0.03821513,  0.01632952,  0.04838169, -0.01543305, -0.08810471,
        -0.17859665,  0.26483986,  0.05982164, -0.20512445,  0.01541738,
        -0.07931299, -0.09695514,  0.4877248 ,  0.00624968, -0.07577613,
        -0.20204765, -0.04950607, -0.05664921, -0.23302075, -0.09530719,
        -0.16260701,  0.01315193, -0.2579068 ,  0.33751994, -0.10189056,
        -0.30145591, -0.13413303,  0.03628096, -0.23133725, -0.00782322,
         0.19889727,  0.36628872, -0.48181278, -0.3

In [11]:
len(sent_embeddings)

41

In [None]:
X_train, X_test, y_train, y_test = DataProcessing.split_data(vector_data, vector_keys)
X_train

## Play

- Remove once finalized

In [None]:
pos_col_names = list(pos_df.columns)
for pos_col_name in pos_col_names:
    print(f"pos_col_name: {spacy.explain(pos_col_name)}")

In [None]:
list(ner_df.columns)

In [None]:
ner_col_names = list(ner_df.columns)
for ner_col_name in ner_col_names:
    print(ner_col_name)
    print(f"ner_col_name: {spacy.explain(ner_col_name)}")