# Spacy
[spacy](https://spacy.io)

In [2]:
import numpy as np
import pyarrow as pa
import spacy
import vaex
from spacy.cli import download
from spacy.language import Language
from goldilox import Pipeline

def download_nlp(lang='en_core_web_sm'):
    try:
        nlp = spacy.load(lang)
        return True
    except:
        download(lang)
    return False

download_nlp()

True

In [5]:
# Build a spacy entities pipeline
@Language.component("ents")
def ents(doc):
    return doc.ents

nlp_entitie = spacy.load('en_core_web_sm')

texts = [
    "Net income was $9.4 million compared to the prior year of $2.7 million. Apple is doing very well",
    "IBM had their revenue exceeded twelve billion dollars, with a loss of $1b.",
]

df = vaex.from_arrays(text=texts)

@Language.component("vectorize")
def vectorize(doc):
    return doc.vector

vectorizer = spacy.load('en_core_web_sm', disable=["ner", 'parser'])
vectorizer.add_pipe('vectorize', name='vectorize', last=True)

@vaex.register_function()
def to_vector(ar):
    if not isinstance(ar, list):
        ar = ar.tolist()
    return np.array([vectorizer(doc) for doc in ar])

df.add_function('to_vector', to_vector)

df['vector'] = df.text.to_vector()
pipeline = Pipeline.from_vaex(df)
pipeline.inference(pipeline.raw)



#,text,vector
0,'Net income was $9.4 million compared to the pri...,"'array([ 0.03719226, -0.00760807, -0.16960604, ..."


# Hugging face - sentence transformer
* [huggingface](https://huggingface.co/docs/transformers/v4.15.0/en/main_classes/pipelines#transformers.FeatureExtractionPipeline)
* [sentence transformer](https://github.com/UKPLab/sentence-transformers)

In [50]:
from sentence_transformers import SentenceTransformer
import vaex
import pyarrow as pa
from goldilox import Pipeline

transformer = SentenceTransformer("distilbert-base-nli-stsb-mean-tokens")


Downloading:   0%|          | 0.00/345 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/4.01k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/555 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/122 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/229 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/265M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/505 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [49]:
@vaex.register_function()
def to_vector(ar):
    print(ar)
    ret = transformer.encode(ar.tolist(), convert_to_numpy=True)
    print(ret)
    return ret


df = vaex.from_arrays(text=['What do you mean by by "It takes two to tango"?'])
df.add_function("to_vector", to_vector)

df['text'] = df['text'].fillna('')
df.text.to_vector()

[
  "What do you mean by by "It takes two to tango"?"
]
[
  "What do you mean by by "It takes two to tango"?"
]
[
  "What do you mean by by "It takes two to tango"?"
]
[
  "What do you mean by by "It takes two to tango"?"
]


AttributeError: 'NoneType' object has no attribute 'tokenize'

In [6]:

@vaex.register_function()
def to_vector(ar):
    if isinstance(ar, str):
        ar = [ar]    
    if hasattr(ar, 'tolist'):
        ar = ar.tolist()
    ret = pa.array(qa(question, story)['answer'] for question in ar)
    return ret


df = vaex.from_arrays(text=['What do you mean by by "It takes two to tango"?'])
df.add_function("to_vector", to_vector)

df['text'] = df['text'].fillna('')
df['vector'] = df.text.to_vector()

model = Pipeline.from_vaex(df)
model.inference({"text":"this is my life"})

No model was supplied, defaulted to distilbert-base-cased (https://huggingface.co/distilbert-base-cased)


KeyboardInterrupt: 