<a href="https://colab.research.google.com/github/kullawattana/thesis_2020_spacy_colab/blob/master/36_pipeline_sample.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Import the English language class
import spacy
from spacy.lang.en import English 
# Create the nlpframework object
nlp = English()

#-------------Anatomy of a component (1)-------------
def custom_component(doc):
    # Do something to the doc here return doc
    nlp.add_pipe(custom_component)

#-------------Anatomy of a component (2)-------------
def custom_component(doc):
    # Do something to the doc here return doc
    nlp.add_pipe(custom_component)

In [None]:
#--------------Example: a simple component (1)--------------
# Create the nlpframework object
nlp = spacy.load('en_core_web_sm') 
# Define a custom component
def custom_component(doc):
    # Print the doc's length
    print('Doc length:' + len(doc))
    return doc

# Add the component first in the pipeline
nlp.add_pipe(custom_component, first=True) 

# Print the pipeline component names
print('Pipeline:', nlp.pipe_names)

Pipeline: ['custom_component', 'tagger', 'parser', 'ner']


In [None]:
#------------Example: a simple component (2)---------------
# Create the nlpframework object
nlp = spacy.load('en_core_web_sm') 
# Define a custom component
def custom_component(doc):
    # Print the doc's length
    print('Doc length:' + str(len(doc))) 
    # Return the doc object
    return doc

# Add the component first in the pipeline
nlp.add_pipe(custom_component, first=True) 
# Process a text
doc = nlp("Hello world!")

Doc length:3


In [None]:
#-----------------Setting custom attributes---------------------
doc._.title = "My document" 
#token._.is_color = True 
#span._.has_color = False

# Import global classes
from spacy.tokens import Doc, Token, Span
# Set extensions on the Doc, Token and Span
Doc.set_extension('title', default=None, force=True) 
Token.set_extension('is_color', default=False, force=True) 
Span.set_extension('has_color', default=False, force=True)

In [None]:
#---------------Attribute extensions-------------------
from spacy.tokens import Token

# Set extension on the Token with default value
#Token.set_extension('is_color', default=False)
doc = nlp("The sky is blue.")

# Overwrite extension attribute value
doc[3]._.is_color = True

Doc length:5


In [None]:
#---------------Property extensions (1)-----------------
from spacy.tokens import Token

# Define getter function
def get_is_color(token):
    colors = ['red', 'yellow', 'blue'] 
    return token.text in colors

# Set extension on the Token with getter
Token.set_extension('is_color', getter=get_is_color, force=True) 
doc = nlp("The sky is blue.")
print(doc[3]._.is_color, '-', doc[3].text)

Doc length:5
True - blue


In [None]:
#---------------Property extensions (2)-----------------
from spacy.tokens import Span
# Define getter function
def get_has_color(span):
    colors = ['red', 'yellow', 'blue']
    return any(token.text in colors for token in span)

# Set extension on the Span with getter
Span.set_extension('has_color', getter=get_has_color, force=True)
doc = nlp("The sky is blue.") 
print(doc[1:4]._.has_color, '-', doc[1:4].text) 
print(doc[0:2]._.has_color, '-', doc[0:2].text)

Doc length:5
True - sky is blue
False - The sky


In [None]:
#---------------------Method extensions-------------------
from spacy.tokens import Doc
# Define method with arguments
def has_token(doc, token_text):
    in_doc = token_text in [token.text for token in doc]

# Set extension on the Doc with method
Doc.set_extension('has_token', method=has_token, force=True)
doc = nlp("The sky is blue.") 
print(doc._.has_token('blue'), '- blue') 
print(doc._.has_token('cloud'), '- cloud')

Doc length:5
None - blue
None - cloud


In [None]:
#---------------Passing in context (1)-----------------
data = [
('This is a text', {'id': 1, 'page_number': 15}), ('And another text', {'id': 2, 'page_number': 16}),
]
for doc, context in nlp.pipe(data, as_tuples=True): 
    print(doc.text, context['page_number'])

#---------------Passing in context (2)-----------------
from spacy.tokens import Doc 
Doc.set_extension('id', default=None, force=True)
Doc.set_extension('page_number', default=None, force=True)

data = [
('This is a text', {'id': 1, 'page_number': 15}), ('And another text', {'id': 2, 'page_number': 16}),
]

for doc, context in nlp.pipe(data, as_tuples=True): 
    doc._.id = context['id']
    doc._.page_number = context['page_number']

Doc length:4
Doc length:3
This is a text 15
And another text 16
Doc length:4
Doc length:3


In [None]:
#-------------------Using only the tokenizer (2)---------------------------
doc = nlp("Hello world")                    #BAD
doc = nlp.make_doc("Hello world!")          #GOOD

Doc length:2


In [None]:
#-------------------Disabling pipeline components--------------------------
# Disable tagger and parser
with nlp.disable_pipes('tagger', 'parser'):
# Process the text and print the entities 
    text = "This is a book"
    doc = nlp(text)
    print(doc.ents)

Doc length:4
()
