In [1]:
import spacy
nlp=spacy.load('en_core_web_sm')

In [2]:
def show_ents(doc):
    if doc.ents:
        for ent in doc.ents:
            print(ent.text + ' - ' + ent.label_ + ' - ' + str(spacy.explain(ent.label_)))
    else:
        print('No named entities found')


In [3]:
doc=nlp('May I go to Washington, DC next May to see the Washington Monument?')
show_ents(doc)

Washington, DC - GPE - Countries, cities, states
next May - DATE - Absolute or relative dates or periods
the Washington Monument - ORG - Companies, agencies, institutions, etc.


In [4]:
for ent in doc.ents:
    print(ent.text,ent.start,ent.end,ent.start_char,ent.end_char,ent.label_)

Washington, DC 4 7 12 26 GPE
next May 7 9 27 35 DATE
the Washington Monument 11 14 43 66 ORG


In [5]:
doc= nlp(u'Tesla to build a U.K. factory for $6 million')

show_ents(doc)

U.K. - GPE - Countries, cities, states
$6 million - MONEY - Monetary values, including unit


In [6]:
from spacy.tokens import Span

In [7]:
#Get the as value of ORG entity label
ORG=doc.vocab.strings[u'ORG']

In [8]:
#Create a Span for the new entity
new_ent=Span(doc, 0, 1,label=ORG)

In [9]:
#Add the entity to the existing Doc object
doc.ents = list(doc.ents) + [new_ent]

In [10]:
doc= nlp(u'Tesla to build a U.K. factory for $6 million')

show_ents(doc)

U.K. - GPE - Countries, cities, states
$6 million - MONEY - Monetary values, including unit


In [11]:
doc=nlp(u'Our company plans to introduce a new vaccum cleaner.'
        u'If successful, the vaccum cleaner will be our first product.')

show_ents(doc)

first - ORDINAL - "first", "second", etc.


In [12]:
#Import PhraseMatcher and create a matcher object:
from spacy.matcher import PhraseMatcher
matcher=PhraseMatcher(nlp.vocab)

In [13]:
#Create the desired phrase patterns:
phrase_list=['vaccum cleaner', 'vaccum-cleaner']
phrase_patterns = [nlp(text) for text in phrase_list]

In [14]:
#Apply the patterns to our matcher object:
matcher.add('newproduct',None, *phrase_patterns)

In [15]:
#Apply the matcher to our Doc object:
matches=matcher(doc)

#see what matches occur:
matches

[(2689272359382549672, 7, 9), (2689272359382549672, 14, 16)]

In [16]:
prod=doc.vocab.strings[u'PRODUCT']
new_ents=[Span(doc, match[1], match[2], label=prod) for match in matches]
doc.ents=list(doc.ents) + new_ents

In [17]:
show_ents(doc)

vaccum cleaner - PRODUCT - Objects, vehicles, foods, etc. (not services)
vaccum cleaner - PRODUCT - Objects, vehicles, foods, etc. (not services)
first - ORDINAL - "first", "second", etc.


## Counting Entities

In [18]:
doc=nlp(u'Originally priced at $29.50, the sweater was marked down to five dollars/')
show_ents(doc)

29.50 - MONEY - Monetary values, including unit
five - CARDINAL - Numerals that do not fall under another type


In [19]:
len([ent for ent in doc.ents if ent.label_ =='MONEY'])

1

In [22]:
#Quick function to remove ents formed on whitespace
from spacy.language import Language

@Language.component("remove_whitespace_entities")
def remove_whitespace_entities(doc):
    doc.ents = [e for e in doc.ents if not e.text.isspace()]
    return doc

#Insert this into the pipeline AFTER the ner component
nlp.add_pipe("remove_whitespace_entities", after='ner')

<function __main__.remove_whitespace_entities(doc)>

## Visualizing the NER 

In [None]:
#perform standard imports
import spacy
nlp=spacy.load('en_core_web_sm')

from spacy import displacy

In [None]:
doc=nlp(u'Over the last quarter Apple sold nearly 20 thousand iPods fro a profit of $6 million. '
        u'By contrast, Sony sold only 7 thousand Walkman music players.')

displacy.render(doc, style='ent', jupyter=True)

In [None]:
for sent in doc.sents:
    displacy.render(nlp(sent.text), style='ent',jupyter=True)