In [20]:
import spacy
nlp = spacy.load('en_core_web_sm')

In [21]:
#write a function to display basic entity info:


In [22]:
def show_ents(doc):
  if doc.ents:
    for ent in doc.ents:
      print(ent.text+' - '+ent.label_+' - '+str(spacy.explain(ent.label_)))

  else:
    print('No named entities found.')


In [23]:
doc =nlp(u'May i go to washington , DC next may to see the washington Monuments?')

show_ents(doc)

washington - GPE - Countries, cities, states
DC - GPE - Countries, cities, states
washington - GPE - Countries, cities, states


In [24]:
doc = nlp(u'i please have 500 dollars from you to pay some Microsoft stock?')
show_ents(doc)

500 dollars - MONEY - Monetary values, including unit
Microsoft - ORG - Companies, agencies, institutions, etc.


In [25]:
doc = nlp(u'Tesla to build a U.K. factory for $6 million')
show_ents(doc)

U.K. - GPE - Countries, cities, states
$6 million - MONEY - Monetary values, including unit


In [26]:
from spacy.tokens import Span

# get the hash value of the org entity level
ORG = doc.vocab.strings[u'ORG']


#create a a+Span new enktity
new_ent = Span(doc , 0 ,1 ,label=ORG)

#add the entity to the exiting doc object
doc.ents = list(doc.ents) + [new_ent]

In [27]:
show_ents(doc)

Tesla - ORG - Companies, agencies, institutions, etc.
U.K. - GPE - Countries, cities, states
$6 million - MONEY - Monetary values, including unit


In [28]:
doc = nlp(u'Our company plans to introduce in new vacuum cleaner .'
          u'If successful , the vacuum cleaner will be our first product.')

show_ents(doc)

first - ORDINAL - "first", "second", etc.


In [29]:

#import phrasematcher and ccreater mactcher object

from spacy.matcher import PhraseMatcher
matcher = PhraseMatcher(nlp.vocab)


#create the desired phrase patterns
phrase_list = ['vacuum cleaner' , 'vacuum-cleaner']
Phrase_patterns = [nlp(text) for text in phrase_list]

#apply the patterns to our matcher object
matcher.add('newproduct' , None , *Phrase_patterns)

#apply the matcher to our doc object
matches = matcher(doc )
matches


[(2689272359382549672, 7, 9), (2689272359382549672, 13, 15)]

In [30]:
from spacy.tokens import Span

PROD = doc.vocab.strings[u'PRODUCT']

new_ents = [Span(doc , match[1] , match[2] , label=PROD) for match in matches]

doc.ents = list(doc.ents) + new_ents

In [31]:
show_ents(doc)

vacuum cleaner - PRODUCT - Objects, vehicles, foods, etc. (not services)
vacuum cleaner - PRODUCT - Objects, vehicles, foods, etc. (not services)
first - ORDINAL - "first", "second", etc.


In [32]:
doc = nlp(u'Originally priced at $29.50 ,\n the sweater was marked down to $12.49')

show_ents(doc)


29.50 - MONEY - Monetary values, including unit
12.49 - MONEY - Monetary values, including unit


In [33]:
len([ent for ent in doc.ents if ent.label_ == 'MONEY'])

2

In [34]:

#def remove_white_space(doc):
 # doc.ents = [e for e in doc.ents if not e.text.isspace()]
#  return doc

#nlp.add_pipe(remove_white_space, after='ner')

In [35]:
from spacy.language import Language

# Create a custom pipeline component
@Language.component("remove_white_space")
def remove_white_space(doc):
    doc.ents = [e for e in doc.ents if not e.text.isspace()]
    return doc

nlp.add_pipe('remove_white_space', after="ner")

doc = nlp("Apple is looking at    buying U.K. startup for $1 billion.")
print([(ent.text, ent.label_) for ent in doc.ents])


[('Apple', 'ORG'), ('U.K.', 'GPE'), ('$1 billion', 'MONEY')]


In [36]:
from spacy import displacy

In [37]:
doc = nlp(u'Over the last quarter Apple sold nearly 20 thousand iPods for a profit of $6 million.'
u'By contrast, sony sold only 7 thousand Walkman music players.')

displacy.render(doc , style='ent' , jupyter=True)

In [38]:
for sent in doc.sents:
  displacy.render(nlp(sent.text) , style='ent' )