In [3]:
import spacy
nlp=spacy.load('en_core_web_sm')

In [4]:
def show_ents(doc):
  if doc.ents:
    for ent in doc.ents:
      print(ent.text + '-' + ent.label_ +'-' +str(spacy.explain(ent.label_)))
  else:
    print('No named entities found:')

In [5]:
doc=nlp(u'May I go to Washington, Dc next May tothe see the Washington Monument?')
show_ents(doc)

Washington, Dc-GPE-Countries, cities, states
next May-DATE-Absolute or relative dates or periods
the Washington Monument-ORG-Companies, agencies, institutions, etc.


In [6]:
doc= nlp(u'Can I please borrow 500 dollars from you to buy some Microsoft stock?')

for ent in doc.ents:
  print(ent.text, ent.start,ent.end,ent.start_char,ent.end_char,ent.label_)

500 dollars 4 6 20 31 MONEY
Microsoft 11 12 53 62 ORG


In [7]:
doc=nlp(u'Tesla to build a U.K. factory for $6 million')

show_ents(doc)

U.K.-GPE-Countries, cities, states
$6 million-MONEY-Monetary values, including unit


# Adding a Named entity to a Span

In [8]:
from spacy.tokens import Span

# Get the hash value of the ORG entity label
ORG= doc.vocab.strings[u'ORG']

# Create a Span for the new entity
new_ent= Span(doc,0,1,label=ORG)

# Add the entity to the existing doc object
doc.ents =list(doc.ents) +[new_ent]

In [9]:
show_ents(doc)

Tesla-ORG-Companies, agencies, institutions, etc.
U.K.-GPE-Countries, cities, states
$6 million-MONEY-Monetary values, including unit


In [10]:
doc=nlp(u'Our company plans to introduce a new vaccum cleaner.'
        u'If successful, the vaccum cleaner will be our first product')

show_ents(doc)

first-ORDINAL-"first", "second", etc.


In [11]:
# Import Phrasematcher and create a matcher object:
from spacy.matcher import PhraseMatcher
matcher=PhraseMatcher(nlp.vocab)

# Create the desired phrase patterns:
phrase_list=['vaccum cleaner','vaccum-cleaner']
phrase_patterns=[nlp(text) for text in phrase_list]

# Apply the patterns to our matcher object:
matcher.add('newproduct',None,*phrase_patterns)

#Apply the matcher to our Doc object
matches=matcher(doc)

#See what matches occur:
matches

[(2689272359382549672, 7, 9), (2689272359382549672, 14, 16)]

In [12]:
# Here we create Spans from each match, and create named entities from them:
from spacy.tokens import Span

PROD= doc.vocab.strings[u'PRODUCT']
new_ents=[Span(doc, match[1],match[2],label=PROD) for match in matches]
doc.ents=list(doc.ents) + new_ents
show_ents(doc)

vaccum cleaner-PRODUCT-Objects, vehicles, foods, etc. (not services)
vaccum cleaner-PRODUCT-Objects, vehicles, foods, etc. (not services)
first-ORDINAL-"first", "second", etc.


In [13]:
doc=nlp(u'Originally priced at $29.50, the sweater was marked down to five dollars.')

show_ents(doc)

29.50-MONEY-Monetary values, including unit
five dollars-MONEY-Monetary values, including unit


In [14]:
# Quick function to remove ents formed on whitespace:
from spacy.language import Language

@Language.component("remove_whitespace_entities")
def remove_whitespace_entities(doc):
  doc.ents=[e for e in doc.ents if not e.text.isspace()]
  return doc

#Insert this into the pipeline AFTER the ner component:
nlp.add_pipe("remove_whitespace_entities", after='ner')

# Visualising NER entities

In [None]:
import spacy
nlp = spacy.load('en_core_web_sm')

In [15]:
# Create a Doc with a whitespace entity
doc_with_whitespace = nlp(u'This is a test sentence with  an entity.')

# Manually add a whitespace entity for testing
from spacy.tokens import Span
whitespace_ent = Span(doc_with_whitespace, 6, 7, label="TEST") # Assuming token 6 is the whitespace

doc_with_whitespace.ents = list(doc_with_whitespace.ents) + [whitespace_ent]

print("Entities before removing whitespace:")
show_ents(doc_with_whitespace)

# Process the doc through the pipeline which now includes the remove_whitespace_entities component
processed_doc = nlp(doc_with_whitespace.text) # Re-process the original text to apply the pipeline

print("\nEntities after removing whitespace:")
show_ents(processed_doc)

Entities before removing whitespace:
 -TEST-None

Entities after removing whitespace:
No named entities found:




In [16]:
import spacy
nlp=spacy.load('en_core_web_sm')

from spacy import displacy

In [17]:
doc=nlp(u'Over the last quarter Apple sold nearly 20 thousands iPods for a profit of $6 million.'
u'By contrast, Sony sold only 7 thousand Walkman music players.')

displacy.render(doc,style='ent',jupyter=True)