In [None]:
# Named-Entity Recognition (NER)
# ------------------------------
# It seeks to locate and classify named entity mentions in unstructured text into
# pre-defined categories such as the person names, organizations, locations, medical
# codes, time expressions, quantities, monetary values, percentages, etc.

In [1]:
import spacy

nlp = spacy.load('en_core_web_sm')

In [6]:
def show_ents(doc):
    if doc.ents:
        for ent in doc.ents:
            print(ent.text + ': ' + ent.label_ + ' - ' + str(spacy.explain(ent.label_)))
    else:
        print('No entities found')
        
def get_entities(doc):
    return doc.ents

In [5]:
# Showing Entities identified by Spacy
text = "May i go to Washington, DC next May to see the Washington Document?"
doc = nlp(text)
show_ents(doc)

Washington, DC: GPE - Countries, cities, states
next May: DATE - Absolute or relative dates or periods
the Washington Document: ORG - Companies, agencies, institutions, etc.


In [18]:
# Showing entity attributes

# - ent.text -> Original text
# - ent.label -> entity hash value
# - ent.label_ -> entity type description
# - ent.start -> Where does the token span starts in the doc
# - ent.end -> Where does the token span ends in the doc
# - ent.start_char 
# - ent.end_char
ents = get_entities(doc)

for ent in ents:
    print('_'*40)
    print('ent.text: ', ent.text)
    print('ent.label: ', ent.label)
    print('ent.label_:', ent.label_)
    print('ent.start: ', ent.start)
    print('ent.end: ', ent.end)
    print('ent.start_char: ', ent.start_char)
    print('ent.end_char: ', ent.end_char)
    
print(doc[ent.start: ent.end])
print(type(doc[ent.start: ent.end]))
print(text[ent.start_char: ent.end_char])

________________________________________
ent.text:  Washington, DC
ent.label:  382
ent.label_: GPE
ent.start:  4
ent.end:  7
ent.start_char:  12
ent.end_char:  26
________________________________________
ent.text:  next May
ent.label:  388
ent.label_: DATE
ent.start:  7
ent.end:  9
ent.start_char:  27
ent.end_char:  35
________________________________________
ent.text:  the Washington Document
ent.label:  381
ent.label_: ORG
ent.start:  11
ent.end:  14
ent.start_char:  43
ent.end_char:  66
the Washington Document
<class 'spacy.tokens.span.Span'>
the Washington Document


In [24]:
# Adding a Named Entity to our vocab
text = "Humber, in future he will be located in Colombia"
doc = nlp(text)
show_ents(doc)
# Humber is not recognizede by Spacy

Colombia: GPE - Countries, cities, states


In [26]:
from spacy.tokens import Span

new_GPE = doc.vocab.strings[u"GPE"]
new_ent = Span(doc, 0, 1, label=new_GPE)

['__class__', '__contains__', '__delattr__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__getitem__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__iter__', '__le__', '__len__', '__lt__', '__ne__', '__new__', '__pyx_vtable__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__sizeof__', '__str__', '__subclasshook__', '_cleanup_stale_strings', '_map', '_reset_and_load', 'add', 'from_bytes', 'from_disk', 'to_bytes', 'to_disk']
