In [18]:
import spacy

try:
    nlp = spacy.load('en_core_web_sm')
except OSError:
    print("Downloading en_core_web_sm model...")
    from spacy.cli import download
    download('en_core_web_sm')
    nlp = spacy.load('en_core_web_sm')

In [19]:
def show_ents(doc):
    if doc.ents:
        for ent in doc.ents:
            print(ent.text+'-'+ent.label_+'_'+str(spacy.explain(ent.label_)))
    else:
        print('No entities found')

In [20]:
doc = nlp(u'may I go to Washington, DC next May to see the Washington Monument?')
show_ents(doc)

Washington, DC-GPE_Countries, cities, states
next May-DATE_Absolute or relative dates or periods
the Washington Monument-ORG_Companies, agencies, institutions, etc.


In [21]:
doc = nlp(u'Can I please have 500 dollars from you to buy some Microsoft stock?')
for ent in doc.ents:
    print(ent.text, ent.start,ent.end, ent.start_char,ent.end_char,ent.label_)


500 dollars 4 6 18 29 MONEY
Microsoft 11 12 51 60 ORG


In [22]:
# ADDING A NAMED ENTITY TO A SPAN
doc = nlp(u'Tesla to build a U.K. factory for $6 million')
show_ents(doc)

U.K.-GPE_Countries, cities, states
$6 million-MONEY_Monetary values, including unit


In [23]:
from spacy.tokens import Span
ORG = doc.vocab.strings[u'ORG']
new_ent = Span(doc,0,1,label=ORG)
doc.ents = list(doc.ents)+[new_ent]

In [24]:
show_ents(doc)

Tesla-ORG_Companies, agencies, institutions, etc.
U.K.-GPE_Countries, cities, states
$6 million-MONEY_Monetary values, including unit


adding named entity of all span

In [25]:
import spacy

try:
    nlp = spacy.load('en_core_web_sm')
except OSError:
    print("Downloading en_core_web_sm model...")
    from spacy.cli import download
    download('en_core_web_sm')
    nlp = spacy.load('en_core_web_sm')

doc = nlp(u'our company plans to introduce a new vacuum cleaner.'u'If successful, the vacuum cleaner will be our first product. ')
show_ents(doc)

first-ORDINAL_"first", "second", etc.


In [26]:
from spacy.matcher import PhraseMatcher
matcher = PhraseMatcher(nlp.vocab)

In [27]:
phrase_list = ['vacuum cleaner','vacuum-cleaner']
phrase_patterns = [nlp(text) for text in phrase_list]

In [28]:
matcher.add('newproduct',None,*phrase_patterns)
matches = matcher(doc)
matches

[(2689272359382549672, 7, 9), (2689272359382549672, 14, 16)]

In [29]:
from spacy.tokens import span
PROD = doc.vocab.strings[u'PRODUCT']
new_ents = [Span(doc,match[1],match[2],label=PROD) for match in matches]
doc.ents = list(doc.ents)+new_ents

In [30]:
show_ents(doc)

vacuum cleaner-PRODUCT_Objects, vehicles, foods, etc. (not services)
vacuum cleaner-PRODUCT_Objects, vehicles, foods, etc. (not services)
first-ORDINAL_"first", "second", etc.


counting entities

In [31]:
doc = nlp(u'Originally priced at $29.50, the sweater was marked down to five dollars.')
show_ents(doc)

29.50-MONEY_Monetary values, including unit
five dollars-MONEY_Monetary values, including unit


In [32]:
len([ent for ent in doc.ents if ent.label_=='MONEY'])

2

noun chunks

In [33]:
doc = nlp(u'Autonomous cars shift insurance liability toward manufacturers.')
for chunk in doc.noun_chunks:
    print(chunk.text+' - '+chunk.root.text+'-'+chunk.root.dep_+'-'+chunk.root.head.text)

Autonomous cars - cars-nsubj-shift
insurance liability - liability-dobj-shift
manufacturers - manufacturers-pobj-toward


In [35]:
len(doc.noun_chunks)#it is generater function we have to convert into list first.

TypeError: object of type '_cython_3_1_1.generator' has no len()

In [36]:
len(list(doc.noun_chunks))

3

VISUALIZING NAMED ENTITIES

In [37]:
import spacy
nlp = spacy.load('en_core_web_sm')
from spacy import displacy

In [38]:
doc = nlp(u'over the last quater Apple sold nearly 20 thousand ipods for a profit of $6million.'+u'By contrast, Sony only sold 8 thousand Walkman music players.')
displacy.render(doc,style='ent' ,jupyter=True)

In [39]:
for sent in doc.sents:
    displacy.render(nlp(sent.text),style='ent',jupyter=True)

In [40]:
doc2 = nlp(u'over the last quater Apple sold nearly 20 thousand ipods for a profit of $6million.'u'by contrast,my kids sold a lot of lemonde.')
for sent in doc.sents:
    displacy.render(nlp(sent.text),style='ent',jupyter=True)

In [41]:
for sent in doc2.sents:
    docx = nlp(sent.text)
    if docx.ents:
        displacy.render(docx,style='ent',jupyter=True)
    else:
      print(docx.text)


In [42]:
options = {'ents':['ORG','MONEY']}
displacy.render(doc,style='ent',jupyter=True,options=options)


In [43]:
colors = {'ORG':'linear_gradient(90deg,#aa9cfc,#fc9ce7)','MONEY':'radial-gradient(yellow,green)'}
options = {'ents':['ORG','MONEY'],'colors':colors}
displacy.render(doc,style='ent',jupyter=True,options=options)

**SENTENCE SEGMENTATION**