# POS Basics

In [2]:
import spacy
nlp = spacy.load('en_core_web_sm')

In [3]:
doc = nlp(u"The quick brown fox jumped over the lazy dog's back.")

In [7]:
doc[4].pos_, doc[4].tag_

('VERB', 'VBD')

In [11]:
for token in doc:
    print(f"{token.text:{10}} {token.pos_:{10}} {token.tag_:{10}} {spacy.explain(token.tag_)}")

The        DET        DT         determiner
quick      ADJ        JJ         adjective
brown      ADJ        JJ         adjective
fox        NOUN       NN         noun, singular or mass
jumped     VERB       VBD        verb, past tense
over       ADP        IN         conjunction, subordinating or preposition
the        DET        DT         determiner
lazy       ADJ        JJ         adjective
dog        NOUN       NN         noun, singular or mass
's         PART       POS        possessive ending
back       NOUN       NN         noun, singular or mass
.          PUNCT      .          punctuation mark, sentence closer


In [18]:
doc =  nlp(u"I read books on NLP.")

# for token in doc:
#     print(f"{token.text:{10}} {token.pos_:{10}} {token.tag_:{10}} {spacy.explain(token.tag_)}")
    
token = doc[1]
print(f"{token.text:{10}} {token.pos_:{10}} {token.tag_:{10}} {spacy.explain(token.tag_)}")

read       VERB       VBP        verb, non-3rd person singular present


In [19]:

doc =  nlp(u"I read a book on NLP.")

# for token in doc:
#     print(f"{token.text:{10}} {token.pos_:{10}} {token.tag_:{10}} {spacy.explain(token.tag_)}")

token = doc[1]
print(f"{token.text:{10}} {token.pos_:{10}} {token.tag_:{10}} {spacy.explain(token.tag_)}")

read       VERB       VBD        verb, past tense


In [20]:
doc = nlp(u"The quick brown fox jumped over the lazy dog's back.")

In [24]:
POS_counts = doc.count_by(spacy.attrs.POS)

In [25]:
POS_counts

{96: 1, 83: 3, 99: 1, 84: 1, 89: 2, 91: 3, 93: 1}

In [26]:
doc.vocab[83].text

'ADJ'

In [30]:
for k, v in sorted(POS_counts.items()):
    print(f"{k}. {doc.vocab[k].text:{5}} {v}")

83. ADJ   3
84. ADP   1
89. DET   2
91. NOUN  3
93. PART  1
96. PUNCT 1
99. VERB  1


In [31]:
TAG_counts = doc.count_by(spacy.attrs.TAG)

for k, v in sorted(TAG_counts.items()):
    print(f"{k}. {doc.vocab[k].text:{5}} {v}")

74. POS   1
1292078113972184607. IN    1
10554686591937588953. JJ    3
12646065887601541794. .     1
15267657372422890137. DT    2
15308085513773655218. NN    3
17109001835818727656. VBD   1


In [32]:
DEP_counts = doc.count_by(spacy.attrs.DEP)

for k, v in sorted(DEP_counts.items()):
    print(f"{k}. {doc.vocab[k].text:{5}} {v}")

399. amod  3
412. det   2
426. nsubj 1
436. pobj  1
437. poss  1
440. prep  1
442. punct 1
8110129090154140942. case  1
8206900633647566924. ROOT  1


# Visualizing POS

In [34]:
import spacy
nlp = spacy.load('en_core_web_sm')
doc = nlp(u"The quick brown fox jumped over the lazy dog's back.")

In [35]:
from spacy import displacy

In [36]:
displacy.render(doc,style='dep',jupyter=True)

In [37]:
options = {'distance':110, 'compact':'True','color':'yellow','bg':'#09a3d5','font':'Times'}

In [38]:
displacy.render(doc,style='dep',jupyter=True, options =options)

In [40]:
doc2 = nlp(u"This is a sentence. This is another sentence, possibly longer than the first.")

In [41]:
spans = list(doc2.sents)

In [43]:
# 127.0.0.5000
displacy.serve(spans, style='dep',options={'distance':110})


[93m    Serving on port 5000...[0m
    Using the 'dep' visualizer


    Shutting down server on port 5000.



# NER P.1

In [44]:
import spacy
nlp = spacy.load('en_core_web_sm')

In [52]:
def show_ents(doc):
    if doc.ents:
        for ent in doc.ents:
            print(ent.text + ' - ' + ent.label_ + ' - ' + str(spacy.explain(ent.label_)))
    else:
        print('No entities found')

In [53]:
doc = nlp(u'Hi how are you?')
show_ents(doc)

No entities found


In [98]:
doc = nlp(u"May I go to Washington, DC next May to see the Washington monument?")

In [99]:
show_ents(doc)

Washington, DC - GPE - Countries, cities, states
next May - DATE - Absolute or relative dates or periods
Washington - GPE - Countries, cities, states


In [56]:
doc = nlp(u'Can I please have 500 dollars of Microsoft Stock?')

In [57]:
show_ents(doc)

500 dollars - MONEY - Monetary values, including unit
Microsoft - ORG - Companies, agencies, institutions, etc.


In [58]:
doc = nlp(u"Tesla to build a U.K factory for $6 million")

In [59]:
show_ents(doc)

U.K - ORG - Companies, agencies, institutions, etc.
$6 million - MONEY - Monetary values, including unit


In [60]:
from spacy.tokens import Span

In [62]:
ORG = doc.vocab.strings[u'ORG']

In [63]:
new_ent = Span(doc, 0,1,label=ORG)

In [64]:
doc.ents = list(doc.ents) + [new_ent]

In [65]:
show_ents(doc)

Tesla - ORG - Companies, agencies, institutions, etc.
U.K - ORG - Companies, agencies, institutions, etc.
$6 million - MONEY - Monetary values, including unit


# NER P.2

In [105]:
doc = nlp(u"Our company created a brand new vacuum cleaner."
         u"This new vacuum-cleaner is the best in show.")

In [106]:
show_ents(doc)

No entities found


In [113]:
from spacy.matcher import PhraseMatcher

In [117]:
matcher = PhraseMatcher(nlp.vocab)
phrase_list = ['vacuum cleaner','vacuum-cleaner']
phrase_patterns = [nlp(text) for text in phrase_list]

In [118]:
matcher.add('newproduct',None,*phrase_patterns)

In [119]:
found_matches = matcher(doc)

In [129]:
# (x, y, z) = (id for the match, start doc token index, end start doc token index)
found_matches

[(2689272359382549672, 6, 8), (2689272359382549672, 11, 14)]

In [120]:
from spacy.tokens import Span

In [121]:
PROD = doc.vocab.strings[u'PRODUCT']

In [130]:
doc[6:8], doc[11:14]

(vacuum cleaner, vacuum-cleaner)

In [122]:
new_ents = [Span(doc,match[1],match[2],label = PROD) for match in found_matches]

In [123]:
doc.ents = list(doc.ents) + new_ents

In [124]:
show_ents(doc)

vacuum cleaner - PRODUCT - Objects, vehicles, foods, etc. (not services)
vacuum-cleaner - PRODUCT - Objects, vehicles, foods, etc. (not services)


In [132]:
doc = nlp(u"Originally I paid $29.95 for this car toy, but now it is marked down by 10 dollars.")

In [133]:
[ent for ent in doc.ents if ent.label_ == "MONEY"]

[29.95, 10 dollars]

In [134]:
len([ent for ent in doc.ents if ent.label_ == "MONEY"])

2

# Visualizing NER

In [135]:
import spacy
nlp = spacy.load('en_core_web_sm')
from spacy import displacy

In [136]:
doc = nlp(u"Over the last quarter Apple sold nearly 20 thousand iPods for a profit of $6 million.")

In [137]:
displacy.render(doc, style='ent',jupyter=True)

In [138]:
doc = nlp(u"Over the last quarter Apple sold nearly 20 thousand iPods for a profit of $6 million."
         u"By contrast, Sony only sold 8 thousand Walkman music players.")

In [139]:
for sent in doc.sents:
    displacy.render(nlp(sent.text),style='ent',jupyter=True)


In [154]:
# colors = {'ORG':'#aa9cfc'}
# colors = {'ORG':'radial-gradient(yellow,green)'}
# colors = {'ORG':'linear-gradient(yellow,red)'}
colors = {'ORG':'linear-gradient(90deg, #aa9cfc, #fc9ce7)'}


options = {'ents':['PRODUCT','ORG'], 'colors':colors}


In [155]:

displacy.render(doc, style='ent',jupyter=True, options =options)

In [None]:
# also works
# displaycy.serve(doc, style='ent', options =options) 

# Sentence Segmentation

In [177]:
import spacy
nlp = spacy.load('en_core_web_sm')

In [161]:
doc = nlp(u'This is the first sentence. This is another sentence. This is the last sentence')

In [162]:
# doc.sents is a generator
for sent in doc.sents:
    print(sent)

This is the first sentence.
This is another sentence.
This is the last sentence


In [164]:
type(list(doc.sents)[0])

spacy.tokens.span.Span

In [165]:
doc = nlp(u'"Management is doing the right things; leadership is doing the right things." -Peter Drucker')

In [166]:
doc.text

'"Management is doing the right things; leadership is doing the right things." -Peter Drucker'

In [167]:
for sent in doc.sents:
    print(sent)
    print()

"Management is doing the right things; leadership is doing the right things."

-Peter Drucker



In [168]:
# add new segmentation rule top nlp pipeline

In [178]:
def set_custom_boundaries(doc):
    for token in doc[:-1]:
        if token.text == ';':
            doc[token.i+1].is_sent_start = True
    return doc

In [179]:
nlp.add_pipe(set_custom_boundaries, before='parser')
nlp.pipe_names

['tagger', 'set_custom_boundaries', 'parser', 'ner']

In [180]:
doc4 = nlp(u'"Management is doing the right things; leadership is doing the right things." -Peter Drucker')

In [181]:
for sent in doc4.sents:
    print(sent)

"Management is doing the right things;
leadership is doing the right things."
-Peter Drucker


In [182]:
# change segmentation rules

In [183]:
nlp = spacy.load('en_core_web_sm')

In [184]:
mystring = u"This is a sentence. This is another.\n\nThis is a\nthird sentence."

print(mystring)

This is a sentence. This is another.

This is a
third sentence.


In [186]:
doc = nlp(mystring)

for sentence in doc.sents:
    print(sentence)

This is a sentence.
This is another.


This is a
third sentence.


In [187]:
from spacy.pipeline import SentenceSegmenter

In [189]:
def split_on_newlines(doc):
    start = 0
    seen_newline = False
    
    for word in doc:
        if seen_newline:
            yield doc[start:word.i]
            start = word.i
            seen_newline = False
        elif word.text.startswith('\n'):
            seen_newline= True
    
    yield doc[start:]

In [191]:
sbd = SentenceSegmenter(nlp.vocab, strategy=split_on_newlines)


In [192]:
nlp.add_pipe(sbd)

In [193]:
doc = nlp(mystring)

In [195]:
for sentence in doc.sents:
    print(sentence)

This is a sentence. This is another.


This is a

third sentence.
