In [1]:
import spacy

In [31]:
nlp = spacy.load('en_core_web_sm')

In [3]:
doc = nlp(u'Tesla is looking at buying U.S. startup for $6 million')

In [4]:
for token in doc:
    print(token.text,token.pos_,token.dep_)

Tesla PROPN nsubj
is AUX aux
looking VERB ROOT
at ADP prep
buying VERB pcomp
U.S. PROPN compound
startup NOUN dobj
for ADP prep
$ SYM quantmod
6 NUM compound
million NUM pobj


In [5]:
nlp.pipeline

[('tagger', <spacy.pipeline.pipes.Tagger at 0x24d0f90ef60>),
 ('parser', <spacy.pipeline.pipes.DependencyParser at 0x24d0fa5de88>),
 ('ner', <spacy.pipeline.pipes.EntityRecognizer at 0x24d0fa5dee8>)]

In [6]:
nlp.pipe_names

['tagger', 'parser', 'ner']

In [7]:
doc2 = nlp(u"Tesla isn't looking into startups anymore.")

In [8]:
for token in doc2:
    print(token.text,token.pos_,token.dep_)

Tesla PROPN nsubj
is AUX aux
n't PART neg
looking VERB ROOT
into ADP prep
startups NOUN pobj
anymore ADV advmod
. PUNCT punct


In [9]:
doc2[0].pos_

'PROPN'

In [11]:
doc2[0].dep_

'nsubj'

In [10]:
doc3 = nlp(u'Although commmonly attributed to John Lennon from his song "Beautiful Boy", \
the phrase "Life is what happens to us while we are making other plans" was written by \
cartoonist Allen Saunders and published in Reader\'s Digest in 1957, when Lennon was 17.')


In [12]:
life_quote = doc3[16:30]

In [13]:
print(life_quote)

"Life is what happens to us while we are making other plans"


In [14]:
type(life_quote)

spacy.tokens.span.Span

In [15]:
type(doc3)

spacy.tokens.doc.Doc

In [16]:
doc4 = nlp(u'This is the first sentence. This is another sentence. This is the last sentence.')

In [17]:
for sentence in doc4.sents:
    print(sentence)

This is the first sentence.
This is another sentence.
This is the last sentence.


In [23]:
doc4[6].is_sent_start

True

In [24]:
# Create a string that includes opening and closing quotation marks
mystring = '"We\'re moving to L.A.!"'
print(mystring)


"We're moving to L.A.!"


In [25]:
doc = nlp(mystring)

In [26]:
for token in doc:
    print(token.text)

"
We
're
moving
to
L.A.
!
"


In [27]:
doc2 = nlp(u"We're here to help! Send snail-mail, email support@oursite.com or visit us at http://www.oursite.com!")


In [29]:
for t in doc2:
    print(t)

We
're
here
to
help
!
Send
snail
-
mail
,
email
support@oursite.com
or
visit
us
at
http://www.oursite.com
!


In [30]:
doc3 = nlp(u'A 5km NYC cab ride costs $10.30')

for t in doc3:
    print(t)

A
5
km
NYC
cab
ride
costs
$
10.30


In [31]:
doc4 = nlp(u"Let's visit St. Louis in the U.S. next year.")

for t in doc4:
    print(t)

Let
's
visit
St.
Louis
in
the
U.S.
next
year
.


In [32]:
len(doc4)

11

In [34]:
len(doc4.vocab)

554

In [36]:
doc5 = nlp(u'It is better to give than to receive.')

# Retrieve the third token:
doc5[2:5]

better to give

In [37]:
doc8 = nlp(u'Apple to build a Hong Kong factory for $6 million')

for token in doc8:
    print(token.text, end=' | ')

print('\n----')

for ent in doc8.ents:
    print(ent.text+' - '+ent.label_+' - '+str(spacy.explain(ent.label_)))
    

Apple | to | build | a | Hong | Kong | factory | for | $ | 6 | million | 
----
Apple - ORG - Companies, agencies, institutions, etc.
Hong Kong - GPE - Countries, cities, states
$6 million - MONEY - Monetary values, including unit


In [38]:
doc9 = nlp(u"Autonomous cars shift insurance liability toward manufacturers.")

for chunk in doc9.noun_chunks:
    print(chunk.text)
    

Autonomous cars
insurance liability
manufacturers


In [40]:
from spacy import displacy

doc = nlp(u'Apple is going to build a U.K. factory for $6 million.')
displacy.render(doc, style='dep', jupyter=True, options={'distance': 50})


In [41]:
doc = nlp(u'Over the last quarter Apple sold nearly 20 thousand iPods for a profit of $6 million.')
displacy.render(doc, style='ent', jupyter=True)


In [44]:
1+1


2

In [45]:
# Stemming
import nltk

In [46]:
from nltk.stem.porter import PorterStemmer

In [47]:
p_stemmer = PorterStemmer()


In [48]:
words = ['run','runner','running','ran','runs','easily','fairly']


In [49]:
for word in words:
    print(word+' --> '+p_stemmer.stem(word))


run --> run
runner --> runner
running --> run
ran --> ran
runs --> run
easily --> easili
fairly --> fairli


In [50]:
from nltk.stem.snowball import SnowballStemmer

# The Snowball Stemmer requires that you pass a language parameter
s_stemmer = SnowballStemmer(language='english')


In [51]:
for word in words:
    print(word+' --> '+s_stemmer.stem(word))

run --> run
runner --> runner
running --> run
ran --> ran
runs --> run
easily --> easili
fairly --> fair


In [52]:
words = ['generous', 'generation', 'generously', 'generate']

In [53]:
for word in words:
    print(word+' --> '+s_stemmer.stem(word))

generous --> generous
generation --> generat
generously --> generous
generate --> generat


In [3]:
# Lemmatization

doc1 = nlp(u"I am a runner running in a race because I love to run since I ran today")

for token in doc1:
    print(token.text, '\t', token.pos_, '\t', token.lemma, '\t', token.lemma_)


I 	 PRON 	 561228191312463089 	 -PRON-
am 	 AUX 	 10382539506755952630 	 be
a 	 DET 	 11901859001352538922 	 a
runner 	 NOUN 	 12640964157389618806 	 runner
running 	 VERB 	 12767647472892411841 	 run
in 	 ADP 	 3002984154512732771 	 in
a 	 DET 	 11901859001352538922 	 a
race 	 NOUN 	 8048469955494714898 	 race
because 	 SCONJ 	 16950148841647037698 	 because
I 	 PRON 	 561228191312463089 	 -PRON-
love 	 VERB 	 3702023516439754181 	 love
to 	 PART 	 3791531372978436496 	 to
run 	 VERB 	 12767647472892411841 	 run
since 	 SCONJ 	 10066841407251338481 	 since
I 	 PRON 	 561228191312463089 	 -PRON-
ran 	 VERB 	 12767647472892411841 	 run
today 	 NOUN 	 11042482332948150395 	 today


In [4]:
def show_lemmas(text):
    for token in text:
        print(f'{token.text:{12}} {token.pos_:{6}} {token.lemma:<{22}} {token.lemma_}')


In [5]:
doc2 = nlp(u"I saw eighteen mice today!")

show_lemmas(doc2)


I            PRON   561228191312463089     -PRON-
saw          VERB   11925638236994514241   see
eighteen     NUM    9609336664675087640    eighteen
mice         NOUN   1384165645700560590    mouse
today        NOUN   11042482332948150395   today
!            PUNCT  17494803046312582752   !


In [6]:
# Print the set of spaCy's default stop words (remember that sets are unordered):
print(nlp.Defaults.stop_words)


{'unless', 'wherein', "'s", 'the', 'across', 'such', 'very', 'hereupon', 'beyond', 'formerly', 'above', 'been', 'serious', 'at', 'well', 'via', 'how', 'due', '’re', 'four', 'others', 'it', 'otherwise', '‘d', 'than', 'last', 'does', 'them', 'go', 'throughout', 'make', 'ever', "'m", 'your', 'wherever', '‘m', 'anyone', 'put', 'sometimes', 'therein', 'am', 'herein', 'now', 'nothing', 'whereupon', 'seem', 'that', 'if', 'in', 'this', 'name', 'using', 'sometime', '’s', 'whether', 'off', 'whereafter', 'upon', 'rather', '‘ll', 'would', 'nor', 'thus', 'while', 'under', 'give', 'mine', 'him', 'namely', 'more', 'against', 'anyhow', 'to', 'therefore', 'see', 'was', 'mostly', 'among', 'done', 'forty', 'thereby', 'with', 'but', 'between', '‘re', "'ve", 'without', 'could', 'you', 'still', 'only', 'n’t', 'show', 'whereby', 'neither', 'first', 'they', 'he', 'next', 'whole', 'their', 'itself', 'eight', 'being', 'quite', 'besides', 'be', 're', 'less', 'else', 'someone', '’d', 'top', 'everywhere', 'full', 

In [7]:
# Print the set of spaCy's default stop words (remember that sets are unordered):
len(nlp.Defaults.stop_words)

326

In [10]:
nlp.vocab['mystery'].is_stop

False

In [11]:
nlp.Defaults.stop_words.add('btw')

In [12]:
nlp.vocab['btw'].is_stop

True

In [13]:
len(nlp.Defaults.stop_words)

327

In [14]:
nlp.Defaults.stop_words.remove('beyond')

In [15]:
nlp.vocab['beyond'].is_stop = False

In [16]:
# Vocabulary and Matching
from spacy.matcher import Matcher
matcher = Matcher(nlp.vocab)
pattern1 = [{'LOWER': 'solarpower'}]
pattern2 = [{'LOWER': 'solar'}, {'LOWER': 'power'}]
pattern3 = [{'LOWER': 'solar'}, {'IS_PUNCT': True}, {'LOWER': 'power'}]

matcher.add('SolarPower', None, pattern1, pattern2, pattern3)


In [17]:
doc = nlp(u'The Solar Power industry continues to grow as demand \
for solarpower increases. Solar-power cars are gaining popularity.')


In [18]:
found_matches = matcher(doc)
print(found_matches)


[(8656102463236116519, 1, 3), (8656102463236116519, 10, 11), (8656102463236116519, 13, 16)]


In [19]:
for match_id, start, end in found_matches:
    string_id = nlp.vocab.strings[match_id]  # get string representation
    span = doc[start:end]                    # get the matched span
    print(match_id, string_id, start, end, span.text)


8656102463236116519 SolarPower 1 3 Solar Power
8656102463236116519 SolarPower 10 11 solarpower
8656102463236116519 SolarPower 13 16 Solar-power


In [20]:
# Redefine the patterns:
pattern1 = [{'LOWER': 'solarpower'}]
pattern2 = [{'LOWER': 'solar'}, {'IS_PUNCT': True, 'OP':'*'}, {'LOWER': 'power'}]

# Remove the old patterns to avoid duplication:
matcher.remove('SolarPower')

# Add the new set of patterns to the 'SolarPower' matcher:
matcher.add('SolarPower', None, pattern1, pattern2)


In [21]:
pattern1 = [{'LOWER': 'solarpower'}]
pattern2 = [{'LOWER': 'solar'}, {'IS_PUNCT': True, 'OP':'*'}, {'LEMMA': 'power'}] # CHANGE THIS PATTERN

# Remove the old patterns to avoid duplication:
matcher.remove('SolarPower')

# Add the new set of patterns to the 'SolarPower' matcher:
matcher.add('SolarPower', None, pattern1, pattern2)


In [22]:
doc2 = nlp(u"Solar--power is solarpower yay!")

In [23]:
found_matches = matcher(doc2)

In [24]:
print(found_matches)

[(8656102463236116519, 0, 3), (8656102463236116519, 4, 5)]


In [25]:
# Import the PhraseMatcher library
from spacy.matcher import PhraseMatcher
matcher = PhraseMatcher(nlp.vocab)


In [31]:
with open('reaganomics.txt') as f:
    doc3 = nlp(f.read())


In [32]:
# First, create a list of match phrases:
phrase_list = ['voodoo economics', 'supply-side economics', 'trickle-down economics', 'free-market economics']

# Next, convert each phrase to a Doc object:
phrase_patterns = [nlp(text) for text in phrase_list]

# Pass each Doc object into matcher (note the use of the asterisk!):
matcher.add('VoodooEconomics', None, *phrase_patterns)

# Build a list of matches:
matches = matcher(doc3)


In [33]:
matches

[(3473369816841043438, 41, 45),
 (3473369816841043438, 49, 53),
 (3473369816841043438, 54, 56),
 (3473369816841043438, 61, 65),
 (3473369816841043438, 673, 677),
 (3473369816841043438, 2987, 2991)]

In [36]:
for match_id, start, end in matches:
    string_id = nlp.vocab.strings[match_id]  # get string representation
    span = doc3[start-5:end+5]                    # get the matched span
    print(match_id, string_id, start, end, span.text)


3473369816841043438 VoodooEconomics 41 45 policies are commonly associated with supply-side economics, referred to as trickle
3473369816841043438 VoodooEconomics 49 53 economics, referred to as trickle-down economics or voodoo economics by political
3473369816841043438 VoodooEconomics 54 56 trickle-down economics or voodoo economics by political opponents, and
3473369816841043438 VoodooEconomics 61 65 by political opponents, and free-market economics by political advocates.


3473369816841043438 VoodooEconomics 673 677 attracted a following from the supply-side economics movement, which formed in
3473369816841043438 VoodooEconomics 2987 2991 became widely known as "trickle-down economics", due to the


In [3]:
# NLP Assessment

with open('owlcreek.txt') as f:
    doc = nlp(f.read())



In [4]:
doc[:36]


AN OCCURRENCE AT OWL CREEK BRIDGE

by Ambrose Bierce

I

A man stood upon a railroad bridge in northern Alabama, looking down
into the swift water twenty feet below.  

In [5]:
# Tokens

len(doc)

4835

In [7]:
doc_sentences = [sent for sent in doc.sents]

In [8]:
len(doc_sentences)

249

In [13]:
print(doc_sentences[2].text)

A man stood upon a railroad bridge in northern Alabama, looking down
into the swift water twenty feet below.  


In [19]:
for token in doc_sentences[2]:
    print(f"{token.text:{15}} {token.pos_:{5}} {token.dep_:{10}} {token.lemma_:{15}}")


A               DET   det        a              
man             NOUN  nsubj      man            
stood           VERB  ROOT       stand          
upon            SCONJ prep       upon           
a               DET   det        a              
railroad        NOUN  compound   railroad       
bridge          NOUN  pobj       bridge         
in              ADP   prep       in             
northern        ADJ   amod       northern       
Alabama         PROPN pobj       Alabama        
,               PUNCT punct      ,              
looking         VERB  advcl      look           
down            ADV   prt        down           

               SPACE            
              
into            ADP   prep       into           
the             DET   det        the            
swift           ADJ   amod       swift          
water           NOUN  pobj       water          
twenty          NUM   nummod     twenty         
feet            NOUN  npadvmod   foot           
below           ADV 

In [20]:
from spacy.matcher import Matcher
matcher = Matcher(nlp.vocab)


In [21]:
pattern = [{'LOWER':'swimming'},{'IS_SPACE': True,'OP':'*'},{'LOWER':'vigorously'}]

In [22]:
matcher.add('Swimming',None,pattern)

In [23]:
found_matches = matcher(doc)

In [24]:
print(found_matches)

[(12881893835109366681, 1274, 1277), (12881893835109366681, 3609, 3612)]


In [25]:
def surrounding(doc,start,end):
    print(doc[start-5:end+5])

In [26]:
surrounding(doc,1274,1277)

evade the bullets and, swimming
vigorously, reach the bank,


In [27]:
surrounding(doc,3609,3612)

shoulder; he was now swimming
vigorously with the current.  


In [32]:
for sentence in doc_sentences:
    if found_matches[0][1] < sentence.end:
        print(sentence)
        break


By diving I could evade the bullets and, swimming
vigorously, reach the bank, take to the woods and get away home.  


In [33]:
for sentence in doc_sentences:
    if found_matches[1][1] < sentence.end:
        print(sentence)
        break


The hunted man saw all this over his shoulder; he was now swimming
vigorously with the current.  


In [34]:
# Create a simple Doc object
doc = nlp(u"The quick brown fox jumped over the lazy dog's back.")


In [35]:
print(doc[4].text)

jumped


In [37]:
print(doc[4].tag_)

VBD


In [38]:
print(doc[4].pos_)

VERB


In [39]:
for token in doc:
    print(f'{token.text:{10}} {token.pos_:{8}} {token.tag_:{6}} {spacy.explain(token.tag_)}')

The        DET      DT     determiner
quick      ADJ      JJ     adjective
brown      ADJ      JJ     adjective
fox        NOUN     NN     noun, singular or mass
jumped     VERB     VBD    verb, past tense
over       ADP      IN     conjunction, subordinating or preposition
the        DET      DT     determiner
lazy       ADJ      JJ     adjective
dog        NOUN     NN     noun, singular or mass
's         PART     POS    possessive ending
back       NOUN     NN     noun, singular or mass
.          PUNCT    .      punctuation mark, sentence closer


In [42]:
doc = nlp(u'I am reading books on NLP.')
r = doc[1]

print(f'{r.text:{10}} {r.pos_:{8}} {r.tag_:{6}} {spacy.explain(r.tag_)}')


am         AUX      VBP    verb, non-3rd person singular present


In [41]:
doc = nlp(u'I read a book on NLP.')
r = doc[1]

print(f'{r.text:{10}} {r.pos_:{8}} {r.tag_:{6}} {spacy.explain(r.tag_)}')


read       VERB     VBD    verb, past tense


In [43]:
doc = nlp(u"The quick brown fox jumped over the lazy dog's back.")

# Count the frequencies of different coarse-grained POS tags:
POS_counts = doc.count_by(spacy.attrs.POS)
POS_counts


{90: 2, 84: 3, 92: 3, 100: 1, 85: 1, 94: 1, 97: 1}

In [45]:
doc.vocab[84].text


'ADJ'

In [48]:
doc[2].pos

84

In [49]:
for k,v in sorted(POS_counts.items()):
    print(f'{k}. {doc.vocab[k].text:{5}}: {v}')


84. ADJ  : 3
85. ADP  : 1
90. DET  : 2
92. NOUN : 3
94. PART : 1
97. PUNCT: 1
100. VERB : 1


In [50]:
# Count the different fine-grained tags:
TAG_counts = doc.count_by(spacy.attrs.TAG)

for k,v in sorted(TAG_counts.items()):
    print(f'{k}. {doc.vocab[k].text:{4}}: {v}')


74. POS : 1
1292078113972184607. IN  : 1
10554686591937588953. JJ  : 3
12646065887601541794. .   : 1
15267657372422890137. DT  : 2
15308085513773655218. NN  : 3
17109001835818727656. VBD : 1


In [51]:
len(doc.vocab)

1786

In [52]:
# Count the different dependencies:
DEP_counts = doc.count_by(spacy.attrs.DEP)

for k,v in sorted(DEP_counts.items()):
    print(f'{k}. {doc.vocab[k].text:{4}}: {v}')


402. amod: 3
415. det : 2
429. nsubj: 1
439. pobj: 1
440. poss: 1
443. prep: 1
445. punct: 1
8110129090154140942. case: 1
8206900633647566924. ROOT: 1


In [53]:
# Import the displaCy library
from spacy import displacy


In [54]:
# Create a simple Doc object
doc = nlp(u"The quick brown fox jumped over the lazy dog's back.")


In [56]:
# Render the dependency parse immediately inside Jupyter:
displacy.render(doc, style='dep', jupyter=True, options={'distance': 50})


In [57]:
for token in doc:
    print(f'{token.text:{10}} {token.pos_:{7}} {token.dep_:{7}} {spacy.explain(token.dep_)}')

The        DET     det     determiner
quick      ADJ     amod    adjectival modifier
brown      ADJ     amod    adjectival modifier
fox        NOUN    nsubj   nominal subject
jumped     VERB    ROOT    None
over       ADP     prep    prepositional modifier
the        DET     det     determiner
lazy       ADJ     amod    adjectival modifier
dog        NOUN    poss    possession modifier
's         PART    case    case marking
back       NOUN    pobj    object of preposition
.          PUNCT   punct   punctuation


In [None]:
displacy.serve(doc, style='dep', options={'distance': 110})


  "__main__", mod_spec)



Using the 'dep' visualizer
Serving on http://0.0.0.0:5000 ...



In [3]:
# Write a function to display basic entity info:
def show_ents(doc):
    if doc.ents:
        for ent in doc.ents:
            print(ent.text+' - '+ent.label_+' - '+str(spacy.explain(ent.label_)))
    else:
        print('No named entities found.')


In [4]:
doc = nlp(u'May I go to Washington, DC next May to see the Washington Monument?')

show_ents(doc)


Washington - GPE - Countries, cities, states
next May - DATE - Absolute or relative dates or periods
the Washington Monument - ORG - Companies, agencies, institutions, etc.


In [5]:
doc = nlp(u'Can I please borrow 500 dollars from you to buy some Microsoft stock?')

for ent in doc.ents:
    print(ent.text, ent.start, ent.end, ent.start_char, ent.end_char, ent.label_)


500 dollars 4 6 20 31 MONEY
Microsoft 11 12 53 62 ORG


In [6]:
doc = nlp(u'Tesla to build a U.K. factory for $6 million')

show_ents(doc)


U.K. - GPE - Countries, cities, states
$6 million - MONEY - Monetary values, including unit


In [8]:
from spacy.tokens import Span

# Get the hash value of the ORG entity label
ORG = doc.vocab.strings[u'ORG']  

# Create a Span for the new entity
new_ent = Span(doc, 0, 1, label=ORG)

# Add the entity to the existing Doc object
doc.ents = list(doc.ents) + [new_ent]


In [9]:
show_ents(doc)


Tesla - ORG - Companies, agencies, institutions, etc.
U.K. - GPE - Countries, cities, states
$6 million - MONEY - Monetary values, including unit


In [10]:
doc = nlp(u'Our company plans to introduce a new vacuum cleaner. '
          u'If successful, the vacuum cleaner will be our first product.')

show_ents(doc)


first - ORDINAL - "first", "second", etc.


In [11]:
# Import PhraseMatcher and create a matcher object:
from spacy.matcher import PhraseMatcher
matcher = PhraseMatcher(nlp.vocab)


In [12]:
# Create the desired phrase patterns:
phrase_list = ['vacuum cleaner', 'vacuum-cleaner']
phrase_patterns = [nlp(text) for text in phrase_list]


In [13]:
# Apply the patterns to our matcher object:
matcher.add('newproduct', None, *phrase_patterns)

# Apply the matcher to our Doc object:
matches = matcher(doc)

# See what matches occur:
matches


[(2689272359382549672, 7, 9), (2689272359382549672, 14, 16)]

In [14]:
# Here we create Spans from each match, and create named entities from them:
from spacy.tokens import Span

PROD = doc.vocab.strings[u'PRODUCT']

new_ents = [Span(doc, match[1],match[2],label=PROD) for match in matches]

doc.ents = list(doc.ents) + new_ents


In [15]:
doc = nlp(u'Originally priced at $29.50, the sweater was marked down to five dollars.')

show_ents(doc)


29.50 - MONEY - Monetary values, including unit
five dollars - MONEY - Monetary values, including unit


In [16]:
len([ent for ent in doc.ents if ent.label_=='MONEY'])


2

In [17]:
from spacy import displacy


In [18]:
doc = nlp(u'Originally priced at $29.50,\nthe sweater was marked down to five dollars.')

show_ents(doc)


29.50 - MONEY - Monetary values, including unit
five dollars - MONEY - Monetary values, including unit


In [19]:
# Quick function to remove ents formed on whitespace:
def remove_whitespace_entities(doc):
    doc.ents = [e for e in doc.ents if not e.text.isspace()]
    return doc

# Insert this into the pipeline AFTER the ner component:
nlp.add_pipe(remove_whitespace_entities, after='ner')


In [20]:
# Import the displaCy library
from spacy import displacy


In [21]:
doc = nlp(u'Over the last quarter Apple sold nearly 20 thousand iPods for a profit of $6 million. '
         u'By contrast, Sony sold only 7 thousand Walkman music players.')

displacy.render(doc, style='ent', jupyter=True)


In [24]:
colors = {'ORG':'#aa9cfc'}
options = {'ents': ['PRODUCT','ORG'], 'colors':colors}

In [25]:
displacy.render(doc, style='ent', jupyter=True, options = options)

In [26]:
# From Spacy Basics:
doc = nlp(u'This is the first sentence. This is another sentence. This is the last sentence.')

for sent in doc.sents:
    print(sent)


This is the first sentence.
This is another sentence.
This is the last sentence.


In [27]:
type(list(doc.sents)[0])

spacy.tokens.span.Span

In [28]:
# SPACY'S DEFAULT BEHAVIOR
doc3 = nlp(u'"Management is doing things right; leadership is doing the right things." -Peter Drucker')

for sent in doc3.sents:
    print(sent)


"Management is doing things right; leadership is doing the right things."
-Peter
Drucker


In [29]:
# ADD A NEW RULE TO THE PIPELINE
def set_custom_boundaries(doc):
    for token in doc[:-1]:
        if token.text == ';':
            doc[token.i+1].is_sent_start = True
    return doc

nlp.add_pipe(set_custom_boundaries, before='parser')

nlp.pipe_names


['tagger',
 'set_custom_boundaries',
 'parser',
 'ner',
 'remove_whitespace_entities']

In [30]:
# Re-run the Doc object creation:
doc4 = nlp(u'"Management is doing things right; leadership is doing the right things." -Peter Drucker')

for sent in doc4.sents:
    print(sent)


"Management is doing things right;
leadership is doing the right things."
-Peter
Drucker


In [32]:
mystring = u"This is a sentence. This is another.\n\nThis is a \nthird sentence."

# SPACY DEFAULT BEHAVIOR:
doc = nlp(mystring)

for sent in doc.sents:
    print([token.text for token in sent])


['This', 'is', 'a', 'sentence', '.']
['This', 'is', 'another', '.', '\n\n']
['This', 'is', 'a', '\n', 'third', 'sentence', '.']


In [33]:
from spacy.pipeline import SentenceSegmenter

def split_on_newlines(doc):
    start = 0
    seen_newline = False
    for word in doc:
        if seen_newline:
            yield doc[start:word.i]
            start = word.i
            seen_newline = False
        elif word.text.startswith('\n'): # handles multiple occurrences
            seen_newline = True
    yield doc[start:]      # handles the last group of tokens


sbd = SentenceSegmenter(nlp.vocab, strategy=split_on_newlines)
nlp.add_pipe(sbd)


In [None]:
doc = nlp(mystring)
for sent in doc.sents:
    print([token.text for token in sent])


['This', 'is', 'a', 'sentence', '.', 'This', 'is', 'another', '.', '\n\n']
['This', 'is', 'a', '\n']
['third', 'sentence', '.']


In [1]:
# Part of Speech Assessment

# RUN THIS CELL to perform standard imports:
import spacy
nlp = spacy.load('en_core_web_sm')
from spacy import displacy


In [2]:
with open('peterrabbit.txt') as f:
    doc = nlp(f.read())

In [10]:
for token in list(doc.sents)[3]:
    print(f"{token.text:{10}} {token.pos_:{10}} {token.tag_:{10}} {str(spacy.explain(token.tag_)):{10}}")
                                                                   

They       PRON       PRP        pronoun, personal
lived      VERB       VBD        verb, past tense
with       ADP        IN         conjunction, subordinating or preposition
their      DET        PRP$       pronoun, possessive
Mother     PROPN      NNP        noun, proper singular
in         ADP        IN         conjunction, subordinating or preposition
a          DET        DT         determiner
sand       NOUN       NN         noun, singular or mass
-          PUNCT      HYPH       punctuation mark, hyphen
bank       NOUN       NN         noun, singular or mass
,          PUNCT      ,          punctuation mark, comma
underneath ADP        IN         conjunction, subordinating or preposition
the        DET        DT         determiner
root       NOUN       NN         noun, singular or mass
of         ADP        IN         conjunction, subordinating or preposition
a          DET        DT         determiner

          SPACE      _SP        None      
very       ADV        RB        

In [15]:
# To get the frequency list

POS_freq = doc.count_by(spacy.attrs.POS)
for k,v in sorted(POS_freq.items()):
    print(f"id:{k} POS:{doc.vocab[k].text} {v} counts")


id:84 POS:ADJ 50 counts
id:85 POS:ADP 123 counts
id:86 POS:ADV 67 counts
id:87 POS:AUX 48 counts
id:89 POS:CCONJ 61 counts
id:90 POS:DET 118 counts
id:92 POS:NOUN 171 counts
id:93 POS:NUM 8 counts
id:94 POS:PART 29 counts
id:95 POS:PRON 81 counts
id:96 POS:PROPN 73 counts
id:97 POS:PUNCT 174 counts
id:98 POS:SCONJ 20 counts
id:100 POS:VERB 136 counts
id:103 POS:SPACE 99 counts


In [16]:
len(doc)

1258

In [19]:
100* POS_freq[92]/len(doc)

13.593004769475357

In [22]:
displacy.render(list(doc.sents)[3],style = 'dep',jupyter = True)

In [27]:
# Show the first 2 named entities

for ent in doc.ents[:2]:
    print(ent.text + ' ' + ent.label_ + ' '+ str(spacy.explain(ent.label_)))


Peter Rabbit PERSON People, including fictional
Beatrix Potter PERSON People, including fictional


In [29]:
len(list(doc.sents))

68

In [30]:
list_of_sents = [nlp(sent.text) for sent in doc.sents]
list_of_ners = [doc for doc in list_of_sents if doc.sents]
len(list_of_ners)


68

In [33]:
displacy.render(list_of_sents[0],style = 'ent',jupyter = True)