In [2]:
### Import spacy and load language library
import spacy
nlp = spacy.load('en_core_web_sm')

In [4]:
## Create a string that includes opening and closing quotation marks
mystring = '"We\'re moving to L.A.!"'
print(mystring)

"We're moving to L.A.!"


In [6]:
## Create a Doc object and explore tokens
doc = nlp(mystring)

for token in doc:
    print(token.text, end = ' | ')

" | We | 're | moving | to | L.A. | ! | " | 

In [7]:
### Example #2  _____ Prefix, suffix , infix and exception

doc2 = nlp(u"We're here to help! Send snail - mail, email support@oursite.com or visit us at www.oursite.com")

for t in doc2:
    print(t)



We
're
here
to
help
!
Send
snail
-
mail
,
email
support@oursite.com
or
visit
us
at
www.oursite.com


In [10]:
### Example #3  Span : Accessing part of a large document

doc3 = nlp(u'asdjf  hasdfps ifu ahsfbia  sfdn  pvgafd asfbg ajsfg ajsfns kdfj asjdfna psdj sjfkj sfbg')

In [11]:
life_quote = doc3[2:6]
print(life_quote)

hasdfps ifu ahsfbia  


In [12]:
## Now check the type
type(life_quote)

spacy.tokens.span.Span

In [13]:
### Exampe #4 Sentences
docs4 = nlp(u'This is the first sentence. This is the second sentence. This is the third sentence.')

for sent in docs4.sents:
    print(sent)

This is the first sentence.
This is the second sentence.
This is the third sentence.


In [14]:
## Check if the sentence is starting
docs4[6].is_sent_start

True

In [15]:
## Exceptions

docs5 = nlp(u"Let's visit St. Louic in the U.S. next year.")

for t in docs5:
    print(t)

Let
's
visit
St.
Louic
in
the
U.S.
next
year
.


In [None]:
## Here the abbreivations for "Saint" and "United States" are both preserved

In [16]:
## Counting tokens
len(doc)

8

In [17]:
## Counting Vocab Entities
len(doc.vocab)

57852

In [None]:
### Retreival of tokens is possible by index position and slice

In [19]:
docs6 = nlp(u'It is better to give than receive.')

# Retreive the third token;
docs6[2]

better

In [20]:
# Retreive three token from the middle:
docs6[2:5]

better to give

In [22]:
# Retreive the last four tokens:
docs6[-4:]

give than receive.

In [23]:
## Tokens cannot be reassigned

doc7 = nlp(u'My dinner was horrible.')
doc8 = nlp(u'Your dinner was good.')

# Try to change from doc7 to doc8
doc7[3] = doc7[3]

TypeError: 'spacy.tokens.doc.Doc' object does not support item assignment

In [25]:
# Name Entities

doc9 = nlp(u'Apple to build a Hong Kong factory for $6 million')

for token in doc9:
    print(token.text, end = ' | ')
    
print('\n----')

for ent in doc9.ents:
    print(ent.text+' - '+ent.label_+' - '+str(spacy.explain(ent.label_)))
    
    ## This happens because of the Dictionary

Apple | to | build | a | Hong | Kong | factory | for | $ | 6 | million | 
----
Apple - ORG - Companies, agencies, institutions, etc.
Hong Kong - GPE - Countries, cities, states
$6 million - MONEY - Monetary values, including unit


In [27]:
## Chunking 
doc10 = nlp(u'Autonomous cars shift insurance liability toward manufacturers.')

for chunks in doc10.noun_chunks:
    print(chunks.text)

Autonomous cars
insurance liability
manufacturers


In [29]:
doc11 = nlp(u"Red cars do not carry higher insurance rates.")

for chunks in doc11.noun_chunks:
    print(chunks.text)

Red cars
higher insurance rates


In [33]:
# Visualizing the dependency parse

from spacy import displacy

doc12= nlp(u'Apple is going to build a U.K. factory for $6 million.')
displacy.render(doc12, style = 'dep', jupyter = True, options = {'distance': 110})

In [34]:
## Visualizing the entity recognizer

doc13 = nlp(u'Over the last quarter Apple sold nearly 20 thousands iPods for a profit of $6 million.')

displacy.render(doc13, style = 'ent', jupyter = True)

In [47]:
### Stemming
## Import the toolkit and the full Poster Stemmer library 

import nltk

from nltk.stem.porter import *


In [48]:
p_stemmer = PorterStemmer()

In [49]:
words = ['runs','runner','running','ran','runs', 'easily']

In [50]:
for words in words:
    print(words+' --> '+p_stemmer.stem(words))

runs --> run
runner --> runner
running --> run
ran --> ran
runs --> run
easily --> easili


In [58]:
### Generating the root word using Snowball stemmer

from nltk.stem.snowball import SnowballStemmer
# The Snowball stemmer requires that you pass a language parameter
s_stemmer = SnowballStemmer(language = 'english')


In [61]:
words = ['runs','runner','running','ran','runs', 'easily','fairly']

In [63]:
for word in words:
    print(word+' --> '+s_stemmer.stem(word))

runs --> run
runner --> runner
running --> run
ran --> ran
runs --> run
easily --> easili
fairly --> fair


In [64]:
## Comparison between Porter Stemmer and Porter 2 stemmer
words = ['consolingly']

In [65]:
print('Porter Stemmer: ')
for word in words:
    print(word+' --> '+p_stemmer.stem(word))

Porter Stemmer: 
consolingly --> consolingli


In [66]:
print('Porter2 Stemmer: ')
for word in words:
    print(word+' --> '+s_stemmer.stem(word))

Porter2 Stemmer: 
consolingly --> consol


In [67]:
## Drawbacks of stemming

phrase = 'I am meeting him tomorrow at the meeting'

for word in phrase.split():
    print(word+' --> '+p_stemmer.stem(word))

I --> I
am --> am
meeting --> meet
him --> him
tomorrow --> tomorrow
at --> at
the --> the
meeting --> meet


In [69]:
## Lemma : It provides the root word

def show_lemmas(text):
    for token in text:
        print(f'{token.text:{12}} {token.pos_:{6}} {token.lemma:<{22}} {token.lemma_}')

In [71]:
doc14 = nlp(u"I saw eighteen mice today")

show_lemmas(doc14)

I            PRON   561228191312463089     -PRON-
saw          VERB   11925638236994514241   see
eighteen     NUM    9609336664675087640    eighteen
mice         NOUN   1384165645700560590    mouse
today        NOUN   11042482332948150395   today


In [None]:
## Notice that the lemma of "saw" is "see", "mice" is the plural of "mouse" and "eighteen" is its own number and not an expanded form of eight

In [75]:
phrase = nlp(u"I am meeting him tomorrow at the meeting")

show_lemmas(phrase)

I            PRON   561228191312463089     -PRON-
am           VERB   10382539506755952630   be
meeting      VERB   6880656908171229526    meet
him          PRON   561228191312463089     -PRON-
tomorrow     NOUN   3573583789758258062    tomorrow
at           ADP    11667289587015813222   at
the          DET    7425985699627899538    the
meeting      NOUN   14798207169164081740   meeting


In [None]:
## Here the lemma of meeting is determined by its Part of Speech tag.


In [103]:
## Stop words in the English language

import spacy
nlp = spacy.load('en_core_web_sm')

In [104]:
# Print the set of spacy's default stop words 
print(nlp.Defaults.stop_words)

{'show', 'down', 'nor', 'while', 'then', 'someone', 'wherein', 'done', 'why', 'does', 'amongst', 'except', 'seeming', 'other', 'in', 'where', 'forty', 'latterly', 'without', 'more', 'my', 'yourselves', 'hereby', 'fifty', 'none', 'very', 'yours', 'a', 'an', 'has', 'than', 'please', 'them', 'toward', 'whose', 'can', 'eight', 'former', 'just', 'mine', 'put', 'say', 'give', 'themselves', 'up', 'used', 'somewhere', 'were', 'off', 'whenever', 'above', 'everywhere', 'ten', 'at', 'beforehand', 'same', 'of', 'only', 'over', 'out', 'so', 'serious', 'using', 'now', 'hereafter', 'through', 'really', 'but', 'did', 'would', 'on', 'back', 'not', 'am', 'another', 'however', 'least', 'seemed', 'therein', 'whole', 'after', 'otherwise', 'no', 'per', 'hereupon', 'or', 'noone', 'own', 'top', 'me', 'although', 'empty', 'even', 'third', 'and', 'anyway', 'i', 'few', 'always', 'often', 'yourself', 'hundred', 'neither', 'next', 'five', 'within', 'during', 'thereupon', 'everyone', 'call', 'between', 'full', 'som

In [105]:
len(nlp.Defaults.stop_words)

305

In [106]:
nlp.vocab['myself'].is_stop  # Check if stop word or not

True

In [107]:
nlp.vocab['mystery'].is_stop

False

In [108]:
## Addition of stop words based on the document you possess

nlp.Defaults.stop_words.add('btw')

# Set the stop_word tag on the lexeme
nlp.vocab['btw'].is_stop = True

In [109]:
len(nlp.Defaults.stop_words)

305

In [110]:
nlp.vocab['btw'].is_stop  ## If you notice stop word 'btw' is added to the list

True

In [114]:
## Removal of stop words based on the document you possess

nlp.Defaults.stop_words.remove('are')

# Set the stop_word tag on the lexeme
nlp.vocab['are'].is_stop = False

In [115]:
len(nlp.Defaults.stop_words)

304

In [116]:
nlp.vocab['are'].is_stop  ## If you notice stop word 'beyond' is added to the list

False

In [117]:
## Addition of stop words based on the document you possess

nlp.Defaults.stop_words.add('beyond')

# Set the stop_word tag on the lexeme
nlp.vocab['beyond'].is_stop = True

In [118]:
len(nlp.Defaults.stop_words)

305

In [1]:
## Perform standard imports
import spacy
nlp = spacy.load('en_core_web_sm')

In [3]:
# Import the matcher library
from spacy.matcher import Matcher
matcher = Matcher(nlp.vocab)

# Here the matcher is the object that pairs to the current Vocab object.

In [5]:
pattern1 = [{'LOWER': 'solarpower'}]
pattern2 = [{'LOWER': 'solar'},{'LOWER': 'power'}]
pattern3 = [{'LOWER': 'solar'},{'IS_PUNCT': True},{'LOWER': 'power'}]
matcher.add('SolarPower', None, pattern1,pattern2,pattern3)

In [13]:
## REFER NOTES

## Applying the matcher to a Doc object

doc_ = nlp(u'The Solar power industry continues to grow as demand \for solarpower increases. Solarpower cars are gaining popularity.')

In [14]:
found_matches = matcher(doc_)
print(found_matches)

[(8656102463236116519, 1, 3), (8656102463236116519, 11, 12), (8656102463236116519, 14, 15)]


In [15]:
for match_id, start,end in found_matches:
    string_id = nlp.vocab.strings [match_id] ### get string representation
    span = doc_[start:end]  ## get matched span
    print(match_id, string_id, start, end, span.text)

8656102463236116519 SolarPower 1 3 Solar power
8656102463236116519 SolarPower 11 12 solarpower
8656102463236116519 SolarPower 14 15 Solarpower


In [17]:
### Setting pattern options and quantifiers
## You can make token rules optional by passing an 'OP' :'*' argument. This lets us streamline our pattern list:

# Redefine th patterns:
pattern1 = [{'LOWER': 'solarpower'}]
pattern2 = [{'LOWER': 'solar'}, {'IS_PUNCT': True, 'OP': '*'},{'LOWER': 'power'} ]

# Remove the old patterns to avoid duplications:
matcher.remove('SolarPower')

# Add the new set of patterns to the 'SolarPower' matcher:
matcher.add('SolarPower', None, pattern1, pattern2)


In [18]:
found_matches = matcher(doc_)
print(found_matches)

[(8656102463236116519, 1, 3), (8656102463236116519, 11, 12), (8656102463236116519, 14, 15)]


In [None]:
## There are other token attributes for matching

In [24]:
## Matching Phrases

from spacy.matcher import PhraseMatcher
matcher = PhraseMatcher(nlp.vocab)

In [27]:
with open('reaganomics.txt') as f:
    doc3 = nlp(f.read())

In [29]:
# First, create alist  of match phrases:
phrase_list = ['voodoo economics', 'supply-side economics', 'trickle-down economics', 'free-market economics']

# Next convert each phrase to a Doc object:
phrase_patterns = [nlp(text) for text in phrase_list]

#Pass each Doc object into matcher (note the use of the astericks! :
matcher.add('VoodooEconomics', None, *phrase_patterns)

#Build a list matches:
matches = matcher(doc3)

In [30]:
# (match_id, start, end)
matches

[(3473369816841043438, 41, 45),
 (3473369816841043438, 49, 53),
 (3473369816841043438, 54, 56),
 (3473369816841043438, 61, 65),
 (3473369816841043438, 673, 677),
 (3473369816841043438, 2985, 2989)]

In [32]:
## But we do not know what is in the that particular position

## Try the 4th on
doc3[665:685]

same time he attracted a following from the supply-side economics movement, which formed in opposition to Keynesian

In [33]:
 doc3[2975:2995]

against institutions.[66] His policies became widely known as "trickle-down economics", due to the significant

In [41]:
## Speech tagging
import spacy
nlp = spacy.load('en_core_web_sm')

In [53]:
# Create a simple Doc object
doc = nlp(u"The quick brown fox jumped over the lazy dog's back.")

In [43]:
# Print the full text:
print(doc.text)

The quick brown fox jumped over the lazy dog's back.


In [45]:
# Print the fifth word anf associated tags:
print(doc[4].text,doc[4].pos_, doc[4].tag_,spacy.explain(doc[4].tag_))

jumped VERB VBD verb, past tense


In [56]:
## Now we can apply the same technique to the entire doc object:

for token in doc:
    print(f'{token.text:{10}} {token.pos_:{8}} {token.tag_:{6}} {spacy.explain(token.tag_)}')

The        DET      DT     determiner
quick      ADJ      JJ     adjective
brown      ADJ      JJ     adjective
fox        NOUN     NN     noun, singular or mass
jumped     VERB     VBD    verb, past tense
over       ADP      IN     conjunction, subordinating or preposition
the        DET      DT     determiner
lazy       ADJ      JJ     adjective
dog        NOUN     NN     noun, singular or mass
's         PART     POS    possessive ending
back       NOUN     NN     noun, singular or mass
.          PUNCT    .      punctuation mark, sentence closer


In [61]:
## Adding entities to the  especially when few words are not recognized
doc = nlp(u'Tesla to build a U.K. factory for $6 million')

def show_ents(doc):
    for ent in doc.ents:
        print(ent.text+' - '+ent.label_+' - '+str(spacy.explain(ent.label_)))
 
show_ents(doc)

U.K. - GPE - Countries, cities, states
$6 million - MONEY - Monetary values, including unit


In [62]:
show_ents(doc)


U.K. - GPE - Countries, cities, states
$6 million - MONEY - Monetary values, including unit


In [65]:
## Right now, spaCY does not recognize "Tesla" as a company

from spacy.tokens import Span

# Get the hash value of the ORG entity label
ORG = doc.vocab.strings[u'ORG']

# Create a Span for the new entity
new_ent = Span(doc,0,1, label = ORG)

# Add the entity to the existing Doc object
doc.ents = list(doc.ents) + [new_ent]

In [66]:
show_ents(doc)

Tesla - ORG - Companies, agencies, institutions, etc.
U.K. - GPE - Countries, cities, states
$6 million - MONEY - Monetary values, including unit


In [67]:
### Whitespace handling
spacy.__version__

'2.0.16'

In [70]:
docu = nlp(u'Originally priced at $29.50,\n the sweater was marked down to five dollars.')

show_ents(docu)

29.50 - MONEY - Monetary values, including unit
five dollars - MONEY - Monetary values, including unit


In [71]:
## Sometimes unwanted entities show up
# Quick funciton to remove ents formed on whitespace:

def remove_whitespace_entities(docu):
    docu.ents = [e for e in docu.ents if not e.text.isspace()]
    return docu

# Insert this into the pipeline AFTER the ner components:
nlp.add_pipe(remove_whitespace_entities, after = 'ner')

In [72]:
# Rereun nlp on the text above and show ents:

docu = nlp(u'Originally priced at $29.50,\n the sweater was marked down to five dollars.')

show_ents(docu)


29.50 - MONEY - Monetary values, including unit
five dollars - MONEY - Monetary values, including unit


In [74]:
### Another rule________________________________

docu4 = nlp(u'"Management is doing things right; leadership is doing the right things." -Peter Driucker')

for sent in docu4.sents:
    print(sent)

"Management is doing things right; leadership is doing the right things."
-Peter Driucker
