In [0]:
import spacy

In [0]:
#Load the English Language Library
#Also called loading the NLP model
nlp=spacy.load('en_core_web_sm')

In [0]:
#create a document object
doc=nlp(u'Tesla is looking at buying U.S. startup for $6 million')

In [0]:
doc

Tesla is looking at buying U.S. startup for $6 million

In [0]:
#Print the individual tokens
for token in doc:
    print(token.text)

Tesla
is
looking
at
buying
U.S.
startup
for
$
6
million


In [0]:
#Print individual tokens and its correspoding parts of speech. Here Parts of speech is mentio
for token in doc:
    print(token.text,token.pos)

Tesla 96
is 87
looking 100
at 85
buying 100
U.S. 96
startup 92
for 85
$ 99
6 93
million 93


In [0]:
#Print individual tokens and its correspoding parts of speech code and parts of speech descri
for token in doc:
    print(token.text,token.pos,token.pos_)

Tesla 96 PROPN
is 87 AUX
looking 100 VERB
at 85 ADP
buying 100 VERB
U.S. 96 PROPN
startup 92 NOUN
for 85 ADP
$ 99 SYM
6 93 NUM
million 93 NUM


In [0]:
#Basic nlp pipeline
nlp.pipeline

[('tagger', <spacy.pipeline.pipes.Tagger at 0x18198c6fda0>),
 ('parser', <spacy.pipeline.pipes.DependencyParser at 0x18198dd20a8>),
 ('ner', <spacy.pipeline.pipes.EntityRecognizer at 0x18198dd2108>)]

In [0]:
#Basic nlp pipeline names
nlp.pipe_names

['tagger', 'parser', 'ner']

In [0]:
doc2=nlp(u"Tesla isn't lookinginto startups anymore")

In [0]:
doc2

Tesla isn't lookinginto startups anymore

In [0]:
#Syntactic dependency
#Syntactic dependency, i.e. the relation between tokens.
for token in doc2:
    print(token.text,token.pos_,token.dep_)

Tesla PROPN nsubj
is AUX ROOT
n't PART neg
lookinginto VERB prep
startups NOUN pobj
anymore ADV advmod


In [0]:
#With indentation
for token in doc2:
    print(f"{token.text:{20}}{token.pos_:{10}}{token.dep_:{10}}")

Tesla               PROPN     nsubj     
is                  AUX       ROOT      
n't                 PART      neg       
lookinginto         VERB      prep      
startups            NOUN      pobj      
anymore             ADV       advmod    


In [0]:
#Even white spaces in the sentences are assigned tokens
doc2=nlp(u"Tesla isn't looking          into startups anymore")

In [0]:
doc2

Tesla isn't looking          into startups anymore

In [0]:
for token in doc2:
    print(token.text,token.pos_,token.dep_)

Tesla PROPN nsubj
is AUX aux
n't PART neg
looking VERB ROOT
          SPACE 
into ADP prep
startups NOUN pobj
anymore ADV advmod


In [0]:
#grab the tokens individually using index
doc2[0].pos

96

In [0]:
doc2[0]

Tesla

In [0]:
#Check the simple part-of-speech tag.
doc2[0].pos_

'PROPN'

In [0]:
#Check The detailed part-of-speech tag.
doc2[0].tag_

'NNP'

In [0]:
#Check the base form i.e. lemma of a particular word
doc2[0].lemma_

'Tesla'

In [0]:
#Check the base form i.e. lemma of a particular word
doc2[3].lemma_

'look'

In [0]:
#Check if a articular word is alphanumeric or not
doc2[3].is_alpha

True

In [0]:
#Check if a particular token is stopword or not
doc2[3].is_stop

False

In [0]:
doc3 = nlp(u'Although commmonly attributed to John Lennon from his song "Beautiful Boy", \the phrase "Life is what happens to us while we are making other plans" was written by \cartoonist Allen Saunders and published in Reader\'s Digest in 1957, when Lennon was 17.')

In [0]:
doc3

Although commmonly attributed to John Lennon from his song "Beautiful Boy", 	he phrase "Life is what happens to us while we are making other plans" was written by \cartoonist Allen Saunders and published in Reader's Digest in 1957, when Lennon was 17.

In [0]:
life_quote=doc3[16:30]

In [0]:
print(life_quote)

phrase "Life is what happens to us while we are making other plans


In [0]:
type(life_quote)

spacy.tokens.span.Span

In [0]:
#The following has three sentences for us
doc4=nlp("This is the first sentence. This is another sentence. This is the last sentence.")

In [0]:
doc4

This is the first sentence. This is another sentence. This is the last sentence.

In [0]:
#Seperate each of teh sentences using Spacy
for sentence in doc4.sents:
    print(sentence)

This is the first sentence.
This is another sentence.
This is the last sentence.


In [0]:
doc4[8]

another

In [0]:
#If a particular wrd is not the start of a sentence, it will return null
doc4[8].is_sent_start

In [0]:
#Let's check if This word in second sentence is the start of a word
doc4[6].is_sent_start

True

# Tokenization Part One

In [0]:
#Load the Spacy Library
import spacy
nlp=spacy.load('en_core_web_sm')

In [0]:
mystring='"We\'re moving to L.A.!"'

In [0]:
mystring

'"We\'re moving to L.A.!"'

In [0]:
print(mystring)

"We're moving to L.A.!"


In [0]:
doc=nlp(mystring)

In [0]:
#print the text of tokens
for token in doc:
    print(token.text)
#Punctuations are also tokenized

"
We
're
moving
to
L.A.
!
"


In [0]:
#But, Punctuations as part of the emailid are read as a single token
doc2=nlp(u"We're here to help! Send snail-mail, email support@oursite.com or visit us at http://www.oursite.com")

In [0]:
doc2

We're here to help! Send snail-mail, email support@oursite.com or visit us at http://www.oursite.com

In [0]:
for t in doc2:
    print(t)
#But the hyphen between snail and mail is read as a seperate token

We
're
here
to
help
!
Send
snail
-
mail
,
email
support@oursite.com
or
visit
us
at
http://www.oursite.com


In [0]:
doc3=nlp(u"A 5km NYC cab ride costs $10.30")

In [0]:
doc3

A 5km NYC cab ride costs $10.30

In [0]:
for t in doc3:
    print(t)
#Here the distance unit and dollar sign is assigned a seperate token)

A
5
km
NYC
cab
ride
costs
$
10.30


In [0]:
doc4=nlp(u"Let's visit St. Louis in the U.S. next year.")

In [0]:
doc4

Let's visit St. Louis in the U.S. next year.

In [0]:
for t in doc4:
    print(t)

Let
's
visit
St.
Louis
in
the
U.S.
next
year
.


In [0]:
#count the number of tokens
#Just check the length of doc object
len(doc4)

11

In [0]:
#Vocabentry contains a full library of items
doc4.vocab

<spacy.vocab.Vocab at 0x18198cc50c8>

In [0]:
#The number is going to change based on the language library loaded at the start
len(doc4.vocab)

512

In [0]:
#Tokens can be retrieved by their index positions
doc5=nlp("It is better to give than receive")

In [0]:
doc5[0]

It

In [0]:
doc5[1]

is

In [0]:
doc5[2]

better

In [0]:
doc5[3]

to

In [0]:
doc5[4]

give

In [0]:
doc5[5]

than

In [0]:
doc5[2:5]

better to give

In [0]:
#Tokens cannot be reassigned
#Let us try to change the token present at index zero
doc5[0]='test'

TypeError: 'spacy.tokens.doc.Doc' object does not support item assignment

In [0]:
doc8=nlp(u"Apple to build a Hong Kong factory for $6 million")

In [0]:
for token in doc8:
    print(token)

Apple
to
build
a
Hong
Kong
factory
for
$
6
million


In [0]:
for token in doc8:
    print(token,end='|')

Apple|to|build|a|Hong|Kong|factory|for|$|6|million|

In [0]:
for token in doc8:
    print(token,end='  |  ')

Apple  |  to  |  build  |  a  |  Hong  |  Kong  |  factory  |  for  |  $  |  6  |  million  |  

In [0]:
for entity in doc8.ents:
    print(entity)
#This will outout those tokens that are something special i.e. Named Entities

Apple
Hong Kong
$6 million


In [0]:
#Get the description of the entities i.e. their corresponding label id
for entity in doc8.ents:
    print(entity,entity.label)
    print('\n')

Apple 383


Hong Kong 384


$6 million 394




In [0]:
#Get the description of the entities i.e. their corresponding label id and the label description
for entity in doc8.ents:
    print(entity)
    print(entity.label)
    print(entity.label_)
    print('\n')

Apple
383
ORG


Hong Kong
384
GPE


$6 million
394
MONEY




In [0]:
#Get the description of the entities i.e. their corresponding label id and the label description and information about the label
for entity in doc8.ents:
    print(entity)
    print(entity.label)
    print(entity.label_)
    print(str(spacy.explain(entity.label_)))
    print('\n')

Apple
383
ORG
Companies, agencies, institutions, etc.


Hong Kong
384
GPE
Countries, cities, states


$6 million
394
MONEY
Monetary values, including unit




In [0]:
#Noun Chunks are base Noun phrases
#Noun Chunks are flat phrases that have Noun as their head
doc9=nlp(u"Autonomous cars shift insurance liability towards manufacturers.")

In [0]:
doc9

Autonomous cars shift insurance liability towards manufacturers.

In [0]:
#Autonomous cars is a noun chunk because
#cars is a noun
#Autonomous is a word describing the noun
#insurance is a noun
#liability is a word describing noun insurance
for chunk in doc9.noun_chunks:
    print(chunk)

Autonomous cars
insurance liability
manufacturers


# Tokenization Part Two

In [0]:
from spacy import displacy

In [0]:
doc=nlp(u"Apple is going to build a U.K. factory for $6 million")

In [0]:
doc

Apple is going to build a U.K. factory for $6 million

In [0]:
#We are visualizing syntactic dependency
#displacy.render(doc,jupyter=True,options={'distance'=:110})
displacy.render(doc,style='dep',options={'distance':110})

In [0]:
#Visualize Entity recognizer
doc=nlp(u"Over the last quarter Apple sold nearly 20 thousand ipods for a profit $6 million.")

In [0]:
doc

Over the last quarter Apple sold nearly 20 thousand ipods for a profit $6 million.

In [0]:
displacy.render(doc,style='ent',jupyter=True)

In [0]:
#To display the figure outside jupyter
doc=nlp(u"This is a sentence.")
displacy.serve(doc,jupyter=True, style='dep')
#The output figure will be displayed at 127.0.0.1:5000 in a browser

# Stemming

In [0]:
import nltk

In [0]:
from nltk.stem.porter import PorterStemmer

In [0]:
#Create instance of Porter Stemer Object
p_stemmer=PorterStemmer()

In [0]:
#Create a list of words
words=['run','runner','ran','runs','easily','fairly']
#All these words convey teh same idea, But they are just in different form

In [0]:
for word in words:
    print(word + '--------->' + p_stemmer.stem(word))

run--------->run
runner--------->runner
ran--------->ran
runs--------->run
easily--------->easili
fairly--------->fairli


In [0]:
#Snowball stemmer
from nltk.stem.snowball import SnowballStemmer

In [0]:
#Create instance of snowball stemmer
s_stemmer=SnowballStemmer(language='english')

In [0]:
for word in words:
    print(word+ '-------->' + s_stemmer.stem(word))

run-------->run
runner-------->runner
ran-------->ran
runs-------->run
easily-------->easili
fairly-------->fair


In [0]:
#Let us add more words to the list
words=['run','runner','ran','runs','easily','fairly','fairness']

In [0]:
for word in words:
    print(word+ '------>' + s_stemmer.stem(word))

run------>run
runner------>runner
ran------>ran
runs------>run
easily------>easili
fairly------>fair
fairness------>fair


In [0]:
words=['generous','generation','generously','generate']

In [0]:
words

['generous', 'generation', 'generously', 'generate']

In [0]:
for word in words:
    print(word+ '-------->' + s_stemmer.stem(word))

generous-------->generous
generation-------->generat
generously-------->generous
generate-------->generat


# Lemmatization

In [0]:
import spacy

In [0]:
nlp=spacy.load('en_core_web_sm')

In [0]:
doc1=nlp(u"I am a runner running in a race because I love to run since I ran today")

In [0]:
doc1

I am a runner running in a race because I love to run since I ran today

In [0]:
for token in doc1:
    print(token)

I
am
a
runner
running
in
a
race
because
I
love
to
run
since
I
ran
today


In [0]:
for token in doc1:
    print(token.text)

I
am
a
runner
running
in
a
race
because
I
love
to
run
since
I
ran
today


In [0]:
#Print token text and parts of speech i.e. POS
for token in doc1:
    print(token.text, '\t', token.pos_)

I 	 PRON
am 	 AUX
a 	 DET
runner 	 NOUN
running 	 VERB
in 	 ADP
a 	 DET
race 	 NOUN
because 	 SCONJ
I 	 PRON
love 	 VERB
to 	 PART
run 	 VERB
since 	 SCONJ
I 	 PRON
ran 	 VERB
today 	 NOUN


In [0]:
#Print token text and their parts of speech i.e. POS and their lemma code
for token in doc1:
    print(token.text, '\t', token.pos_, '\t', token.lemma)

I 	 PRON 	 561228191312463089
am 	 AUX 	 10382539506755952630
a 	 DET 	 11901859001352538922
runner 	 NOUN 	 12640964157389618806
running 	 VERB 	 12767647472892411841
in 	 ADP 	 3002984154512732771
a 	 DET 	 11901859001352538922
race 	 NOUN 	 8048469955494714898
because 	 SCONJ 	 16950148841647037698
I 	 PRON 	 561228191312463089
love 	 VERB 	 3702023516439754181
to 	 PART 	 3791531372978436496
run 	 VERB 	 12767647472892411841
since 	 SCONJ 	 10066841407251338481
I 	 PRON 	 561228191312463089
ran 	 VERB 	 12767647472892411841
today 	 NOUN 	 11042482332948150395


In [0]:
#Print token text and their parts of speech i.e. POS and their lemma code and their lemma word
for token in doc1:
    print(token.text, '\t', token.pos_, '\t', token.lemma, '\t', token.lemma_)
#Running ran run all reduced to the same word run

I 	 PRON 	 561228191312463089 	 -PRON-
am 	 AUX 	 10382539506755952630 	 be
a 	 DET 	 11901859001352538922 	 a
runner 	 NOUN 	 12640964157389618806 	 runner
running 	 VERB 	 12767647472892411841 	 run
in 	 ADP 	 3002984154512732771 	 in
a 	 DET 	 11901859001352538922 	 a
race 	 NOUN 	 8048469955494714898 	 race
because 	 SCONJ 	 16950148841647037698 	 because
I 	 PRON 	 561228191312463089 	 -PRON-
love 	 VERB 	 3702023516439754181 	 love
to 	 PART 	 3791531372978436496 	 to
run 	 VERB 	 12767647472892411841 	 run
since 	 SCONJ 	 10066841407251338481 	 since
I 	 PRON 	 561228191312463089 	 -PRON-
ran 	 VERB 	 12767647472892411841 	 run
today 	 NOUN 	 11042482332948150395 	 today


In [0]:
#Let us create a function to display the above output nicely
def show_lemmas(text):
    for token in text:
        print(f"{token.text} {token.pos_} {token.lemma}{token.lemma_}")

In [0]:
show_lemmas(doc1)

I PRON 561228191312463089-PRON-
am AUX 10382539506755952630be
a DET 11901859001352538922a
runner NOUN 12640964157389618806runner
running VERB 12767647472892411841run
in ADP 3002984154512732771in
a DET 11901859001352538922a
race NOUN 8048469955494714898race
because SCONJ 16950148841647037698because
I PRON 561228191312463089-PRON-
love VERB 3702023516439754181love
to PART 3791531372978436496to
run VERB 12767647472892411841run
since SCONJ 10066841407251338481since
I PRON 561228191312463089-PRON-
ran VERB 12767647472892411841run
today NOUN 11042482332948150395today


In [0]:
def show_lemmas(text):
    for token in text:
        print(f"{token.text:{10}}{token.pos_:{6}}{token.lemma:{30}}{token.lemma_}")

In [0]:
show_lemmas(doc1)

I         PRON              561228191312463089-PRON-
am        AUX             10382539506755952630be
a         DET             11901859001352538922a
runner    NOUN            12640964157389618806runner
running   VERB            12767647472892411841run
in        ADP              3002984154512732771in
a         DET             11901859001352538922a
race      NOUN             8048469955494714898race
because   SCONJ           16950148841647037698because
I         PRON              561228191312463089-PRON-
love      VERB             3702023516439754181love
to        PART             3791531372978436496to
run       VERB            12767647472892411841run
since     SCONJ           10066841407251338481since
I         PRON              561228191312463089-PRON-
ran       VERB            12767647472892411841run
today     NOUN            11042482332948150395today


In [0]:
def show_lemmas(text):
    for token in text:
        print(f"{token.text:{15}}{token.pos_:{10}}{token.lemma:<{30}}{token.lemma_}")

In [0]:
show_lemmas(doc1)

I              PRON      561228191312463089            -PRON-
am             AUX       10382539506755952630          be
a              DET       11901859001352538922          a
runner         NOUN      12640964157389618806          runner
running        VERB      12767647472892411841          run
in             ADP       3002984154512732771           in
a              DET       11901859001352538922          a
race           NOUN      8048469955494714898           race
because        SCONJ     16950148841647037698          because
I              PRON      561228191312463089            -PRON-
love           VERB      3702023516439754181           love
to             PART      3791531372978436496           to
run            VERB      12767647472892411841          run
since          SCONJ     10066841407251338481          since
I              PRON      561228191312463089            -PRON-
ran            VERB      12767647472892411841          run
today          NOUN      110424823329481503

In [0]:
doc2=nlp(u"I saw ten mice today!")

In [0]:
doc2

I saw ten mice today!

In [0]:
show_lemmas(doc2)

I              PRON      561228191312463089            -PRON-
saw            VERB      11925638236994514241          see
ten            NUM       7970704286052693043           ten
mice           NOUN      1384165645700560590           mouse
today          NOUN      11042482332948150395          today
!              PUNCT     17494803046312582752          !


# Stopwords

In [0]:
import spacy

In [0]:
nlp=spacy.load('en_core_web_sm')

In [0]:
#View the default stopwords present in the vocabulary
print(nlp.Defaults.stop_words)
#The output is a dictionary and not a set

{'hereupon', 'meanwhile', 'just', 'whereupon', 'hundred', 'anyway', '‘m', 'ever', 'there', 'whither', 'whether', '‘s', 'formerly', 'therein', 'because', 'if', 'five', 'all', 'very', "'re", 'about', 'now', 'nowhere', 'bottom', 'same', 'its', 'in', '‘re', 'cannot', 'does', 'even', 'except', 'an', 'too', 'over', 'had', 'n‘t', 'used', 'together', 'ours', 'within', 'with', 'none', 'nevertheless', '‘ll', 'sixty', 'down', 'myself', 'has', 'have', 'full', 'mine', 'various', 'each', 'other', 'our', 'through', 'hereby', 'already', 'that', 'were', 'above', 'am', 'would', 'yours', 'below', 'but', 'or', 'seems', 'serious', 'keep', 'become', 'anyhow', 'however', 'otherwise', 'toward', 'those', 'call', 'go', 'his', 'more', 'least', 'somewhere', 'due', 'fifteen', 'nobody', 'most', 'a', 'besides', 'here', '’d', 'at', 'are', 'part', 'namely', 'off', 'what', 'such', 'who', 'to', 'their', '’s', 'thereafter', 'still', 'give', 'others', 'been', 'which', 'no', 'yet', 'by', 'up', 'elsewhere', 'whoever', 'quit

In [0]:
#Check the number of default stopwords present in the vocabulary
len(nlp.Defaults.stop_words)

326

In [0]:
#Check if a particular word is a stopword. Here we are checking for word "is"
nlp.vocab['is']

<spacy.lexeme.Lexeme at 0x18767b15480>

In [0]:
nlp.vocab['is'].is_stop

True

In [0]:
#Check if word "mystery is stopword"
nlp.vocab['mystery'].is_stop

False

In [0]:
#Check if word "mystery is stopword"
nlp.vocab["mystery"].is_stop

False

In [0]:
#Check if word "btw"  is a stopword
nlp.vocab["btw"].is_stop

False

In [0]:
#Add a specific word to the list of existing stopwords
#Here we are adding the word "btw" to the stopwords
nlp.Defaults.stop_words.add("btw")

In [0]:
nlp.vocab["btw"].is_stop=True

In [0]:
#Since we added one stopword to the existing vocabulary
#The length of the stopwords should be increased by 1
#from existing 326 to 327
len(nlp.Defaults.stop_words)

327

In [0]:
#Now let us check if the word "btw" is a stopword or not
#It should return True as we added btw to the list of stopwords
nlp.vocab["btw"].is_stop

True

In [0]:
#View the default stopwords
nlp.Defaults.stop_words

{"'d",
 "'ll",
 "'m",
 "'re",
 "'s",
 "'ve",
 'a',
 'about',
 'above',
 'across',
 'after',
 'afterwards',
 'again',
 'against',
 'all',
 'almost',
 'alone',
 'along',
 'already',
 'also',
 'although',
 'always',
 'am',
 'among',
 'amongst',
 'amount',
 'an',
 'and',
 'another',
 'any',
 'anyhow',
 'anyone',
 'anything',
 'anyway',
 'anywhere',
 'are',
 'around',
 'as',
 'at',
 'back',
 'be',
 'became',
 'because',
 'become',
 'becomes',
 'becoming',
 'been',
 'before',
 'beforehand',
 'behind',
 'being',
 'below',
 'beside',
 'besides',
 'between',
 'beyond',
 'both',
 'bottom',
 'btw',
 'but',
 'by',
 'ca',
 'call',
 'can',
 'cannot',
 'could',
 'did',
 'do',
 'does',
 'doing',
 'done',
 'down',
 'due',
 'during',
 'each',
 'eight',
 'either',
 'eleven',
 'else',
 'elsewhere',
 'empty',
 'enough',
 'even',
 'ever',
 'every',
 'everyone',
 'everything',
 'everywhere',
 'except',
 'few',
 'fifteen',
 'fifty',
 'first',
 'five',
 'for',
 'former',
 'formerly',
 'forty',
 'four',
 'from'

In [0]:
print(nlp.Defaults.stop_words)

{'hereupon', 'meanwhile', 'just', 'whereupon', 'hundred', 'anyway', '‘m', 'ever', 'there', 'whither', 'whether', '‘s', 'formerly', 'therein', 'because', 'if', 'five', 'all', 'very', "'re", 'about', 'now', 'nowhere', 'bottom', 'same', 'its', 'in', '‘re', 'cannot', 'does', 'even', 'except', 'an', 'too', 'over', 'had', 'n‘t', 'used', 'together', 'ours', 'within', 'with', 'none', 'nevertheless', '‘ll', 'sixty', 'down', 'myself', 'has', 'have', 'full', 'mine', 'various', 'each', 'other', 'our', 'through', 'hereby', 'already', 'that', 'were', 'above', 'am', 'would', 'yours', 'below', 'but', 'or', 'seems', 'serious', 'keep', 'become', 'anyhow', 'however', 'otherwise', 'toward', 'those', 'call', 'go', 'his', 'more', 'least', 'somewhere', 'due', 'fifteen', 'nobody', 'most', 'a', 'besides', 'here', '’d', 'at', 'are', 'part', 'namely', 'off', 'what', 'such', 'who', 'to', 'their', '’s', 'thereafter', 'still', 'give', 'others', 'been', 'which', 'no', 'yet', 'by', 'up', 'elsewhere', 'whoever', 'quit

In [0]:
#Remove a stopword
#Forexample let us remove the word beyond from teh default stopwords
nlp.Defaults.stop_words.remove("beyond")

In [0]:
nlp.vocab["beyond"].is_stop=False

In [0]:
#Let us check if the beyond word is present in default stopwords
nlp.Defaults.stop_words

{"'d",
 "'ll",
 "'m",
 "'re",
 "'s",
 "'ve",
 'a',
 'about',
 'above',
 'across',
 'after',
 'afterwards',
 'again',
 'against',
 'all',
 'almost',
 'alone',
 'along',
 'already',
 'also',
 'although',
 'always',
 'am',
 'among',
 'amongst',
 'amount',
 'an',
 'and',
 'another',
 'any',
 'anyhow',
 'anyone',
 'anything',
 'anyway',
 'anywhere',
 'are',
 'around',
 'as',
 'at',
 'back',
 'be',
 'became',
 'because',
 'become',
 'becomes',
 'becoming',
 'been',
 'before',
 'beforehand',
 'behind',
 'being',
 'below',
 'beside',
 'besides',
 'between',
 'both',
 'bottom',
 'btw',
 'but',
 'by',
 'ca',
 'call',
 'can',
 'cannot',
 'could',
 'did',
 'do',
 'does',
 'doing',
 'done',
 'down',
 'due',
 'during',
 'each',
 'eight',
 'either',
 'eleven',
 'else',
 'elsewhere',
 'empty',
 'enough',
 'even',
 'ever',
 'every',
 'everyone',
 'everything',
 'everywhere',
 'except',
 'few',
 'fifteen',
 'fifty',
 'first',
 'five',
 'for',
 'former',
 'formerly',
 'forty',
 'four',
 'from',
 'front',

In [0]:
#Let us check if the word beyond is a stopword
nlp.vocab['beyond'].is_stop

False

# Phrase Matching and Vocabulary-Part One

In [0]:
import spacy

In [0]:
nlp=spacy.load('en_core_web_sm')

In [0]:
from spacy.matcher import Matcher

In [0]:
#Create a matcher object
#Pass the current nlp vocab to this object
matcher=Matcher(nlp.vocab)

In [0]:
#Create patterns
#SolarPower
pattern1=[{'LOWER':'solarpower'}]
#Solar-power
patter2=[{'LOWER':'solar'},{'IS_PUNCT':True},{'LOWER':'power'}]
#Solar power
pattern3=[{'LOWER':'solar'},{'LOWER':'power'}]

In [0]:
#We are going to give a name to our matcher. Fr example solarpower
matcher.add('SolarPower',None,pattern1,patter2,pattern3)

In [0]:
#Create a document
doc=nlp(u"The Solar Power industry continues to grow as solarpower increases. Solar-power is amazing.")

In [0]:
found_matches=matcher(doc)

In [0]:
print(found_matches)
#This will return tuples containing the span of matches
#The Solar Power industry#  here index 1=Solar     index 3 means untill index2 which is Power
#continues to grow as solarpower increases# index8 and index9 are referring to the solarpower
#  Solar-power is amazing

[(8656102463236116519, 1, 3), (8656102463236116519, 8, 9), (8656102463236116519, 11, 14)]


In [0]:
#Let us print the above in an indentation format
#This should contain the matchid, start index of the match, end index of the match and also the word which is matched
for match_id,start,end in found_matches:
    string_id=nlp.vocab.strings[match_id]             #Getting the string representation
    span=doc[start:end]
    print(match_id,string_id,start,end,span.text)

8656102463236116519 SolarPower 1 3 Solar Power
8656102463236116519 SolarPower 8 9 solarpower
8656102463236116519 SolarPower 11 14 Solar-power


In [0]:
#Remove a particular pattern
matcher.remove('solarpower')

ValueError: [E175] Can't remove rule for unknown match pattern ID: solarpower

In [0]:
#It is case-sensitive. So, type the name correctly
matcher.remove('SolarPower')

In [0]:
#solarpower SolarPower
pattern1=[{'LOWER':'solarpower'}]


#solar_power
#solar----power
#solar,.,.,.power
#Here *means it is referring to the IS_PUNCT can repeat one or more times
pattern2=[{'LOWER':'solar'},{'IS_PUNCT':True},{'OP':'*'},{'LOWER':'power'}]

In [0]:
#name the pattern as SolarPower
matcher.add('SolarPower',None,pattern1,patter2)

In [0]:
doc2=nlp(u"Solar--power is solarpower yay!")

In [0]:
doc2

Solar--power is solarpower yay!

In [0]:
found_matches=matcher(doc2)

In [0]:
print(found_matches)

[(8656102463236116519, 0, 3), (8656102463236116519, 4, 5)]


In [0]:
#Let us print the found_matches with the matcherid,matchername,match start index,match end index and the word whichis matched from the text
for match_id,start,end in found_matches:
    string_id=nlp.vocab.strings[match_id]                   #Get the string representation
    span=doc2[start:end]
    print(match_id,string_id,start,end,span.text)

8656102463236116519 SolarPower 0 3 Solar--power
8656102463236116519 SolarPower 4 5 solarpower


# Phrase Matching and Vocabulary-Part Two

In [0]:
from spacy.matcher import PhraseMatcher

In [0]:
#Create an object
matcher=PhraseMatcher(nlp.vocab)

In [0]:
with open('reaganomics.txt') as f:
    doc3=nlp(f.read())
#since there is an issue with the file, we deleted more content and kept only few lines in the text file

In [0]:
doc3

REAGANOMICS
https://en.wikipedia.org/wiki/Reaganomics

Reaganomics (a portmanteau of [Ronald] Reagan and economics attributed to Paul Harvey)[1] refers to the economic policies promoted by U.S. President Ronald Reagan during the 1980s. These policies are commonly associated with supply-side economics, referred to as trickle-down economics or voodoo economics by political opponents, and free-market economics by political advocates.

The four pillars of Reagan's economic policy were to reduce the growth of government spending, reduce the federal income tax and capital gains tax, reduce government regulation, and tighten the money supply in order to reduce inflation.[2]

The results of Reaganomics are still debated. Supporters point to the end of stagflation, stronger GDP growth, and an entrepreneur revolution in the decades that followed.[3][4] Critics point to the widening income gap, an atmosphere of greed, and the national debt tripling in eight years which ultimately reversed the pos

In [0]:
#Let us  create a phrase list with the phrases that we want to search
phrase_list=['voodoo economics','supply-side economics','trickle-down economics','free-market economics']

In [0]:
phrase_patetrns=[nlp(text) for text in phrase_list]

In [0]:
#The phrase_patterns is not a strings instead itis a spacy document
type(phrase_patetrns[0])

spacy.tokens.doc.Doc

In [0]:
#We are naming the matcher as EconMatcher
#To add eachy spacy doc we need to provide a *phrase_patterns
matcher.add('EconMatcher',None,*phrase_patetrns)
#we added the patterns present in phrase_patterns to the created EconMatcher

In [0]:
#We should build a list of matches
found_matches=matcher(doc3)

In [0]:
found_matches

[(3680293220734633682, 41, 45),
 (3680293220734633682, 49, 53),
 (3680293220734633682, 54, 56),
 (3680293220734633682, 61, 65),
 (3680293220734633682, 673, 677),
 (3680293220734633682, 2987, 2991)]

In [0]:
#Print the stringid,matchid,start index of match, end index of match and the actual match
for match_id, start, end in found_matches:
    string_id=nlp.vocab.strings[match_id]        #get the string representation of the matchid
    span=doc3[start:end]
    print(match_id,string_id,start,end,span.text)

3680293220734633682 EconMatcher 41 45 supply-side economics
3680293220734633682 EconMatcher 49 53 trickle-down economics
3680293220734633682 EconMatcher 54 56 voodoo economics
3680293220734633682 EconMatcher 61 65 free-market economics
3680293220734633682 EconMatcher 673 677 supply-side economics
3680293220734633682 EconMatcher 2987 2991 trickle-down economics


In [0]:
#We want to get a context of where the match occurred 
#Then we will add some indexes before the start and end ins pan
for match_id,start,end in found_matches:
    string_id=nlp.vocab.strings[match_id]          #Getting the string representation
    span=doc3[start-5:end+4]                       #We are adding some indexes before the start and after the end
    print(match_id,string_id,start,end,span.text)

3680293220734633682 EconMatcher 41 45 policies are commonly associated with supply-side economics, referred to as
3680293220734633682 EconMatcher 49 53 economics, referred to as trickle-down economics or voodoo economics by
3680293220734633682 EconMatcher 54 56 trickle-down economics or voodoo economics by political opponents,
3680293220734633682 EconMatcher 61 65 by political opponents, and free-market economics by political advocates.
3680293220734633682 EconMatcher 673 677 attracted a following from the supply-side economics movement, which formed
3680293220734633682 EconMatcher 2987 2991 became widely known as "trickle-down economics", due to
