# Spacy Basics

In [2]:
import spacy

# loaded a language model and named it NLP
nlp = spacy.load('en_core_web_sm')


In [3]:
# Create a Doc object
# by applying model to text
doc = nlp(u'Tesla is looking at buying U.S. startup for $6 million')



In [5]:

# Print each token separately
for token in doc:
    # raw text, part of speech, syntatic dependency
    print(token.text, token.pos_, token.dep_)

Tesla PROPN nsubj
is VERB aux
looking VERB ROOT
at ADP prep
buying VERB pcomp
U.S. PROPN compound
startup NOUN dobj
for ADP prep
$ SYM quantmod
6 NUM compound
million NUM pobj


In [6]:
# breaks down
nlp.pipeline
# nlp.pipe_names

[('tagger', <spacy.pipeline.Tagger at 0x7fea42065fd0>),
 ('parser', <spacy.pipeline.DependencyParser at 0x7fea29a47b30>),
 ('ner', <spacy.pipeline.EntityRecognizer at 0x7fea29a63110>)]

In [7]:
nlp.

['tagger', 'parser', 'ner']

In [12]:
doc2 = nlp(u"Tesla isn't  looking into startups anymore.")

# Print each token separately
for token in doc2:
    # raw text, part of speech, syntatic dependency
    print(token.text, token.pos_, token.dep_, token.lemma_)

Tesla PROPN nsubj tesla
is VERB aux be
n't ADV neg not
  SPACE   
looking VERB ROOT look
into ADP prep into
startups NOUN pobj startup
anymore ADV advmod anymore
. PUNCT punct .


In [13]:
doc3 = nlp(u'Although commmonly attributed to John Lennon from his song "Beautiful Boy", \
the phrase "Life is what happens to us while we are making other plans" was written by \
cartoonist Allen Saunders and published in Reader\'s Digest in 1957, when Lennon was 17.')

In [17]:
# for idx, token in enumerate(doc3):
#     print(idx, token.text)

doc3[16:30]

"Life is what happens to us while we are making other plans"

In [24]:
doc4 = nlp(u'My name is Dr. Sean. This is the first sentence. This is another sentence. This is the last sentence.')

for sent in doc4.sents:
    print(sent)


My name is Dr. Sean.
This is the first sentence.
This is another sentence.
This is the last sentence.


In [26]:
doc4[6].is_sent_start

True

# Tokenization

In [28]:
import spacy
nlp = spacy.load('en_core_web_sm')

In [30]:
mystring = '"We\'re Moving to L.A.!"'

In [31]:
doc = nlp(mystring)

In [32]:
for token in doc:
    print(token.text)

"
We
're
Moving
to
L.A.
!
"


In [33]:
doc2 = nlp(u"We're here to help! Send snail-mail, email support@oursite.com or visit us at http://www.oursite.com!")



In [35]:
for t in doc2:
    print(t)

We
're
here
to
help
!
Send
snail
-
mail
,
email
support@oursite.com
or
visit
us
at
http://www.oursite.com
!


In [36]:
doc3 = nlp(u'A 5km NYC cab ride costs $10.30')

for t in doc3:
    print(t)

A
5
km
NYC
cab
ride
costs
$
10.30


In [37]:
doc4 = nlp(u"Let's visit St. Louis in the U.S. next year.")

for t in doc4:
    print(t)

Let
's
visit
St.
Louis
in
the
U.S.
next
year
.


5km NYC - QUANTITY - Measurements, as of weight or distance
10.30 - MONEY - Monetary values, including unit


In [39]:
doc5 = nlp(u'It is better to give than to receive.')

# Retrieve the third token:
doc5[2:5]

better to give

In [40]:
# Retrieve the last four tokens:
doc5[-4:]

than to receive.

In [41]:
doc05[0] = 'test'

NameError: name 'doc05' is not defined

In [42]:
doc8 = nlp(u'Apple to build a Hong Kong factory for $6 million')

for token in doc8:
    print(token.text, end=' | ')

print('\n----')

for ent in doc8.ents:
    print(ent.text+' - '+ent.label_+' - '+str(spacy.explain(ent.label_)))

Apple | to | build | a | Hong | Kong | factory | for | $ | 6 | million | 
----
Apple - ORG - Companies, agencies, institutions, etc.
Hong Kong - GPE - Countries, cities, states
$6 million - MONEY - Monetary values, including unit


In [45]:
for ent in doc3.ents:
    print(ent.text+' - '+ent.label_+' - '+str(spacy.explain(ent.label_)))
    
    
print('\n'+15*'-')

for ent in doc4.ents:
    print(ent.text+' - '+ent.label_+' - '+str(spacy.explain(ent.label_)))

5km NYC - QUANTITY - Measurements, as of weight or distance
10.30 - MONEY - Monetary values, including unit

---------------
St. Louis - GPE - Countries, cities, states
U.S. - GPE - Countries, cities, states
next year - DATE - Absolute or relative dates or periods


In [46]:
doc9 = nlp(u"Autonomous cars shift insurance liability toward manufacturers.")

for chunk in doc9.noun_chunks:
    print(chunk.text)

Autonomous cars
insurance liability
manufacturers


# Built-In Visualizers

In [51]:
from spacy import displacy

doc = nlp(u'Apple is going to build a U.K. factory for $6 million.')
displacy.render(doc, style='dep', jupyter=True, options={'distance': 80})

In [52]:
doc = nlp(u'Over the last quarter Apple sold nearly 20 thousand iPods for a profit of $6 million.')
displacy.render(doc, style='ent', jupyter=True)

# Stemming

In [53]:
# Import the toolkit and the full Porter Stemmer library
import nltk

from nltk.stem.porter import *

In [54]:
p_stemmer = PorterStemmer()

In [61]:
words = ['run','runner','running','ran','runs','easily','fairly', 'fairness','fairy','fare']

In [57]:
for word in words:
    print(word + '----->' + p_stemmer.stem(word))

run----->run
runner----->runner
running----->run
ran----->ran
runs----->run
easily----->easili
fairly----->fairli


In [58]:
from nltk.stem.snowball import SnowballStemmer

In [59]:
s_stemmer = SnowballStemmer(language='english')

In [63]:
for word in words:
    print(word + ' -----> ' + s_stemmer.stem(word))

run -----> run
runner -----> runner
running -----> run
ran -----> ran
runs -----> run
easily -----> easili
fairly -----> fair
fairness -----> fair
fairy -----> fairi
fare -----> fare


In [66]:
words = ['generous','generation','generated','gernerously','generate','gene','generated','giving','gave']
for word in words:
    print(word + ' -----> ' + s_stemmer.stem(word))

generous -----> generous
generation -----> generat
generated -----> generat
gernerously -----> gerner
generate -----> generat
gene -----> gene
generated -----> generat
giving -----> give
gave -----> gave


# Lemmatization

In [68]:
# Perform standard imports:
import spacy
nlp = spacy.load('en_core_web_sm')

In [74]:
doc1 = nlp(u"I am a runner running in a race because I love to run since I ran today")

for token in doc1:
    print(token.text, '\t', token.pos_, '\t', token.lemma_)

I 	 PRON 	 -PRON-
am 	 VERB 	 be
a 	 DET 	 a
runner 	 NOUN 	 runner
running 	 VERB 	 run
in 	 ADP 	 in
a 	 DET 	 a
race 	 NOUN 	 race
because 	 ADP 	 because
I 	 PRON 	 -PRON-
love 	 VERB 	 love
to 	 PART 	 to
run 	 VERB 	 run
since 	 ADP 	 since
I 	 PRON 	 -PRON-
ran 	 VERB 	 run
today 	 NOUN 	 today


In [75]:
def show_lemmas(text):
    for token in text:
        print(f'{token.text:{12}} {token.pos_:{6}} {token.lemma:<{22}} {token.lemma_}')

In [76]:
doc2 = nlp(u"I saw eighteen mice today!")

show_lemmas(doc2)

I            PRON   561228191312463089     -PRON-
saw          VERB   11925638236994514241   see
eighteen     NUM    9609336664675087640    eighteen
mice         NOUN   1384165645700560590    mouse
today        NOUN   11042482332948150395   today
!            PUNCT  17494803046312582752   !


In [77]:
doc3 = nlp(u"I am meeting him tomorrow at the meeting.")

show_lemmas(doc3)

I            PRON   561228191312463089     -PRON-
am           VERB   10382539506755952630   be
meeting      VERB   6880656908171229526    meet
him          PRON   561228191312463089     -PRON-
tomorrow     NOUN   3573583789758258062    tomorrow
at           ADP    11667289587015813222   at
the          DET    7425985699627899538    the
meeting      NOUN   14798207169164081740   meeting
.            PUNCT  12646065887601541794   .



# Stop Words

In [78]:

import spacy
nlp = spacy.load('en_core_web_sm')
# Print the set of spaCy's default stop words (remember that sets are unordered):
print(nlp.Defaults.stop_words)

{'and', 'whether', 'many', 'nor', 'such', 'three', 'yet', 'few', 'does', 'because', 'anything', 'either', 'us', 'between', 'although', 'against', 'other', 'eleven', 'thereupon', 'nevertheless', 'we', 'when', 'or', 'at', 'everyone', 'rather', 'anyone', 'seem', 'to', 'very', 'among', 'into', 'doing', 'done', 'take', 'now', 'unless', 'by', 'nine', 'yourselves', 'seeming', 'both', 'forty', 'name', 'part', 'also', 'were', 'beforehand', 'becomes', 'whither', 'down', 'him', 'may', 'enough', 'everything', 'already', 'none', 'noone', 'six', 'must', 'once', 'that', 'another', 'except', 'itself', 'always', 'toward', 'besides', 'really', 'in', 'seems', 'its', 'seemed', 'however', 'too', 'i', 'mostly', 'only', 'becoming', 'off', 'mine', 'show', 'see', 'serious', 'up', 'was', 'whose', 'you', 'per', 'along', 'therein', 'as', 'your', 'onto', 'for', 'from', 'than', 'out', 'using', 'behind', 'ours', 'still', 'until', 'who', 'anyway', 'somewhere', 'not', 'various', 'before', 'each', 'have', 'again', 'am'

In [79]:
len(nlp.Defaults.stop_words)

305

In [86]:
'is' in nlp.Defaults.stop_words

True

In [87]:
nlp.vocab['myself'].is_stop

True

In [88]:
nlp.vocab['myspace'].is_stop

False

In [89]:
# Add the word to the set of stop words. Use lowercase!
nlp.Defaults.stop_words.add('btw')

# Set the stop_word tag on the lexeme
nlp.vocab['btw'].is_stop = True

In [90]:
# Remove the word from the set of stop words
nlp.Defaults.stop_words.remove('beyond')

# Remove the stop_word tag from the lexeme
nlp.vocab['beyond'].is_stop = False

In [91]:
'btw' in nlp.Defaults.stop_words, 'beyond' in nlp.Defaults.stop_words, 

(True, False)

# Vocabulary and Matching part 1

In [93]:
# Perform standard imports
import spacy
nlp = spacy.load('en_core_web_sm')
from spacy.matcher import Matcher
matcher = Matcher(nlp.vocab)

In [160]:

# Redefine the patterns:
# SolarPower
pattern1 = [{'LOWER': 'solarpower'}]
# Solar-Power
pattern2 = [{'LOWER': 'solar'}, {'IS_PUNCT': True}, {'LOWER': 'power'}]
# Solar Power
# pattern3 = [{'LOWER': 'solar'}, {'LOWER': 'power'}]


In [172]:
matcher.remove('SolarPower')

KeyError: 8656102463236116519

In [174]:
# Add the new set of patterns to the 'SolarPower' matcher:
# # OLD
# matcher.add('SolarPower', None, pattern1, pattern2)
matcher.add('SolarPower', None,  pattern1, pattern2)

In [175]:
doc = nlp(u'The Solar Power industry continues to grow as demand \
for solarpower increases. Solar-power cars are gaining popularity.')

In [176]:
found_matches = matcher(doc)
print(found_matches)

[(8656102463236116519, 1, 3), (8656102463236116519, 10, 11), (8656102463236116519, 13, 16)]


In [177]:
for match_id, start, end in found_matches:
    string_id = nlp.vocab.strings[match_id]  # get string representation
    span = doc[start:end]                    # get the matched span
    print(match_id, string_id, start, end, span.text)

8656102463236116519 SolarPower 1 3 Solar Power
8656102463236116519 SolarPower 10 11 solarpower
8656102463236116519 SolarPower 13 16 Solar-power


In [178]:
# Redefine the patterns:
pattern1 = [{'LOWER': 'solarpower'}]
pattern2 = [{'LOWER': 'solar'}, {'IS_PUNCT': True, 'OP':'*'}, {'LOWER': 'power'}]

# Remove the old patterns to avoid duplication:
matcher.remove('SolarPower')

# Add the new set of patterns to the 'SolarPower' matcher:
matcher.add('SolarPower', None, pattern1, pattern2)

In [181]:
doc2 = nlp(u"Solar--power is solarpower yay!")
found_matches = matcher(doc2)
print(found_matches)

[(8656102463236116519, 0, 3), (8656102463236116519, 4, 5)]


In [182]:

show_lemmas(doc2)

Solar        ADJ    3825196732376443040    solar
--           PUNCT  10501404726543969396   --
power        NOUN   10405720708504167118   power
is           VERB   10382539506755952630   be
solarpower   NOUN   5703546853475899243    solarpower
yay          NUM    17916626178616515936   yay
!            PUNCT  17494803046312582752   !


# Vocabulary and Matching part 2

In [190]:
# Perform standard imports, reset nlp
import spacy
nlp = spacy.load('en_core_web_sm')

from spacy.matcher import PhraseMatcher
matcher = PhraseMatcher(nlp.vocab)



In [210]:
# with open('../TextFiles/reaganomics.txt', encoding='utf-8') as f:

# encoding='cp1252' is Windows text encoding
with open('../TextFiles/reaganomics.txt', encoding='cp1252') as f:
    doc3 = nlp(f.read())

In [211]:
phrase_list = ['voodoo economics','supply-side economics','trickle-down economics','free-market economics']

In [214]:
phrase_patterns = [nlp(text) for text in phrase_list]
phrase_patterns

[voodoo economics,
 supply-side economics,
 trickle-down economics,
 free-market economics]

In [215]:
matcher.add('EconMatcher',None,*phrase_patterns)

In [216]:
found_matches = matcher(doc3)

In [220]:
for match_id, start, end in found_matches:
    string_id = nlp.vocab.strings[match_id]  # get string representation
    span = doc3[start-5:end+5]                    # get the matched span
    print(match_id, string_id, start, end, span.text)

3680293220734633682 EconMatcher 41 45 policies are commonly associated with supply-side economics, referred to as trickle
3680293220734633682 EconMatcher 49 53 economics, referred to as trickle-down economics or voodoo economics by political
3680293220734633682 EconMatcher 54 56 trickle-down economics or voodoo economics by political opponents, and
3680293220734633682 EconMatcher 61 65 by political opponents, and free-market economics by political advocates.


3680293220734633682 EconMatcher 673 677 attracted a following from the supply-side economics movement, which formed in
3680293220734633682 EconMatcher 2985 2989 became widely known as "trickle-down economics", due to the
