# Spacy Basics

In [1]:
import spacy

print(spacy.__version__)

2.2.4


In [2]:
nlp = spacy.load("en_core_web_sm")

In [3]:
doc = nlp(u"Tesla is looking at buying U.S. startup for $6 millon")

for token in doc:
    print(token.text, token.pos_, token.dep_)

Tesla PROPN nsubj
is AUX aux
looking VERB ROOT
at ADP prep
buying VERB pcomp
U.S. PROPN compound
startup NOUN dobj
for ADP prep
$ SYM nmod
6 NUM nummod
millon NOUN pobj


<img src="../pipeline1.png" width="600">

In [4]:
nlp.pipeline

[('tagger', <spacy.pipeline.pipes.Tagger at 0x7f85866985d0>),
 ('parser', <spacy.pipeline.pipes.DependencyParser at 0x7f85856dd750>),
 ('ner', <spacy.pipeline.pipes.EntityRecognizer at 0x7f85856dd830>)]

In [5]:
nlp.pipe_names

['tagger', 'parser', 'ner']

In [6]:
doc2 = nlp(u"Tesla isn't   looking into startups anymore.")
for token in doc:
    print(token.text, token.pos_, token.dep_)

Tesla PROPN nsubj
is AUX aux
looking VERB ROOT
at ADP prep
buying VERB pcomp
U.S. PROPN compound
startup NOUN dobj
for ADP prep
$ SYM nmod
6 NUM nummod
millon NOUN pobj


In [7]:
doc2[0],type(doc2)

(Tesla, spacy.tokens.doc.Doc)

In [9]:
# Part-of-Speech(POS)
# Dependencies

doc2[0].pos_,doc[2].dep_

('PROPN', 'ROOT')

In [10]:
spacy.explain(doc2[0].pos_),spacy.explain(doc2[0].dep_)

('proper noun', 'nominal subject')

|Tag|Description|doc2[0].tag|
|:------|:------:|:------|
|`.text`|The original word text<!-- .element: style="text-align:left;" -->|`Tesla`|
|`.lemma_`|The base form of the word|`tesla`|
|`.pos_`|The simple part-of-speech tag|`PROPN`/`proper noun`|
|`.tag_`|The detailed part-of-speech tag|`NNP`/`noun, proper singular`|
|`.shape_`|The word shape – capitalization, punctuation, digits|`Xxxxx`|
|`.is_alpha`|Is the token an alpha character?|`True`|
|`.is_stop`|Is the token part of a stop list, i.e. the most common words of the language?|`False`|

In [14]:
print(doc2[4].text)
print(doc2[4].lemma_)
print(doc2[4].pos_)
print(doc2[4].tag_ + " / " + spacy.explain(doc2[4].tag_))

print(doc2[4].text + " : " + doc2[4].shape_)
print(doc2[2].text + " : " + doc2[2].shape_)

print(doc2[4].is_alpha)
print(doc2[4].is_stop)

looking
look
VERB
VBG / verb, gerund or present participle
looking : xxxx
n't : x'x
True
False


In [15]:
doc3 = nlp(u'Although commmonly attributed to John Lennon from his song "Beautiful Boy", \
the phrase "Life is what happens to us while we are making other plans" was written by \
cartoonist Allen Saunders and published in Reader\'s Digest in 1957, when Lennon was 17.')

In [16]:
life_quote = doc3[16:30]
print(life_quote)

"Life is what happens to us while we are making other plans"


In [17]:
doc4 = nlp(u'This is the first sentence. This is another sentence. This is the last sentence.')

for sent in doc4.sents:
    print(sent)

This is the first sentence.
This is another sentence.
This is the last sentence.


In [20]:
doc4[6],doc4[6].is_sent_start

(This, True)

# Tokenization

In [22]:
import spacy 
nlp = spacy.load("en_core_web_sm")
nlp.pipe_names

['tagger', 'parser', 'ner']

In [23]:
mystring = '"We\'re moving to L.A.!"'
print(mystring)

"We're moving to L.A.!"


In [25]:
doc = nlp(mystring)

for token in doc:
    print(token.text, end = " | ")

" | We | 're | moving | to | L.A. | ! | " | 

In [26]:
doc2 = nlp(u"We're here to help! Send snail-mail, email support@oursite.com or visit us at http://www.oursite.com!")

for t in doc2:
    print(t)

We
're
here
to
help
!
Send
snail
-
mail
,
email
support@oursite.com
or
visit
us
at
http://www.oursite.com
!


In [27]:
doc3 = nlp(u'A 5km NYC cab ride costs $10.30')
for t in doc3:
    print(t)

A
5
km
NYC
cab
ride
costs
$
10.30


In [28]:
doc4 = nlp(u"Let's visit St. Louis in the U.S. next year.")
for t in doc4:
    print(t)

Let
's
visit
St.
Louis
in
the
U.S.
next
year
.


In [31]:
mystring = '"We\'re moving to L.A.!"'
doc = nlp(mystring)
len(doc),len(doc.vocab)

(8, 512)

In [32]:
doc5 = nlp(u'It is better to give than to receive.')
doc5[2]

better

In [33]:
doc5[2:5]

better to give

In [34]:
doc5[-4:]

than to receive.

In [35]:
doc6 = nlp(u'My dinner was horrible.')
doc7 = nlp(u'Your dinner was delicious.')
doc6[3] = doc7[3]

TypeError: ignored

In [36]:
doc8 = nlp(u'Apple to build a Hong Kong factory for $6 million')

for token in doc8:
    print(token.text, end = " | ")

print("\n-----")

for ent in doc8.ents:
    print(ent.text + " - " + ent.label_ + " - " + str(spacy.explain(ent.label_)))

Apple | to | build | a | Hong | Kong | factory | for | $ | 6 | million | 
-----
Apple - ORG - Companies, agencies, institutions, etc.
Hong Kong - GPE - Countries, cities, states
$6 million - MONEY - Monetary values, including unit


In [41]:
# Noun Chunks -- > İsim Parçacıkları 

doc9 = nlp(u"Autonomous cars shift insurance liability toward manufacturers.")

for chunk in doc9.noun_chunks:
    print(chunk.text)

Autonomous cars
insurance liability
manufacturers


In [42]:
doc10 = nlp(u"Red cars do not carry higher insurance rates.")

for chunk in doc10.noun_chunks:
    print(chunk.text)

Red cars
higher insurance rates


In [43]:
doc11 = nlp(u"He was a one-eyed, one-horned, flying, purple people-eater.")

for chunk in doc11.noun_chunks:
    print(chunk.text)

He
a one-eyed, one-horned, flying, purple people-eater


In [45]:
from spacy import displacy

doc = nlp(u'Apple is going to build a U.K. factory for $6 million.')
displacy.render(doc, style = "ent", jupyter = True, options = {"distance" : 110})

In [46]:
displacy.render(doc, style = "dep", jupyter = True, options = {"distance" : 110})

In [47]:
doc = nlp(u'Over the last quarter Apple sold nearly 20 thousand iPods for a profit of $6 million.')
displacy.render(doc, style='ent', jupyter=True)

In [48]:
doc = nlp(u'This is a sentence.')
#displacy.serve(doc, style='dep')

# Stemming

In [54]:
import nltk
from nltk.stem.porter import PorterStemmer
from nltk.stem.snowball import SnowballStemmer

In [55]:
words = ['run','runner','running','ran','runs','easily','fairly']

p_stemmer = PorterStemmer()
s_stemmer = SnowballStemmer(language='english')

for word in words:
    print(word + " ------ > " + p_stemmer.stem(word))
    print(word + " ------ > " + s_stemmer.stem(word))
    print()

run ------ > run
run ------ > run

runner ------ > runner
runner ------ > runner

running ------ > run
running ------ > run

ran ------ > ran
ran ------ > ran

runs ------ > run
runs ------ > run

easily ------ > easili
easily ------ > easili

fairly ------ > fairli
fairly ------ > fair



In [56]:
words = ['consolingly']

print("Porter Stemmer : ")
for word in words:
    print(word + " ------ > " + p_stemmer.stem(word))

Porter Stemmer : 
consolingly ------ > consolingli


In [57]:
words = ['consolingly']

print("Porter2 Stemmer : ")
for word in words:
    print(word + " ------ > " + s_stemmer.stem(word))

Porter2 Stemmer : 
consolingly ------ > consol


In [58]:
phrase = 'I am meeting him tomorrow at the meeting'
for word in phrase.split():
    print(word+' --> '+p_stemmer.stem(word))

I --> I
am --> am
meeting --> meet
him --> him
tomorrow --> tomorrow
at --> at
the --> the
meeting --> meet


# Lemmatization

In [59]:
import spacy 
nlp = spacy.load("en_core_web_sm")

In [60]:
doc1 = nlp(u"I am a runner running in a race because I love to run since I ran today")

for token in doc1:
    print(token.text , "\t", token.pos_, "\t", token.lemma, "\t", token.lemma_)

I 	 PRON 	 561228191312463089 	 -PRON-
am 	 AUX 	 10382539506755952630 	 be
a 	 DET 	 11901859001352538922 	 a
runner 	 NOUN 	 12640964157389618806 	 runner
running 	 VERB 	 12767647472892411841 	 run
in 	 ADP 	 3002984154512732771 	 in
a 	 DET 	 11901859001352538922 	 a
race 	 NOUN 	 8048469955494714898 	 race
because 	 SCONJ 	 16950148841647037698 	 because
I 	 PRON 	 561228191312463089 	 -PRON-
love 	 VERB 	 3702023516439754181 	 love
to 	 PART 	 3791531372978436496 	 to
run 	 VERB 	 12767647472892411841 	 run
since 	 SCONJ 	 10066841407251338481 	 since
I 	 PRON 	 561228191312463089 	 -PRON-
ran 	 VERB 	 12767647472892411841 	 run
today 	 NOUN 	 11042482332948150395 	 today


In [63]:
def show_lemmas(text):
    for token in text:
        print(f'{token.text:{12}} {token.pos_:{6}} {token.lemma:<{22}} {token.lemma_}')

In [64]:
# mice -- > mouse 
# saw --- > see
doc2 = nlp(u"I saw eighteen mice today!")

show_lemmas(doc2)

I            PRON   561228191312463089     -PRON-
saw          VERB   11925638236994514241   see
eighteen     NUM    9609336664675087640    eighteen
mice         NOUN   1384165645700560590    mouse
today        NOUN   11042482332948150395   today
!            PUNCT  17494803046312582752   !


In [65]:
# meet ile meeting farka dikkat !!!
doc3 = nlp(u"I am meeting him tomorrow at the meeting.")
show_lemmas(doc3)

I            PRON   561228191312463089     -PRON-
am           AUX    10382539506755952630   be
meeting      VERB   6880656908171229526    meet
him          PRON   561228191312463089     -PRON-
tomorrow     NOUN   3573583789758258062    tomorrow
at           ADP    11667289587015813222   at
the          DET    7425985699627899538    the
meeting      NOUN   14798207169164081740   meeting
.            PUNCT  12646065887601541794   .


In [66]:
doc4 = nlp(u"That's an enormous automobile")
show_lemmas(doc4)

That         DET    4380130941430378203    that
's           AUX    10382539506755952630   be
an           DET    15099054000809333061   an
enormous     ADJ    17917224542039855524   enormous
automobile   NOUN   7211811266693931283    automobile


# StopWords

In [67]:
import spacy
nlp = spacy.load("en_core_web_sm")

In [69]:
print(nlp.Defaults.stop_words)

{'still', 'mine', 'should', 'many', '‘s', 'former', 'n’t', 'whereupon', "'re", 'seeming', 'no', 'what', 'how', 'give', 'three', 'used', 'one', 'whereby', 'while', 'when', 'during', 'much', 'almost', 'alone', 'between', '’re', 'onto', 'where', 'whom', 'make', 'become', 'thus', 'itself', 'to', 'without', 'really', 'over', 'done', 'upon', 'anything', 'thereupon', 'name', 'rather', 'others', '‘m', '’s', 'under', 'my', 'take', '‘ll', 'has', 'formerly', 'they', 'unless', 'side', 'via', 'made', 'ca', 'fifty', 'already', 'twenty', '‘re', 'this', 'quite', 'amount', 'around', 'am', 'had', 'me', 'latterly', 'below', 'throughout', 'both', 'its', 'all', 'twelve', 'therein', 'something', "'s", 'whoever', 'otherwise', 'amongst', 'well', 'not', 'full', 'anywhere', 'we', 'why', 'nine', 'if', 'whatever', 'everything', 'nevertheless', 'anyone', 'serious', 'us', 'move', 'seemed', 'ever', 'eleven', 'whither', 'then', 'top', 'themselves', 'seem', 'in', 'their', 'moreover', 'herself', 'none', 'least', 'again

In [70]:
len(nlp.Defaults.stop_words)

326

In [71]:
nlp.vocab["myself"].is_stop,nlp.vocab["mystery"].is_stop

(True, False)

In [72]:
# To add a StopWords
# btw -- > by the way

nlp.Defaults.stop_words.add("btw")

nlp.vocab["btw"].is_stop = True

In [73]:
len(nlp.Defaults.stop_words)

327

In [74]:
nlp.vocab["btw"].is_stop

True

In [75]:
# Var olan stop words

nlp.vocab["beyond"].is_stop

True

In [76]:
nlp.vocab["beyond"].is_stop = False
nlp.vocab["beyond"].is_stop

False

# Vocabulary and Matching

In [97]:
import spacy
from spacy.matcher import Matcher

nlp = spacy.load("en_core_web_sm")

In [98]:
matcher = Matcher(nlp.vocab)

This found both two-word patterns, with and without the hyphen!

The following quantifiers can be passed to the `'OP'` key:
<table><tr><th>OP</th><th>Description</th></tr>

<tr ><td><span >\!</span></td><td>Negate the pattern, by requiring it to match exactly 0 times</td></tr>
<tr ><td><span >?</span></td><td>Make the pattern optional, by allowing it to match 0 or 1 times</td></tr>
<tr ><td><span >\+</span></td><td>Require the pattern to match 1 or more times</td></tr>
<tr ><td><span >\*</span></td><td>Allow the pattern to match zero or more times</td></tr>
</table>


In [99]:
pattern1 = [{'LOWER': 'solarpower'}]
pattern2 = [{'LOWER': 'solar'}, {'LOWER': 'power'}]
pattern3 = [{'LOWER': 'solar'}, {'IS_PUNCT': True}, {'LOWER': 'power'}]

matcher.add('SolarPower', None, pattern1, pattern2, pattern3)

In [100]:
doc = nlp(u'The Solar Power industry continues to grow as demand \
for solarpower increases. Solar-power cars are gaining popularity.')

found_matches = matcher(doc)
print(found_matches)

[(8656102463236116519, 1, 3), (8656102463236116519, 10, 11), (8656102463236116519, 13, 16)]


In [102]:
for match_id, start, end in found_matches:

    string_id = nlp.vocab.strings[match_id]
    span = doc[start:end]
    print(match_id, string_id, start, end, span.text)

8656102463236116519 SolarPower 1 3 Solar Power
8656102463236116519 SolarPower 10 11 solarpower
8656102463236116519 SolarPower 13 16 Solar-power


In [104]:
pattern1 = [{'LOWER':'solarpower'}]
pattern2 = [{'LOWER':'solar'},{'IS_PUNCT':True,'OP':'*'},{'LOWER':'power'}]

matcher.remove("SolarPower")
matcher.add('SolarPower', None, pattern1, pattern2)

In [105]:
found_matches = matcher(doc)
print(found_matches)

[(8656102463236116519, 1, 3), (8656102463236116519, 10, 11), (8656102463236116519, 13, 16)]


In [106]:
for match_id, start, end in found_matches:

    string_id = nlp.vocab.strings[match_id]
    span = doc[start:end]
    print(match_id, string_id, start, end, span.text)

8656102463236116519 SolarPower 1 3 Solar Power
8656102463236116519 SolarPower 10 11 solarpower
8656102463236116519 SolarPower 13 16 Solar-power


---

In [112]:
pattern1 = [{'LOWER': 'solarpower'}]
pattern2 = [{'LOWER': 'solar'}, {'IS_PUNCT': True, 'OP':'*'}, {'LEMMA': 'power'}] # CHANGE THIS PATTERN

matcher.remove("SolarPower")
matcher.add("SolarPower",None,pattern1,pattern2)

In [114]:
doc2 = nlp(u'Solar-powered energy runs solar-powered cars.')

found_matches = matcher(doc2)
print(found_matches)

[(8656102463236116519, 0, 3), (8656102463236116519, 5, 8)]


In [116]:
for match_id, start, end in found_matches:

    string_id = nlp.vocab.strings[match_id]
    span = doc2[start:end]
    print(match_id, string_id, start, end, span.text)

8656102463236116519 SolarPower 0 3 Solar-powered
8656102463236116519 SolarPower 5 8 solar-powered


---

In [117]:
pattern1 = [{'LOWER': 'solarpower'}]
pattern2 = [{'LOWER': 'solar'}, {'IS_PUNCT': True, 'OP':'*'}, {'LOWER': 'power'}]
pattern3 = [{'LOWER': 'solarpowered'}]
pattern4 = [{'LOWER': 'solar'}, {'IS_PUNCT': True, 'OP':'*'}, {'LOWER': 'powered'}]

matcher.remove("SolarPower")

matcher.add("SolarPower",None,pattern1,pattern2,pattern3,pattern4)

In [120]:
doc2 = nlp(u'Solar-powered energy runs solar-powered cars.')
found_matches = matcher(doc2)
print(found_matches)

[(8656102463236116519, 0, 3), (8656102463236116519, 5, 8)]


In [121]:
for match_id , start, end in found_matches:

    string_id = nlp.vocab.strings[match_id]
    span = doc2[start:end]
    print(match_id, string_id, start, end, span.text)

8656102463236116519 SolarPower 0 3 Solar-powered
8656102463236116519 SolarPower 5 8 solar-powered


## Other token attributes
Besides lemmas, there are a variety of token attributes we can use to determine matching rules:
<table><tr><th>Attribute</th><th>Description</th></tr>

<tr ><td><span >`ORTH`</span></td><td>The exact verbatim text of a token</td></tr>
<tr ><td><span >`LOWER`</span></td><td>The lowercase form of the token text</td></tr>
<tr ><td><span >`LENGTH`</span></td><td>The length of the token text</td></tr>
<tr ><td><span >`IS_ALPHA`, `IS_ASCII`, `IS_DIGIT`</span></td><td>Token text consists of alphanumeric characters, ASCII characters, digits</td></tr>
<tr ><td><span >`IS_LOWER`, `IS_UPPER`, `IS_TITLE`</span></td><td>Token text is in lowercase, uppercase, titlecase</td></tr>
<tr ><td><span >`IS_PUNCT`, `IS_SPACE`, `IS_STOP`</span></td><td>Token is punctuation, whitespace, stop word</td></tr>
<tr ><td><span >`LIKE_NUM`, `LIKE_URL`, `LIKE_EMAIL`</span></td><td>Token text resembles a number, URL, email</td></tr>
<tr ><td><span >`POS`, `TAG`, `DEP`, `LEMMA`, `SHAPE`</span></td><td>The token's simple and extended part-of-speech tag, dependency label, lemma, shape</td></tr>
<tr ><td><span >`ENT_TYPE`</span></td><td>The token's entity label</td></tr>

</table>

In [123]:
import spacy
from spacy.matcher import Matcher
from spacy.matcher import PhraseMatcher

In [127]:
import zipfile

zip_file = zipfile.ZipFile("/content/drive/MyDrive/NLP_Vol1/UPDATED_NLP_COURSE.zip","r")
zip_file.extractall()
zip_file.close()

In [128]:
matcher = PhraseMatcher(nlp.vocab)

In [131]:
with open("/content/UPDATED_NLP_COURSE/TextFiles/reaganomics.txt", encoding = "unicode_escape") as f:
    doc3 = nlp(f.read())

In [None]:
doc3

In [133]:
phrase_list = ['voodoo economics', 'supply-side economics', 'trickle-down economics', 'free-market economics']

phrase_patterns = [nlp(text) for text in phrase_list]

matcher.add("VoodooEconomics",None,*phrase_patterns)

found_matches = matcher(doc3)

In [134]:
found_matches

[(3473369816841043438, 41, 45),
 (3473369816841043438, 49, 53),
 (3473369816841043438, 54, 56),
 (3473369816841043438, 61, 65),
 (3473369816841043438, 673, 677),
 (3473369816841043438, 2986, 2990)]

In [136]:
for match_id , start, end in found_matches:

    string_id = nlp.vocab.strings[match_id]
    span = doc3[start:end]
    print(match_id, string_id, start, end, span.text)

3473369816841043438 VoodooEconomics 41 45 supply-side economics
3473369816841043438 VoodooEconomics 49 53 trickle-down economics
3473369816841043438 VoodooEconomics 54 56 voodoo economics
3473369816841043438 VoodooEconomics 61 65 free-market economics
3473369816841043438 VoodooEconomics 673 677 supply-side economics
3473369816841043438 VoodooEconomics 2986 2990 trickle-down economics


In [142]:
sents = [sent for sent in doc3.sents]
len(sents)

205

In [148]:
sents[2].start,sents[3].end

(4, 70)

# Exercise

In [149]:
import spacy
nlp = spacy.load("en_core_web_sm")

In [157]:
with open("/content/UPDATED_NLP_COURSE/TextFiles/owlcreek.txt") as f:
    doc = nlp(f.read())

In [158]:
doc[:36]

AN OCCURRENCE AT OWL CREEK BRIDGE

by Ambrose Bierce

I

A man stood upon a railroad bridge in northern Alabama, looking down
into the swift water twenty feet below.  

In [159]:
len(doc)

4835

In [160]:
sents = [sent for sent in doc.sents]
len(sents)

249

In [161]:
print(sents[2].text)

A man stood upon a railroad bridge in northern Alabama, looking down
into the swift water twenty feet below.  


In [163]:
for token in sents[2]:
    print(token.text, token.pos_, token.dep_, token.lemma_)

A DET det a
man NOUN nsubj man
stood VERB ROOT stand
upon SCONJ prep upon
a DET det a
railroad NOUN compound railroad
bridge NOUN pobj bridge
in ADP prep in
northern ADJ amod northern
Alabama PROPN pobj Alabama
, PUNCT punct ,
looking VERB advcl look
down ADV prt down

 SPACE  

into ADP prep into
the DET det the
swift ADJ amod swift
water NOUN pobj water
twenty NUM nummod twenty
feet NOUN npadvmod foot
below ADV advmod below
. PUNCT punct .
  SPACE   


In [165]:
for token in sents[2]:
    print(f'{token.text:{15}} {token.pos_:{5}} {token.dep_:{10}} {token.lemma_:{15}}')

A               DET   det        a              
man             NOUN  nsubj      man            
stood           VERB  ROOT       stand          
upon            SCONJ prep       upon           
a               DET   det        a              
railroad        NOUN  compound   railroad       
bridge          NOUN  pobj       bridge         
in              ADP   prep       in             
northern        ADJ   amod       northern       
Alabama         PROPN pobj       Alabama        
,               PUNCT punct      ,              
looking         VERB  advcl      look           
down            ADV   prt        down           

               SPACE            
              
into            ADP   prep       into           
the             DET   det        the            
swift           ADJ   amod       swift          
water           NOUN  pobj       water          
twenty          NUM   nummod     twenty         
feet            NOUN  npadvmod   foot           
below           ADV 

In [166]:
from spacy.matcher import Matcher,PhraseMatcher

In [171]:
matcher = Matcher(nlp.vocab)

pattern = [{'LOWER':'swimming'},{'IS_SPACE':True,'OP':'*'}, {'LOWER':'vigorously'}]

matcher.add("Swimming",None, pattern)

In [172]:
found_matches = matcher(doc)
print(found_matches)

[(12881893835109366681, 1274, 1277), (12881893835109366681, 3609, 3612)]


In [175]:
print(doc[1265:1290])

By diving I could evade the bullets and, swimming
vigorously, reach the bank, take to the woods and get away home


In [176]:
print(doc[3600:3615])

all this over his shoulder; he was now swimming
vigorously with the current


In [179]:
for sent in sents:
    if found_matches[0][1] < sent.end:
        print(sent)
        break

By diving I could evade the bullets and, swimming
vigorously, reach the bank, take to the woods and get away home.  


In [180]:
found_matches[0][1],sent.end

(1274, 1292)

In [181]:
for sent in sents:
    if found_matches[1][1] < sent.end:
        print(sent)
        break

The hunted man saw all this over his shoulder; he was now swimming
vigorously with the current.  


In [182]:
found_matches[1][1],sent.end

(3609, 3617)