### Spacy

In [1]:
#you put in raw text, and get back a **Doc** object, that comes with a variety of annotations.
import spacy
nlp = spacy.load('en_core_web_sm')
doc = nlp(u'Tesla is looking at buying U.S. startup for $6 million')
for token in doc:
    print(token.text, token.pos_, token.dep_)

Tesla PROPN nsubj
is AUX aux
looking VERB ROOT
at ADP prep
buying VERB pcomp
U.S. PROPN compound
startup NOUN dobj
for ADP prep
$ SYM quantmod
6 NUM compound
million NUM pobj


In [2]:
nlp.pipeline  # the doc object runs trough this pipleine: text->tokeniser-> tagger->parser-> ner->doc

[('tagger', <spacy.pipeline.pipes.Tagger at 0x29bc0936cc8>),
 ('parser', <spacy.pipeline.pipes.DependencyParser at 0x29bc16a8d08>),
 ('ner', <spacy.pipeline.pipes.EntityRecognizer at 0x29bc16c42e8>)]

In [3]:
print( doc[0].pos_ )   # Part-of-Speech Tagging (POS) : propn = proper noun
print( doc[0].tag_ )   # fine-grained tag , POS is coarse
print( doc[0].dep_ )   # dependencies assigned to each token : nomnal subject
print( doc[0].text )   #orignal word
print( doc[0].lemma_ ) # base form
print( doc[0].shape_)  # The word shape – capitalization, punctuation, digits
print( doc[0].is_stop) # Is the token part of a stop list, i.e. the most common words

PROPN
NNP
nsubj
Tesla
Tesla
Xxxxx
False


In [4]:
spacy.explain('NNP')

'noun, proper singular'

In [5]:
#span:
doc3 = nlp(u'Although commmonly attributed to John Lennon from his song "Beautiful Boy". \
the phrase "Life is what happens to us while we are making other plans" was written by \
cartoonist Allen Saunders and published in Reader\'s Digest in 1957.')
life_quote = doc3[16:30]
print(life_quote)
type(life_quote)

"Life is what happens to us while we are making other plans"


spacy.tokens.span.Span

#### Example only tokenize

In [None]:
import spacy
nlp = spacy.load('en_core_web_sm',disable=['parser', 'tagger','ner'])
nlp.max_length = 1198623

In [None]:
#clean token
def separate_punc(doc_text):
    return [token.text.lower() for token in nlp(doc_text) if token.text not in 
            '\n\n \n\n\n!"-#$%&()--.*+,-/:;<=>?@[\\]^_`{|}~\t\n ']
d = read_file('melville-moby_dick.txt')
tokens = separate_punc(d)
#tokens are list of words

#### Sectences

In [6]:
for sent in doc3.sents:
    print(sent)

Although commmonly attributed to John Lennon from his song "Beautiful Boy".
the phrase "Life is what happens to us while we are making other plans" was written by cartoonist Allen Saunders and published in Reader's Digest in 1957.


In [7]:
sents= [sent for sent in doc3.sents] #to make it iteratable

In [10]:
print(sents[1].start, sents[1].end)

14 45


In [11]:
doc3[0].is_sent_start   # 

True

In [12]:
len(doc.vocab)

520

In [13]:
doc3 = nlp(u'"Management is doing things right; leadership is doing the right things." -PeterDrucker')
for sent in doc3.sents:
    print(sent)

"Management is doing things right; leadership is doing the right things." -PeterDrucker


In [14]:
# ADD A NEW RULE TO THE PIPELINE
def set_custom_boundaries(doc):
    for token in doc[:-1]:
        if token.text == ';':
            doc[token.i+1].is_sent_start = True
    return doc
nlp.add_pipe(set_custom_boundaries, before='parser')
nlp.pipe_names

['tagger', 'set_custom_boundaries', 'parser', 'ner']

In [15]:
doc4 = nlp(u'"Management is doing things right; leadership is doing the right things." -PeterDrucker')
for sent in doc4.sents:
    print(sent)

"Management is doing things right;
leadership is doing the right things." -PeterDrucker


#### NER adn noun chunk

In [16]:
#NER  named entities
doc8 = nlp(u'Apple to build a Hong Kong factory for $6 million')
for ent in doc8.ents:
    print(ent.text+' - '+ent.label_+' - '+str(spacy.explain(ent.label_)))

Apple - ORG - Companies, agencies, institutions, etc.
Hong Kong - GPE - Countries, cities, states
$6 million - MONEY - Monetary values, including unit


In [17]:
len([ent for ent in doc.ents if ent.label_=='MONEY'])      # count entities

1

In [18]:
# Noun chunks are "base noun phrases" 
doc9 = nlp(u"Autonomous cars shift insurance liability toward manufacturers.")
for chunk in doc9.noun_chunks:
    print(chunk.text)

Autonomous cars
insurance liability
manufacturers


#### Visualize

In [19]:
from spacy import displacy
doc = nlp(u'Over the last quarter Apple sold nearly 20 thousand iPods for a profit of $6 million.')
displacy.render(doc, style='ent', jupyter=True)
#displacy.serve(doc, style='ent') # on:  http://127.0.0.1:5000

In [20]:
colors = {'ORG': 'linear-gradient(90deg, #aa9cfc, #fc9ce7)', 'PRODUCT': 'radial-gradient(yellow, green)'}
options = {'ents': ['ORG', 'PRODUCT'],'colors':colors}         #view only speicic items
for sent in doc.sents:                         # if more sentences, view in separate line
    displacy.render(nlp(sent.text), style='ent', jupyter=True,options=options)

In [21]:
options = {'distance': 110, 'compact': 'True', 'color': 'yellow', 'bg': '#09a3d5', 'font': 'Times'}
displacy.render(doc, style='dep', jupyter=True,options=options)

In [22]:
#stop words in ('en_core_web_sm')
len(nlp.Defaults.stop_words)

326

In [23]:
nlp.vocab['myself'].is_stop

True

In [24]:
# Add the word to the set of stop words. Use lowercase!
nlp.Defaults.stop_words.add('btw')
# Remove the word from the set of stop words
nlp.Defaults.stop_words.remove('beyond')

#### Stemming

In [25]:
import nltk
from nltk.stem.porter import *
p_stemmer = PorterStemmer()
words = ['run','runner','running','ran','runs','easily','fairly']
for word in words:
    print(word+' --> '+p_stemmer.stem(word))

run --> run
runner --> runner
running --> run
ran --> ran
runs --> run
easily --> easili
fairly --> fairli


In [26]:
from nltk.stem.snowball import SnowballStemmer #somewath better
s_stemmer = SnowballStemmer(language='english')
for word in words:
    print(word+' --> '+s_stemmer.stem(word))

run --> run
runner --> runner
running --> run
ran --> ran
runs --> run
easily --> easili
fairly --> fair


#### Lemmatization

In [27]:
#better than stemming, alos consider tenses
def show_lemmas(text):
    for token in text:
        print(f'{token.text:{12}} {token.pos_:{6}} {token.lemma:<{22}} {token.lemma_}')
doc2 = nlp(u"I saw eighteen mice today!")
show_lemmas(doc2)

I            PRON   561228191312463089     -PRON-
saw          VERB   11925638236994514241   see
eighteen     NUM    9609336664675087640    eighteen
mice         NOUN   1384165645700560590    mouse
today        NOUN   11042482332948150395   today
!            PUNCT  17494803046312582752   !


#### Rule-based Matching

In [28]:
from spacy.matcher import Matcher
matcher = Matcher(nlp.vocab)

pattern1 = [{'LOWER': 'solarpower'}]
pattern2 = [{'LOWER': 'solar'}, {'LOWER': 'power'}]
pattern3 = [{'LOWER': 'solar'}, {'IS_PUNCT': True}, {'LOWER': 'power'}]
matcher.add('SolarPower', None, pattern1, pattern2, pattern3)

doc = nlp(u'The Solar Power industry continues to grow as demand \
for solarpower increases. Solar-power cars are gaining popularity.')

matcher(doc) #Each tuple contains an ID for the match, with start & end tokens that map to the span

[(8656102463236116519, 1, 3),
 (8656102463236116519, 10, 11),
 (8656102463236116519, 13, 16)]

In [29]:
# Redefine the patterns:
pattern1 = [{'LOWER': 'solarpower'}]
pattern2 = [{'LOWER': 'solar'}, {'IS_PUNCT': True, 'OP':'*'}, {'LOWER': 'power'}] #OP  for optional
matcher.remove('SolarPower')
matcher.add('SolarPower', None, pattern1, pattern2)
matcher(doc)

[(8656102463236116519, 1, 3),
 (8656102463236116519, 10, 11),
 (8656102463236116519, 13, 16)]

#### PhraseMatcher

In [39]:
from spacy.matcher import PhraseMatcher
matcher = PhraseMatcher(nlp.vocab)
with open('UPDATED_NLP_COURSE/TextFiles/reaganomics.txt', encoding='utf8') as f:
    doc3 = nlp(f.read())
    
# First, create a list of match phrases:
phrase_list = ['voodoo economics', 'supply-side economics', 'trickle-down economics', 'free-market economics']
# Next, convert each phrase to a Doc object:
phrase_patterns = [nlp(text) for text in phrase_list]
# Pass each Doc object into matcher (note the use of the asterisk!):
matcher.add('VoodooEconomics', None, *phrase_patterns)
# Build a list of matches:
matcher(doc3)

UnicodeDecodeError: 'utf-8' codec can't decode byte 0x96 in position 6744: invalid start byte

#### Part of Speech Basics

In [31]:
#tag_ fine-grained vs. pos_ is coarse
doc = nlp(u'I read books on NLP.')
r = doc[1]
print(f'{r.text:{10}} {r.pos_:{8}} {r.tag_:{6}} {spacy.explain(r.tag_)}')

read       VERB     VBD    verb, past tense


In [32]:
# doc.count_by() 
#method accepts a specific token attribute as its argument, and returns a frequency count
#of the given attribute as a dictionary object. Keys in the dictionary are the integer values of the  attributeID.
doc = nlp(u"The quick brown fox jumped over the lazy dog's back.")

In [33]:
doc.count_by(spacy.attrs.POS)

{90: 2, 84: 3, 92: 3, 100: 1, 85: 1, 94: 1, 97: 1}

In [34]:
doc.count_by(spacy.attrs.TAG)

{15267657372422890137: 2,
 10554686591937588953: 3,
 15308085513773655218: 3,
 17109001835818727656: 1,
 1292078113972184607: 1,
 74: 1,
 12646065887601541794: 1}