In [1]:
import nltk
import spacy

nlp = spacy.load('en')

In [2]:
nltk.download()

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


True

In [3]:
text = """Each autumn, businesses flock to elite universities like Harvard and Stanford to recruit engineers for their first post-university jobs. Curious students pile into classrooms to hear recruiters deliver their best pitches. These are the first moments when prospective employees size up a company’s culture and assess whether they can see themselves reflected in its future."""

## Tokenization

### Sentence Tokenization

##### Using spaCy

In [4]:
doc = nlp(text)

In [16]:
#sentence spans (an iterable object, like a list, of tokens) are available via the sents attribute
for index, sent in enumerate(doc.sents):
    print(index, type(sent), sent, '\n')

0 <class 'spacy.tokens.span.Span'> Each autumn, businesses flock to elite universities like Harvard and Stanford to recruit engineers for their first post-university jobs. 

1 <class 'spacy.tokens.span.Span'> Curious students pile into classrooms to hear recruiters deliver their best pitches. 

2 <class 'spacy.tokens.span.Span'> These are the first moments when prospective employees size up a company’s culture and assess whether they can see themselves reflected in its future. 



In [13]:
spacy_sents = list(doc.sents)

##### Using NLTK

In [9]:
from nltk import sent_tokenize

In [20]:
for index, sent in enumerate(sent_tokenize(text)):
    print(index, type(sent), sent, '\n')

0 <class 'str'> Each autumn, businesses flock to elite universities like Harvard and Stanford to recruit engineers for their first post-university jobs. 

1 <class 'str'> Curious students pile into classrooms to hear recruiters deliver their best pitches. 

2 <class 'str'> These are the first moments when prospective employees size up a company’s culture and assess whether they can see themselves reflected in its future. 



In [14]:
nltk_sents = sent_tokenize(text)

### Word Tokenization

##### spaCy

In [18]:
for index, token in enumerate(spacy_sents[0]):
    print(index, type(token), token)

0 <class 'spacy.tokens.token.Token'> Each
1 <class 'spacy.tokens.token.Token'> autumn
2 <class 'spacy.tokens.token.Token'> ,
3 <class 'spacy.tokens.token.Token'> businesses
4 <class 'spacy.tokens.token.Token'> flock
5 <class 'spacy.tokens.token.Token'> to
6 <class 'spacy.tokens.token.Token'> elite
7 <class 'spacy.tokens.token.Token'> universities
8 <class 'spacy.tokens.token.Token'> like
9 <class 'spacy.tokens.token.Token'> Harvard
10 <class 'spacy.tokens.token.Token'> and
11 <class 'spacy.tokens.token.Token'> Stanford
12 <class 'spacy.tokens.token.Token'> to
13 <class 'spacy.tokens.token.Token'> recruit
14 <class 'spacy.tokens.token.Token'> engineers
15 <class 'spacy.tokens.token.Token'> for
16 <class 'spacy.tokens.token.Token'> their
17 <class 'spacy.tokens.token.Token'> first
18 <class 'spacy.tokens.token.Token'> post
19 <class 'spacy.tokens.token.Token'> -
20 <class 'spacy.tokens.token.Token'> university
21 <class 'spacy.tokens.token.Token'> jobs
22 <class 'spacy.tokens.token.Token

##### NLTK

In [19]:
from nltk import word_tokenize

In [21]:
for index, token in enumerate(word_tokenize(nltk_sents[0])):
    print(index, type(token), token)

0 <class 'str'> Each
1 <class 'str'> autumn
2 <class 'str'> ,
3 <class 'str'> businesses
4 <class 'str'> flock
5 <class 'str'> to
6 <class 'str'> elite
7 <class 'str'> universities
8 <class 'str'> like
9 <class 'str'> Harvard
10 <class 'str'> and
11 <class 'str'> Stanford
12 <class 'str'> to
13 <class 'str'> recruit
14 <class 'str'> engineers
15 <class 'str'> for
16 <class 'str'> their
17 <class 'str'> first
18 <class 'str'> post-university
19 <class 'str'> jobs
20 <class 'str'> .


In [79]:
nltk_tokens = word_tokenize(nltk_sents[0])

##### using Regular Expressions (regex)

In [22]:
import re

In [74]:
WORDS_RE = re.compile(r'\W+')

In [75]:
for index, token in enumerate(re.split(WORDS_RE, nltk_sents[0])):
    print(index, type(token), token)

0 <class 'str'> Each
1 <class 'str'> autumn
2 <class 'str'> businesses
3 <class 'str'> flock
4 <class 'str'> to
5 <class 'str'> elite
6 <class 'str'> universities
7 <class 'str'> like
8 <class 'str'> Harvard
9 <class 'str'> and
10 <class 'str'> Stanford
11 <class 'str'> to
12 <class 'str'> recruit
13 <class 'str'> engineers
14 <class 'str'> for
15 <class 'str'> their
16 <class 'str'> first
17 <class 'str'> post
18 <class 'str'> university
19 <class 'str'> jobs
20 <class 'str'> 


But we lost the punctuation and we have emtpy strings.

In [76]:
WORDS_AND_PUNCT_RE = re.compile(r'\w+|[\,\.\!\?\-]')

In [77]:
for index, token in enumerate(re.findall(WORDS_AND_PUNCT_RE, nltk_sents[0])):
    print(index, type(token), token)

0 <class 'str'> Each
1 <class 'str'> autumn
2 <class 'str'> ,
3 <class 'str'> businesses
4 <class 'str'> flock
5 <class 'str'> to
6 <class 'str'> elite
7 <class 'str'> universities
8 <class 'str'> like
9 <class 'str'> Harvard
10 <class 'str'> and
11 <class 'str'> Stanford
12 <class 'str'> to
13 <class 'str'> recruit
14 <class 'str'> engineers
15 <class 'str'> for
16 <class 'str'> their
17 <class 'str'> first
18 <class 'str'> post
19 <class 'str'> -
20 <class 'str'> university
21 <class 'str'> jobs
22 <class 'str'> .


See [this tutorial](https://www.datacamp.com/community/tutorials/python-regular-expression-tutorial) to learn more about regular expressions and visit [pythex.org](www.pythex.org) to play around.

# Part of Speech (POS) Tagging

##### spaCy

The best part about spaCy is it does everything for you right out of the box

In [78]:
[(t, t.pos_, t.tag_) for t in spacy_sents[0]]

[(Each, 'DET', 'DT'),
 (autumn, 'NOUN', 'NN'),
 (,, 'PUNCT', ','),
 (businesses, 'NOUN', 'NNS'),
 (flock, 'NOUN', 'NN'),
 (to, 'ADP', 'IN'),
 (elite, 'ADJ', 'JJ'),
 (universities, 'NOUN', 'NNS'),
 (like, 'ADP', 'IN'),
 (Harvard, 'PROPN', 'NNP'),
 (and, 'CCONJ', 'CC'),
 (Stanford, 'PROPN', 'NNP'),
 (to, 'PART', 'TO'),
 (recruit, 'VERB', 'VB'),
 (engineers, 'NOUN', 'NNS'),
 (for, 'ADP', 'IN'),
 (their, 'ADJ', 'PRP$'),
 (first, 'ADJ', 'JJ'),
 (post, 'NOUN', 'NN'),
 (-, 'PUNCT', 'HYPH'),
 (university, 'NOUN', 'NN'),
 (jobs, 'NOUN', 'NNS'),
 (., 'PUNCT', '.')]

##### NLTK

In [80]:
nltk.pos_tag(nltk_tokens)

[('Each', 'DT'),
 ('autumn', 'NN'),
 (',', ','),
 ('businesses', 'NNS'),
 ('flock', 'VBP'),
 ('to', 'TO'),
 ('elite', 'VB'),
 ('universities', 'NNS'),
 ('like', 'IN'),
 ('Harvard', 'NNP'),
 ('and', 'CC'),
 ('Stanford', 'NNP'),
 ('to', 'TO'),
 ('recruit', 'VB'),
 ('engineers', 'NNS'),
 ('for', 'IN'),
 ('their', 'PRP$'),
 ('first', 'JJ'),
 ('post-university', 'NN'),
 ('jobs', 'NNS'),
 ('.', '.')]

# Word Normalization: Stemming and Lemmatization
[Read more](https://nlp.stanford.edu/IR-book/html/htmledition/stemming-and-lemmatization-1.html)

##### spaCy

spaCy doesn't support stemming, but lemmatization (like everything else) is already built in.

In [82]:
[(t, 'LEMMA:', t.lemma_) for t in spacy_sents[0]]

[(Each, 'LEMMA:', 'each'),
 (autumn, 'LEMMA:', 'autumn'),
 (,, 'LEMMA:', ','),
 (businesses, 'LEMMA:', 'business'),
 (flock, 'LEMMA:', 'flock'),
 (to, 'LEMMA:', 'to'),
 (elite, 'LEMMA:', 'elite'),
 (universities, 'LEMMA:', 'university'),
 (like, 'LEMMA:', 'like'),
 (Harvard, 'LEMMA:', 'harvard'),
 (and, 'LEMMA:', 'and'),
 (Stanford, 'LEMMA:', 'stanford'),
 (to, 'LEMMA:', 'to'),
 (recruit, 'LEMMA:', 'recruit'),
 (engineers, 'LEMMA:', 'engineer'),
 (for, 'LEMMA:', 'for'),
 (their, 'LEMMA:', '-PRON-'),
 (first, 'LEMMA:', 'first'),
 (post, 'LEMMA:', 'post'),
 (-, 'LEMMA:', '-'),
 (university, 'LEMMA:', 'university'),
 (jobs, 'LEMMA:', 'job'),
 (., 'LEMMA:', '.')]

##### NLTK

NLTK has multiple stemmers that you can use.

In [85]:
from nltk.stem import LancasterStemmer, PorterStemmer, SnowballStemmer

In [88]:
lancaster_stem = LancasterStemmer()
porter_stem = PorterStemmer()
snowball_stem = SnowballStemmer('english')

In [91]:
for token in nltk_tokens:
    print('token: {} ---  Lancaster: {}  |  Porter: {}  |  Snowball: {}'.format(token, lancaster_stem.stem(token), porter_stem.stem(token), snowball_stem.stem(token)))

token: Each ---  Lancaster: each  |  Porter: each  |  Snowball: each
token: autumn ---  Lancaster: autumn  |  Porter: autumn  |  Snowball: autumn
token: , ---  Lancaster: ,  |  Porter: ,  |  Snowball: ,
token: businesses ---  Lancaster: busy  |  Porter: busi  |  Snowball: busi
token: flock ---  Lancaster: flock  |  Porter: flock  |  Snowball: flock
token: to ---  Lancaster: to  |  Porter: to  |  Snowball: to
token: elite ---  Lancaster: elit  |  Porter: elit  |  Snowball: elit
token: universities ---  Lancaster: univers  |  Porter: univers  |  Snowball: univers
token: like ---  Lancaster: lik  |  Porter: like  |  Snowball: like
token: Harvard ---  Lancaster: harvard  |  Porter: harvard  |  Snowball: harvard
token: and ---  Lancaster: and  |  Porter: and  |  Snowball: and
token: Stanford ---  Lancaster: stanford  |  Porter: stanford  |  Snowball: stanford
token: to ---  Lancaster: to  |  Porter: to  |  Snowball: to
token: recruit ---  Lancaster: recruit  |  Porter: recruit  |  Snowball:

# Analyzing Sentence Structure

There are two main approaches to analyzing sentence structure: Dependency-based and Constituency-based<br>
From [Wikipedia](https://en.wikipedia.org/wiki/Dependency_grammar#Dependency_vs._constituency):<br>
"Dependency is a one-to-one correspondence: for every element (e.g. word or morph) in the sentence, there is exactly one node in the structure of that sentence that corresponds to that element. The result of this one-to-one correspondence is that dependency grammars are word (or morph) grammars. All that exist are the elements and the dependencies that connect the elements into a structure. This situation should be compared with the constituency relation of phrase structure grammars. Constituency is a one-to-one-or-more correspondence, which means that, for every element in a sentence, there are one or more nodes in the structure that correspond to that element. The result of this difference is that dependency structures are minimal[7] compared to their constituency structure counterparts, since they tend to contain much fewer nodes."

##### spaCy

spaCy supports dependency parsing for analyzing sentence structure.

In [92]:
# already built in
[(t, t.dep_, t.head) for t in spacy_sents[0]]

[(Each, 'det', autumn),
 (autumn, 'npadvmod', flock),
 (,, 'punct', flock),
 (businesses, 'nsubj', flock),
 (flock, 'ROOT', flock),
 (to, 'aux', elite),
 (elite, 'advcl', flock),
 (universities, 'dobj', elite),
 (like, 'prep', universities),
 (Harvard, 'pobj', like),
 (and, 'cc', Harvard),
 (Stanford, 'conj', Harvard),
 (to, 'aux', recruit),
 (recruit, 'xcomp', elite),
 (engineers, 'dobj', recruit),
 (for, 'prep', engineers),
 (their, 'poss', jobs),
 (first, 'amod', jobs),
 (post, 'compound', university),
 (-, 'punct', university),
 (university, 'compound', jobs),
 (jobs, 'pobj', for),
 (., 'punct', flock)]

In [93]:
from spacy import displacy

In [100]:
# you can convert a sentence Span object to a doc like this
sent_span = spacy_sents[0]
print(type(sent_span))
sent_doc = sent_span.as_doc()

<class 'spacy.tokens.span.Span'>


In [103]:
displacy.render(sent_doc, style='dep', jupyter=True, options={'distance': 120})

  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)


##### NLTK

NLTK can perform both types. However, they require you to define your own grammars, which is outside of the scope of this course. NLTK also provides support for using Stanford's CoreNLP software to perform dependency and constituency parsing.

In [104]:
nltk.CFG(nltk_sents[0])

TypeError: __init__() missing 1 required positional argument: 'productions'