1. Using a Tagger: tagging is the second step of NLP pipeline, first is tokenization

In [2]:
import nltk
from nltk.tokenize import word_tokenize

text = word_tokenize("And now for something completely different")
nltk.pos_tag(text)

[('And', 'CC'),
 ('now', 'RB'),
 ('for', 'IN'),
 ('something', 'NN'),
 ('completely', 'RB'),
 ('different', 'JJ')]

In [3]:
text = word_tokenize("They refuse to permit us to obtain the refuse permit")
nltk.pos_tag(text)

[('They', 'PRP'),
 ('refuse', 'VBP'),
 ('to', 'TO'),
 ('permit', 'VB'),
 ('us', 'PRP'),
 ('to', 'TO'),
 ('obtain', 'VB'),
 ('the', 'DT'),
 ('refuse', 'NN'),
 ('permit', 'NN')]

In [4]:
text = nltk.Text(word.lower() for word in nltk.corpus.brown.words())
text.similar('woman')

man time day year car moment world house family child country boy
state job place way war girl work word


2. Tagged Corpora

In [5]:
tagged_token = nltk.tag.str2tuple('fly/NN')
tagged_token

('fly', 'NN')

In [6]:
# If a corpus contains tagged words, it will have tagged_words method
nltk.corpus.brown.tagged_words()

[('The', 'AT'), ('Fulton', 'NP-TL'), ...]

In [7]:
# Tagset: not all corpora use the same tagset
nltk.corpus.brown.tagged_words(tagset="universal")

[('The', 'DET'), ('Fulton', 'NOUN'), ...]

In [8]:
nltk.corpus.sinica_treebank.tagged_words()

[('一', 'Neu'), ('友情', 'Nad'), ('嘉珍', 'Nba'), ...]

In [9]:
from nltk.corpus import brown
brown_news_tagged = brown.tagged_words(categories="news", tagset="universal")
tag_fd = nltk.FreqDist(tag for (word, tag) in brown_news_tagged)
tag_fd.most_common()

[('NOUN', 30654),
 ('VERB', 14399),
 ('ADP', 12355),
 ('.', 11928),
 ('DET', 11389),
 ('ADJ', 6706),
 ('ADV', 3349),
 ('CONJ', 2717),
 ('PRON', 2535),
 ('PRT', 2264),
 ('NUM', 2166),
 ('X', 92)]

In [10]:
word_tag_pairs = nltk.bigrams(brown_news_tagged)
noun_preceders = [a[1] for (a, b) in word_tag_pairs if b[1] == 'NOUN']
fdist = nltk.FreqDist(noun_preceders)
[tag for (tag, _) in fdist.most_common()]

['NOUN',
 'DET',
 'ADJ',
 'ADP',
 '.',
 'VERB',
 'CONJ',
 'NUM',
 'ADV',
 'PRT',
 'PRON',
 'X']

In [12]:
wsj = nltk.corpus.treebank.tagged_words(tagset="universal")
word_tag_fd = nltk.FreqDist(wsj)
[wt for (wt, _) in word_tag_fd.most_common() if wt[1] == 'VERB']

[('is', 'VERB'),
 ('said', 'VERB'),
 ('was', 'VERB'),
 ('are', 'VERB'),
 ('be', 'VERB'),
 ('has', 'VERB'),
 ('have', 'VERB'),
 ('will', 'VERB'),
 ('says', 'VERB'),
 ('would', 'VERB'),
 ('were', 'VERB'),
 ('had', 'VERB'),
 ('been', 'VERB'),
 ('could', 'VERB'),
 ("'s", 'VERB'),
 ('can', 'VERB'),
 ('do', 'VERB'),
 ('say', 'VERB'),
 ('make', 'VERB'),
 ('may', 'VERB'),
 ('did', 'VERB'),
 ('rose', 'VERB'),
 ('made', 'VERB'),
 ('does', 'VERB'),
 ('expected', 'VERB'),
 ('buy', 'VERB'),
 ('take', 'VERB'),
 ('get', 'VERB'),
 ('might', 'VERB'),
 ('sell', 'VERB'),
 ('added', 'VERB'),
 ('sold', 'VERB'),
 ('help', 'VERB'),
 ('including', 'VERB'),
 ('should', 'VERB'),
 ('reported', 'VERB'),
 ('according', 'VERB'),
 ('pay', 'VERB'),
 ('compared', 'VERB'),
 ('being', 'VERB'),
 ('fell', 'VERB'),
 ('began', 'VERB'),
 ('based', 'VERB'),
 ('used', 'VERB'),
 ('closed', 'VERB'),
 ("'re", 'VERB'),
 ('want', 'VERB'),
 ('see', 'VERB'),
 ('took', 'VERB'),
 ('yield', 'VERB'),
 ('offered', 'VERB'),
 ('set', 'VERB'

In [13]:
cfd1 = nltk.ConditionalFreqDist(wsj)
cfd1['yield'].most_common()

[('VERB', 28), ('NOUN', 20)]

In [21]:
cfd2 = nltk.ConditionalFreqDist((tag, word) for (word, tag) in wsj)
list(cfd2['NOUN'])

['Pierre',
 'Vinken',
 'years',
 'board',
 'director',
 'Nov.',
 'Mr.',
 'chairman',
 'Elsevier',
 'N.V.',
 'Dutch',
 'group',
 'Rudolph',
 'Agnew',
 'Consolidated',
 'Gold',
 'Fields',
 'PLC',
 'conglomerate',
 'form',
 'asbestos',
 'Kent',
 'cigarette',
 'filters',
 'percentage',
 'cancer',
 'deaths',
 'workers',
 'researchers',
 'fiber',
 'crocidolite',
 'lungs',
 'exposures',
 'symptoms',
 'decades',
 'Lorillard',
 'Inc.',
 'unit',
 'Loews',
 'Corp.',
 'cigarettes',
 'Micronite',
 'findings',
 'year',
 'results',
 'today',
 'New',
 'England',
 'Journal',
 'Medicine',
 'forum',
 'attention',
 'problem',
 'spokewoman',
 'story',
 'anyone',
 'properties',
 'products',
 'research',
 'smokers',
 'information',
 'users',
 'risk',
 'James',
 'A.',
 'Talcott',
 'Boston',
 'Dana-Farber',
 'Cancer',
 'Institute',
 'Dr.',
 'team',
 'National',
 'schools',
 'Harvard',
 'University',
 'spokeswoman',
 'amounts',
 'paper',
 'type',
 'filter',
 'company',
 'men',
 'substance',
 'times',
 'number',

3 Mapping Words to Properties using Python Dictionaries

In [23]:
pos = {}
pos['colorness'] = 'ADJ'
pos

{'colorness': 'ADJ'}

In [24]:
pos['ideas'] = 'N'
pos['sleep'] = 'V'
pos['furiously'] = 'ADV'

In [25]:
for word in sorted(pos):
    print(word + ":", pos[word])

colorness: ADJ
furiously: ADV
ideas: N
sleep: V


In [26]:
for key, val in sorted(pos.items()):
    print(key + ":", val)

colorness: ADJ
furiously: ADV
ideas: N
sleep: V


In [27]:
# we'd like the dictionary to create a default value if it does not exist
from collections import defaultdict

frequency = defaultdict(int)
frequency['colorness'] = 4
frequency['ideas']

0

In [28]:
pos = defaultdict(lambda: 'NOUN')
pos['colorness'] = 'ADJ'
pos['blog']

'NOUN'

In [29]:
# create a default dictionary that maps each word to its replacement, UNK for all unknown words
alice = nltk.corpus.gutenberg.words('carroll-alice.txt')
vocab = nltk.FreqDist(alice)
v1000 = [word for (word, _) in vocab.most_common(1000)]
mapping = defaultdict(lambda: "UNK")
for v in v1000:
    mapping[v] = v

alice2 = [mapping[v] for v in alice]
alice2[:100]

['[',
 'Alice',
 "'",
 's',
 'Adventures',
 'in',
 'Wonderland',
 'by',
 'UNK',
 'UNK',
 'UNK',
 'UNK',
 'CHAPTER',
 'I',
 '.',
 'Down',
 'the',
 'Rabbit',
 '-',
 'UNK',
 'Alice',
 'was',
 'beginning',
 'to',
 'get',
 'very',
 'tired',
 'of',
 'sitting',
 'by',
 'her',
 'sister',
 'on',
 'the',
 'bank',
 ',',
 'and',
 'of',
 'having',
 'nothing',
 'to',
 'do',
 ':',
 'once',
 'or',
 'twice',
 'she',
 'had',
 'peeped',
 'into',
 'the',
 'book',
 'her',
 'sister',
 'was',
 'reading',
 ',',
 'but',
 'it',
 'had',
 'no',
 'pictures',
 'or',
 'UNK',
 'in',
 'it',
 ',',
 "'",
 'and',
 'what',
 'is',
 'the',
 'use',
 'of',
 'a',
 'book',
 ",'",
 'thought',
 'Alice',
 "'",
 'without',
 'pictures',
 'or',
 'conversation',
 "?'",
 'So',
 'she',
 'was',
 'considering',
 'in',
 'her',
 'own',
 'mind',
 '(',
 'as',
 'well',
 'as',
 'she',
 'could',
 ',']

In [31]:
last_letters = defaultdict(list)
words = nltk.corpus.words.words('en')
for word in words:
    key = word[-2:]
    last_letters[key].append(word)

last_letters['ly']

['abactinally',
 'abandonedly',
 'abasedly',
 'abashedly',
 'abashlessly',
 'abbreviately',
 'abdominally',
 'abhorrently',
 'abidingly',
 'abiogenetically',
 'abiologically',
 'abjectly',
 'ableptically',
 'ably',
 'abnormally',
 'abominably',
 'aborally',
 'aboriginally',
 'abortively',
 'aboundingly',
 'abridgedly',
 'abruptedly',
 'abruptly',
 'abscondedly',
 'absently',
 'absentmindedly',
 'absolutely',
 'absolutistically',
 'absorbedly',
 'absorbingly',
 'absorptively',
 'abstemiously',
 'abstinently',
 'abstractedly',
 'abstractively',
 'abstractly',
 'abstrusely',
 'absurdly',
 'abundantly',
 'abusedly',
 'abusefully',
 'abusively',
 'abysmally',
 'academically',
 'acceleratedly',
 'accentually',
 'acceptably',
 'acceptedly',
 'accessarily',
 'accessibly',
 'accessively',
 'accessorily',
 'accidentally',
 'accidently',
 'accommodately',
 'accommodatingly',
 'accordantly',
 'accordingly',
 'accountably',
 'accumulatively',
 'accurately',
 'accursedly',
 'accusably',
 'accusative

In [32]:
anagrams = defaultdict(list)
for word in words:
    key = ''.join(sorted(word))
    anagrams[key].append(word)

anagrams['aeilnrt']

['entrail', 'latrine', 'ratline', 'reliant', 'retinal', 'trenail']

In [33]:
counts = defaultdict(int)
for word in nltk.corpus.gutenberg.words('milton-paradise.txt'):
    counts[word] += 1

[key for (key, value) in counts.items() if value == 32]

['mortal',
 'Against',
 'Him',
 'There',
 'brought',
 'King',
 'virtue',
 'every',
 'been',
 'thine']

4. Automatic Tagging

In [35]:
from nltk.corpus import brown
brown_tagged_sents = brown.tagged_sents(categories="news")
brown_sents = brown.sents(categories="news")
brown_sents

[['The', 'Fulton', 'County', 'Grand', 'Jury', 'said', 'Friday', 'an', 'investigation', 'of', "Atlanta's", 'recent', 'primary', 'election', 'produced', '``', 'no', 'evidence', "''", 'that', 'any', 'irregularities', 'took', 'place', '.'], ['The', 'jury', 'further', 'said', 'in', 'term-end', 'presentments', 'that', 'the', 'City', 'Executive', 'Committee', ',', 'which', 'had', 'over-all', 'charge', 'of', 'the', 'election', ',', '``', 'deserves', 'the', 'praise', 'and', 'thanks', 'of', 'the', 'City', 'of', 'Atlanta', "''", 'for', 'the', 'manner', 'in', 'which', 'the', 'election', 'was', 'conducted', '.'], ...]

In [36]:
# Default tagger: assign most frequent tag for each word
tags = [tag for (_, tag) in brown.tagged_words(categories="news")]
nltk.FreqDist(tags).max()

'NN'

In [37]:
raw = 'I do not like green eggs and ham, I do not like them Sam I am!'
tokens = word_tokenize(raw)
default_tagger = nltk.DefaultTagger("NN")
default_tagger.tag(tokens)

[('I', 'NN'),
 ('do', 'NN'),
 ('not', 'NN'),
 ('like', 'NN'),
 ('green', 'NN'),
 ('eggs', 'NN'),
 ('and', 'NN'),
 ('ham', 'NN'),
 (',', 'NN'),
 ('I', 'NN'),
 ('do', 'NN'),
 ('not', 'NN'),
 ('like', 'NN'),
 ('them', 'NN'),
 ('Sam', 'NN'),
 ('I', 'NN'),
 ('am', 'NN'),
 ('!', 'NN')]

In [39]:
# The regular expression tagger
patterns = [
    (r'.*ing$', 'VBG'),
    (r'.*ed$', 'VBD'),
    (r'.*es$', 'VBZ'),
    (r'.*ould$', 'MD'),
    (r'.*\'s$', 'NN$'),
    (r'.*s$', 'NNS'),
    (r'^-?[0-9]+(.[0-9]+)?$', 'CD'),
    (r'.*', 'NN')
]

regexp_tagger = nltk.RegexpTagger(patterns)
regexp_tagger.tag(brown_sents[3])


[('``', 'NN'),
 ('Only', 'NN'),
 ('a', 'NN'),
 ('relative', 'NN'),
 ('handful', 'NN'),
 ('of', 'NN'),
 ('such', 'NN'),
 ('reports', 'NNS'),
 ('was', 'NNS'),
 ('received', 'VBD'),
 ("''", 'NN'),
 (',', 'NN'),
 ('the', 'NN'),
 ('jury', 'NN'),
 ('said', 'NN'),
 (',', 'NN'),
 ('``', 'NN'),
 ('considering', 'VBG'),
 ('the', 'NN'),
 ('widespread', 'NN'),
 ('interest', 'NN'),
 ('in', 'NN'),
 ('the', 'NN'),
 ('election', 'NN'),
 (',', 'NN'),
 ('the', 'NN'),
 ('number', 'NN'),
 ('of', 'NN'),
 ('voters', 'NNS'),
 ('and', 'NN'),
 ('the', 'NN'),
 ('size', 'NN'),
 ('of', 'NN'),
 ('this', 'NNS'),
 ('city', 'NN'),
 ("''", 'NN'),
 ('.', 'NN')]

In [40]:
regexp_tagger.evaluate(brown_tagged_sents)

0.20326391789486245

In [41]:
# The lookup tagger
fd = nltk.FreqDist(brown.words(categories='news'))
cfd = nltk.ConditionalFreqDist(brown.tagged_words(categories="news"))
most_freq_words = fd.most_common(100)
likely_tags = dict((word, cfd[word].max()) for (word, _) in most_freq_words)
baseline_tagger = nltk.UnigramTagger(model=likely_tags)
baseline_tagger.evaluate(brown_tagged_sents)

0.45578495136941344

In [42]:
sent = brown.sents(categories="news")[3]
baseline_tagger.tag(sent)

[('``', '``'),
 ('Only', None),
 ('a', 'AT'),
 ('relative', None),
 ('handful', None),
 ('of', 'IN'),
 ('such', None),
 ('reports', None),
 ('was', 'BEDZ'),
 ('received', None),
 ("''", "''"),
 (',', ','),
 ('the', 'AT'),
 ('jury', None),
 ('said', 'VBD'),
 (',', ','),
 ('``', '``'),
 ('considering', None),
 ('the', 'AT'),
 ('widespread', None),
 ('interest', None),
 ('in', 'IN'),
 ('the', 'AT'),
 ('election', None),
 (',', ','),
 ('the', 'AT'),
 ('number', None),
 ('of', 'IN'),
 ('voters', None),
 ('and', 'CC'),
 ('the', 'AT'),
 ('size', None),
 ('of', 'IN'),
 ('this', 'DT'),
 ('city', None),
 ("''", "''"),
 ('.', '.')]

In [43]:
# We would like to fall back to the default tagger (backoff)
# So if the lookup tagger finds None, it can fallback to the backoff tagger
baseline_tagger = nltk.UnigramTagger(model=likely_tags, backoff=nltk.DefaultTagger("NN"))

In [44]:
baseline_tagger.tag(sent)

[('``', '``'),
 ('Only', 'NN'),
 ('a', 'AT'),
 ('relative', 'NN'),
 ('handful', 'NN'),
 ('of', 'IN'),
 ('such', 'NN'),
 ('reports', 'NN'),
 ('was', 'BEDZ'),
 ('received', 'NN'),
 ("''", "''"),
 (',', ','),
 ('the', 'AT'),
 ('jury', 'NN'),
 ('said', 'VBD'),
 (',', ','),
 ('``', '``'),
 ('considering', 'NN'),
 ('the', 'AT'),
 ('widespread', 'NN'),
 ('interest', 'NN'),
 ('in', 'IN'),
 ('the', 'AT'),
 ('election', 'NN'),
 (',', ','),
 ('the', 'AT'),
 ('number', 'NN'),
 ('of', 'IN'),
 ('voters', 'NN'),
 ('and', 'CC'),
 ('the', 'AT'),
 ('size', 'NN'),
 ('of', 'IN'),
 ('this', 'DT'),
 ('city', 'NN'),
 ("''", "''"),
 ('.', '.')]

In [45]:
baseline_tagger.evaluate(brown_tagged_sents)

0.5817769556656125

5. N-Gram Tagging

In [46]:
from nltk.corpus import brown
brown_tagged_sents = brown.tagged_sents(categories="news")
brown_sents = brown.sents(categories="news")
unigram_tagger = nltk.UnigramTagger(brown_tagged_sents)
unigram_tagger.tag(brown_sents[2007])

[('Various', 'JJ'),
 ('of', 'IN'),
 ('the', 'AT'),
 ('apartments', 'NNS'),
 ('are', 'BER'),
 ('of', 'IN'),
 ('the', 'AT'),
 ('terrace', 'NN'),
 ('type', 'NN'),
 (',', ','),
 ('being', 'BEG'),
 ('on', 'IN'),
 ('the', 'AT'),
 ('ground', 'NN'),
 ('floor', 'NN'),
 ('so', 'QL'),
 ('that', 'CS'),
 ('entrance', 'NN'),
 ('is', 'BEZ'),
 ('direct', 'JJ'),
 ('.', '.')]

In [47]:
unigram_tagger.evaluate(brown_tagged_sents)

0.9349006503968017

In [48]:
size = int(len(brown_tagged_sents) * 0.9)
size

4160

In [49]:
train_sents = brown_tagged_sents[:size]
test_sents = brown_tagged_sents[size:]
unigram_tagger = nltk.UnigramTagger(train_sents)
unigram_tagger.evaluate(test_sents)

0.8121200039868434

In [50]:
bigram_tagger = nltk.BigramTagger(train_sents)
bigram_tagger.tag(brown_sents[2007])

[('Various', 'JJ'),
 ('of', 'IN'),
 ('the', 'AT'),
 ('apartments', 'NNS'),
 ('are', 'BER'),
 ('of', 'IN'),
 ('the', 'AT'),
 ('terrace', 'NN'),
 ('type', 'NN'),
 (',', ','),
 ('being', 'BEG'),
 ('on', 'IN'),
 ('the', 'AT'),
 ('ground', 'NN'),
 ('floor', 'NN'),
 ('so', 'CS'),
 ('that', 'CS'),
 ('entrance', 'NN'),
 ('is', 'BEZ'),
 ('direct', 'JJ'),
 ('.', '.')]

In [51]:
unseen_sent = brown_sents[4203]
bigram_tagger.tag(unseen_sent)

[('The', 'AT'),
 ('population', 'NN'),
 ('of', 'IN'),
 ('the', 'AT'),
 ('Congo', 'NP'),
 ('is', 'BEZ'),
 ('13.5', None),
 ('million', None),
 (',', None),
 ('divided', None),
 ('into', None),
 ('at', None),
 ('least', None),
 ('seven', None),
 ('major', None),
 ('``', None),
 ('culture', None),
 ('clusters', None),
 ("''", None),
 ('and', None),
 ('innumerable', None),
 ('tribes', None),
 ('speaking', None),
 ('400', None),
 ('separate', None),
 ('dialects', None),
 ('.', None)]

In [52]:
# If the tagger does not see a word during training, then it does not know
# how to tag it during testing, even if the following words is seen during training
bigram_tagger.evaluate(test_sents)

0.10206319146815508

In [55]:
# Combining taggers
# Use more accurate algorithms when we can, but fall back to algorithms with wider coverage when necessary

t0 = nltk.DefaultTagger('NN')
t1 = nltk.UnigramTagger(train_sents, backoff=t0)
t2 = nltk.BigramTagger(train_sents, backoff=t1)
t3 = nltk.TrigramTagger(train_sents, backoff=t2)

t2.evaluate(test_sents)

0.8452108043456593

In [56]:
t3.evaluate(test_sents)

0.843317053722715

In [57]:
from pickle import dump
output = open('t2.pkl', 'wb')
dump(t2, output, -1)
output.close()

In [58]:
from pickle import load
input = open('t2.pkl', 'rb')
tagger = load(input)
input.close()

In [59]:
text = """The board's action shows what free enterprise
is up against in our complex maze of regulatory laws ."""

tokens = text.split()
tagger.tag(tokens)

[('The', 'AT'),
 ("board's", 'NN$'),
 ('action', 'NN'),
 ('shows', 'NNS'),
 ('what', 'WDT'),
 ('free', 'JJ'),
 ('enterprise', 'NN'),
 ('is', 'BEZ'),
 ('up', 'RP'),
 ('against', 'IN'),
 ('in', 'IN'),
 ('our', 'PP$'),
 ('complex', 'JJ'),
 ('maze', 'NN'),
 ('of', 'IN'),
 ('regulatory', 'NN'),
 ('laws', 'NNS'),
 ('.', '.')]