# Creating POS taggers

Source: https://streamhacker.com/2008/11/03/part-of-speech-tagging-with-nltk-part-1/ 

Make sure `nltk` is installed. If it is already (e.g. Anaconda), get the corpora:
`>>> nltk.download()`

In [1]:
%%time
import nltk.corpus, nltk.tag, itertools
brown_review_sents = nltk.corpus.brown.tagged_sents(categories=['reviews'])
brown_lore_sents = nltk.corpus.brown.tagged_sents(categories=['lore'])
brown_romance_sents = nltk.corpus.brown.tagged_sents(categories=['romance'])
 
brown_train = list(itertools.chain(brown_review_sents[:1000], brown_lore_sents[:1000], brown_romance_sents[:1000]))
brown_test = list(itertools.chain(brown_review_sents[1000:2000], brown_lore_sents[1000:2000], brown_romance_sents[1000:2000]))
 
conll_sents = nltk.corpus.conll2000.tagged_sents()
conll_train = list(conll_sents[:4000])
conll_test = list(conll_sents[4000:8000])
 
treebank_sents = nltk.corpus.treebank.tagged_sents()
treebank_train = list(treebank_sents[:1500])
treebank_test = list(treebank_sents[1500:3000])

Wall time: 34.3 s


In [2]:
# this part was missing in the blog post
# maybe it's supposed to be run on each corpus separately?
train_sents = treebank_train + conll_train + brown_train

In [3]:
%%time
def backoff_tagger(tagged_sents, tagger_classes, backoff=None):
    if not backoff:
        backoff = tagger_classes[0](tagged_sents)
        del tagger_classes[0]
 
    for cls in tagger_classes:
        tagger = cls(tagged_sents, backoff=backoff)
        backoff = tagger
 
    return backoff
 
ubt_tagger = backoff_tagger(train_sents, [nltk.tag.UnigramTagger, nltk.tag.BigramTagger, nltk.tag.TrigramTagger])
utb_tagger = backoff_tagger(train_sents, [nltk.tag.UnigramTagger, nltk.tag.TrigramTagger, nltk.tag.BigramTagger])
but_tagger = backoff_tagger(train_sents, [nltk.tag.BigramTagger, nltk.tag.UnigramTagger, nltk.tag.TrigramTagger])
btu_tagger = backoff_tagger(train_sents, [nltk.tag.BigramTagger, nltk.tag.TrigramTagger, nltk.tag.UnigramTagger])
tub_tagger = backoff_tagger(train_sents, [nltk.tag.TrigramTagger, nltk.tag.UnigramTagger, nltk.tag.BigramTagger])
tbu_tagger = backoff_tagger(train_sents, [nltk.tag.TrigramTagger, nltk.tag.BigramTagger, nltk.tag.UnigramTagger])


Wall time: 51.3 s


In [4]:
utb_tagger.evaluate(brown_test)

0.6303110707677269

In [5]:
all_test = treebank_test + brown_test + conll_test

In [6]:
taggers = [name for name in dir() if name.endswith('_tagger') and len(name) == 10]
taggers

['btu_tagger',
 'but_tagger',
 'tbu_tagger',
 'tub_tagger',
 'ubt_tagger',
 'utb_tagger']

In [7]:
%%time
import pandas as pd

v = vars()
tests = [test_name for test_name in dir() if test_name.endswith('_test')]

df = pd.DataFrame.from_dict({
    name: {test_name: v[name].evaluate(v[test_name]) for test_name in tests}
    for name in taggers # dir() if name.endswith('_tagger')
})

df

Wall time: 30.2 s


In [8]:
df.iloc[1:].sum().div(3).sort_values()

tbu_tagger    0.380357
tub_tagger    0.385595
btu_tagger    0.546765
but_tagger    0.573388
utb_tagger    0.784914
ubt_tagger    0.789297
dtype: float64

In [9]:
tests

['all_test', 'brown_test', 'conll_test', 'treebank_test']

In [13]:
# What's in there by the way?
brown_test[0]

[('To', 'TO'),
 ('begin', 'VB'),
 ('with', 'IN'),
 (',', ','),
 ('Scapin', 'NP'),
 ('is', 'BEZ'),
 ('a', 'AT'),
 ('trickster', 'NN'),
 ('in', 'IN'),
 ('the', 'AT'),
 ('old', 'JJ'),
 ('tradition', 'NN'),
 ('of', 'IN'),
 ('the', 'AT'),
 ('clever', 'JJ'),
 ('servant', 'NN'),
 ('who', 'WPS'),
 ('plots', 'VBZ'),
 ('the', 'AT'),
 ('strategy', 'NN'),
 ('of', 'IN'),
 ('courtship', 'NN'),
 ('for', 'IN'),
 ('his', 'PP$'),
 ('master', 'NN'),
 ('.', '.')]

In [11]:
utb_tagger.tag(['Mary', 'sees', 'a', 'little', 'joy', 'passing', '.'])

[('Mary', 'NNP'),
 ('sees', 'VBZ'),
 ('a', 'DT'),
 ('little', 'JJ'),
 ('joy', 'NN'),
 ('passing', 'VBG'),
 ('.', '.')]

In [12]:
utb_tagger.tag('Time to pick up the slack .'.split())

[('Time', 'NN'),
 ('to', 'TO'),
 ('pick', 'VB'),
 ('up', 'RP'),
 ('the', 'AT'),
 ('slack', 'NN'),
 ('.', '.')]