# Part-of-Speech Tagging in NLTK

## Reading Tagged Corpora

In [52]:
from nltk import FreqDist
from nltk.corpus import treebank
from nltk.corpus import nps_chat
from nltk.corpus import brown
from nltk import DefaultTagger
from nltk import UnigramTagger
from nltk import BigramTagger
from nltk import sent_tokenize
from nltk import word_tokenize
from nltk import pos_tag

In [2]:
# print one tagged sentence from brown
print(brown.tagged_sents()[0])

[('The', 'AT'), ('Fulton', 'NP-TL'), ('County', 'NN-TL'), ('Grand', 'JJ-TL'), ('Jury', 'NN-TL'), ('said', 'VBD'), ('Friday', 'NR'), ('an', 'AT'), ('investigation', 'NN'), ('of', 'IN'), ("Atlanta's", 'NP$'), ('recent', 'JJ'), ('primary', 'NN'), ('election', 'NN'), ('produced', 'VBD'), ('``', '``'), ('no', 'AT'), ('evidence', 'NN'), ("''", "''"), ('that', 'CS'), ('any', 'DTI'), ('irregularities', 'NNS'), ('took', 'VBD'), ('place', 'NN'), ('.', '.')]


In [3]:
# print first 30 tagged words from brown
print(brown.tagged_words()[:30])

[('The', 'AT'), ('Fulton', 'NP-TL'), ('County', 'NN-TL'), ('Grand', 'JJ-TL'), ('Jury', 'NN-TL'), ('said', 'VBD'), ('Friday', 'NR'), ('an', 'AT'), ('investigation', 'NN'), ('of', 'IN'), ("Atlanta's", 'NP$'), ('recent', 'JJ'), ('primary', 'NN'), ('election', 'NN'), ('produced', 'VBD'), ('``', '``'), ('no', 'AT'), ('evidence', 'NN'), ("''", "''"), ('that', 'CS'), ('any', 'DTI'), ('irregularities', 'NNS'), ('took', 'VBD'), ('place', 'NN'), ('.', '.'), ('The', 'AT'), ('jury', 'NN'), ('further', 'RBR'), ('said', 'VBD'), ('in', 'IN')]


In [4]:
# here's a single tagged word
wordtag = brown.tagged_words()[0]
print("Tagged word:\n")
print(wordtag)

Tagged word:

('The', 'AT')


In [5]:
print("Type:\n")
print(type(wordtag))

Type:

<class 'tuple'>


In [6]:
print("First element of tuple is the word:\n")
print(wordtag[0])
print("\nSecond element of tuple is the tag:\n")
print(wordtag[1])

First element of tuple is the word:

The

Second element of tuple is the tag:

AT


The Brown corpus is organized into different types of text, which can be selected by the categories argument, and it also allows you to map the tags to a simplified tag set, described in table 5.1 in the NLTK book.

In [7]:
# all the categories (text types) in brown corpus
brown.categories()

['adventure',
 'belles_lettres',
 'editorial',
 'fiction',
 'government',
 'hobbies',
 'humor',
 'learned',
 'lore',
 'mystery',
 'news',
 'religion',
 'reviews',
 'romance',
 'science_fiction']

In [8]:
# text type humor, using universal tagset
brown_humor_tagged = brown.tagged_words(categories='humor', tagset = 'universal')
print(brown_humor_tagged[:50])

[('It', 'PRON'), ('was', 'VERB'), ('among', 'ADP'), ('these', 'DET'), ('that', 'ADP'), ('Hinkle', 'NOUN'), ('identified', 'VERB'), ('a', 'DET'), ('photograph', 'NOUN'), ('of', 'ADP'), ('Barco', 'NOUN'), ('!', '.'), ('!', '.'), ('For', 'ADP'), ('it', 'PRON'), ('seems', 'VERB'), ('that', 'ADP'), ('Barco', 'NOUN'), (',', '.'), ('fancying', 'VERB'), ('himself', 'PRON'), ('a', 'DET'), ("ladies'", 'NOUN'), ('man', 'NOUN'), ('(', '.'), ('and', 'CONJ'), ('why', 'ADV'), ('not', 'ADV'), (',', '.'), ('after', 'ADP'), ('seven', 'NUM'), ('marriages', 'NOUN'), ('?', '.'), ('?', '.'), (')', '.'), (',', '.'), ('had', 'VERB'), ('listed', 'VERB'), ('himself', 'PRON'), ('for', 'ADP'), ('Mormon', 'NOUN'), ('Beard', 'NOUN'), ('roles', 'NOUN'), ('at', 'ADP'), ('the', 'DET'), ('instigation', 'NOUN'), ('of', 'ADP'), ('his', 'DET'), ('fourth', 'ADJ'), ('murder', 'NOUN')]


Other tagged corpora also come with the tagged_words method.  Note that the chat corpus is tagged with Penn Treebank POS tags.

In [9]:
# nps_chat corpus uses Penn Treebank POS tags.
from nltk.corpus import nps_chat
print(nps_chat.tagged_words()[:50])

[('now', 'RB'), ('im', 'PRP'), ('left', 'VBD'), ('with', 'IN'), ('this', 'DT'), ('gay', 'JJ'), ('name', 'NN'), (':P', 'UH'), ('PART', 'VB'), ('hey', 'UH'), ('everyone', 'NN'), ('ah', 'UH'), ('well', 'UH'), ('NICK', 'NN'), (':', ':'), ('U7', 'NNP'), ('U7', 'NNP'), ('is', 'VBZ'), ('a', 'DT'), ('gay', 'JJ'), ('name', 'NN'), ('.', '.'), ('.', 'SYM'), ('ACTION', 'NN'), ('gives', 'VBZ'), ('U121', 'NNP'), ('a', 'DT'), ('golf', 'NN'), ('clap', 'NN'), ('.', '.'), (':)', 'UH'), ('JOIN', 'VB'), ('hi', 'UH'), ('U59', 'NNP'), ('26', 'CD'), ('/', 'CC'), ('m', 'NN'), ('/', 'CC'), ('ky', 'NNP'), ('women', 'NNS'), ('that', 'WDT'), ('are', 'VBP'), ('nice', 'JJ'), ('please', 'VB'), ('pm', 'VB'), ('me', 'PRP'), ('JOIN', 'VB'), ('PART', 'VB'), ('there', 'RB'), ('ya', 'PRP')]


In this class, we will mostly use the Penn Treebank tag set, as it is the most widely used.  The Treebank has the tagged_words and tagged_sents methods, as well as the words method that we used before to get the tokens.

In [10]:
from nltk.corpus import treebank
# raw method
treebank_text = treebank.raw()
print("This is a string:")
print(treebank_text[:150],'\n')

This is a string:

( (S 
    (NP-SBJ 
      (NP (NNP Pierre) (NNP Vinken) )
      (, ,) 
      (ADJP 
        (NP (CD 61) (NNS years) )
        (JJ old) )
      (, ,) ) 



In [11]:
# words (token) method
treebank_tokens = treebank.words()
print(treebank_tokens[:50])

['Pierre', 'Vinken', ',', '61', 'years', 'old', ',', 'will', 'join', 'the', 'board', 'as', 'a', 'nonexecutive', 'director', 'Nov.', '29', '.', 'Mr.', 'Vinken', 'is', 'chairman', 'of', 'Elsevier', 'N.V.', ',', 'the', 'Dutch', 'publishing', 'group', '.', 'Rudolph', 'Agnew', ',', '55', 'years', 'old', 'and', 'former', 'chairman', 'of', 'Consolidated', 'Gold', 'Fields', 'PLC', ',', 'was', 'named', '*-1', 'a']


In [12]:
treebank_tagged_words = treebank.tagged_words()[:50]
print("First 50 tagged words in Penn TreeBank:\n")
print(treebank_tagged_words)

First 50 tagged words in Penn TreeBank:

[('Pierre', 'NNP'), ('Vinken', 'NNP'), (',', ','), ('61', 'CD'), ('years', 'NNS'), ('old', 'JJ'), (',', ','), ('will', 'MD'), ('join', 'VB'), ('the', 'DT'), ('board', 'NN'), ('as', 'IN'), ('a', 'DT'), ('nonexecutive', 'JJ'), ('director', 'NN'), ('Nov.', 'NNP'), ('29', 'CD'), ('.', '.'), ('Mr.', 'NNP'), ('Vinken', 'NNP'), ('is', 'VBZ'), ('chairman', 'NN'), ('of', 'IN'), ('Elsevier', 'NNP'), ('N.V.', 'NNP'), (',', ','), ('the', 'DT'), ('Dutch', 'NNP'), ('publishing', 'VBG'), ('group', 'NN'), ('.', '.'), ('Rudolph', 'NNP'), ('Agnew', 'NNP'), (',', ','), ('55', 'CD'), ('years', 'NNS'), ('old', 'JJ'), ('and', 'CC'), ('former', 'JJ'), ('chairman', 'NN'), ('of', 'IN'), ('Consolidated', 'NNP'), ('Gold', 'NNP'), ('Fields', 'NNP'), ('PLC', 'NNP'), (',', ','), ('was', 'VBD'), ('named', 'VBN'), ('*-1', '-NONE-'), ('a', 'DT')]


In [13]:
treebank_tagged = treebank.tagged_sents()[:2]
print("First 2 tagged sentences in Penn Treebank:\n")
print(treebank_tagged)

First 2 tagged sentences in Penn Treebank:

[[('Pierre', 'NNP'), ('Vinken', 'NNP'), (',', ','), ('61', 'CD'), ('years', 'NNS'), ('old', 'JJ'), (',', ','), ('will', 'MD'), ('join', 'VB'), ('the', 'DT'), ('board', 'NN'), ('as', 'IN'), ('a', 'DT'), ('nonexecutive', 'JJ'), ('director', 'NN'), ('Nov.', 'NNP'), ('29', 'CD'), ('.', '.')], [('Mr.', 'NNP'), ('Vinken', 'NNP'), ('is', 'VBZ'), ('chairman', 'NN'), ('of', 'IN'), ('Elsevier', 'NNP'), ('N.V.', 'NNP'), (',', ','), ('the', 'DT'), ('Dutch', 'NNP'), ('publishing', 'VBG'), ('group', 'NN'), ('.', '.')]]


The NLTK has almost 4,000 sentences of tagged data from Penn Treebank, while the actual Treebank has much more.  This will limit the accuracy of the POS taggers (and later parsers) that we can define in lab, but also make the running times short enough for labs.

## Tags Frequencies

In [21]:
from nltk import FreqDist
tag_fd = FreqDist(tag for (word, tag) in treebank.tagged_words())
print("Tags given on all tagged words in Penn Treebank:\n")
print(tag_fd.keys(), "\n")

Tags given on all tagged words in Penn Treebank:

dict_keys(['NNP', ',', 'CD', 'NNS', 'JJ', 'MD', 'VB', 'DT', 'NN', 'IN', '.', 'VBZ', 'VBG', 'CC', 'VBD', 'VBN', '-NONE-', 'RB', 'TO', 'PRP', 'RBR', 'WDT', 'VBP', 'RP', 'PRP$', 'JJS', 'POS', '``', 'EX', "''", 'WP', ':', 'JJR', 'WRB', '$', 'NNPS', 'WP$', '-LRB-', '-RRB-', 'PDT', 'RBS', 'FW', 'UH', 'SYM', 'LS', '#']) 



In [23]:
print("Top 10 Most Frequent Tags in Treebank:\n")
for tag, freq in tag_fd.most_common(10):
    print(tag, freq)

Top 10 Most Frequent Tags in Treebank:

NN 13166
IN 9857
NNP 9410
DT 8165
-NONE- 6592
NNS 6047
JJ 5834
, 4886
. 3874
CD 3546


In [27]:
# use the first letter of the POS tag to get classes of tags
tag_classes_fd = FreqDist(tag[0] for (word, tag) in treebank.tagged_words())
# Top 10 tag classes
print("Top 10 Most Frequent Tag Classes in Treebank:\n")
for tag, freq in tag_classes_fd.most_common(10):
    print(tag, freq)

Top 10 Most Frequent Tag Classes in Treebank:

N 28867
V 12637
I 9857
D 8165
- 6838
J 6397
C 5811
, 4886
. 3874
P 3333


## Tagger Training Setup
* We will use the tagged sentences and words from the Penn Treebank
* We separate our tagged data into a training set, where we'll learn the probabilities of the words and their tags, and a test set to evaluate how our taggers perform
* This allows us to test the tagger’s accuracy on similar, but not the same, data that it was trained on
* The training set is the first 90% of the sentences and the test set is the remaining 10%

In [33]:
# train/test split
treebank_tagged = treebank.tagged_sents() # all tagged sentences of treebank
size = int(len(treebank_tagged) * 0.9) # 90% for training
treebank_train = treebank_tagged[:size] # training set
treebank_test = treebank_tagged[size:] # test set

In the NLTK, a number of POS taggers are included in the tag module, including one that we can use that has been trained on all of Penn Treebank.  But for instructional purposes, we will develop a sequence of N-gram taggers whose performance improves.

## Create a default tagger
This default tagger just tags everything with the most frequent tag: NN. This simple tagger doesn't actually use the training set.

In [36]:
# creates the tagger
from nltk import DefaultTagger
t0 = DefaultTagger('NN')
# show the effect of the tagger by tagging the first 20 words
t0.tag(treebank_tokens[:20])

[('Pierre', 'NN'),
 ('Vinken', 'NN'),
 (',', 'NN'),
 ('61', 'NN'),
 ('years', 'NN'),
 ('old', 'NN'),
 (',', 'NN'),
 ('will', 'NN'),
 ('join', 'NN'),
 ('the', 'NN'),
 ('board', 'NN'),
 ('as', 'NN'),
 ('a', 'NN'),
 ('nonexecutive', 'NN'),
 ('director', 'NN'),
 ('Nov.', 'NN'),
 ('29', 'NN'),
 ('.', 'NN'),
 ('Mr.', 'NN'),
 ('Vinken', 'NN')]

The NLTK includes a function for taggers that computes tagging accuracy by comparing the result of a tagger with the original “gold standard” tagged text.  Here we use the NLTK function “evaluate” to apply the default tagger (to the untagged text) and compare it with the gold standard tagged text in the test set.

In [38]:
# accuracy of classifying all tokens in test set as 'NN' using default tagger
t0.evaluate(treebank_test)

0.14697201017811704

* The evaluate function first takes the tagged text and removes the tags, so that only tokens are left.  
* Then it runs the tagger, in this case t0, to tag all the text.  
* Then it compares the tags predicted by the tagger with the “gold standard” tags already given.  
* It reports the accuracy, which is the percentage of words with correct tags.

Other simple taggers described in the NLTK book are the Regular Expression Tagger and the Lookup Tagger.

## Training Unigram Tagger
* It tags each word with the most frequent tag that word has in the corpus.
* For example, if the word “bank” occurs 30 times with the tag “NN” and 10 times with the tag “VB”, we’ll just tag it with “NN”. 

In [42]:
from nltk import UnigramTagger
t1 = UnigramTagger(treebank_tagged) # unigram tagger trained on entire tagged corpus
# show the effect of the tagger by tagging the first 20 words
t1.tag(treebank_tokens[:20])

[('Pierre', 'NNP'),
 ('Vinken', 'NNP'),
 (',', ','),
 ('61', 'CD'),
 ('years', 'NNS'),
 ('old', 'JJ'),
 (',', ','),
 ('will', 'MD'),
 ('join', 'VB'),
 ('the', 'DT'),
 ('board', 'NN'),
 ('as', 'IN'),
 ('a', 'DT'),
 ('nonexecutive', 'JJ'),
 ('director', 'NN'),
 ('Nov.', 'NNP'),
 ('29', 'CD'),
 ('.', '.'),
 ('Mr.', 'NNP'),
 ('Vinken', 'NNP')]

Train the tagger on the training set and evaluate on the test set.

In [43]:
t1 = UnigramTagger(treebank_train) # unigram tagger trained on training set only
t1.evaluate(treebank_test) # evaluate on the test set

0.8627989821882952

## Training Bigram Tagger with backoff to Unigram and Default Tagger
Finally, NLTK has a Bigram tagger that can be trained using 2 tag-word sequences. 
But there will be unknown frequencies in the test data for the bigram tagger, and unknown words for the unigram tagger, so we can use the backoff tagger capability of NLTK to create a combined tagger.  This tagger uses bigram frequencies to tag as much as possible.  If a word doesn’t occur in a bigram, it uses the unigram tagger to tag that word.  If the word is unknown to the unigram tagger, then we use the default tagger to tag it as ‘NN’.


In [45]:
from nltk import BigramTagger
t0 = DefaultTagger("NN") # Default, tag words as 'NN'
t1 = UnigramTagger(treebank_train, backoff=t0) # unigram tagger; backoff to default tagger
t2 = BigramTagger(treebank_train, backoff=t1) # bigram tagger; backoff to unigram tagger
t2.evaluate(treebank_test)

0.8905852417302799

## Applying N-Gram Tagger to Text Data
* This function is actually trained to tokenize individual sentences and will work better if we first use the sentence splitter, aka tokenizer, to produce a list of text strings for individual sentences.

In [47]:
text = "Three Calgarians have found a rather unusual way of leaving snow and ice behind. They set off this week on foot and by camels on a grueling trek across the burning Arabian desert."

In [48]:
from nltk import sent_tokenize
textsplit = sent_tokenize(text)
print(textsplit)

['Three Calgarians have found a rather unusual way of leaving snow and ice behind.', 'They set off this week on foot and by camels on a grueling trek across the burning Arabian desert.']


After producing the list of sentence texts, apply the word tokenizer to each sentence.

In [50]:
from nltk import word_tokenize
tokentext = [word_tokenize(sent) for sent in textsplit]
print(tokentext)

[['Three', 'Calgarians', 'have', 'found', 'a', 'rather', 'unusual', 'way', 'of', 'leaving', 'snow', 'and', 'ice', 'behind', '.'], ['They', 'set', 'off', 'this', 'week', 'on', 'foot', 'and', 'by', 'camels', 'on', 'a', 'grueling', 'trek', 'across', 'the', 'burning', 'Arabian', 'desert', '.']]


Now apply the t2 bigram POS tagger to each sentence of tokens in the list.

In [51]:
taggedtext = [t2.tag(tokens) for tokens in tokentext]
print(taggedtext)

[[('Three', 'CD'), ('Calgarians', 'NN'), ('have', 'VBP'), ('found', 'VBN'), ('a', 'DT'), ('rather', 'RB'), ('unusual', 'JJ'), ('way', 'NN'), ('of', 'IN'), ('leaving', 'VBG'), ('snow', 'NN'), ('and', 'CC'), ('ice', 'NN'), ('behind', 'IN'), ('.', '.')], [('They', 'PRP'), ('set', 'VBN'), ('off', 'RP'), ('this', 'DT'), ('week', 'NN'), ('on', 'IN'), ('foot', 'NN'), ('and', 'CC'), ('by', 'IN'), ('camels', 'NN'), ('on', 'IN'), ('a', 'DT'), ('grueling', 'NN'), ('trek', 'NN'), ('across', 'IN'), ('the', 'DT'), ('burning', 'NN'), ('Arabian', 'NN'), ('desert', 'NN'), ('.', '.')]]


We observe that this text has quite a few words that appear to be unknown to this tagger from the data it was trained on.  Examples of this are “Calgarians” and “camels”. In both cases, these two words are tagged as NN instead of the correct tags of NNPS and NNS, respectively.  This points out the benefit of adding sequence information such as an HMM tagger would use and lexical information, such as a Maximum Entropy tagger could use if you defined such features.  In the NLTK, another strategy would be to use a Regular Expression tagger as a backoff tagger that could take into account word features.

NLTK includes the Stanford POS tagger already, which is available in the module 'taggers/maxent_treebank_pos_tagger/english.pickle' and it is used for the standard nltk.pos_tag function.

In [53]:
# use the standard nltk POS tagger on the same example text
from nltk import pos_tag
taggedtextStanford = [pos_tag(tokens) for tokens in tokentext]
print(taggedtextStanford)

[[('Three', 'CD'), ('Calgarians', 'NNPS'), ('have', 'VBP'), ('found', 'VBN'), ('a', 'DT'), ('rather', 'RB'), ('unusual', 'JJ'), ('way', 'NN'), ('of', 'IN'), ('leaving', 'VBG'), ('snow', 'NN'), ('and', 'CC'), ('ice', 'NN'), ('behind', 'NN'), ('.', '.')], [('They', 'PRP'), ('set', 'VBD'), ('off', 'RP'), ('this', 'DT'), ('week', 'NN'), ('on', 'IN'), ('foot', 'NN'), ('and', 'CC'), ('by', 'IN'), ('camels', 'NNS'), ('on', 'IN'), ('a', 'DT'), ('grueling', 'NN'), ('trek', 'NN'), ('across', 'IN'), ('the', 'DT'), ('burning', 'NN'), ('Arabian', 'JJ'), ('desert', 'NN'), ('.', '.')]]


In [54]:
# We can use a list comprehension to define the new list as all of the tagged tokens in each for the sentences
taggedtext_flat = [pair for sent in taggedtext for pair in sent]
print(taggedtext_flat)

[('Three', 'CD'), ('Calgarians', 'NN'), ('have', 'VBP'), ('found', 'VBN'), ('a', 'DT'), ('rather', 'RB'), ('unusual', 'JJ'), ('way', 'NN'), ('of', 'IN'), ('leaving', 'VBG'), ('snow', 'NN'), ('and', 'CC'), ('ice', 'NN'), ('behind', 'IN'), ('.', '.'), ('They', 'PRP'), ('set', 'VBN'), ('off', 'RP'), ('this', 'DT'), ('week', 'NN'), ('on', 'IN'), ('foot', 'NN'), ('and', 'CC'), ('by', 'IN'), ('camels', 'NN'), ('on', 'IN'), ('a', 'DT'), ('grueling', 'NN'), ('trek', 'NN'), ('across', 'IN'), ('the', 'DT'), ('burning', 'NN'), ('Arabian', 'NN'), ('desert', 'NN'), ('.', '.')]


In [55]:
taggedtextStanford_flat = [pair for sent in taggedtextStanford for pair in sent]
print(taggedtextStanford_flat)

[('Three', 'CD'), ('Calgarians', 'NNPS'), ('have', 'VBP'), ('found', 'VBN'), ('a', 'DT'), ('rather', 'RB'), ('unusual', 'JJ'), ('way', 'NN'), ('of', 'IN'), ('leaving', 'VBG'), ('snow', 'NN'), ('and', 'CC'), ('ice', 'NN'), ('behind', 'NN'), ('.', '.'), ('They', 'PRP'), ('set', 'VBD'), ('off', 'RP'), ('this', 'DT'), ('week', 'NN'), ('on', 'IN'), ('foot', 'NN'), ('and', 'CC'), ('by', 'IN'), ('camels', 'NNS'), ('on', 'IN'), ('a', 'DT'), ('grueling', 'NN'), ('trek', 'NN'), ('across', 'IN'), ('the', 'DT'), ('burning', 'NN'), ('Arabian', 'JJ'), ('desert', 'NN'), ('.', '.')]


## Run a POS Tagger on a Large Text and Look at Tag Frequencies

In [62]:
# Extract text from the book Emma
from nltk.corpus import gutenberg
from nltk.corpus import sent_tokenize
from nltk import pos_tag
from nltk import FreqDist

file0 = gutenberg.fileids()[0]
emmatext = gutenberg.raw(file0)

In [66]:
# Process text
sentlist = sent_tokenize(emmatext) # split raw text into sentences
sentlisttokens = [word_tokenize(sent) for sent in sentlist] # tokenize the list of sentences

In [68]:
print(sentlisttokens[0])

['[', 'Emma', 'by', 'Jane', 'Austen', '1816', ']', 'VOLUME', 'I', 'CHAPTER', 'I', 'Emma', 'Woodhouse', ',', 'handsome', ',', 'clever', ',', 'and', 'rich', ',', 'with', 'a', 'comfortable', 'home', 'and', 'happy', 'disposition', ',', 'seemed', 'to', 'unite', 'some', 'of', 'the', 'best', 'blessings', 'of', 'existence', ';', 'and', 'had', 'lived', 'nearly', 'twenty-one', 'years', 'in', 'the', 'world', 'with', 'very', 'little', 'to', 'distress', 'or', 'vex', 'her', '.']


In [78]:
# run tagger
taggedtext = [pos_tag(tokens) for tokens in sentlisttokens]
print(taggedtext[:3])

[[('[', 'NNS'), ('Emma', 'NNP'), ('by', 'IN'), ('Jane', 'NNP'), ('Austen', 'NNP'), ('1816', 'CD'), (']', 'NNP'), ('VOLUME', 'NNP'), ('I', 'PRP'), ('CHAPTER', 'VBP'), ('I', 'PRP'), ('Emma', 'NNP'), ('Woodhouse', 'NNP'), (',', ','), ('handsome', 'NN'), (',', ','), ('clever', 'NN'), (',', ','), ('and', 'CC'), ('rich', 'JJ'), (',', ','), ('with', 'IN'), ('a', 'DT'), ('comfortable', 'JJ'), ('home', 'NN'), ('and', 'CC'), ('happy', 'JJ'), ('disposition', 'NN'), (',', ','), ('seemed', 'VBD'), ('to', 'TO'), ('unite', 'VB'), ('some', 'DT'), ('of', 'IN'), ('the', 'DT'), ('best', 'JJS'), ('blessings', 'NNS'), ('of', 'IN'), ('existence', 'NN'), (';', ':'), ('and', 'CC'), ('had', 'VBD'), ('lived', 'VBN'), ('nearly', 'RB'), ('twenty-one', 'CD'), ('years', 'NNS'), ('in', 'IN'), ('the', 'DT'), ('world', 'NN'), ('with', 'IN'), ('very', 'RB'), ('little', 'JJ'), ('to', 'TO'), ('distress', 'VB'), ('or', 'CC'), ('vex', 'VB'), ('her', 'PRP'), ('.', '.')], [('She', 'PRP'), ('was', 'VBD'), ('the', 'DT'), ('you

In [79]:
# flatten the list to get just one list of (word, tag)
taggedtext_flat = [pair for sent in taggedtext for pair in sent]
print(taggedtext_flat[:50])

[('[', 'NNS'), ('Emma', 'NNP'), ('by', 'IN'), ('Jane', 'NNP'), ('Austen', 'NNP'), ('1816', 'CD'), (']', 'NNP'), ('VOLUME', 'NNP'), ('I', 'PRP'), ('CHAPTER', 'VBP'), ('I', 'PRP'), ('Emma', 'NNP'), ('Woodhouse', 'NNP'), (',', ','), ('handsome', 'NN'), (',', ','), ('clever', 'NN'), (',', ','), ('and', 'CC'), ('rich', 'JJ'), (',', ','), ('with', 'IN'), ('a', 'DT'), ('comfortable', 'JJ'), ('home', 'NN'), ('and', 'CC'), ('happy', 'JJ'), ('disposition', 'NN'), (',', ','), ('seemed', 'VBD'), ('to', 'TO'), ('unite', 'VB'), ('some', 'DT'), ('of', 'IN'), ('the', 'DT'), ('best', 'JJS'), ('blessings', 'NNS'), ('of', 'IN'), ('existence', 'NN'), (';', ':'), ('and', 'CC'), ('had', 'VBD'), ('lived', 'VBN'), ('nearly', 'RB'), ('twenty-one', 'CD'), ('years', 'NNS'), ('in', 'IN'), ('the', 'DT'), ('world', 'NN'), ('with', 'IN')]


In [80]:
# Frequency Distribution of tags
tags_fd = FreqDist(tag for (word, tag) in taggedtext_flat)
# Top 10 tags
print("Top 10 Most Frequent Tags in Emma:\n")
for tag, freq in tags_fd.most_common(10):
    print(tag, freq)

Top 10 Most Frequent Tags in Emma:

NN 19330
IN 17880
PRP 15619
RB 12997
DT 12743
, 12016
JJ 10249
NNP 9095
VBD 9049
VB 8941
