In [1]:
import nltk
from collections import defaultdict

In [2]:
corpus = nltk.corpus.gutenberg
bible = 'bible-kjv.txt'
bible_words = corpus.words(bible)

In [3]:
bible_pos = nltk.pos_tag(bible_words)

In [4]:
# create a set as the default entry value
tags = defaultdict(set)
for word, tag in bible_pos:
    tags[word].add(tag)

# look at words with more than 8 tags:
for word, tag_list in tags.items():
    if len(tag_list) > 8:
        print("{}: {}".format(word, tag_list))

unto: {'RP', 'IN', 'NNS', 'JJ', 'CC', 'RB', 'VBP', 'NNP', 'PRP$', 'VBZ', 'VB', 'MD', 'RBR', 'VBD', 'NN'}
forth: {'RP', 'IN', 'NNS', 'JJ', 'PDT', 'JJS', 'VBN', 'RB', 'VBP', 'VBZ', 'VB', 'VBD', 'NN'}
hath: {'IN', 'JJ', 'PDT', 'PRP', 'RB', 'VBP', 'VBZ', 'VB', 'MD', 'VBD', 'NN'}
wherein: {'IN', 'WP', 'JJ', 'NN', "''", 'CC', 'JJR', 'RB', 'VBP', 'NNP', 'WRB', 'VBZ', 'VB', 'WDT', 'VBD', 'EX'}
behold: {'JJ', 'CC', 'VBN', 'UH', 'RB', 'VBP', 'VB', 'VBD', 'NN'}
till: {'IN', 'JJ', 'CC', 'RB', 'VBP', 'VBZ', 'VB', 'EX', 'NN'}
evil: {'NNS', 'JJ', 'CC', 'VBN', 'RB', 'VBP', 'FW', 'VBZ', 'VB', 'EX', 'VBD', 'NN'}
goeth: {'VBG', 'NNS', 'JJ', 'RB', 'VBP', 'VBZ', 'VB', 'VBD', 'NN'}
thou: {'NNS', 'IN', 'RP', 'JJ', 'PRP', "''", 'CC', 'VBN', 'JJR', 'RB', 'VBP', 'NNP', 'VBZ', 'VB', 'EX', 'MD', 'VBD', 'NN'}
eat: {'IN', 'JJ', 'RB', 'VBP', 'NNP', 'VBZ', 'VB', 'VBD', 'NN'}
shalt: {'NNS', 'JJ', 'PRP', 'VBN', 'JJR', 'RB', 'VBP', 'FW', 'VBZ', 'VB', 'MD', 'VBD', 'NN'}
thereof: {'NNS', 'RP', 'JJ', 'PRP', 'RB', 'VBP', 'V

In [5]:
# https://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html
style = "en-ptb"  # penn treebank POS

simplify = lambda tag : nltk.map_tag(style, "universal", tag)

def simple_pos(text):
    pos = nltk.pos_tag(text)
    return [(word, simplify(tag)) for word, tag in pos]

# create a set as the default entry value
tags = defaultdict(set)
for word, tag in simple_pos(bible_words):
    tags[word].add(tag)

# look at words with more than 8 tags:
for word, tag_list in tags.items():
    if len(tag_list) > 8:
        print("{}: {}".format(word, tag_list))

wherein: {'.', 'ADP', 'PRON', 'VERB', 'CONJ', 'NOUN', 'DET', 'ADV', 'ADJ'}
thou: {'.', 'ADP', 'PRON', 'VERB', 'CONJ', 'NOUN', 'PRT', 'DET', 'ADV', 'ADJ'}
ye: {'.', 'NUM', 'ADP', 'X', 'VERB', 'PRON', 'CONJ', 'NOUN', 'PRT', 'DET', 'ADV', 'ADJ'}
doth: {'ADP', 'X', 'VERB', 'CONJ', 'NOUN', 'PRT', 'DET', 'ADV', 'ADJ'}
thee: {'ADP', 'PRON', 'VERB', 'CONJ', 'NOUN', 'PRT', 'DET', 'ADV', 'ADJ'}


In [6]:
corpus = nltk.corpus.brown
corpus.tagged_words()[:10]

[('The', 'AT'),
 ('Fulton', 'NP-TL'),
 ('County', 'NN-TL'),
 ('Grand', 'JJ-TL'),
 ('Jury', 'NN-TL'),
 ('said', 'VBD'),
 ('Friday', 'NR'),
 ('an', 'AT'),
 ('investigation', 'NN'),
 ('of', 'IN')]

In [7]:
corpus.tagged_words(tagset="universal")[:10]

[('The', 'DET'),
 ('Fulton', 'NOUN'),
 ('County', 'NOUN'),
 ('Grand', 'ADJ'),
 ('Jury', 'NOUN'),
 ('said', 'VERB'),
 ('Friday', 'NOUN'),
 ('an', 'DET'),
 ('investigation', 'NOUN'),
 ('of', 'ADP')]