### In this tutorial, we will look at some of the off-the-shelf POS taggers and a minimalistic introduction to train them

In [19]:
import nltk
nltk.download('tagsets')
nltk.download('averaged_perceptron_tagger')
nltk.download('treebank')
from nltk.help import upenn_tagset, claws5_tagset, brown_tagset

[nltk_data] Downloading package tagsets to C:\Users\Admin_server
[nltk_data]     APN\AppData\Roaming\nltk_data...
[nltk_data]   Package tagsets is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Admin_server APN\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package treebank to C:\Users\Admin_server
[nltk_data]     APN\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\treebank.zip.


Different tagsets exist. Choose depending on the end application required. Some tagsets have coarse tags while other have very fine tags

In [5]:
print("Printing upenn_tagset")
print(upenn_tagset())

Printing upenn_tagset
$: dollar
    $ -$ --$ A$ C$ HK$ M$ NZ$ S$ U.S.$ US$
'': closing quotation mark
    ' ''
(: opening parenthesis
    ( [ {
): closing parenthesis
    ) ] }
,: comma
    ,
--: dash
    --
.: sentence terminator
    . ! ?
:: colon or ellipsis
    : ; ...
CC: conjunction, coordinating
    & 'n and both but either et for less minus neither nor or plus so
    therefore times v. versus vs. whether yet
CD: numeral, cardinal
    mid-1890 nine-thirty forty-two one-tenth ten million 0.5 one forty-
    seven 1987 twenty '79 zero two 78-degrees eighty-four IX '60s .025
    fifteen 271,124 dozen quintillion DM2,000 ...
DT: determiner
    all an another any both del each either every half la many much nary
    neither no some such that the them these this those
EX: existential there
    there
FW: foreign word
    gemeinschaft hund ich jeux habeas Haementeria Herr K'ang-si vous
    lutihaw alai je jour objets salutaris fille quibusdam pas trop Monte
    terram fiche oui corporis 

In [6]:
print("Printing brown_tagset")
print(brown_tagset())

Printing brown_tagset
(: opening parenthesis
    (
): closing parenthesis
    )
*: negator
    not n't
,: comma
    ,
--: dash
    --
.: sentence terminator
    . ? ; ! :
:: colon
    :
ABL: determiner/pronoun, pre-qualifier
    quite such rather
ABN: determiner/pronoun, pre-quantifier
    all half many nary
ABX: determiner/pronoun, double conjunction or pre-quantifier
    both
AP: determiner/pronoun, post-determiner
    many other next more last former little several enough most least only
    very few fewer past same Last latter less single plenty 'nough lesser
    certain various manye next-to-last particular final previous present
    nuf
AP$: determiner/pronoun, post-determiner, genitive
    other's
AP+AP: determiner/pronoun, post-determiner, hyphenated pair
    many-much
AT: article
    the an no a every th' ever' ye
BE: verb 'to be', infinitive or imperative
    be
BED: verb 'to be', past tense, 2nd person singular or all persons plural
    were
BED*: verb 'to be', past tense, 2

In [7]:
print("Printing claws5_tagset")
print(claws5_tagset() )

Printing claws5_tagset
AJ0: adjective (unmarked)
    good, old
AJC: comparative adjective
    better, older
AJS: superlative adjective
    best, oldest
AT0: article
    THE, A, AN
AV0: adverb (unmarked)
    often, well, longer, furthest
AVP: adverb particle
    up, off, out
AVQ: wh-adverb
    when, how, why
CJC: coordinating conjunction
    and, or
CJS: subordinating conjunction
    although, when
CJT: the conjunction THAT
    that
CRD: cardinal numeral
    3, fifty-five, 6609 (excl one)
DPS: possessive determiner form
    your, their
DT0: general determiner
    these, some
DTQ: wh-determiner
    whose, which
EX0: existential THERE
    there
ITJ: interjection or other isolate
    oh, yes, mhm
NN0: noun (neutral for number)
    aircraft, data
NN1: singular noun
    pencil, goose
NN2: plural noun
    pencils, geese
NP0: proper noun
    London, Michael, Mars
NULL: the null tag (for items not to be tagged)
ORD: ordinal
    sixth, 77th, last
PNI: indefinite pronoun
    none, everything
PNP:

In [8]:
#DefaultTagger is a dumb tagger just assigns a default tag to every word
from nltk.tag.sequential import DefaultTagger
print("Default tagger")
default_tagger = DefaultTagger('None')
sent = "I love to play cricket"
print(default_tagger.tag(sent.split()))

Default tagger
[('I', 'None'), ('love', 'None'), ('to', 'None'), ('play', 'None'), ('cricket', 'None')]


In [11]:
#Averaged Perceptron tagger
import nltk
words=nltk.word_tokenize("He is playing cricket on a playground")
tagged = nltk.pos_tag(words)
print(tagged)

[('He', 'PRP'), ('is', 'VBZ'), ('playing', 'VBG'), ('cricket', 'NN'), ('on', 'IN'), ('a', 'DT'), ('playground', 'NN')]


In [12]:
#Extracting training and test strings from Brown Corpus, a well known repository of 1 million words
from nltk.corpus import brown
brown_train = list(brown.tagged_sents(categories='news')[:500])
brown_test = list(brown.tagged_sents(categories='news')[500:600])

In [13]:
#UnigramTagger assigns the most frequent tag of a word to all it's occurrences
from nltk.tag import UnigramTagger
print("Unigram tagger")

#Collect statistics from Brown corpus
unigram_tagger = UnigramTagger(brown_train)
#Evaluating unigram tagger on a set of test sentences 
print(unigram_tagger.evaluate(brown_test))
sent = "I love to play cricket"
print(unigram_tagger.tag(sent.split()))

Unigram tagger
0.7345971563981043
[('I', 'PPSS'), ('love', 'VB'), ('to', 'TO'), ('play', None), ('cricket', None)]


In [22]:
sent = "Sachin score is 100 runs in test match"
print(unigram_tagger.tag(sent.split()))
sent = "run running runs ran"
print(unigram_tagger.tag(sent.split()))

[('Sachin', None), ('score', 'NN'), ('is', 'BEZ'), ('100', 'CD'), ('runs', None), ('in', 'IN'), ('test', 'NN'), ('match', None)]
[('run', 'VB'), ('running', 'VBG'), ('runs', None), ('ran', None)]


In [23]:
#Extracting Brown Corpus, a well known repository of 1 million words
from nltk.corpus import brown
test_sent = brown.sents(categories='news')[0]
print(test_sent)
test_sent = brown.tagged_sents(categories='news')[0]
print(test_sent)

['The', 'Fulton', 'County', 'Grand', 'Jury', 'said', 'Friday', 'an', 'investigation', 'of', "Atlanta's", 'recent', 'primary', 'election', 'produced', '``', 'no', 'evidence', "''", 'that', 'any', 'irregularities', 'took', 'place', '.']
[('The', 'AT'), ('Fulton', 'NP-TL'), ('County', 'NN-TL'), ('Grand', 'JJ-TL'), ('Jury', 'NN-TL'), ('said', 'VBD'), ('Friday', 'NR'), ('an', 'AT'), ('investigation', 'NN'), ('of', 'IN'), ("Atlanta's", 'NP$'), ('recent', 'JJ'), ('primary', 'NN'), ('election', 'NN'), ('produced', 'VBD'), ('``', '``'), ('no', 'AT'), ('evidence', 'NN'), ("''", "''"), ('that', 'CS'), ('any', 'DTI'), ('irregularities', 'NNS'), ('took', 'VBD'), ('place', 'NN'), ('.', '.')]


##### RegexpTagger is a rule based Tagger, We provide rules in the form of regular expressions

In [24]:
# RegexpTagger
from nltk.corpus import brown
from nltk.tag.sequential import RegexpTagger
test_sent = brown.sents(categories='news')[0]
regexp_tagger = RegexpTagger(
        [(r'^-?[0-9]+(.[0-9]+)?$', 'CD'),   # cardinal numbers
                (r'(The|the|A|a|An|an)$', 'AT'),   # articles
                (r'.*able$', 'JJ'),                # adjectives
                (r'.*ness$', 'NN'),                # nouns formed from adjectives
                (r'.*ly$', 'RB'),                  # adverbs
                (r'.*s$', 'NNS'),                  # plural nouns
                (r'.*ing$', 'VBG'),                # gerunds
                (r'.*ed$', 'VBD'),                 # past tense verbs
                (r'(from|on|to|into|of)$', 'PREP'),   # prepositions
                (r'.*', 'NN')                      # nouns (default)
        ])
sent = "Sachin plays a game of Cricket. He scored 100 runs."
print(regexp_tagger.tag(sent.split()))
print(regexp_tagger.tag(test_sent))


[('Sachin', 'NN'), ('plays', 'NNS'), ('a', 'AT'), ('game', 'NN'), ('of', 'PREP'), ('Cricket.', 'NN'), ('He', 'NN'), ('scored', 'VBD'), ('100', 'CD'), ('runs.', 'NN')]
[('The', 'AT'), ('Fulton', 'NN'), ('County', 'NN'), ('Grand', 'NN'), ('Jury', 'NN'), ('said', 'NN'), ('Friday', 'NN'), ('an', 'AT'), ('investigation', 'NN'), ('of', 'PREP'), ("Atlanta's", 'NNS'), ('recent', 'NN'), ('primary', 'NN'), ('election', 'NN'), ('produced', 'VBD'), ('``', 'NN'), ('no', 'NN'), ('evidence', 'NN'), ("''", 'NN'), ('that', 'NN'), ('any', 'NN'), ('irregularities', 'NNS'), ('took', 'NN'), ('place', 'NN'), ('.', 'NN')]


#### Let us try to train a HMM POS trainer on the brown corpus

In [25]:
# Import the corpus
from nltk.corpus import treebank

# Train data - pretagged
train_data = treebank.tagged_sents()[:3000]

print(train_data[0])

# Import HMM module
from nltk.tag import hmm

# Setup a trainer with default(None) values
# And train with the data
trainer = hmm.HiddenMarkovModelTrainer()
tagger = trainer.train_supervised(train_data)

print(tagger)
# Prints the basic data about the tagger

print(tagger.tag("Today is a good day .".split()))

print(tagger.tag("Joe met Joanne in Delhi .".split()))

print(tagger.tag("Chicago is the birthplace of Ginny".split()))

[('Pierre', 'NNP'), ('Vinken', 'NNP'), (',', ','), ('61', 'CD'), ('years', 'NNS'), ('old', 'JJ'), (',', ','), ('will', 'MD'), ('join', 'VB'), ('the', 'DT'), ('board', 'NN'), ('as', 'IN'), ('a', 'DT'), ('nonexecutive', 'JJ'), ('director', 'NN'), ('Nov.', 'NNP'), ('29', 'CD'), ('.', '.')]
<HiddenMarkovModelTagger 46 states and 10779 output symbols>
[('Today', 'NN'), ('is', 'VBZ'), ('a', 'DT'), ('good', 'JJ'), ('day', 'NN'), ('.', '.')]
[('Joe', 'NNP'), ('met', 'VBD'), ('Joanne', 'NNP'), ('in', 'IN'), ('Delhi', 'NNP'), ('.', 'NNP')]
[('Chicago', 'NNP'), ('is', 'VBZ'), ('the', 'DT'), ('birthplace', 'NNP'), ('of', 'NNP'), ('Ginny', 'NNP')]
