# CS5616 - NLP - A2 - POS Tagging

## 209338R - KATS Jayathilaka

#### -----------------------------------------------------------------------------------------------------

### Importing and downloading NLTK Library and Data

In [1]:
import nltk
import os

##### You have to change this directory according to your system. Otherwise, `nltk.download()` will always be invoked.

In [2]:
# I am KATS Jayathilaka - SINGHABAHU is my nickname
NLTK_DATA_PATH_CHECK = '/home/singhabahu/nltk_data/'

In [3]:
if not os.path.exists(NLTK_DATA_PATH_CHECK):
    nltk.download()

### Reading `annotate.txt` file

In [4]:
datafile = open('annotate.txt', 'r', encoding = 'utf-8')
raw_data = datafile.read()
datafile.close()

### Removing all numbers

In [5]:
import re
nonum_data = re.sub(r'\d+', '', raw_data)

### Removing all punctuation marks

In [6]:
import string
nopunc_data = nonum_data.translate(str.maketrans("","", string.punctuation))

### Removing leading and trailing whitespaces

In [7]:
stripped_data = nopunc_data.strip()

### Preprocessed data

In [8]:
data = stripped_data

#### -----------------------------------------------------------------------------------------------------

### Tokenization

In [9]:
tokens = nltk.word_tokenize(data)

### POS Tagging

#### Checking different POS tag sets

In [74]:
nltk.help.upenn_tagset()

$: dollar
    $ -$ --$ A$ C$ HK$ M$ NZ$ S$ U.S.$ US$
'': closing quotation mark
    ' ''
(: opening parenthesis
    ( [ {
): closing parenthesis
    ) ] }
,: comma
    ,
--: dash
    --
.: sentence terminator
    . ! ?
:: colon or ellipsis
    : ; ...
CC: conjunction, coordinating
    & 'n and both but either et for less minus neither nor or plus so
    therefore times v. versus vs. whether yet
CD: numeral, cardinal
    mid-1890 nine-thirty forty-two one-tenth ten million 0.5 one forty-
    seven 1987 twenty '79 zero two 78-degrees eighty-four IX '60s .025
    fifteen 271,124 dozen quintillion DM2,000 ...
DT: determiner
    all an another any both del each either every half la many much nary
    neither no some such that the them these this those
EX: existential there
    there
FW: foreign word
    gemeinschaft hund ich jeux habeas Haementeria Herr K'ang-si vous
    lutihaw alai je jour objets salutaris fille quibusdam pas trop Monte
    terram fiche oui corporis ...
IN: preposition or

In [76]:
nltk.help.claws5_tagset()

AJ0: adjective (unmarked)
    good, old
AJC: comparative adjective
    better, older
AJS: superlative adjective
    best, oldest
AT0: article
    THE, A, AN
AV0: adverb (unmarked)
    often, well, longer, furthest
AVP: adverb particle
    up, off, out
AVQ: wh-adverb
    when, how, why
CJC: coordinating conjunction
    and, or
CJS: subordinating conjunction
    although, when
CJT: the conjunction THAT
    that
CRD: cardinal numeral
    3, fifty-five, 6609 (excl one)
DPS: possessive determiner form
    your, their
DT0: general determiner
    these, some
DTQ: wh-determiner
    whose, which
EX0: existential THERE
    there
ITJ: interjection or other isolate
    oh, yes, mhm
NN0: noun (neutral for number)
    aircraft, data
NN1: singular noun
    pencil, goose
NN2: plural noun
    pencils, geese
NP0: proper noun
    London, Michael, Mars
NULL: the null tag (for items not to be tagged)
ORD: ordinal
    sixth, 77th, last
PNI: indefinite pronoun
    none, everything
PNP: personal pronoun
    y

In [77]:
nltk.help.brown_tagset()

(: opening parenthesis
    (
): closing parenthesis
    )
*: negator
    not n't
,: comma
    ,
--: dash
    --
.: sentence terminator
    . ? ; ! :
:: colon
    :
ABL: determiner/pronoun, pre-qualifier
    quite such rather
ABN: determiner/pronoun, pre-quantifier
    all half many nary
ABX: determiner/pronoun, double conjunction or pre-quantifier
    both
AP: determiner/pronoun, post-determiner
    many other next more last former little several enough most least only
    very few fewer past same Last latter less single plenty 'nough lesser
    certain various manye next-to-last particular final previous present
    nuf
AP$: determiner/pronoun, post-determiner, genitive
    other's
AP+AP: determiner/pronoun, post-determiner, hyphenated pair
    many-much
AT: article
    the an no a every th' ever' ye
BE: verb 'to be', infinitive or imperative
    be
BED: verb 'to be', past tense, 2nd person singular or all persons plural
    were
BED*: verb 'to be', past tense, 2nd person singular or 

#### Checking different corpora in NLTK

In [88]:
from nltk.corpus import treebank
from nltk.corpus import brown
from nltk.corpus import gutenberg

In [97]:
print(len(treebank.words()))
treebank.words()

100676


['Pierre', 'Vinken', ',', '61', 'years', 'old', ',', ...]

In [98]:
print(len(brown.words()))
brown.words()

1161192


['The', 'Fulton', 'County', 'Grand', 'Jury', 'said', ...]

In [99]:
print(len(gutenberg.words()))
gutenberg.words()

2621613


['[', 'Emma', 'by', 'Jane', 'Austen', '1816', ']', ...]

### Training different POS Taggers using Brown Corpus

#### Defining common functions to output the tagged corpus for the dataset `annotate.txt`

In [141]:
def create_tagged_corpus(tagged_list):
    tagged_corpus = ''
    for tagged in penn_tagged:
        tagged_corpus += '/'.join(tagged) + ' '
    return tagged_corpus

def save_corpus(name, corpus):
    f = open(name, 'w')
    f.write(corpus)
    f.close()

#### Splitting Brown corpus to train & test sets

In [144]:
brown_tagged_sents = brown.tagged_sents()
brown_ts_len = len(brown_tagged_sents)
train_sents = brown_tagged_sents[:int(brown_ts_len*0.8)]
test_sents = brown_tagged_sents[int(brown_ts_len*0.8):]

#### 1. Training & using UnigramTagger

In [138]:
unigram_tagger = UnigramTagger(train_sents)

In [139]:
unigram_tagger.evaluate(test_sents)

0.8773754310202373

In [147]:
tagged_corpus_by_unigram = unigram_tagger.tag(tokens)
tagged_corpus_by_unigram

[('Lectures', 'NNS-TL'),
 ('were', 'BED'),
 ('really', 'RB'),
 ('good', 'JJ'),
 ('There', 'EX'),
 ('were', 'BED'),
 ('lot', 'NN'),
 ('of', 'IN'),
 ('people', 'NNS'),
 ('who', 'WPS'),
 ('came', 'VBD'),
 ('their', 'PP$'),
 ('without', 'IN'),
 ('any', 'DTI'),
 ('Java', 'NP'),
 ('knowledge', 'NN'),
 ('and', 'CC'),
 ('yet', 'RB'),
 ('you', 'PPSS'),
 ('were', 'BED'),
 ('very', 'QL'),
 ('supportive', 'JJ'),
 ('for', 'IN'),
 ('me', 'PPO'),
 ('Its', 'PP$'),
 ('really', 'RB'),
 ('good', 'JJ'),
 ('that', 'CS'),
 ('the', 'AT'),
 ('lecturer', 'NN'),
 ('explains', 'VBZ'),
 ('most', 'QL'),
 ('of', 'IN'),
 ('the', 'AT'),
 ('concepts', 'NNS'),
 ('using', 'VBG'),
 ('examples', 'NNS'),
 ('It', 'PPS'),
 ('helps', 'VBZ'),
 ('us', 'PPO'),
 ('to', 'TO'),
 ('understand', 'VB'),
 ('them', 'PPO'),
 ('better', 'JJR'),
 ('But', 'CC'),
 ('if', 'CS'),
 ('you', 'PPSS'),
 ('are', 'BER'),
 ('sitting', 'VBG'),
 ('at', 'IN'),
 ('the', 'AT'),
 ('back', 'RB'),
 ('you', 'PPSS'),
 ('cant', 'NN'),
 ('properly', 'RB'),
 ('see

In [148]:
save_corpus('tagged_corpus_by_unigram.txt', create_tagged_corpus(tagged_corpus_by_unigram))

In [119]:
brill_tagger = BrillTagger(brown.tagged_sents())

TypeError: __init__() missing 1 required positional argument: 'rules'

In [118]:
hmm_tagger = HiddenMarkovModelTagger(brown.tagged_sents())

TypeError: __init__() missing 4 required positional arguments: 'states', 'transitions', 'outputs', and 'priors'

In [117]:
unigram_tagger.tag(tokens)

[('Lectures', 'NNS-TL'),
 ('were', 'BED'),
 ('really', 'RB'),
 ('good', 'JJ'),
 ('There', 'EX'),
 ('were', 'BED'),
 ('lot', 'NN'),
 ('of', 'IN'),
 ('people', 'NNS'),
 ('who', 'WPS'),
 ('came', 'VBD'),
 ('their', 'PP$'),
 ('without', 'IN'),
 ('any', 'DTI'),
 ('Java', 'NP'),
 ('knowledge', 'NN'),
 ('and', 'CC'),
 ('yet', 'RB'),
 ('you', 'PPSS'),
 ('were', 'BED'),
 ('very', 'QL'),
 ('supportive', 'JJ'),
 ('for', 'IN'),
 ('me', 'PPO'),
 ('Its', 'PP$'),
 ('really', 'RB'),
 ('good', 'JJ'),
 ('that', 'CS'),
 ('the', 'AT'),
 ('lecturer', 'NN'),
 ('explains', 'VBZ'),
 ('most', 'QL'),
 ('of', 'IN'),
 ('the', 'AT'),
 ('concepts', 'NNS'),
 ('using', 'VBG'),
 ('examples', 'NNS'),
 ('It', 'PPS'),
 ('helps', 'VBZ'),
 ('us', 'PPO'),
 ('to', 'TO'),
 ('understand', 'VB'),
 ('them', 'PPO'),
 ('better', 'JJR'),
 ('But', 'CC'),
 ('if', 'CS'),
 ('you', 'PPSS'),
 ('are', 'BER'),
 ('sitting', 'VBG'),
 ('at', 'IN'),
 ('the', 'AT'),
 ('back', 'RB'),
 ('you', 'PPSS'),
 ('cant', 'NN'),
 ('properly', 'RB'),
 ('see

#### 1. Penn Treebank Tagset tagger (default)

In [13]:
penn_tagged = nltk.pos_tag(tokens)
penn_tagged

[('Lectures', 'NNS'),
 ('were', 'VBD'),
 ('really', 'RB'),
 ('good', 'JJ'),
 ('There', 'EX'),
 ('were', 'VBD'),
 ('lot', 'NN'),
 ('of', 'IN'),
 ('people', 'NNS'),
 ('who', 'WP'),
 ('came', 'VBD'),
 ('their', 'PRP$'),
 ('without', 'IN'),
 ('any', 'DT'),
 ('Java', 'NNP'),
 ('knowledge', 'NN'),
 ('and', 'CC'),
 ('yet', 'RB'),
 ('you', 'PRP'),
 ('were', 'VBD'),
 ('very', 'RB'),
 ('supportive', 'JJ'),
 ('for', 'IN'),
 ('me', 'PRP'),
 ('Its', 'PRP$'),
 ('really', 'RB'),
 ('good', 'JJ'),
 ('that', 'IN'),
 ('the', 'DT'),
 ('lecturer', 'NN'),
 ('explains', 'VBZ'),
 ('most', 'JJS'),
 ('of', 'IN'),
 ('the', 'DT'),
 ('concepts', 'NNS'),
 ('using', 'VBG'),
 ('examples', 'NNS'),
 ('It', 'PRP'),
 ('helps', 'VBZ'),
 ('us', 'PRP'),
 ('to', 'TO'),
 ('understand', 'VB'),
 ('them', 'PRP'),
 ('better', 'JJR'),
 ('But', 'CC'),
 ('if', 'IN'),
 ('you', 'PRP'),
 ('are', 'VBP'),
 ('sitting', 'VBG'),
 ('at', 'IN'),
 ('the', 'DT'),
 ('back', 'NN'),
 ('you', 'PRP'),
 ('cant', 'VBP'),
 ('properly', 'JJ'),
 ('see', 

In [14]:
save_corpus('tagged_penn.txt', create_tagged_corpus(penn_tagged))

#### 2. Universal Tagset tagger

In [15]:
univ_tagged = nltk.pos_tag(tokens, tagset='universal')
univ_tagged

[('Lectures', 'NOUN'),
 ('were', 'VERB'),
 ('really', 'ADV'),
 ('good', 'ADJ'),
 ('There', 'DET'),
 ('were', 'VERB'),
 ('lot', 'NOUN'),
 ('of', 'ADP'),
 ('people', 'NOUN'),
 ('who', 'PRON'),
 ('came', 'VERB'),
 ('their', 'PRON'),
 ('without', 'ADP'),
 ('any', 'DET'),
 ('Java', 'NOUN'),
 ('knowledge', 'NOUN'),
 ('and', 'CONJ'),
 ('yet', 'ADV'),
 ('you', 'PRON'),
 ('were', 'VERB'),
 ('very', 'ADV'),
 ('supportive', 'ADJ'),
 ('for', 'ADP'),
 ('me', 'PRON'),
 ('Its', 'PRON'),
 ('really', 'ADV'),
 ('good', 'ADJ'),
 ('that', 'ADP'),
 ('the', 'DET'),
 ('lecturer', 'NOUN'),
 ('explains', 'VERB'),
 ('most', 'ADJ'),
 ('of', 'ADP'),
 ('the', 'DET'),
 ('concepts', 'NOUN'),
 ('using', 'VERB'),
 ('examples', 'NOUN'),
 ('It', 'PRON'),
 ('helps', 'VERB'),
 ('us', 'PRON'),
 ('to', 'PRT'),
 ('understand', 'VERB'),
 ('them', 'PRON'),
 ('better', 'ADJ'),
 ('But', 'CONJ'),
 ('if', 'ADP'),
 ('you', 'PRON'),
 ('are', 'VERB'),
 ('sitting', 'VERB'),
 ('at', 'ADP'),
 ('the', 'DET'),
 ('back', 'NOUN'),
 ('you', 

In [16]:
save_corpus('tagged_univ.txt', create_tagged_corpus(univ_tagged))

#### 3. POS tagger

In [36]:
from nltk.tag import UnigramTagger
from nltk.tag import RegexpTagger

In [38]:
rt = RegexpTagger(penn_tagged)
rt

<Regexp Tagger: size=2075>

In [53]:
from nltk.data import load

In [65]:
load('help/tagsets/claws5_tagset.pickle')

{'VBI': ('infinitive of the verb "BE"', 'be '),
 'NULL': ('the null tag (for items not to be tagged)', ''),
 'VBN': ('past participle of the verb "BE"', 'been '),
 'PRF': ('the preposition OF', 'of '),
 'VBB': ('the "base forms" of the verb "BE" (except the infinitive)',
  'am, are '),
 'VHZ': ('-s form of the verb "HAVE"', "has, 's "),
 'CJS': ('subordinating conjunction', 'although, when '),
 'NN2': ('plural noun', 'pencils, geese '),
 'VBG': ('-ing form of the verb "BE"', 'being '),
 'VHG': ('-ing form of the verb "HAVE"', 'having '),
 'VBZ': ('-s form of the verb "BE"', "is, 's "),
 'VHB': ('base form of the verb "HAVE" (except the infinitive)', 'have '),
 'NP0': ('proper noun', 'London, Michael, Mars '),
 'AV0': ('adverb (unmarked)', 'often, well, longer, furthest '),
 'VHN': ('past participle of the verb "HAVE"', 'had '),
 'PRP': ('preposition (except for OF)', 'for, above, to '),
 'VHI': ('infinitive of the verb "HAVE"', 'have '),
 'AJC': ('comparative adjective', 'better, older

In [62]:
load('help/tagsets/upenn_tagset.pickle')

{'LS': ('list item marker',
  'A A. B B. C C. D E F First G H I J K One SP-44001 SP-44002 SP-44005 SP-44007 Second Third Three Two * a b c d first five four one six three two '),
 'TO': ('"to" as preposition or infinitive marker', 'to '),
 'VBN': ('verb, past participle',
  'multihulled dilapidated aerosolized chaired languished panelized used experimented flourished imitated reunifed factored condensed sheared unsettled primed dubbed desired ... '),
 "''": ('closing quotation mark', "' '' "),
 'WP': ('WH-pronoun',
  'that what whatever whatsoever which who whom whosoever '),
 'UH': ('interjection',
  'Goodbye Goody Gosh Wow Jeepers Jee-sus Hubba Hey Kee-reist Oops amen huh howdy uh dammit whammo shucks heck anyways whodunnit honey golly man baby diddle hush sonuvabitch ... '),
 'VBG': ('verb, present participle or gerund',
  "telegraphing stirring focusing angering judging stalling lactating hankerin' alleging veering capping approaching traveling besieging encrypting interrupting era

In [35]:
unigram_tagger = UnigramTagger(penn_tagged)
unigram_tagger

ValueError: too many values to unpack (expected 2)

In [18]:
save_corpus('tagged_moka.txt', create_tagged_corpus(moka_tagged))

In [19]:
from nltk.tag import UnigramTagger
tagger = UnigramTagger(tokens)

ValueError: not enough values to unpack (expected 2, got 1)

In [26]:
from nltk.corpus import brown

In [27]:
brown.tagged_words()[:10]

[('The', 'AT'),
 ('Fulton', 'NP-TL'),
 ('County', 'NN-TL'),
 ('Grand', 'JJ-TL'),
 ('Jury', 'NN-TL'),
 ('said', 'VBD'),
 ('Friday', 'NR'),
 ('an', 'AT'),
 ('investigation', 'NN'),
 ('of', 'IN')]

In [28]:
len(nltk.pos_tag(tokens))

2075

In [29]:
from nltk.tag import UnigramTagger

In [31]:
ts = brown.tagged_sents()

In [32]:
ts

[[('The', 'AT'), ('Fulton', 'NP-TL'), ('County', 'NN-TL'), ('Grand', 'JJ-TL'), ('Jury', 'NN-TL'), ('said', 'VBD'), ('Friday', 'NR'), ('an', 'AT'), ('investigation', 'NN'), ('of', 'IN'), ("Atlanta's", 'NP$'), ('recent', 'JJ'), ('primary', 'NN'), ('election', 'NN'), ('produced', 'VBD'), ('``', '``'), ('no', 'AT'), ('evidence', 'NN'), ("''", "''"), ('that', 'CS'), ('any', 'DTI'), ('irregularities', 'NNS'), ('took', 'VBD'), ('place', 'NN'), ('.', '.')], [('The', 'AT'), ('jury', 'NN'), ('further', 'RBR'), ('said', 'VBD'), ('in', 'IN'), ('term-end', 'NN'), ('presentments', 'NNS'), ('that', 'CS'), ('the', 'AT'), ('City', 'NN-TL'), ('Executive', 'JJ-TL'), ('Committee', 'NN-TL'), (',', ','), ('which', 'WDT'), ('had', 'HVD'), ('over-all', 'JJ'), ('charge', 'NN'), ('of', 'IN'), ('the', 'AT'), ('election', 'NN'), (',', ','), ('``', '``'), ('deserves', 'VBZ'), ('the', 'AT'), ('praise', 'NN'), ('and', 'CC'), ('thanks', 'NNS'), ('of', 'IN'), ('the', 'AT'), ('City', 'NN-TL'), ('of', 'IN-TL'), ('Atlant

In [24]:
tagger = UnigramTagger(ts[:500])

NameError: name 'ts' is not defined

In [25]:
tagger.evaluate(tokens)

NameError: name 'tagger' is not defined

In [211]:
tagger.evaluate(tokens)

ValueError: not enough values to unpack (expected 2, got 1)