# Lab06

# POS Tagging


## Regular Expressioon Tagging

In [45]:
import nltk

nltk.download('punkt')
nltk.download('brown')

from nltk import word_tokenize
from nltk.corpus import brown

brown_tagged_sents = brown.tagged_sents(categories='news')
brown_sents = brown.sents(categories='news')

[nltk_data] Downloading package punkt to /Users/jessicaxu/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package brown to /Users/jessicaxu/nltk_data...
[nltk_data]   Package brown is already up-to-date!


In [46]:
 patterns = [
        (r'.*ing$', 'VBG'),               # gerunds
        (r'.*ed$', 'VBD'),                # simple past
        (r'.*es$', 'VBZ'),                # 3rd singular present
        (r'.*ould$', 'MD'),               # modals
        (r'.*\'s$', 'NN$'),               # possessive nouns
        (r'.*s$', 'NNS'),                 # plural nouns
        (r'^-?[0-9]+(.[0-9]+)?$', 'CD'),  # cardinal numbers
        (r'.*', 'NN')                     # nouns (default)
    ]

In [47]:
regexp_tagger = nltk.RegexpTagger(patterns)

print(brown_sents[3])
print(regexp_tagger.tag(brown_sents[3]))

['``', 'Only', 'a', 'relative', 'handful', 'of', 'such', 'reports', 'was', 'received', "''", ',', 'the', 'jury', 'said', ',', '``', 'considering', 'the', 'widespread', 'interest', 'in', 'the', 'election', ',', 'the', 'number', 'of', 'voters', 'and', 'the', 'size', 'of', 'this', 'city', "''", '.']
[('``', 'NN'), ('Only', 'NN'), ('a', 'NN'), ('relative', 'NN'), ('handful', 'NN'), ('of', 'NN'), ('such', 'NN'), ('reports', 'NNS'), ('was', 'NNS'), ('received', 'VBD'), ("''", 'NN'), (',', 'NN'), ('the', 'NN'), ('jury', 'NN'), ('said', 'NN'), (',', 'NN'), ('``', 'NN'), ('considering', 'VBG'), ('the', 'NN'), ('widespread', 'NN'), ('interest', 'NN'), ('in', 'NN'), ('the', 'NN'), ('election', 'NN'), (',', 'NN'), ('the', 'NN'), ('number', 'NN'), ('of', 'NN'), ('voters', 'NNS'), ('and', 'NN'), ('the', 'NN'), ('size', 'NN'), ('of', 'NN'), ('this', 'NNS'), ('city', 'NN'), ("''", 'NN'), ('.', 'NN')]


In [48]:
regexp_tagger.evaluate(brown_tagged_sents)

0.20326391789486245

In [49]:
raw = 'This race is awesome, I want to race too'
tokens = word_tokenize(raw)

print(regexp_tagger.tag(tokens))

[('This', 'NNS'), ('race', 'NN'), ('is', 'NNS'), ('awesome', 'NN'), (',', 'NN'), ('I', 'NN'), ('want', 'NN'), ('to', 'NN'), ('race', 'NN'), ('too', 'NN')]


# Hidden Markov Models 



In [50]:
# Hidden Markov Models in Python
# Katrin Erk, March 2013 updated March 2016
#
# This HMM addresses the problem of part-of-speech tagging. It estimates
# the probability of a tag sequence for a given word sequence as follows:
#
# Say words = w1....wN
# and tags = t1..tN
#
# then
# P(tags | words) is_proportional_to  product P(ti | t{i-1}) P(wi | ti)
#
# To find the best tag sequence for a given sequence of words,
# we want to find the tag sequence that has the maximum P(tags | words)
import nltk
import sys
nltk.download('brown')

from nltk.corpus import brown
from nltk.corpus import treebank


[nltk_data] Downloading package brown to /Users/jessicaxu/nltk_data...
[nltk_data]   Package brown is already up-to-date!


In [13]:
# Estimating P(wi | ti) from corpus data using Maximum Likelihood Estimation (MLE):
# P(wi | ti) = count(wi, ti) / count(ti)
#
# We add an artificial "start" tag at the beginning of each sentence, and
# We add an artificial "end" tag at the end of each sentence.
# So we start out with the brown tagged sentences,
# add the two artificial tags,
# and then make one long list of all the tag/word pairs.

brown_tags_words = []
brown_tagged_sents = brown.tagged_sents()

for sent in brown_tagged_sents:
    # sent is a list of word/tag pairs
    # add START/START at the beginning
    brown_tags_words.append( ("START", "START") )
    # then all the tag/word pairs for the word/tag pairs in the sentence.
    # shorten tags to 2 characters each
    brown_tags_words.extend([ (tag[:2], word) for (word, tag) in sent ])
    # then END/END
    brown_tags_words.append( ("END", "END") )

# conditional frequency distribution
cfd_tagwords = nltk.ConditionalFreqDist(brown_tags_words)
# conditional probability distribution
cpd_tagwords = nltk.ConditionalProbDist(cfd_tagwords, nltk.MLEProbDist)

print("The probability of an adjective (JJ) being 'new' is", cpd_tagwords["JJ"].prob("new"))
print("The probability of a verb (VB) being 'duck' is", cpd_tagwords["VB"].prob("duck"))

# Estimating P(ti | t{i-1}) from corpus data using Maximum Likelihood Estimation (MLE):
# P(ti | t{i-1}) = count(t{i-1}, ti) / count(t{i-1})
brown_tags = [tag for (tag, word) in brown_tags_words ]

# make conditional frequency distribution:
# count(t{i-1} ti)
cfd_tags= nltk.ConditionalFreqDist(nltk.bigrams(brown_tags))
# make conditional probability distribution, using
# maximum likelihood estimate:
# P(ti | t{i-1})
cpd_tags = nltk.ConditionalProbDist(cfd_tags, nltk.MLEProbDist)

print("If we have just seen 'DT', the probability of 'NN' is", cpd_tags["DT"].prob("NN"))
print( "If we have just seen 'VB', the probability of 'JJ' is", cpd_tags["VB"].prob("DT"))
print( "If we have just seen 'VB', the probability of 'NN' is", cpd_tags["VB"].prob("NN"))


The probability of an adjective (JJ) being 'new' is 0.01472344917632025
The probability of a verb (VB) being 'duck' is 6.042713350943527e-05
If we have just seen 'DT', the probability of 'NN' is 0.5057722522030194
If we have just seen 'VB', the probability of 'JJ' is 0.016885067592065053
If we have just seen 'VB', the probability of 'NN' is 0.10970977711020183


In [14]:
#####
# Viterbi:
# If we have a word sequence, what is the best tag sequence?
#
# The method above lets us determine the probability for a single tag sequence.
# But in order to find the best tag sequence, we need the probability
# for _all_ tag sequence.
# What Viterbi gives us is just a good way of computing all those many probabilities
# as fast as possible.

# what is the list of all tags?
distinct_tags = set(brown_tags)

sentence = ["This", "race", "is", "awesome", ",", "I", "want", "to", "race", "too" ]
#sentence = ["I", "saw", "her", "duck" ]
sentlen = len(sentence)

# viterbi:
# for each step i in 1 .. sentlen,
# store a dictionary
# that maps each tag X
# to the probability of the best tag sequence of length i that ends in X
viterbi = [ ]

# backpointer:
# for each step i in 1..sentlen,
# store a dictionary
# that maps each tag X
# to the previous tag in the best tag sequence of length i that ends in X
backpointer = [ ]

first_viterbi = { }
first_backpointer = { }
for tag in distinct_tags:
    # don't record anything for the START tag
    if tag == "START": continue
    first_viterbi[ tag ] = cpd_tags["START"].prob(tag) * cpd_tagwords[tag].prob( sentence[0] )
    first_backpointer[ tag ] = "START"

print(first_viterbi)
print(first_backpointer)
    
viterbi.append(first_viterbi)
backpointer.append(first_backpointer)

currbest = max(first_viterbi.keys(), key = lambda tag: first_viterbi[ tag ])
print( "Word", "'" + sentence[0] + "'", "current best two-tag sequence:", first_backpointer[ currbest], currbest)
# print( "Word", "'" + sentence[0] + "'", "current best tag:", currbest)

for wordindex in range(1, len(sentence)):
    this_viterbi = { }
    this_backpointer = { }
    prev_viterbi = viterbi[-1]
    
    for tag in distinct_tags:
        # don't record anything for the START tag
        if tag == "START": continue

        # if this tag is X and the current word is w, then 
        # find the previous tag Y such that
        # the best tag sequence that ends in X
        # actually ends in Y X
        # that is, the Y that maximizes
        # prev_viterbi[ Y ] * P(X | Y) * P( w | X)
        # The following command has the same notation
        # that you saw in the sorted() command.
        best_previous = max(prev_viterbi.keys(),
                            key = lambda prevtag: \
            prev_viterbi[ prevtag ] * cpd_tags[prevtag].prob(tag) * cpd_tagwords[tag].prob(sentence[wordindex]))

        # Instead, we can also use the following longer code:
        # best_previous = None
        # best_prob = 0.0
        # for prevtag in distinct_tags:
        #    prob = prev_viterbi[ prevtag ] * cpd_tags[prevtag].prob(tag) * cpd_tagwords[tag].prob(sentence[wordindex])
        #    if prob > best_prob:
        #        best_previous= prevtag
        #        best_prob = prob
        #
        this_viterbi[ tag ] = prev_viterbi[ best_previous] * \
            cpd_tags[ best_previous ].prob(tag) * cpd_tagwords[ tag].prob(sentence[wordindex])
        this_backpointer[ tag ] = best_previous

    currbest = max(this_viterbi.keys(), key = lambda tag: this_viterbi[ tag ])
    print( "Word", "'" + sentence[ wordindex] + "'", "current best two-tag sequence:", this_backpointer[ currbest], currbest)
    # print( "Word", "'" + sentence[ wordindex] + "'", "current best tag:", currbest)


    # done with all tags in this iteration
    # so store the current viterbi step
    viterbi.append(this_viterbi)
    backpointer.append(this_backpointer)


# done with all words in the sentence.
# now find the probability of each tag
# to have "END" as the next tag,
# and use that to find the overall best sequence
prev_viterbi = viterbi[-1]
best_previous = max(prev_viterbi.keys(),
                    key = lambda prevtag: prev_viterbi[ prevtag ] * cpd_tags[prevtag].prob("END"))

prob_tagsequence = prev_viterbi[ best_previous ] * cpd_tags[ best_previous].prob("END")

# best tagsequence: we store this in reverse for now, will invert later
best_tagsequence = [ "END", best_previous ]
# invert the list of backpointers
backpointer.reverse()

# go backwards through the list of backpointers
# (or in this case forward, because we have inverter the backpointer list)
# in each case:
# the following best tag is the one listed under
# the backpointer for the current best tag
current_best_tag = best_previous
for bp in backpointer:
    best_tagsequence.append(bp[current_best_tag])
    current_best_tag = bp[current_best_tag]

best_tagsequence.reverse()
print( "The sentence was:", end = " ")
for w in sentence: print( w, end = " ")
print("\n")
print( "The best tag sequence is:", end = " ")
for t in best_tagsequence: print (t, end = " ")
print("\n")
print( "The probability of the best tag sequence is:", prob_tagsequence)



{'NR': 0.0, 'WR': 0.0, ':': 0.0, 'MD': 0.0, '--': 0.0, 'VB': 0.0, "'": 0.0, 'HV': 0.0, 'CC': 0.0, 'CS': 0.0, '(': 0.0, '*': 0.0, 'DT': 0.0033218181276236437, 'AB': 0.0, ')-': 0.0, 'FW': 0.0, 'RN': 0.0, 'RB': 0.0, 'END': 0.0, ',': 0.0, '.-': 0.0, '(-': 0.0, '.': 0.0, 'PN': 0.0, 'AT': 0.0, 'PP': 0.0, "''": 0.0, 'BE': 0.0, 'CD': 0.0, ',-': 0.0, 'NN': 0.0, 'JJ': 0.0, 'WD': 0.0, '``': 0.0, 'WP': 0.0, 'RP': 0.0, 'NI': 0.0, 'DO': 0.0, 'TO': 0.0, 'OD': 0.0, 'NP': 0.0, ')': 0.0, '*-': 0.0, 'QL': 0.0, ':-': 0.0, 'EX': 0.0, 'UH': 0.0, 'WQ': 0.0, 'IN': 0.0, 'AP': 0.0}
{'NR': 'START', 'WR': 'START', ':': 'START', 'MD': 'START', '--': 'START', 'VB': 'START', "'": 'START', 'HV': 'START', 'CC': 'START', 'CS': 'START', '(': 'START', '*': 'START', 'DT': 'START', 'AB': 'START', ')-': 'START', 'FW': 'START', 'RN': 'START', 'RB': 'START', 'END': 'START', ',': 'START', '.-': 'START', '(-': 'START', '.': 'START', 'PN': 'START', 'AT': 'START', 'PP': 'START', "''": 'START', 'BE': 'START', 'CD': 'START', ',-': 

The code is implemented by [Katrin Erk](http://www.katrinerk.com/courses/python-worksheets/hidden-markov-models-for-pos-tagging-in-python)

##  Train HMM Tagger with NLTK HMM Trainer

In [15]:
# Pretagged training data
brown_tagged_sents = brown.tagged_sents()

print(brown_tagged_sents)

[[('The', 'AT'), ('Fulton', 'NP-TL'), ('County', 'NN-TL'), ('Grand', 'JJ-TL'), ('Jury', 'NN-TL'), ('said', 'VBD'), ('Friday', 'NR'), ('an', 'AT'), ('investigation', 'NN'), ('of', 'IN'), ("Atlanta's", 'NP$'), ('recent', 'JJ'), ('primary', 'NN'), ('election', 'NN'), ('produced', 'VBD'), ('``', '``'), ('no', 'AT'), ('evidence', 'NN'), ("''", "''"), ('that', 'CS'), ('any', 'DTI'), ('irregularities', 'NNS'), ('took', 'VBD'), ('place', 'NN'), ('.', '.')], [('The', 'AT'), ('jury', 'NN'), ('further', 'RBR'), ('said', 'VBD'), ('in', 'IN'), ('term-end', 'NN'), ('presentments', 'NNS'), ('that', 'CS'), ('the', 'AT'), ('City', 'NN-TL'), ('Executive', 'JJ-TL'), ('Committee', 'NN-TL'), (',', ','), ('which', 'WDT'), ('had', 'HVD'), ('over-all', 'JJ'), ('charge', 'NN'), ('of', 'IN'), ('the', 'AT'), ('election', 'NN'), (',', ','), ('``', '``'), ('deserves', 'VBZ'), ('the', 'AT'), ('praise', 'NN'), ('and', 'CC'), ('thanks', 'NNS'), ('of', 'IN'), ('the', 'AT'), ('City', 'NN-TL'), ('of', 'IN-TL'), ('Atlant

In [16]:
# Import HMM module
from nltk.tag import hmm

# Setup a trainer with default(None) values
# And train with the data
trainer = hmm.HiddenMarkovModelTrainer()
trained_tagger = trainer.train_supervised(brown_tagged_sents)

print (trained_tagger)
# Prints the basic data about the tagger

tokens = word_tokenize("This race is awesome, I want to race too")
print(trained_tagger.tag(tokens))



<HiddenMarkovModelTagger 472 states and 56057 output symbols>
[('This', 'DT'), ('race', 'NN'), ('is', 'BEZ'), ('awesome', 'JJ'), (',', ','), ('I', 'PPSS'), ('want', 'VB'), ('to', 'TO'), ('race', 'VB'), ('too', 'QL')]


# LSTM based POS Tagger (Keras)

## Training data

In [51]:
import nltk
nltk.download('punkt')
from nltk import word_tokenize

nltk.download('treebank')
from nltk.corpus import treebank

import numpy as np
from sklearn.model_selection import train_test_split
 

[nltk_data] Downloading package punkt to /Users/jessicaxu/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package treebank to
[nltk_data]     /Users/jessicaxu/nltk_data...
[nltk_data]   Package treebank is already up-to-date!


In [52]:
tagged_sentences = nltk.corpus.treebank.tagged_sents()
 
print(tagged_sentences[0])
print("Tagged sentences: ", len(tagged_sentences))
print("Tagged words:", len(nltk.corpus.treebank.tagged_words()))
 

[('Pierre', 'NNP'), ('Vinken', 'NNP'), (',', ','), ('61', 'CD'), ('years', 'NNS'), ('old', 'JJ'), (',', ','), ('will', 'MD'), ('join', 'VB'), ('the', 'DT'), ('board', 'NN'), ('as', 'IN'), ('a', 'DT'), ('nonexecutive', 'JJ'), ('director', 'NN'), ('Nov.', 'NNP'), ('29', 'CD'), ('.', '.')]
Tagged sentences:  3914
Tagged words: 100676


In [53]:
sentences, sentence_tags =[], [] 
for tagged_sentence in tagged_sentences:
    sentence, tags = zip(*tagged_sentence)
    sentences.append(np.array(sentence))
    sentence_tags.append(np.array(tags))
 
print(sentences[5])
print(sentence_tags[5])

['Lorillard' 'Inc.' ',' 'the' 'unit' 'of' 'New' 'York-based' 'Loews'
 'Corp.' 'that' '*T*-2' 'makes' 'Kent' 'cigarettes' ',' 'stopped' 'using'
 'crocidolite' 'in' 'its' 'Micronite' 'cigarette' 'filters' 'in' '1956'
 '.']
['NNP' 'NNP' ',' 'DT' 'NN' 'IN' 'JJ' 'JJ' 'NNP' 'NNP' 'WDT' '-NONE-' 'VBZ'
 'NNP' 'NNS' ',' 'VBD' 'VBG' 'NN' 'IN' 'PRP$' 'NN' 'NN' 'NNS' 'IN' 'CD'
 '.']


In [54]:
(train_sentences, 
 test_sentences, 
 train_tags, 
 test_tags) = train_test_split(sentences, sentence_tags, test_size=0.2)

### Making vocabs with special tokens: padding (PAD) and unknown (OOV)

*OOV: Out Of Vocabulary*

In [55]:
words, tags = set([]), set([])
 
for s in train_sentences:
    for w in s:
        words.add(w.lower())

for ts in train_tags:
    for t in ts:
        tags.add(t)

word2index = {w: i + 2 for i, w in enumerate(list(words))}
word2index['-PAD-'] = 0  # The special value used for padding
word2index['-OOV-'] = 1  # The special value used for OOVs
 
tag2index = {t: i + 2 for i, t in enumerate(list(tags))}
tag2index['-PAD-'] = 0  # The special value used to padding
tag2index['-OOV-'] = 1  # The special value used for OOVs

def tag_to_index(tag):
    if tag in tag2index:
        return tag2index[tag]
    else:
        return tag2index['-OOV-']

In [56]:
train_sentences_X, test_sentences_X, train_tags_y, test_tags_y = [], [], [], []

for s in train_sentences:
    s_int = []
    for w in s:
        try:
            s_int.append(word2index[w.lower()])
        except KeyError:
            s_int.append(word2index['-OOV-'])

    train_sentences_X.append(s_int)

for s in test_sentences:
    s_int = []
    for w in s:
        try:
            s_int.append(word2index[w.lower()])
        except KeyError:
            s_int.append(word2index['-OOV-'])

    test_sentences_X.append(s_int)

for s in train_tags:
    train_tags_y.append([tag_to_index(t) for t in s])

for s in test_tags:
    test_tags_y.append([tag_to_index(t) for t in s])

print(train_sentences_X[0])
print(test_sentences_X[0])
print(train_tags_y[0])
print(test_tags_y[0])

[6646, 5414, 1210, 5961, 2488, 10017, 5961, 10110, 3445, 8328, 10017, 7267, 1322, 5221, 5178, 10017, 5309, 595, 3056, 8787, 3776, 3711, 7486, 10017, 418, 5713, 2583, 2249, 10017, 4234, 10017, 1210, 5961, 9675, 10017, 2167, 8793, 3890, 7573]
[1210, 8787, 8960, 9373, 10017, 9947, 8197, 5360, 8764, 133, 7206, 8142, 5071, 9639, 6775, 4165, 1, 1718, 4342, 3224, 3876, 5836, 5961, 5677, 2281, 9479, 4342, 5177, 8832, 6191, 1297, 9639, 544, 5961, 1049, 4342, 5961, 937, 9371, 10017, 466, 4038, 1, 5077, 5961, 6726, 9371, 7573]
[5, 5, 12, 34, 15, 14, 34, 24, 15, 11, 14, 8, 45, 12, 11, 14, 42, 5, 12, 34, 24, 17, 11, 14, 45, 38, 16, 38, 14, 5, 14, 12, 34, 24, 14, 16, 24, 15, 26]
[12, 34, 24, 11, 14, 24, 11, 5, 25, 5, 28, 24, 45, 44, 28, 5, 8, 5, 12, 36, 36, 12, 34, 36, 36, 36, 12, 27, 13, 38, 45, 44, 28, 34, 15, 12, 34, 36, 36, 14, 35, 45, 13, 12, 34, 36, 36, 26]


### Getting max length of sequence

In [57]:
MAX_LENGTH = len(max(train_sentences_X, key=len))
print(MAX_LENGTH) 

271


In [58]:
from keras.preprocessing.sequence import pad_sequences
 
train_sentences_X = pad_sequences(train_sentences_X, maxlen=MAX_LENGTH, padding='post')
test_sentences_X = pad_sequences(test_sentences_X, maxlen=MAX_LENGTH, padding='post')
train_tags_y = pad_sequences(train_tags_y, maxlen=MAX_LENGTH, padding='post')
test_tags_y = pad_sequences(test_tags_y, maxlen=MAX_LENGTH, padding='post')

## Keras Model (Bidirectional LSTM)

In [59]:
from keras.models import Sequential
from keras.layers import Dense, LSTM, InputLayer, Bidirectional, TimeDistributed, Embedding, Activation
from keras.optimizers import Adam
 

model = Sequential()
model.add(InputLayer(input_shape=(MAX_LENGTH, )))
model.add(Embedding(len(word2index), 128))
model.add(Bidirectional(LSTM(256, return_sequences=True)))
model.add(TimeDistributed(Dense(len(tag2index))))
model.add(Activation('softmax'))

model.compile(loss='categorical_crossentropy',
              optimizer=Adam(0.001),
              metrics=['accuracy'])

model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_3 (InputLayer)         (None, 271)               0         
_________________________________________________________________
embedding_2 (Embedding)      (None, 271, 128)          1305856   
_________________________________________________________________
bidirectional_2 (Bidirection (None, 271, 512)          788480    
_________________________________________________________________
time_distributed_2 (TimeDist (None, 271, 48)           24624     
_________________________________________________________________
activation_2 (Activation)    (None, 271, 48)           0         
Total params: 2,118,960
Trainable params: 2,118,960
Non-trainable params: 0
_________________________________________________________________


In [60]:
def to_categorical(sequences, categories):
    cat_sequences = []
    for s in sequences:
        cats = []
        for item in s:
            cats.append(np.zeros(categories))
            cats[-1][item] = 1.0
        cat_sequences.append(cats)
    return np.array(cat_sequences)

## Train model

In [61]:
cat_train_tags_y = to_categorical(train_tags_y, len(tag2index))

model.fit(train_sentences_X, to_categorical(train_tags_y, len(tag2index)), batch_size=128, epochs=40, validation_split=0.2)

scores = model.evaluate(test_sentences_X, to_categorical(test_tags_y, len(tag2index)))
print(f"{model.metrics_names[1]}: {scores[1] * 100}") 
  
    

Train on 2504 samples, validate on 627 samples
Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40
Epoch 21/40
Epoch 22/40
Epoch 23/40
Epoch 24/40
Epoch 25/40
Epoch 26/40
Epoch 27/40
Epoch 28/40
Epoch 29/40
Epoch 30/40
Epoch 31/40
Epoch 32/40
Epoch 33/40
Epoch 34/40
Epoch 35/40
Epoch 36/40
Epoch 37/40
Epoch 38/40
Epoch 39/40
Epoch 40/40
acc: 99.18140625131542


## Testing with sentence

In [64]:
test_samples = [
    word_tokenize("This race is awesome, I want to race too.")
]

# Converting sentence (tokens) word to index
test_samples_X = []
for s in test_samples:
    s_int = []
    for w in s:
        try:
            s_int.append(word2index[w.lower()])
        except KeyError:
            s_int.append(word2index['-OOV-'])
    test_samples_X.append(s_int)

test_samples_X = pad_sequences(test_samples_X, maxlen=MAX_LENGTH, padding='post')

#decode the result to have actual tags
def decode_result(sequences, index):
    token_sequences = []
    for categorical_sequence in sequences:
        token_sequence = []
        for categorical in categorical_sequence:
            token_sequence.append(index[np.argmax(categorical)])
 
        token_sequences.append(token_sequence)
 
    return token_sequences


predictions = model.predict(test_samples_X)

print(test_samples)
print(decode_result(predictions, {i: t for t, i in tag2index.items()}))


[['This', 'race', 'is', 'awesome', ',', 'I', 'want', 'to', 'race', 'too', '.']]
[['DT', 'NN', 'VBZ', 'NN', ',', 'PRP', 'VBP', 'TO', 'VB', 'RB', '.', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD

# Exercise



In this exercise, you are required to implement a program to retrieve top 10 frequent words for adjective tag and noun tag.  You are free to choose training dataset but the POS tagger should be either HMM or LSTM based.


For counting words, you can use [FreqDist()](http://www.nltk.org/api/nltk.html?highlight=freqdist) in  NLTK Probability module

In [78]:
import os
import pickle
import numpy as np

import nltk

#nltk.download('punkt')
#nltk.download('brown')

from nltk.corpus import brown
from nltk.tokenize import sent_tokenize, word_tokenize

from nltk.probability import FreqDist

import wikipedia

from nltk.tag import hmm

In [80]:



class POSWordCounter():
    
    _word = ""
    _trained_tagger = None
    _text=""
    noun_words = []
    adj_words = []

    # add more class attributes if required
    
    
    
    def __init__(self, word):
        self._word = word
        self._text = wikipedia.page(self._word).content
        
    def train_pos_tagger():
        # Getting wikipedia contents of "University of Sydney"
       
        
        tokens=word_tokenize(text)
        training=[]
        for s in tokens:
            s_int=[]
            for w in s:
                try:
                    s_int.append(word2index[w.lower()])
                except keyError:
                    s_int.append(word2index['_oov_'])
            training.append(s_int)
        
        training=pad_sequences(training,maxlen=MAX_LENGTH,padding='post')
        #using the model trained above with BiLSTM
        predictions=model.predict(training)
        self._trained_tagger=decode_result(predictions, {i: t for t, i in tag2index.items()})
        
        for i in range(tokens):
            if _trained_tagger[i]=='JJ':
                self.adj_words.append(i)
            elif _trained_tagger[i]=='NN':
                self.noun_words.append(i)
                
       
        

        
    def count_words():

        # conditional frequency distribution
        adj_tagwords = nltk.ConditionalFreqDist(((len(word),word) for word in adj_words))
        noun_tagwords= nltk.ConditionalFreqDist(((len(word),word) for word in noun_words))
        
        self.adj_tagwords=adj_tagwords
        self.noun_tagwords=noun_tagwords
        
        
        

        


    def get_top5_noun_words():
        top5_noun=noun_tagwords[5]

        return top5_noun

    def get_top5_adj_words():
        top5_adj=adj_tagwords[5]
        

        return top5_adj

    # add more class methods if required
    
    
    
    
word = "University of Sydney"
pwc = POSWordCounter(word)
pwc.train_pos_tagger()
pwc.count_words()

print(pwc.get_top5_noun_words())
print(pwc.get_top5_adj_words())

ConnectionError: ('Connection aborted.', ConnectionResetError(54, 'Connection reset by peer'))


## Sample Ouput
```
[('university', 30), ('campus', 8), ('cent', 4), ('program', 3), ('faculty', 3)]
[('Australian', 3), ('new', 3), ('rare', 3), ('current', 2), ('senior', 2)]
```

