In [6]:
import numpy as np
import pandas as pd
import nltk
import random
from sklearn.model_selection import train_test_split
import pprint, time

nltk.download('treebank')
nltk.download('universal_tagset')

nltk_data = list(nltk.corpus.treebank.tagged_sents(tagset = 'universal'))

print(nltk_data[:2])

[nltk_data] Downloading package treebank to /root/nltk_data...
[nltk_data]   Package treebank is already up-to-date!
[nltk_data] Downloading package universal_tagset to /root/nltk_data...
[nltk_data]   Package universal_tagset is already up-to-date!


[[('Pierre', 'NOUN'), ('Vinken', 'NOUN'), (',', '.'), ('61', 'NUM'), ('years', 'NOUN'), ('old', 'ADJ'), (',', '.'), ('will', 'VERB'), ('join', 'VERB'), ('the', 'DET'), ('board', 'NOUN'), ('as', 'ADP'), ('a', 'DET'), ('nonexecutive', 'ADJ'), ('director', 'NOUN'), ('Nov.', 'NOUN'), ('29', 'NUM'), ('.', '.')], [('Mr.', 'NOUN'), ('Vinken', 'NOUN'), ('is', 'VERB'), ('chairman', 'NOUN'), ('of', 'ADP'), ('Elsevier', 'NOUN'), ('N.V.', 'NOUN'), (',', '.'), ('the', 'DET'), ('Dutch', 'NOUN'), ('publishing', 'VERB'), ('group', 'NOUN'), ('.', '.')]]


In [7]:
train_tagged_words = [tup for sent in train_set for tup in sent]
test_tagged_words = [tup for sent in test_set for tup in sent]
print(len(train_tagged_words))
print(len(test_tagged_words))

80310
20366


In [8]:
for sent in nltk_data[:2]:
    for tuple in sent:
        print(tuple)

('Pierre', 'NOUN')
('Vinken', 'NOUN')
(',', '.')
('61', 'NUM')
('years', 'NOUN')
('old', 'ADJ')
(',', '.')
('will', 'VERB')
('join', 'VERB')
('the', 'DET')
('board', 'NOUN')
('as', 'ADP')
('a', 'DET')
('nonexecutive', 'ADJ')
('director', 'NOUN')
('Nov.', 'NOUN')
('29', 'NUM')
('.', '.')
('Mr.', 'NOUN')
('Vinken', 'NOUN')
('is', 'VERB')
('chairman', 'NOUN')
('of', 'ADP')
('Elsevier', 'NOUN')
('N.V.', 'NOUN')
(',', '.')
('the', 'DET')
('Dutch', 'NOUN')
('publishing', 'VERB')
('group', 'NOUN')
('.', '.')


In [9]:
train_set, test_set = train_test_split(nltk_data, train_size = 0.8, random_state = 101)

In [10]:
print(train_set[0])
print(train_tagged_words[0])

[('Drink', 'NOUN'), ('Carrier', 'NOUN'), ('Competes', 'VERB'), ('With', 'ADP'), ('Cartons', 'NOUN')]
('Drink', 'NOUN')


In [11]:
tags = {tag for word, tag in train_tagged_words}
print(len(tags))
print(tags)

vocab = {word for word, tag in train_tagged_words}
print(len(vocab))

12
{'PRON', 'CONJ', 'ADP', 'NUM', 'ADV', 'PRT', 'VERB', 'X', 'DET', 'ADJ', '.', 'NOUN'}
11052


In [12]:
def word_given_tag(word, tag, train_bag = train_tagged_words):
    tag_list = [pair for pair in train_bag if pair[1] == tag]
    count_tag = len(tag_list)
    w_given_tag_list = [pair[0] for pair in tag_list if pair[0] == word]
    count_w_given_tag = len(w_given_tag_list)
    
    return (count_w_given_tag, count_tag)

In [13]:
def t2_given_t1(t2, t1, train_bag = train_tagged_words):
    tags = [pair[1] for pair in train_bag]
    count_t1 = len([t for t in tags if t == t1])
    count_t2_t1 = 0
    for index in range(len(tags)-1):
        if tags[index] == t1 and tags[index + 1] == t2:
            count_t2_t1 += 1
            return (count_t2_t1, count_t1)

In [14]:
tags_matrix = np.zeros((len(tags), len(tags)), dtype = 'float32')
for i, t1 in enumerate(list(tags)):
    for j, t2 in enumerate(list(tags)):
        tags_matrix[i ,j] = t2_given_t1(t2, t1)[0]/t2_given_t1(t2, t1)[1]

In [15]:
tags_df = pd.DataFrame(tags_matrix, columns = list(tags), index = list(tags))
display(tags_df)

Unnamed: 0,PRON,CONJ,ADP,NUM,ADV,PRT,VERB,X,DET,ADJ,.,NOUN
PRON,0.000456,0.000456,0.000456,0.000456,0.000456,0.000456,0.000456,0.000456,0.000456,0.000456,0.000456,0.000456
CONJ,0.000549,0.000549,0.000549,0.000549,0.000549,0.000549,0.000549,0.000549,0.000549,0.000549,0.000549,0.000549
ADP,0.000127,0.000127,0.000127,0.000127,0.000127,0.000127,0.000127,0.000127,0.000127,0.000127,0.000127,0.000127
NUM,0.000357,0.000357,0.000357,0.000357,0.000357,0.000357,0.000357,0.000357,0.000357,0.000357,0.000357,0.000357
ADV,0.000388,0.000388,0.000388,0.000388,0.000388,0.000388,0.000388,0.000388,0.000388,0.000388,0.000388,0.000388
PRT,0.000391,0.000391,0.000391,0.000391,0.000391,0.000391,0.000391,0.000391,0.000391,0.000391,0.000391,0.000391
VERB,9.2e-05,9.2e-05,9.2e-05,9.2e-05,9.2e-05,9.2e-05,9.2e-05,9.2e-05,9.2e-05,9.2e-05,9.2e-05,9.2e-05
X,0.000192,0.000192,0.000192,0.000192,0.000192,0.000192,0.000192,0.000192,0.000192,0.000192,0.000192,0.000192
DET,0.000144,0.000144,0.000144,0.000144,0.000144,0.000144,0.000144,0.000144,0.000144,0.000144,0.000144,0.000144
ADJ,0.000194,0.000194,0.000194,0.000194,0.000194,0.000194,0.000194,0.000194,0.000194,0.000194,0.000194,0.000194


In [16]:
def Viterbi(words, train_bag = train_tagged_words):
    state = []
    T = list(set(pair[1] for pair in train_bag))
    
    for key, word in enumerate(words):
        p = []
        for tag in T:
            if key == 0:
                trainsition_p = tags_df.loc['.',tag]
            else:
                trainsition_p = tags_df.loc[state[-1], tag]
            
            emission_p = word_given_tag(words[key], tag)[0]/ word_given_tag(words[key], tag)[1]
            state_probability = emission_p * trainsition_p
            p.append(state_probability)
        
        pmax = max(p)
        state_max = T[p.index(pmax)]
        state.append(state_max)
    return list(zip(words, state))

In [17]:
random.seed(1234)
rndom = [random.randint(1, len(test_set)) for x in range(10)]
test_run = [test_set[i] for i in rndom]
test_run_base = [tup for sent in test_run for tup in sent]
test_tagged_words = [tup[0] for sent in test_run for tup in sent]

In [18]:
tagged_seq = Viterbi(test_tagged_words)
check = [i for i, j in zip(tagged_seq, test_run_base) if i == j]

accuracy = len(check)/ len(tagged_seq)
print('Viterbi Algorithm Accuracy: ', accuracy*100)

Viterbi Algorithm Accuracy:  92.82296650717703


Use Libary

In [26]:
from nltk.corpus import treebank

train_data = treebank.tagged_sents()[:1000]
train_data

[[('Pierre', 'NNP'), ('Vinken', 'NNP'), (',', ','), ('61', 'CD'), ('years', 'NNS'), ('old', 'JJ'), (',', ','), ('will', 'MD'), ('join', 'VB'), ('the', 'DT'), ('board', 'NN'), ('as', 'IN'), ('a', 'DT'), ('nonexecutive', 'JJ'), ('director', 'NN'), ('Nov.', 'NNP'), ('29', 'CD'), ('.', '.')], [('Mr.', 'NNP'), ('Vinken', 'NNP'), ('is', 'VBZ'), ('chairman', 'NN'), ('of', 'IN'), ('Elsevier', 'NNP'), ('N.V.', 'NNP'), (',', ','), ('the', 'DT'), ('Dutch', 'NNP'), ('publishing', 'VBG'), ('group', 'NN'), ('.', '.')], ...]

In [19]:
from nltk.tag import hmm

trainer = hmm.HiddenMarkovModelTrainer()
# target = trainer.train_supervised(train_data)