# Parts-of-Speech Tagging (POS) Using Hidden Markov Models and Dynamic Programming with the Verbati Algorithm

# Imports

In [1]:
from collections import Counter
from pos_tagger import PartsOfSpeechTaggerHMM

# Loading Data

This Demonstration uses two tagged data sets collected from the **Wall Street Journal (WSJ)**. 

[Here](http://relearn.be/2015/training-common-sense/sources/software/pattern-2.6-critical-fork/docs/html/mbsp-tags.html) is an example 'tag-set' or Part of Speech designation describing the two or three letter tag and their meaning. 
- One data set (**WSJ-2_21.pos**) will be used for **training**.
- The other (**WSJ-24.pos**) for **testing**. 

In [2]:
training_corpus_path = "./data/WSJ_02-21.pos"
test_corpus_path = "./data/WSJ_24.pos"

def load_data(path, load_word_list = False):
  with open(path, 'r') as f:
    corpus = f.readlines()

  if load_word_list:

    word_list = []
    for line in corpus:
      if line.split("\t")[0] == "\n":
        word_list.append("")

      else:
        word_list.append(line.split("\t")[0])

    return corpus, word_list

  return corpus

train_corpus, train_words = load_data(training_corpus_path, load_word_list = True)
test_corpus, test_words = load_data(test_corpus_path, load_word_list = True)

# Extracting Vocab from Corpus Words

In [3]:
def extract_vocab(corpus_words):
    vocab = {}
    vocab_list= [word if word.strip() else "--n--" for word in corpus_words]
    vocab_counts= Counter(vocab_list)

    voc_l = [word for word in vocab_counts if vocab_counts[word]>1]
    voc_l = voc_l + ["--unk_digit--","--unk_punct--","--unk_upper--","--unk_noun--","--unk_verb--","--unk_adj--","--unk_adv--","--unk--"]

    for i, word in enumerate(sorted(voc_l)): 
        vocab[word] = i
    return vocab

vocab = extract_vocab(train_words)

# Initializing and Fitting the POS Model

In [4]:
tagger = PartsOfSpeechTaggerHMM()
tagger.fit(vocab, train_corpus)

word count = 50000
word count = 100000
word count = 150000
word count = 200000
word count = 250000
word count = 300000
word count = 350000
word count = 400000
word count = 450000
word count = 500000
word count = 550000
word count = 600000
word count = 650000
word count = 700000
word count = 750000
word count = 800000
word count = 850000
word count = 900000
word count = 950000


# Predicting POS and Calculating Accuracy

In [5]:
predictions = tagger.predict_pos(test_words, test_corpus)

Words processed:     5000
Words processed:    10000
Words processed:    15000
Words processed:    20000
Words processed:    25000
Words processed:    30000
Accuracy of the Viterbi algorithm is 0.9545
