In [1]:
import numpy as np
import random
import matplotlib.pyplot as plt
from tqdm import tqdm
from collections import defaultdict
from mlreading_hub.pml.mle_hmm import *

In [2]:
def loadData(PATH):
    words,tags = [],[]
    vocab,vocabtag = set(),set()
    with open(PATH) as f:
        lines = f.readlines()
        sent,senttag = [],[]
        for line in lines:
            line = line.strip()
            tokens = line.split(" ")
            if len(tokens)>1:
                sent.append(tokens[0])
                senttag.append(tokens[1])
                vocab.add(tokens[0])
                vocabtag.add(tokens[1])
            else:
                words.append(sent)
                tags.append(senttag)
                sent,senttag=[],[]        
    return words,tags,vocab,vocabtag

#### Part of Speech Tagging
- https://www.clips.uantwerpen.be/conll2000/chunking/

In [3]:
trainDataPATH = "/home/yui/Documents/data/nlp/pos/train.txt"
words,tags,vocab,vocabtag = loadData(trainDataPATH)

In [4]:
testDataPATH = "/home/yui/Documents/data/nlp/pos/test.txt"
words_,tags_,_,_ = loadData(testDataPATH)

In [5]:
print("Number of vocabularies: ",len(vocab))
print("Number of tags: ",len(vocabtag))
print("Number of sentences for training: ",len(words),len(tags))
print("Number of sentences for testing: ",len(words_),len(tags_))

Number of vocabularies:  19122
Number of tags:  44
Number of sentences for training:  8936 8936
Number of sentences for testing:  2012 2012


In [6]:
model = HMM()
model.setDistinctHiddensAndObservations(list(vocab),
            list(vocabtag))
model = add_patch(model)
for i in tqdm(range(len(words))):
    model.learn(tags[i],words[i])

100%|██████████| 8936/8936 [46:25<00:00,  3.21it/s]


#### Metrics
1. Accuracy $\frac{TP+TN}{TP+TN+FP+FN}$
2. Precision $\frac{TP}{TP+FP}$
3. Recall $\frac{TP}{TP+FN}$
4. F1 score $2\frac{P\times R}{P+R}$


In [8]:
def evalPerformance(model,words,tags,name="Train"):
    TP,total = 0,0
    for i in tqdm(range(len(words))):
        try:
            _,res = model.decode(words[i])
            for j in range(len(res)):
                TP += res[j]==tags[i][j]
                total+=1
        except:
            pass
    print("{} Accuracy: ".format(name),TP/total)

In [9]:
evalPerformance(model,words,tags,name="Train")
evalPerformance(model,words_,tags_,name="Test")

100%|██████████| 8936/8936 [04:37<00:00, 32.21it/s]
  1%|          | 22/2012 [00:00<00:28, 69.33it/s] 

Train Accuracy:  0.9231415927113689


100%|██████████| 2012/2012 [00:27<00:00, 74.36it/s]

Test Accuracy:  0.9368231046931408



