In [170]:
# train-hmm.py
from collections import defaultdict

# train_input_path = "../../test/05-train-input.txt"
train_input_path = "../../data/wiki-en-train.norm_pos"
model_path = "tutorial04.txt"

# モデル読み込み
transition = defaultdict(lambda: 0)
emission = defaultdict(lambda: 0)
possible_tags = defaultdict(lambda: 0)

with open(train_input_path) as f, open(model_path, mode="w") as fw:
    for line in f:
        word_tags = line.split()
        previous = "<s>"
        possible_tags[previous] += 1
        for word, tag in [x.split("_") for x in word_tags]:
            transition[f"{previous} {tag}"] += 1
            possible_tags[tag] += 1
            emission[f"{tag} {word}"] += 1
            previous = tag
        transition[f"{previous} </s>"] += 1
    for key, value in transition.items():
        previous, word = key.split()
        output = f"T {key} {value/possible_tags[previous]}"
        fw.write(output + "\n")
    for key, value in emission.items():
        previous, word = key.split()
        output = f"E {key} {value/possible_tags[previous]}"
        fw.write(output + "\n")

In [175]:
# test-hmm.py
import math
from collections import defaultdict

input_model_path = "tutorial04.txt"
# test_input_path = "../../test/05-test-input.txt"
test_input_path = "../../data/wiki-en-test.norm"
output_path = "my_answer.pos"
UNKNOWN_RATE = 0.05
N = 1e6

transition = defaultdict(lambda: 0)
emission = defaultdict(lambda: 0)
possible_tags = defaultdict(lambda: 0)

with open(input_model_path) as fr:
    for line in fr:
        typ, context, word, prob = line.split()
        possible_tags[context] = 1
        if typ == "T":
            transition[f"{context} {word}"] = float(prob)
        else:
            emission[f"{context} {word}"] = float(prob)

with open(test_input_path) as f, open(output_path, mode="w") as fw:
    for line in f:
        words = line.split()
        l = len(words)

        best_score = defaultdict(lambda: 0)
        best_edge = defaultdict(lambda: 0)

        best_score["0 <s>"] = 0
        best_edge["0 <s>"] = None

        for i in range(l):
            for prev_tag in possible_tags.keys():
                for next_tag in possible_tags.keys():
                    if f"{i} {prev_tag}" not in best_score or f"{prev_tag} {next_tag}" not in transition:
                        continue
                    pt = transition[f"{prev_tag} {next_tag}"]
                    pe = (1 - UNKNOWN_RATE) * emission[f"{next_tag} {words[i]}"] + UNKNOWN_RATE / N
                    
                    score = best_score[f"{i} {prev_tag}"] - math.log(pt, 2) - math.log(pe, 2)
                    if f"{i+1} {next_tag}" not in best_score or best_score[f"{i+1} {next_tag}"] > score:
                        best_score[f"{i+1} {next_tag}"] = score
                        best_edge[f"{i+1} {next_tag}"] = f"{i} {prev_tag}"
        for tag in possible_tags.keys():
            if f"{l} {tag}" in best_score and f"{tag} </s>" in transition:
                pt = transition[f"{tag} </s>"]
                pe = (1 - UNKNOWN_RATE) * emission[f"{tag} </s>"] + UNKNOWN_RATE / N
                score = best_score[f"{l} {tag}"] - math.log(pt, 2) - math.log(pe, 2)
                if f"{l+1} </s>" not in best_score or best_score[f"{l+1} </s>"] > score:
                    best_score[f"{l+1} </s>"] = score
                    best_edge[f"{l+1} </s>"] = f"{l} {tag}"
        tags = []
        next_edge = best_edge[f"{l+1} </s>"]
        while next_edge != "0 <s>" and next_edge != None:
            position, tag = next_edge.split()
            tags.append(tag)
            next_edge = best_edge[next_edge]
        tags = tags[::-1]
        fw.write(" ".join(tags) + "\n")

In [140]:
!cat tutorial04.txt | head -10

T <s> X 1.0
T X Y 0.6666666666666666
T Y Z 0.5
T Z </s> 1.0
T X X 0.3333333333333333
T Y </s> 0.5
E X a 0.6666666666666666
E Y b 1.0
E Z a 1.0
E X c 0.3333333333333333


In [141]:
!cat ../../test/05-train-input.txt | head -10

a_X b_Y a_Z
a_X c_X b_Y


In [142]:
!cat ../../test/05-test-input.txt | head -10

a b a
a c b


In [143]:
!cat ../../test/05-train-answer.txt | head -10

T <s> X 1.000000
T X X 0.333333
T X Y 0.666667
T Y </s> 0.500000
T Y Z 0.500000
T Z </s> 1.000000
E X a 0.666667
E X c 0.333333
E Y b 1.000000
E Z a 1.000000


In [144]:
!cat ../../test/05-test-answer.txt | head -10

X Y Z
X X Y


In [176]:
!cat my_answer.pos | head -10

IN JJ NNS , DT NN -LRB- NN -RRB- VBZ DT JJ NN IN JJ NN NN , WDT VBZ DT NN IN VBG DT NN IN DT NN -LRB- FW NN -RRB- VBZ VBN IN DT NN , WRB DT NN VBZ JJ NNS -LRB- NN -RRB- .
DT NN TO DT NN IN JJ NN NN , JJ IN NN , VBG NN IN NN NNS , NN NN , NN , NN FW NN .
NNP VBZ RB RB TO DT NN WRB JJ NNS VB RB JJ NNS IN NN IN DT NN IN NN NNS CC NNS .
DT JJ NN IN NNS VBP VBN VBN , IN JJ NNS WDT VBP DT NN NN IN JJ NNS , TO JJ NN NN NNS IN WDT DT NN VBZ VBN IN DT JJ NN IN DT NN IN RB JJ NNS , TO RB JJ NNS IN NN NN IN NNS , RB JJ NN NNS .
IN DT , JJ NN NNS VBP VBN DT RBS JJ NNS TO NN .
JJ NN VBZ JJ TO NN IN DT NN IN NN .
IN NNP , NN IN DT JJ -LRB- NN -RRB- NN VBZ VBN IN CD NN , IN DT NNS IN JJ NNS VBG IN DT NN .
IN DT NN NNS , JJ NNS IN DT NN TO DT NN VBP VBN VBN IN JJ NN NNS -LRB- FW , NN -RRB- , WRB DT JJ NN IN DT JJS JJ NN IN RB IN DT RBS JJ NN VBD DT NN CC JJ NN , RB .
DT NN VBZ CD -RRB- : `` JJ NN '' CC `` DT NNS '' NN .
DT NN VBZ IN DT NN IN DT JJ NN IN NN NNS WDT VBD RB VBN , IN IN DT JJ DT DT NNS IN

In [177]:
!cat ../../data/wiki-en-test.pos | head -10

IN JJ NNS , JJ NN -LRB- NN -RRB- VBZ DT JJ NN IN JJ NN NN , WDT VBZ DT NN IN VBG WDT NN IN DT NN -LRB- FW NN -RRB- VBZ VBN IN DT NN , WRB DT NN VBZ JJ NNS -LRB- NN -RRB- .
DT NN TO DT NN VBZ JJ JJ NN , JJ IN NN , VBG NN IN NN NNS , NN NN , NN , NN FW FW .
NNP VBZ VBN RB TO DT NN WRB NNP NNS VBP RB JJ NNS IN NN IN DT NN IN NN NNS CC NNS .
DT JJ NN IN NNS VBP VBN VBN , IN JJ NNS WDT VBP DT NN VBN IN JJ NNS , TO JJ NN NN NNS IN WDT DT NN VBZ VBN IN DT JJ NN IN DT NN IN RB JJ NNS , TO RB JJ NNS WDT VBP NNS IN NNS , RB VBG NN NNS .
IN DT , JJ NN NNS VBP VBN DT RBS JJ NNS TO NN .
JJ NN VBZ JJ TO NN IN DT NN IN NNS .
IN NNP , NN IN DT JJ -LRB- JJ -RRB- NN VBZ RB IN CD NN , IN DT NNS IN JJ NNS VBG IN CD NN .
IN JJ NN NNS , JJ NNS IN CD NN TO CD NN VBP VBN VBN IN JJ NN NNS -LRB- NN , NN -RRB- , WRB DT NN NN IN DT JJS JJ NN IN RB VBG DT RBS JJ NN VBD CD NN CC CD NN , RB .
NN NN VBZ CD NNS : `` JJ NN '' CC `` DT NNS '' NN .
DT JJ VBZ VBG DT NNS IN DT JJ NN IN NN NNS WDT VBD RB VBN , IN IN DT JJ P

In [178]:
!cat ../../data/wiki-en-test.norm | head -10

In computational linguistics , word-sense disambiguation -LRB- WSD -RRB- is an open problem of natural language processing , which governs the process of identifying which sense of a word -LRB- i.e. meaning -RRB- is used in a sentence , when the word has multiple meanings -LRB- polysemy -RRB- .
The solution to this problem impacts other computer-related writing , such as discourse , improving relevance of search engines , anaphora resolution , coherence , inference et cetera .
Research has progressed steadily to the point where WSD systems achieve sufficiently high levels of accuracy on a variety of word types and ambiguities .
A rich variety of techniques have been researched , from dictionary-based methods that use the knowledge encoded in lexical resources , to supervised machine learning methods in which a classifier is trained for each distinct word on a corpus of manually sense-annotated examples , to completely unsupervised methods that cluster occurrences of words , thereby ind

In [179]:
!../../script/gradepos.pl ../../data/wiki-en-test.pos my_answer.pos

Accuracy: 90.82% (4144/4563)

Most common mistakes:
NNS --> NN	45
NN --> JJ	27
NNP --> NN	22
JJ --> DT	22
JJ --> NN	12
VBN --> NN	12
NN --> IN	11
NN --> DT	10
NNP --> JJ	8
JJ --> VBN	7
