In [1]:
import utils
import emission
import transition

In [2]:
dataset_folder = "data/EN/"
train_data = dataset_folder + "train"
lines = utils.read_file_to_lines(train_data)

In [3]:
emission_data = emission.generate_emission_table(lines)
hashmap = emission_data["x_hashmap"]
og_vocab_size = len(hashmap.keys())

# smoothing
smoothed_hashmap = utils.add_unk(hashmap, k=1)
smooth_vocab_size = len(smoothed_hashmap.keys())
print("Reduced vocab from", og_vocab_size, "to", smooth_vocab_size)

emission_data["x_hashmap"] = smoothed_hashmap
x_vocab = list(emission_data["x_hashmap"].keys())

Skipped 1 lines: [''] 

Reduced vocab from 18213 to 8898


In [4]:
transition_pairs = transition.generate_transition_pairs(lines)

In [5]:
y_pairs = transition_pairs["Y_pairs"]
y_vocab = transition_pairs["y_vocab"]

In [6]:
transition_data = transition.generate_transition_data(y_pairs, y_vocab)

In [7]:
# quick test - impossible cases 
assert transition.get_mle("##START##", "", transition_data) == 0
assert transition.get_mle("O", "##START##", transition_data) == 0
assert transition.get_mle("##END##", "O", transition_data) == 0
# non-zero for others
assert transition.get_mle("##START##", "B-NP", transition_data) > 0
assert transition.get_mle("O", "##END##", transition_data) > 0

In [8]:
data = {
    "emission_data": emission_data,
    "transition_data": transition_data,
    "y_vocab": y_vocab
}

In [9]:
import viterbi

In [10]:
word_tokenizer = viterbi.Tokenizer()
word_tokenizer.fit_on_text(x_vocab)

pos_tokenizer = viterbi.Tokenizer()
pos_tokenizer.fit_on_text(y_vocab)

In [11]:
first_break = lines.index("")
sentence_xy = lines[:first_break]

In [12]:
words, pos_list = [], []
for token in sentence_xy:
    word, pos = token.split(" ")
    words.append(word)
    pos_list.append(pos)
sentence_x = " ".join(words).strip()
sentence_x

'Municipal bonds are generally a bit safer than corporate bonds in a recession , but not as safe as bonds issued by the federal government .'

In [13]:
print(pos_list)

['B-NP', 'I-NP', 'B-VP', 'B-ADVP', 'B-ADJP', 'I-ADJP', 'I-ADJP', 'B-PP', 'B-NP', 'I-NP', 'B-PP', 'B-NP', 'I-NP', 'O', 'O', 'B-ADJP', 'I-ADJP', 'I-ADJP', 'B-PP', 'B-NP', 'B-VP', 'B-PP', 'B-NP', 'I-NP', 'I-NP', 'O']


In [14]:
tokens = word_tokenizer.return_sequence(words)

In [15]:
hmm = viterbi.HMM()

In [16]:
hmm.build_transition_weights(transition_data)
hmm.transition_weights.shape

(23, 23)

In [17]:
hmm.build_emission_weights(emission_data)
hmm.emission_weights.shape

(23, 8898)

In [18]:
#hmm.b(22, tokens[j])

In [20]:
print(words[0], pos_list[0])

Municipal B-NP


In [19]:
n = len(tokens)
j = n - 1
print(words[j], pos_list[j])
hmm.pi(j-1, 22, tokens.tobytes())

. O
B: B: B: B: B: B: B: B: B: B: B: B: B: B: B: B: B: B: B: B: B: B: B: B: 1 0 [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
0.0
B: 1 1 [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
0.0
B: 1 2 [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
0.0
B: 1 3 [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
0.0
B: 1 4 [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
0.0
B: 1 5 [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
0.0
B: 1 6 [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
0.0
B: 1 7 [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 

0.0