In [None]:
import utils
import emission
import transition

In [2]:
dataset_folder = "data/EN/"
train_data = dataset_folder + "train"
lines = utils.read_file_to_lines(train_data)

In [3]:
emission_data = emission.generate_emission_table(lines)
hashmap = emission_data["x_hashmap"]
og_vocab_size = len(hashmap.keys())

# smoothing
smoothed_hashmap = utils.add_unk(hashmap, k=1)
smooth_vocab_size = len(smoothed_hashmap.keys())
print("Reduced vocab from", og_vocab_size, "to", smooth_vocab_size)

emission_data["x_hashmap"] = smoothed_hashmap
x_vocab = list(emission_data["x_hashmap"].keys())

Skipped 1 lines: [''] 

Reduced vocab from 18213 to 8898


In [4]:
transition_pairs = transition.generate_transition_pairs(lines)

In [5]:
y_pairs = transition_pairs["Y_pairs"]
y_vocab = transition_pairs["y_vocab"]

In [6]:
transition_data = transition.generate_transition_data(y_pairs, y_vocab)

In [7]:
# quick test - impossible cases 
assert transition.get_mle("##START##", "", transition_data) == 0
assert transition.get_mle("O", "##START##", transition_data) == 0
assert transition.get_mle("##END##", "O", transition_data) == 0
# non-zero for others
assert transition.get_mle("##START##", "B-NP", transition_data) > 0
assert transition.get_mle("O", "##END##", transition_data) > 0

In [8]:
data = {
    "emission_data": emission_data,
    "transition_data": transition_data,
    "y_vocab": y_vocab
}

In [9]:
import viterbi

In [10]:
first_break = lines.index("")
sentence_xy = lines[:first_break]

In [11]:
words, pos_list = [], []
for token in sentence_xy:
    word, pos = token.split(" ")
    words.append(word)
    pos_list.append(pos)
sentence_x = " ".join(words).strip()
sentence_x

'Municipal bonds are generally a bit safer than corporate bonds in a recession , but not as safe as bonds issued by the federal government .'

In [13]:
hmm = viterbi.HMM()

In [14]:
hmm.fit_word_tokenizer(x_vocab)
hmm.fit_pos_tokenizer(y_vocab)

In [15]:
hmm.build_transition_weights(transition_data)
hmm.transition_weights.shape

(23, 23)

In [16]:
hmm.build_emission_weights(emission_data)
hmm.emission_weights.shape

(23, 8898)

In [17]:
pred = hmm.viterbi_predict(sentence_x)
pred = hmm.pos_tokens_to_labels(pred)

In [23]:
print(pos_list)

['B-NP', 'I-NP', 'B-VP', 'B-ADVP', 'B-ADJP', 'I-ADJP', 'I-ADJP', 'B-PP', 'B-NP', 'I-NP', 'B-PP', 'B-NP', 'I-NP', 'O', 'O', 'B-ADJP', 'I-ADJP', 'I-ADJP', 'B-PP', 'B-NP', 'B-VP', 'B-PP', 'B-NP', 'I-NP', 'I-NP', 'O']


In [24]:
print(pred)

['B-NP' 'B-VP' 'I-VP' 'B-NP' 'I-NP' 'I-NP' 'B-PP' 'B-NP' 'I-NP' 'B-PP'
 'B-NP' 'I-NP' 'O' 'B-CONJP' 'I-CONJP' 'I-CONJP' 'I-CONJP' 'B-PP' 'B-NP'
 'B-VP' 'I-VP' 'B-NP' 'I-NP' 'I-NP' 'I-NP' 'O']
