In [1]:
import utils
import emission
import transition
import viterbi

import copy
from random import randint, choice
import numpy as np

In [2]:
LOWER = True
NORM_TENSE = True
REP_NUM = True
REP_YEAR = True
REP_SYM = True

In [3]:
dataset_folder = "data/EN/"
train_data = dataset_folder + "train"
lines = utils.read_file_to_lines(train_data)

In [4]:
emission_data = emission.generate_emission_table(lines,
                                                 lower=LOWER,
                                                 norm_tense=NORM_TENSE,
                                                 replace_number=REP_NUM,
                                                 replace_year=REP_YEAR,
                                                 replace_symbol=REP_SYM)
hashmap = emission_data["x_hashmap"]
word_freq = emission_data["x_word_freq"]
smoothed_hashmap = utils.add_unk(hashmap, word_freq, k=5)
emission_data["x_hashmap"] = smoothed_hashmap

x_vocab = utils.get_emission_vocab(smoothed_hashmap)
print("Vocab size:", len(x_vocab))

Vocab size: 3348


In [5]:
print(x_vocab)

['!', '#', '##UNK##', '##YEAR##', '$', '%', '&', "'", "''", "'d", "'ll", "'m", "'re", "'s", "'ve", ',', '-', '-lcb-', '-lrb-', '-rcb-', '-rrb-', '.', '...', '0-3', '0-4', '1-10', '1-3', '1-4', '1-5', '1-6', '1-7', '1-8', '1-9', '11\\/16', '1\\/2', '1\\/4', '1\\/8', '2-3', '2-4', '2-5', '2-6', '2-7', '2-8', '2-9', '3-3', '3-4', '3-5', '3-6', '3-7', '3-8', '3\\/4', '3\\/8', '4-3', '4-4', '4-5', '4-6', '4-7', '4-8', '5-3', '5-4', '5-5', '5-6', '5-7', '5-8', '5\\/8', '6-3', '6-4', '6-5', '6-6', '6-7', '6-8', '7-3', '7-4', '7-5', '7-7', '7\\/8', '8-3', '8-4', '8-5', '8-6', '9-3', '9-4', '9-5', '9-6', '9\\/16', ':', ';', '?', 'NUM-THING', '`', '``', 'a', 'a.', 'a.m.', 'a.p.', 'ab', 'abandoned', 'ability', 'able', 'abortion', 'abortion-rights', 'about', 'above', 'abroad', 'abuse', 'accept', 'accepted', 'access', 'accord', 'account', 'accountant', 'accounted', 'accused', 'achieved', 'acknowledges', 'acquire', 'acquired', 'acquiring', 'acquisition', 'across', 'act', 'action', 'active', 'activis

In [6]:
transition_pairs = transition.generate_transition_pairs(lines)

In [7]:
y_pairs = transition_pairs["Y_pairs"]
y_vocab = transition_pairs["y_vocab"]
y_freq = transition_pairs["y_freq"]

In [8]:
transition_data = transition.generate_transition_data(y_pairs, y_vocab)

In [9]:
hmm = viterbi.HMM()
hmm.fit_word_tokenizer(x_vocab)
hmm.fit_pos_tokenizer(y_vocab)
hmm.build_transition_weights(y_freq, transition_data)
hmm.build_emission_weights(emission_data)

In [10]:
train_data = dataset_folder + "dev.in"
lines = utils.read_file_to_lines(train_data)

sentences = []

while len(lines) > 1:
    sentence_break = lines.index("")
    sentence_xy = lines[:sentence_break]
    words = [utils.preprocess_text(token,
                                   lower=LOWER,
                                   norm_tense=NORM_TENSE,
                                   replace_number=REP_NUM,
                                   replace_year=REP_YEAR,
                                   replace_symbol=REP_SYM)
             for token in sentence_xy]
    sentence = " ".join(words).strip()
    sentences.append(sentence)
    lines = lines[sentence_break+1:]

In [11]:
new_words = []

for line in sentences:
    for word in line.split(" "):
        if word not in x_vocab:
            new_words.append(word)
        
new_words = list(set(new_words))
new_words.sort()
print("New words", len(new_words))

New words 2199


In [12]:
# only for the progress bar!
try:
    from tqdm import tqdm
    USE_TQDM = True
except Exception as e:
    print(e, "TQDM import error, disable progress bar")

if USE_TQDM:
    sentences_it = tqdm(sentences)
else:
    sentences_it = sentences

  0%|          | 0/1094 [00:00<?, ?it/s]

In [13]:
preds = []

for line in sentences_it:
    pred = hmm.viterbi_predict(line)
    pred = hmm.pos_tokens_to_labels(pred)
    preds.append(pred)
    
assert len(sentences) == len(preds)

100%|██████████| 1094/1094 [00:32<00:00, 33.79it/s]


In [14]:
outfile = dataset_folder + "dev.p5.out"

with open(outfile, "w") as f:
    for sentence, pred in zip(sentences, preds):
        word_array = sentence.split(" ")
        try:
            assert len(word_array) == len(pred)
            for i, word in enumerate(word_array):
                f.write(word + " " + pred[i] +"\n")
        except:
            print(word_array)
            print(pred)
            break
        f.write("\n")

In [15]:
gold_data = dataset_folder + "dev.out"
pred_data = dataset_folder + "dev.p5.out"

data = utils.run_eval(gold_data, pred_data)

print("Entity F:", data["entity_f"])
print("Sentiment F:", data["sentiment_f"])

Entity F: 0.8396
Sentiment F: 0.8096
