In [1]:
import utils
import emission
import transition

In [2]:
LOWER = True
NORM_TENSE = True
REP_NUM = True
REP_YEAR = True
REP_SYM = False

In [3]:
dataset_folder = "data/AL/"
train_data = dataset_folder + "train"
lines = utils.read_file_to_lines(train_data)

In [4]:
emission_data = emission.generate_emission_table(lines,
                                                 lower=LOWER,
                                                 norm_tense=NORM_TENSE,
                                                 replace_number=REP_NUM,
                                                 replace_year=REP_YEAR,
                                                 replace_symbol=REP_SYM)
hashmap = emission_data["x_hashmap"]
word_freq = emission_data["x_word_freq"]
smoothed_hashmap = utils.add_unk(hashmap, word_freq, k=4)
emission_data["x_hashmap"] = smoothed_hashmap

x_vocab = utils.get_emission_vocab(smoothed_hashmap)
print("Vocab size:", len(x_vocab))

not enough values to unpack (expected 2, got 1)
Skipped 1 lines:  ['']
Vocab size: 1078


In [5]:
transition_pairs = transition.generate_transition_pairs(lines)

In [6]:
y_pairs = transition_pairs["Y_pairs"]
y_vocab = transition_pairs["y_vocab"]
y_freq = transition_pairs["y_freq"]

In [7]:
transition_data = transition.generate_transition_data(y_pairs, y_vocab)

In [8]:
import viterbi

In [9]:
hmm = viterbi.HMM()
hmm.fit_word_tokenizer(x_vocab)
hmm.fit_pos_tokenizer(y_vocab)
hmm.build_transition_weights(y_freq, transition_data)
hmm.build_emission_weights(emission_data)

In [10]:
train_data = dataset_folder + "dev.in"
lines = utils.read_file_to_lines(train_data)

sentences = []

while len(lines) > 1:
    sentence_break = lines.index("")
    sentence_xy = lines[:sentence_break]
    words = [utils.preprocess_text(token,
                                   lower=LOWER,
                                   norm_tense=NORM_TENSE,
                                   replace_number=REP_NUM,
                                   replace_year=REP_YEAR,
                                   replace_symbol=REP_SYM)
             for token in sentence_xy]
    sentence = " ".join(words).strip()
    sentences.append(sentence)
    lines = lines[sentence_break+1:]

In [11]:
sentences[100:110]

['娴 姹 鐪 鏉 宸 甯 涓 鍩 鍖 鍗 鏄 琛 閬 澶 鍏 璺 濂 鏈 瑙 5 骞',
 '涓 鍗 璺 216 鍙 14 鍙 妤 鍗 7 妤',
 '娴 姹 鐪 姹 灞 甯 鍙 濉 琛 閬 闄 鏉 鏉 闄 鏉 309 鍙',
 '鏉 宸 鎷 澧 鍖 鏂 瀹 鑺 鑻 135 鏍 7 - 1121',
 '娴 姹 鐪 瀹 娉 甯 浣 濮 甯 鍏 姹 琛 閬 涓 鍑 妗 鏉',
 '婀 闃 闀 娌 鍗 鐪 鍞 娌 鍘 婀 闃 闀 婀 闃 鏈 琛',
 '涓 娌 鍥 鍙 澶 琛 鍖 閾 鍏 瀵 16 骞 2299 瀹',
 '娴 姹 鐪 缁 鍏 甯 宓 宸 甯 宕 浠 闀 婀 鏉 妗 鏉',
 '闃 鏄 琛 閬 鏃 灞 鏉 鐐 鍏 灞 392 鍙',
 '瀹 娉 姹 涓 鍖 涓 灞 璺 608 鍙 閾 娉 鐧 璐 9 妤 澶 骞 楦 鐢 瑁']

In [12]:
# only for the progress bar!
try:
    from tqdm import tqdm
    USE_TQDM = True
except Exception as e:
    print(e, "TQDM import error, disable progress bar")

if USE_TQDM:
    sentences_it = tqdm(sentences)
else:
    sentences_it = sentences

  0%|          | 0/1492 [00:00<?, ?it/s]

In [13]:
preds = []

for line in sentences_it:
    pred = hmm.viterbi_predict(line)
    pred = hmm.pos_tokens_to_labels(pred)
    preds.append(pred)
    
assert len(sentences) == len(preds)

100%|██████████| 1492/1492 [01:57<00:00, 12.68it/s]


In [14]:
outfile = dataset_folder + "dev.p5.out"

with open(outfile, "w") as f:
    for sentence, pred in zip(sentences, preds):
        word_array = sentence.split(" ")
        try:
            assert len(word_array) == len(pred)
            for i, word in enumerate(word_array):
                f.write(word + " " + pred[i] +"\n")
        except:
            print(word_array)
            print(pred)
            break
        f.write("\n")

In [15]:
!python3 evalResult.py ./data/AL/dev.out ./data/AL/dev.p5.out


#Entity in gold data: 8408
#Entity in prediction: 9213

#Correct Entity : 5308
Entity  precision: 0.5761
Entity  recall: 0.6313
Entity  F: 0.6025

#Correct Sentiment : 4562
Sentiment  precision: 0.4952
Sentiment  recall: 0.5426
Sentiment  F: 0.5178
