In [1]:
import utils
import emission
import transition

In [2]:
dataset_folder = "data/SG/"
train_data = dataset_folder + "train"
lines = utils.read_file_to_lines(train_data)

In [3]:
emission_data = emission.generate_emission_table(lines)
hashmap = emission_data["x_hashmap"]
word_freq = emission_data["x_word_freq"]
smoothed_hashmap = utils.add_unk(hashmap, word_freq, k=3)
emission_data["x_hashmap"] = smoothed_hashmap

x_vocab = utils.get_emission_vocab(smoothed_hashmap)

Skipped 1 lines:  ['']


In [4]:
transition_pairs = transition.generate_transition_pairs(lines)

In [5]:
y_pairs = transition_pairs["Y_pairs"]
y_vocab = transition_pairs["y_vocab"]
y_freq = transition_pairs["y_freq"]

In [6]:
transition_data = transition.generate_transition_data(y_pairs, y_vocab)

In [7]:
# quick test - impossible cases 
assert transition.get_mle("##END##", "##START##", y_freq, transition_data) == 0
assert transition.get_mle("##START##", "##END##", y_freq, transition_data) == 0

In [8]:
import viterbi

In [9]:
# convert first sentence to test

first_break = lines.index("")
sentence_xy = lines[:first_break]

words, pos_list = [], []
for token in sentence_xy:
    word, pos = token.split(" ")
    words.append(word)
    pos_list.append(pos)
sentence_x = " ".join(words).strip()
sentence_x

"I'm about to see Adam Levine and I'm probably going to drop dead https://t.co/MpRIS0FqSA"

In [10]:
hmm = viterbi.HMM()

In [11]:
hmm.fit_word_tokenizer(x_vocab)
hmm.fit_pos_tokenizer(y_vocab)

In [12]:
hmm.build_transition_weights(y_freq, transition_data)
hmm.transition_weights.shape

(9, 9)

In [13]:
hmm.build_emission_weights(emission_data)
hmm.emission_weights.shape

(9, 10733)

In [14]:
pred = hmm.viterbi_predict(sentence_x)
pred = hmm.pos_tokens_to_labels(pred)
print("preds", pred)
print("truth", pos_list)
assert len(pred) == len(pos_list)

preds ['O', 'O', 'O', 'O', 'B-positive', 'I-positive', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
truth ['O', 'O', 'O', 'O', 'B-positive', 'I-positive', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']


In [15]:
train_data = dataset_folder + "dev.in"
lines = utils.read_file_to_lines(train_data)

sentences = []

while len(lines) > 1:
    sentence_break = lines.index("")
    sentence_xy = lines[:sentence_break]
    words = [token.strip() for token in sentence_xy]
    sentence = " ".join(words).strip()
    sentences.append(sentence)
    lines = lines[sentence_break+1:]

In [16]:
# only for the progress bar!
try:
    from tqdm import tqdm
    USE_TQDM = True
except Exception as e:
    print(e, "TQDM import error, disable progress bar")

if USE_TQDM:
    sentences_it = tqdm(sentences)
else:
    sentences_it = sentences

  0%|          | 0/3107 [00:00<?, ?it/s]

In [17]:
preds = []

for line in sentences_it:
    pred = hmm.viterbi_predict(line)
    pred = hmm.pos_tokens_to_labels(pred)
    preds.append(pred)
    
assert len(sentences) == len(preds)

100%|██████████| 3107/3107 [00:06<00:00, 499.02it/s]


In [18]:
outfile = dataset_folder + "dev.p3.out"

with open(outfile, "w") as f:
    for sentence, pred in zip(sentences, preds):
        word_array = sentence.split(" ")
        try:
            assert len(word_array) == len(pred)
            for i, word in enumerate(word_array):
                f.write(word + " " + pred[i] +"\n")
        except:
            print(word_array)
            print(pred)
            break
        f.write("\n")

In [19]:
!python3 evalResult.py ./data/SG/dev.out ./data/SG/dev.p3.out


#Entity in gold data: 4537
#Entity in prediction: 2984

#Correct Entity : 1605
Entity  precision: 0.5379
Entity  recall: 0.3538
Entity  F: 0.4268

#Correct Sentiment : 999
Sentiment  precision: 0.3348
Sentiment  recall: 0.2202
Sentiment  F: 0.2657
