In [1]:
import utils
import emission
import transition
import viterbi

In [2]:
dataset_folder = "data/AL/"
train_data = dataset_folder + "train"
lines = utils.read_file_to_lines(train_data)

entity_lines = []
sentiment_lines = []

for line in lines:
    try:
        word, tag = line.split(" ")
        entity, sentiment = tag.split("-")
        entity_lines.append(word + " " + entity)
        sentiment_lines.append(word + " " + sentiment)
    except:
        entity_lines.append(line)
        sentiment_lines.append(line)

In [3]:
test_data = dataset_folder + "dev.in"
lines = utils.read_file_to_lines(test_data)

sentences = []

while len(lines) > 1:
    sentence_break = lines.index("")
    sentence_xy = lines[:sentence_break]
    words = [token.strip() for token in sentence_xy]
    sentence = " ".join(words).strip()
    sentences.append(sentence)
    lines = lines[sentence_break+1:]

In [4]:
def predict_lines(lines):
    emission_data = emission.generate_emission_table(lines)
    hashmap = emission_data["x_hashmap"]
    word_freq = emission_data["x_word_freq"]
    smoothed_hashmap = utils.add_unk(hashmap, word_freq, k=3)
    emission_data["x_hashmap"] = smoothed_hashmap

    x_vocab = utils.get_emission_vocab(smoothed_hashmap)

    transition_pairs = transition.generate_transition_pairs(lines)

    y_pairs = transition_pairs["Y_pairs"]
    y_vocab = transition_pairs["y_vocab"]
    y_freq = transition_pairs["y_freq"]

    transition_data = transition.generate_transition_data(y_pairs, y_vocab)

    hmm = viterbi.HMM()

    hmm.fit_word_tokenizer(x_vocab)
    hmm.fit_pos_tokenizer(y_vocab)

    hmm.build_transition_weights(y_freq, transition_data)
    hmm.build_emission_weights(emission_data)
    
    # only for the progress bar!
    try:
        from tqdm import tqdm
        USE_TQDM = True
    except Exception as e:
        print(e, "TQDM import error, disable progress bar")

    if USE_TQDM:
        sentences_it = tqdm(sentences)
    else:
        sentences_it = sentences
        
    preds = []

    for line in sentences_it:
        pred = hmm.viterbi_predict(line)
        pred = hmm.pos_tokens_to_labels(pred)
        preds.append(pred)

    assert len(sentences) == len(preds)
    
    return preds

In [5]:
entity_preds = predict_lines(entity_lines)
sentiment_preds = predict_lines(sentiment_lines)

100%|██████████| 1492/1492 [00:01<00:00, 1337.64it/s]
100%|██████████| 1492/1492 [00:33<00:00, 45.19it/s]


In [6]:
outfile = dataset_folder + "dev.p5.out"

with open(outfile, "w") as f:
    for sentence, entity_pred, sentiment_pred in zip(sentences, entity_preds, sentiment_preds):
        word_array = sentence.split(" ")
        try:
            for i, word in enumerate(word_array):
                if entity_pred[i] == "O" or sentiment_pred[i] == "O":
                    f.write(word + " O\n")
                else:
                    f.write(word + " " + entity_pred[i] + "-" + sentiment_pred[i] +"\n")
        except:
            print(word_array)
            print(pred)
            break
        f.write("\n")

In [7]:
gold_data = dataset_folder + "dev.out"
pred_data = dataset_folder + "dev.p5.out"

data = utils.run_eval(gold_data, pred_data)

print("Entity F:", data["entity_f"])
print("Entity precision:", data["entity_p"])
print("Entity recall:", data["entity_r"])
print("Sentiment F:", data["sentiment_f"])
print("Sentiment precision:", data["sentiment_p"])
print("Sentiment recall:", data["sentiment_r"])

Entity F: 0.5097
Entity precision: 0.4392
Entity recall: 0.607
Sentiment F: 0.4436
Sentiment precision: 0.3823
Sentiment recall: 0.5283
