In [13]:
import numpy as np
import itertools
import time
import sys

from data_pre import data_preprocessing
from misc import compute_prob_log,compute_tag_acc
from inference import local_predictor, greedy_LR_predictor, greedy_RL_predictor, Viterbi_predictor

In [14]:
(data_train,data_dev,word2ix, ix2word, tag2ix, ix2tag, em_prob, trans_prob) = data_preprocessing()

In [15]:
em_prob[em_prob == 0] = sys.float_info.min
trans_prob[trans_prob == 0] = sys.float_info.min

In [16]:
# top_tag = 5
# ix_sorted = np.argsort(trans_prob[tag2ix['<s>'],:])[::-1]
# ix = ix_sorted[:top_tag]
# print("top {} tags with after '<s>' and probablity:".format(top_tag))
# for i in ix:
#     print("{} : {}".format(ix2tag[i],trans_prob[tag2ix['<s>'],i] ))

In [17]:
# top_word = 10
# ix_sorted = np.argsort(em_prob[tag2ix['JJ'],:])[::-1]
# ix = ix_sorted[:top_word]
# print("top {} words with tag 'JJ' and probablity:".format(top_word))
# for i in ix:
#     print("{} : {}".format(ix2word[i],em_prob[tag2ix['JJ'],i] ))

# Preliminaries for Inference with HMMs 

## Log-probability calculation
code is in `code/misc.py/`

In [18]:
# (corpus, tags) = data_dev
# print("log probability of dev is {}".format(compute_prob_log(corpus, tags, trans_prob, em_prob, word2ix, tag2ix)))

## Local predictor baseline

In [19]:
# (corpus, tags) = data_dev
# start = time.time()
# tags_pred = local_predictor(corpus, em_prob, word2ix, ix2tag)   
# runtime = time.time() - start
# print("baseline accuracy using local predictor is {}".format(compute_tag_acc(tags_pred, tags)))
# print("log prob of baseline prediction is {}".format(compute_prob_log(corpus, tags_pred, trans_prob, em_prob, word2ix, tag2ix)))
# print("runtime for local predictor is {}".format(runtime))

In [20]:
# (corpus, tags) = data_dev
# start = time.time()
# tags_pred = greedy_LR_predictor(corpus, em_prob, trans_prob, word2ix, tag2ix, ix2tag) 
# runtime = time.time() - start
# print("accuracy using greedy lr predictor is {}".format(compute_tag_acc(tags_pred, tags)))
# print("log prob of greedy lr predictor is {}".format(compute_prob_log(corpus, tags_pred, trans_prob, em_prob, word2ix, tag2ix)))
# print("runtime for greedy lr predictor is {}".format(runtime))

In [21]:
# (corpus, tags) = data_dev
# start = time.time()
# tags_pred = greedy_RL_predictor(corpus, em_prob, trans_prob, word2ix, tag2ix, ix2tag) 
# runtime = time.time() - start
# print("accuracy using greedy rl predictor is {}".format(compute_tag_acc(tags_pred, tags)))
# print("log prob of greedy rl predictor is {}".format(compute_prob_log(corpus, tags_pred, trans_prob, em_prob, word2ix, tag2ix)))
# print("runtime for greedy rl predictor is {}".format(runtime))

In [22]:
# (corpus, tags) = data_dev
# start = time.time()
# tags_pred = Viterbi_predictor(corpus, em_prob, trans_prob, word2ix, tag2ix, ix2tag) 
# runtime = time.time() - start
# print("accuracy using Viterbi_predictor is {}".format(compute_tag_acc(tags_pred, tags)))
# print("log prob of Viterbi_predictor is {}".format(compute_prob_log(corpus, tags_pred, trans_prob, em_prob, word2ix, tag2ix)))
# print("runtime for Viterbi_predictor is {}".format(runtime))

In [83]:
def beam_search_predictor(corpus, em_prob, trans_prob, word2ix, tag2ix, ix2tag, top = 5):
    tags_pred = []

    for Xs in corpus:
        xs = [word2ix[x] for x in Xs]
        B = beam_search(xs,em_prob, trans_prob, tag2ix,top)
        last = B.pop()
        (y,v,i) = last[0]
        tags = [y] ##yT
        while len(B) > 0:
            last = B.pop()
            (y,v,i) = last[i]
            tags.append(y) 
        tags = tags[::-1]
        tags = [ix2tag[y] for y in tags]
        tags_pred.append(tags)

    return tags_pred


def beam_search(xs,em_prob, trans_prob, tag2ix, top):
    B = []
    ## A(1), B(1)
    x = xs[0]
    vs = np.log(em_prob[:,x]) + np.log(trans_prob[tag2ix['<s>'],:])
    item = [(ix,v,-1) for ix,v in enumerate(vs)]
    B.append(maxb(item,top))
    ## A(t), B(t)
    for x in xs[1:-1]:
        item = []
        for j,b in enumerate(B[-1]):
            (yh,vh,ih) = b
            vs = np.log(em_prob[:,x]) + np.log(trans_prob[yh,:]) + vh
            As = [(ix,v,j) for ix,v in enumerate(vs)] ## this part is too slow!!
            item.append(As)
        item = list(itertools.chain.from_iterable(item))
        B.append(maxb(item,top))

    ## A(T),B(T)
    x = xs[-1]
    item = []
    for j,b in enumerate(B[-1]):
        (yh,vh,ih) = b
        vs = np.log(em_prob[:,x]) + np.log(trans_prob[yh,:]) + vh + np.log(trans_prob[:,tag2ix['</s>']])
        As = [(ix,v,j) for ix,v in enumerate(vs)]
        item.append(As)
    item = list(itertools.chain.from_iterable(item))
    B.append(maxb(item,1))

    return B

def maxb(item, b):
    item = sorted(item, key=lambda x: x[1])[::-1]
    return item[:b]


In [84]:
b = 100
(corpus, tags) = data_dev
start = time.time()
tags_pred = beam_search_predictor(corpus, em_prob, trans_prob, word2ix, tag2ix, ix2tag) 
runtime = time.time() - start
print("accuracy using Viterbi_predictor is {}".format(compute_tag_acc(tags_pred, tags)))
print("log prob of Viterbi_predictor is {}".format(compute_prob_log(corpus, tags_pred, trans_prob, em_prob, word2ix, tag2ix)))
print("runtime for Viterbi_predictor is {}".format(runtime))

accuracy using Viterbi_predictor is 0.8901386138613862
log prob of Viterbi_predictor is -163348.66072238702
runtime for Viterbi_predictor is 3.1500351428985596
