In [None]:
%matplotlib inline
import tensorflow as tf
import numpy as np
import matplotlib
import os
import math
import json

In [None]:
tags = {'I-LOC': 0, 
        'B-ORG': 1, 
        'I-PER': 3, 
        'O': 2, 
        'I-MISC': 4, 
        'B-MISC': 5, 
        'I-ORG': 6, 
        'B-LOC': 7}

In [None]:
def load_vectors(data_file, dim=50):
    vectors = json.load(open(data_file, 'r'))
    vocabulary = dict([(word, wid) for wid, word in enumerate(vectors.keys())])
    vocabulary['UUUNKKK'] = len(vocabulary)
    wv = np.vstack([vectors.values(), np.zeros(dim)])
    print 'Vocabulary Size: {}'.format(len(vocabulary))
    print 'Word Vectors Size: {}'.format(wv.shape)
    del vectors
    return wv, vocabulary

def load_sequence(data_file, vocabulary, tags, seq_len=2):
    dataset = []
    labels = []
    
    words = []
    with open(data_file, 'r') as fh:
        for line in fh:
            line = line.strip()
            if line != '' and 'DOCSTART' not in line:
                instance = line.split(' ')
                words.append(vocabulary[instance[0]])
                labels.append(tags[instance[-1]])
    
    for i in range(len(words)):
        instance = [words[j] if j >= 0 and j < len(words) else vocabulary['UUUNKKK'] \
                    for j in range(i-seq_len+1, i+1)]
        dataset.append(instance)
    del words
    return dataset, labels

In [None]:
def generate_batch_data(dataset, labels, batch_size):
    steps = int(math.ceil(len(dataset)/batch_size))
    index = 0
    for _ in range(steps):
        batch_data = dataset[index: index+batch_size]
        batch_label = labels[index: index+batch_size]
        index += batch_size
        yield batch_data, batch_label

In [None]:
data_dir = os.path.join('../data')
wv, vocabulary = load_vectors(os.path.join(data_dir, 'conll_vectors_100.json'), dim=100)
train_data, train_labels = load_sequence(os.path.join(data_dir, 'eng.train'), vocabulary, tags,
                                        seq_len=5)
valid_data, valid_labels = load_sequence(os.path.join(data_dir, 'eng.testa'), vocabulary, tags,
                                        seq_len=5)
print train_data[:5], train_labels[:5]

In [None]:
seq_len = 5
hidden_size = 150
learning_rate = 0.001
l2 = 0.001
layers = 1 
label_size = 8

label_embed = np.zeros((label_size, label_size), dtype=np.float32)
np.fill_diagonal(label_embed, 1.0)

graph = tf.Graph()
with graph.as_default():
    input_placeholder = tf.placeholder(shape=[None, seq_len], dtype=tf.int32)
    label_placeholder = tf.placeholder(shape=[None], dtype=tf.int64)
    with tf.name_scope('embedding_layer'):
        with tf.device('/cpu:0'):
            embed = tf.nn.embedding_lookup(wv, input_placeholder)
            window = tf.to_float(embed)
            window = [tf.squeeze(_input, [1]) for _input in tf.split(1, seq_len, window)]
            true_labels = tf.nn.embedding_lookup(label_embed, label_placeholder)
    with tf.name_scope('rnn_layer'):
        lstm_cell = tf.nn.rnn_cell.BasicLSTMCell(hidden_size, forget_bias=0.0,
                                                state_is_tuple=True, activation=tf.tanh)
        lstm_cell = tf.nn.rnn_cell.DropoutWrapper(lstm_cell, input_keep_prob=0.5,
                                                  output_keep_prob=0.5)
        cell = tf.nn.rnn_cell.MultiRNNCell([lstm_cell] * layers, state_is_tuple=True)
        output, state = tf.nn.rnn(cell, window, dtype=tf.float32)
    with tf.name_scope('linear_layer'):
        weight = tf.get_variable('W', shape=[hidden_size, label_size],
                                initializer = tf.contrib.layers.xavier_initializer())
        bias = tf.get_variable('bias',
                              initializer=tf.zeros_initializer(shape=[1, label_size]))
        logit = tf.matmul(output[-1], weight) + bias
    with tf.name_scope('loss'):
        ce_loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logit, true_labels))
        reg_loss = 0
        for w in tf.trainable_variables():
            if 'bias' not in w.name.lower():
                reg_loss += tf.nn.l2_loss(w)
        loss = ce_loss + l2 * reg_loss
    with tf.name_scope('optimizer'):
        optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(ce_loss)
    with tf.name_scope('prediction'):
        yhat = tf.argmax(tf.nn.softmax(logit), dimension=1)
    with tf.name_scope('accuracy'):
        acc = tf.reduce_mean(tf.cast(tf.equal(label_placeholder, yhat), dtype=tf.float32))
    with tf.name_scope('confusion_matrix'):
        matrix = tf.contrib.metrics.confusion_matrix(yhat, label_placeholder,
                                                    num_classes=tf.constant(8, dtype=tf.int64),
                                                     dtype = tf.float32)

In [None]:
def run_epoch(sess):
    costs = []
    losses = []
    for X_batch, y_batch in generate_batch_data(train_data, train_labels, batch_size=128):
        _, cost, accuracy, cm = sess.run([optimizer, loss, acc, matrix], feed_dict={
                input_placeholder: X_batch,
                label_placeholder: y_batch
            })
        costs.append(cost)
        losses.append(accuracy)
    return np.mean(costs), np.mean(losses), cm

In [None]:
max_epochs = 500
with tf.Session(graph=graph) as sess:
    init_op = tf.initialize_all_variables()
    sess.run(init_op)
    for epoch in range(max_epochs):
        avg_cost, avg_acc, cm = run_epoch(sess)
        if epoch % 10 == 0:
            _, cost, accuracy, cm = sess.run([optimizer, loss, acc, matrix], feed_dict={
                input_placeholder: valid_data,
                label_placeholder: valid_labels
            })
            print 'Epoch: {}'.format(epoch)
            print 'Train Loss: {}, Train Accuracy: {}'.format(avg_cost, avg_acc)
            print 'Valid Loss: {}, Valid Accuracy: {}'.format(cost, accuracy)
            for tag, i in tags.items():
                precision = cm[i, i]/np.sum(cm[i, :])
                recall = cm[i, i]/np.sum(cm[:, i])
                f1 = 2*precision*recall/(precision+recall)
                print 'Tag: {}, P: {}, R: {}, F1: {}'.format(precision, recall, f1)
    X_test, y_test = load_sequence(os.path.join(data_dir, 'eng.testb'), vocabulary, tags,
                                        seq_len=5)
    _, cost, accuracy = sess.run([optimizer, loss, acc, matrix], feed_dict={
                input_placeholder: X_test,
                label_placeholder: y_test
            })
    print 'Test Loss: {}, Test Accuracy: {}'.format(cost, accuracy)
    try:
        for tag, i in tags.items():
            precision = cm[i, i]/np.sum(cm[i, :])
            recall = cm[i, i]/np.sum(cm[:, i])
            f1 = 2*precision*recall/(precision+recall)
            print 'Tag: {}, P: {}, R: {}, F1: {}'.format(precision, recall, f1)
    except:
        pass