In [2]:
%matplotlib inline
import tensorflow as tf
import numpy as np
from pyspark.mllib.feature import Word2Vec
import matplotlib
import matplotlib.pyplot as plt
import os
import json
import math

In [3]:
data_dir = os.path.join('../data')
conll_data = os.path.join(data_dir, 'eng.conll')
train_data = os.path.join(data_dir, 'eng.train')
valid_data = os.path.join(data_dir, 'eng.testa')
test_data = os.path.join(data_dir, 'eng.testb')

In [6]:
def get_text(data_path):
    data = (sc.textFile(conll_data)
            .map(lambda line: line.strip())
            .filter(lambda line: line != '' and 'DOCSTART' not in line)
            .flatMap(lambda line: [line.split()[0]])
           ).collect()
    print 'Words..'
    print data[:20]
    return ' '.join(data)

def generate_vectors(data_path, dim=50):
    text = get_text(data_path)
    data = (sc.parallelize([text])
            .map(lambda line: line.split(' '))
           )
    model = Word2Vec().setVectorSize(dim).setMinCount(1).fit(data)
    vectors = dict(model.getVectors())
    for i in vectors:
        vectors[i] = list(vectors[i])
    with open(os.path.join(data_dir, 'conll_vectors_%s.json'%dim), 'w') as outfile:
        outfile.write(json.dumps(vectors))
    print 'Vocabulary Size: {}'.format(len(vectors))
    del data
    del vectors

generate_vectors(conll_data, 300)

Words..
[u'EU', u'rejects', u'German', u'call', u'to', u'boycott', u'British', u'lamb', u'.', u'Peter', u'Blackburn', u'BRUSSELS', u'1996-08-22', u'The', u'European', u'Commission', u'said', u'on', u'Thursday', u'it']
Vocabulary Size: 30289


In [3]:
def load_vectors(data_file):
    vectors = json.load(open(data_file, 'r'))
    vocabulary = dict([(word, wid) for wid, word in enumerate(vectors.keys())])
    vocabulary['UUUNKKK'] = len(vocabulary)
    wv = np.vstack([vectors.values(), np.zeros(300)])
    print 'Vocabulary Size: {}'.format(len(vocabulary))
    print 'Word Vectors Size: {}'.format(wv.shape)
    del vectors
    return wv, vocabulary

def load_data(data_file, vocabulary, tags, context_size=1):
    dataset = []
    labels = []
    
    words = []
    with open(data_file, 'r') as fh:
        for line in fh:
            line = line.strip()
            if line != '' and 'DOCSTART' not in line:
                instance = line.split(' ')
                words.append(vocabulary[instance[0]])
                labels.append(tags[instance[-1]])
    
    for i in range(len(words)):
        instance = [words[j] if j >= 0 and j < len(words) else vocabulary['UUUNKKK'] \
                    for j in range(i-context_size, i+context_size+1)]
        dataset.append(instance)
    del words
    return dataset, labels

def generate_batch_data(dataset, labels, batch_size):
    steps = int(math.ceil(len(dataset)/batch_size))
    index = 0
    for _ in range(steps):
        batch_data = dataset[index: index+batch_size]
        batch_label = labels[index: index+batch_size]
        index += batch_size
        yield batch_data, batch_label

tag_stats = dict((sc.textFile(train_data)
             .filter(lambda line: line != '' and 'DOCSTART' not in line)
             .map(lambda line: (line.strip().split(' ')[-1], 1))
             .reduceByKey(lambda x,y: x+y)
            ).collect())
tags = dict([(t,tid) for tid,t in enumerate(tag_stats.keys())])
print tags
vectors, vocabulary = load_vectors(os.path.join(data_dir, 'conll_vectors.json'))
reverse_vocabulary = dict([(v,k) for k,v in vocabulary.items()])
X_train, y_train = load_data(train_data, vocabulary, tags, context_size=1)

{u'I-LOC': 0, u'B-ORG': 1, u'I-PER': 3, u'O': 2, u'I-MISC': 4, u'B-MISC': 5, u'I-ORG': 6, u'B-LOC': 7}
Vocabulary Size: 30290
Word Vectors Size: (30290, 300)


In [4]:
print 'Data:'
for i in range(5):
    print X_train[i], map(lambda j: reverse_vocabulary[j], X_train[i])

Data:
[30289, 27407, 24523] ['UUUNKKK', u'EU', u'rejects']
[27407, 24523, 13654] [u'EU', u'rejects', u'German']
[24523, 13654, 26305] [u'rejects', u'German', u'call']
[13654, 26305, 15865] [u'German', u'call', u'to']
[26305, 15865, 25375] [u'call', u'to', u'boycott']


In [20]:
batch_size = 128
num_labels = 8
context_size = 1
feature_size = 300

filter_sizes = [1, 2]

label_encoding = np.zeros((8,8), dtype=np.float32)
np.fill_diagonal(label_encoding, 1.0)

cnn_graph = tf.Graph()
with cnn_graph.as_default():
    input_placeholder = tf.placeholder(shape=(None, 2*context_size+1), 
                                       dtype=tf.int32, name='data')
    label_placeholder = tf.placeholder(shape=[None], dtype=tf.int64)
    with tf.variable_scope('embedding'):
        with tf.device('/cpu:0'):
            embed = tf.nn.embedding_lookup(vectors, input_placeholder)
            window = tf.to_float(tf.reshape(embed, shape=[-1, 2*context_size+1, 
                                                          feature_size, 1]))
            label = tf.to_float(tf.nn.embedding_lookup(label_encoding, label_placeholder))
    with tf.variable_scope('convolution'):
        convolution_outputs = []
        for filter_size in filter_sizes:
            kernel_shape = [filter_size, feature_size, 1, 2]
            kernel = tf.get_variable(name='kernel%s' % filter_size,
                                        shape=kernel_shape,
                                        initializer=tf.truncated_normal_initializer())
            bias = tf.get_variable(name='bias%s' % filter_size,
                                      initializer=tf.zeros_initializer(shape=[2]))
            conv = tf.nn.conv2d(window, kernel, [1, 1, 1, 1], padding='VALID')
            hidden = tf.nn.relu(tf.nn.bias_add(conv, bias))
            convolution_outputs.append(hidden)
        concat = tf.concat(1, convolution_outputs)
        features = tf.reshape(concat, shape=[-1, 10])
    with tf.name_scope('hidden_layers'):
        h_window = tf.to_float(tf.reshape(embed, shape=[-1, (2*context_size+1)*feature_size]))
        window = tf.concat(1, [features, h_window])
        with tf.variable_scope('hidden1'):
            weights = tf.get_variable(name='W', shape=[910, 1800],
                                     initializer=tf.contrib.layers.xavier_initializer())
            bias = tf.get_variable(name='b1', initializer=tf.zeros_initializer(shape=[1,1800]))
            hidden1 = tf.nn.relu(tf.matmul(window, weights) + bias)
        with tf.variable_scope('hidden2'):
            weights = tf.get_variable(name='V', shape=[1800, 900],
                                     initializer=tf.contrib.layers.xavier_initializer())
            bias = tf.get_variable(name='b2', initializer=tf.zeros_initializer(shape=[1,900]))
            hidden2 = tf.nn.relu(tf.matmul(hidden1, weights) + bias)
        with tf.variable_scope('hidden3'):
            weights = tf.get_variable(name='T', shape=[900, 300],
                                     initializer=tf.contrib.layers.xavier_initializer())
            bias = tf.get_variable(name='b3', initializer=tf.zeros_initializer(shape=[1,300]))
            hidden3 = tf.nn.relu(tf.matmul(hidden2, weights) + bias)
    with tf.variable_scope('linear'):
        weight = tf.get_variable(name='U',
                                     shape=[300, num_labels],
                                     initializer=tf.contrib.layers.xavier_initializer())
        bias = tf.get_variable(name='b',
                                  initializer=tf.zeros_initializer(shape=[1, num_labels]))
        logits = tf.matmul(hidden3, weight) + bias
    with tf.variable_scope('softmax'):
        prediction = tf.nn.softmax(logits)
    with tf.name_scope('loss'):
        ce_loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits, label))
        with tf.variable_scope('linear', reuse=True):
            u = tf.get_variable(name='U')
            u_loss = tf.nn.l2_loss(u)
        with tf.variable_scope('hidden1', reuse=True):
            w = tf.get_variable(name='W')
            w_loss = tf.nn.l2_loss(w)
        with tf.variable_scope('hidden2', reuse=True):
            v = tf.get_variable(name='V')
            v_loss = tf.nn.l2_loss(v)
        with tf.variable_scope('hidden3', reuse=True):
            t = tf.get_variable(name='T')
            t_loss = tf.nn.l2_loss(t)
        loss = ce_loss + 0.001 * (u_loss + w_loss + v_loss + t_loss)
    with tf.variable_scope('Optimizer'):
        optimizer = tf.train.AdamOptimizer(learning_rate=0.001).minimize(loss)
    with tf.variable_scope('prediction'):
        y_hat = tf.argmax(prediction, dimension=1)
        acc = tf.reduce_mean(tf.cast(tf.equal(y_hat, label_placeholder), dtype=tf.float32))
        confusion_matrix = tf.contrib.metrics.confusion_matrix(y_hat, label_placeholder,
                                                              dtype=tf.float32)

In [None]:
max_epochs = 100
X_valid, y_valid = load_data(valid_data, vocabulary, tags, context_size=1)
with tf.Session(graph=cnn_graph) as sess:
    init_op = tf.initialize_all_variables()
    sess.run(init_op)
    for epoch in range(max_epochs):
        costs=[]
        accs=[]
        for X_batch, y_batch in generate_batch_data(X_train, y_train, batch_size=128):
            feed_dict = {
                input_placeholder: X_batch,
                label_placeholder: y_batch
            }
            _, cost, accuracy = sess.run([optimizer, loss, acc], feed_dict=feed_dict)
            costs.append(cost)
            accs.append(accuracy)
        avg_cost = np.mean(costs)
        avg_acc = np.mean(accs)
        if epoch % 10 == 0:
            print 'Epoch: {}, Loss: {}, Acc: {}'.format(epoch, avg_cost, avg_acc)
            _, cost, accuracy, cm = sess.run([optimizer, loss, acc, confusion_matrix], 
                                             feed_dict={
                    input_placeholder: X_valid,
                    label_placeholder: y_valid
                })
            print 'Validation Loss:{}, Accuracy: {}'.format(cost, accuracy)
            # print cm
            try:
                for tag, i in tags.items():
                    precision = cm[i,i]/np.sum(cm[i, :], dtype=np.float32)
                    recall = cm[i,i]/np.sum(cm[:, i], dtype=np.float32)
                    f1 = 2*precision*recall/(precision + recall)
                    print 'Tag: {}, P: {}, R:{}, F1: {}'.format(tag, precision, recall, f1)
            except:
                pass

Epoch: 0, Loss: 0.573065817356, Acc: 0.863546609879
Validation Loss:0.525649785995, Accuracy: 0.873349964619
Tag: I-LOC, P: 0.674515247345, R:0.465138494968, F1: 0.55059359127
Tag: B-ORG, P: nan, R:nan, F1: nan
Tag: I-PER, P: 0.611987352371, R:0.677675426006, F1: 0.643158471368
Tag: O, P: 0.899215936661, R:0.976332485676, F1: 0.936188848066
Tag: I-MISC, P: 0.0, R:0.0, F1: nan
Tag: B-MISC, P: nan, R:0.0, F1: nan
Tag: I-ORG, P: 0.5, R:0.000956022937316, F1: 0.00190839702373


