In [None]:
%matplotlib inline
import tensorflow as tf
import numpy as np
from pyspark.mllib.feature import Word2Vec
import matplotlib
import matplotlib.pyplot as plt
import os
import json
import math
import seaborn

In [None]:
data_dir = os.path.join('../data')
conll_data = os.path.join(data_dir, 'eng.conll')
train_data = os.path.join(data_dir, 'eng.train')
valid_data = os.path.join(data_dir, 'eng.testa')
test_data = os.path.join(data_dir, 'eng.testb')

In [None]:
def load_vectors(data_file, dim):
    vectors = json.load(open(data_file, 'r'))
    vocabulary = dict([(word, wid) for wid, word in enumerate(vectors.keys())])
    vocabulary['UUUNKKK'] = len(vocabulary)
    wv = np.vstack([vectors.values(), np.zeros(dim)])
    print 'Vocabulary Size: {}'.format(len(vocabulary))
    print 'Word Vectors Size: {}'.format(wv.shape)
    del vectors
    return wv, vocabulary

def load_chars_set(data_file):
    char_set = json.load(open(data_file, 'r'))
    char_map = dict([(char, cid) for cid, char in enumerate(char_set.keys())])
    char_map['~'] = len(char_map)
    del char_set
    return char_map

def pad(word, char_map, length, symbol='~'):
    word += char_map[symbol]*(length - len(word))
    return word

# ToDo
def load_data(data_file, vocabulary, tags, context_size=1):
    pass

def generate_batch_data(dataset, labels, batch_size):
    steps = int(math.ceil(len(dataset)/batch_size))
    index = 0
    for _ in range(steps):
        batch_data = dataset[index: index+batch_size]
        batch_label = labels[index: index+batch_size]
        index += batch_size
        yield batch_data, batch_label

In [None]:
batch_size = 128
char_embed_size = 128
word_embed_size = 100

kernel_sizes = [2, 3, 5]

label_encoding = np.zeros(shape=(8, 8), dtype=tf.float32)
np.fill_diagonal(label_encoding, 1.0)

graph = tf.Graph()
with graph.as_default():
    input_placeholder = tf.placeholder(shape=[None], dtype=tf.int32)
    label_placeholder = tf.placeholder(shape=[None], dtype=tf.int64)
    chars_placeholder = tf.placeholder(shape=[None, 61], dtype=tf.int32)
    chars_embedding = tf.Variable(
        tf.trunacated_normal(shape=[len(char_map), char_embed_size]),
        trainable=True, name='char_embedding',
        dtype=tf.float32)
    with tf.variable_scope('embedding'):
        with tf.device('/cpu:0'):
            word_embedding = tf.nn.embedding_lookup(vectors, input_placeholder)
            char_embedding = tf.nn.embedding_lookup(chars_embedding, chars_placeholder)
            y = tf.nn.embedding_lookup(label_encoding, label_placeholder)
            char_embed = tf.reshape(char_embedding, shape=[-1, 61, 128, 1])
    with tf.variable_scope('conv'):
        convolution_outputs = []
        for kernel_size in kernel_sizes:
            kernel_shape = [kernel_size, char_embed_size, 1, 2]
            kernel = tf.get_variable(name='kernel%s'%kernel_size,
                                     shape=kernel_shape,
                                     initializer=tf.truncated_normal_initializer())
            bias = tf.get_variable(name='bias%s'%kernel_size,
                                   initializer=tf.zeros_initializer(shape=[2]))
            conv = tf.nn.conv2d(char_embed, kernel, [1, 1, 1, 1], padding='VALID')
            hidden = tf.nn.relu(tf.nn.bias_add(conv, bias))
            convolution_outputs.append(hidden)
        concat = tf.concat(1, convolution_outputs)
        
        