In [1]:
import warnings
warnings.filterwarnings('ignore')

import tensorflow as tf
import numpy as np

tf.set_random_seed(42)
np.random.seed(42)

tf.__version__

'1.6.0'

In [2]:
class HierarchicalAttentionNetwork(object):
    def __init__(self, embedding_matrix, num_class, hidden_dim = None):
        super(HierarchicalAttentionNetwork, self).__init__()
        self._embedding_matrix = embedding_matrix
        self._num_class = num_class
        self._hidden_dim = \
            hidden_dim if hidden_dim is not None else self._embedding_matrix.shape[1]
    def _make_rnn_cell(self):
        cell = tf.nn.rnn_cell.GRUCell(num_units=self._hidden_dim)
        return cell
    def _make_graph_batch(self, graph):
        with graph.as_default():
            words = tf.placeholder(tf.int32, [None, None, None], name='words')
            words_length = tf.placeholder(tf.int32, [None, None], name='words_length')
            sentences_length = tf.placeholder(tf.int32, [None], name='sentences_length')
            labels = tf.placeholder(tf.int32, [None], name='labels')
            init_state = tf.placeholder(tf.float32, [None, None, self._hidden_dim * 2], name='init_state')
            
            with tf.variable_scope('embeddings'):
                embedding = \
                    tf.get_variable('parameter', 
                                    shape=self._embedding_matrix.shape, 
                                    initializer=tf.constant_initializer(embedding_matrix), 
                                    dtype=tf.float32, trainable=True)
                embedded  = tf.nn.embedding_lookup(embedding, words, name='lookup')
            with tf.variable_scope('words_lstm'):
                cell_fw = self._make_rnn_cell()
                cell_bw = self._make_rnn_cell()
                def step(state, inp):
                    data = inp[0]
                    length = inp[1]
                    fw_state = tf.split(tf.map_fn(lambda x: x[0][x[1], :], (state, length - 1), dtype=tf.float32), 2, axis=1)[0]
                    bw_state = tf.split(state[:, 0, :], 2, axis=1)[1]
                    (outputs_fw, outputs_bw), _ = \
                        tf.nn.bidirectional_dynamic_rnn(
                            cell_fw, cell_bw, data, sequence_length=length, 
                            initial_state_fw=fw_state, initial_state_bw=bw_state, dtype=tf.float32
                        )
                    outputs = tf.concat([outputs_fw, outputs_bw], axis=2)
                    return outputs
                embedded_t = tf.transpose(embedded, perm=[1, 0, 2, 3])
                words_length_t = tf.transpose(words_length)
                outputs = tf.scan(step, (embedded_t, words_length_t), initializer=init_state)
                outputs = tf.transpose(outputs, perm=[1, 0, 2, 3])
#                 def fn(inp):
#                     (outputs_fw, outputs_bw), _ = \
#                         tf.nn.bidirectional_dynamic_rnn(cell_fw, cell_bw, inp[0], sequence_length=inp[1], dtype=tf.float32)
#                     return tf.concat([outputs_fw, outputs_bw], axis=2)
#                 outputs = tf.map_fn(fn, (embedded, words_length), dtype=tf.float32)
            with tf.variable_scope('words_attention'):
                hidden = \
                    tf.layers.dense(outputs, units=self._hidden_dim * 2, 
                                    activation=tf.nn.tanh, kernel_initializer=tf.random_uniform_initializer(-0.1, 0.1))
                attention = \
                    tf.layers.dense(hidden, units=1, 
                                    activation=None, kernel_initializer=tf.random_uniform_initializer(-0.1, 0.1))
                attention = tf.transpose(tf.nn.softmax(tf.transpose(attention, perm=[0, 1, 3, 2])), perm=[0, 1, 3, 2])
            outputs = tf.reduce_sum(outputs * attention, axis=2)
            with tf.variable_scope('sentence_lstm'):
                cell_fw = self._make_rnn_cell()
                cell_bw = self._make_rnn_cell()
                (outputs_fw, outputs_bw), _ = \
                    tf.nn.bidirectional_dynamic_rnn(cell_fw, cell_bw, outputs, sequence_length=sentences_length, dtype=tf.float32)
            outputs = tf.concat([outputs_fw, outputs_bw], axis=2)
            with tf.variable_scope('sentence_attention'):
                hidden = \
                    tf.layers.dense(outputs, units=self._hidden_dim * 2, 
                                    activation=tf.nn.tanh, kernel_initializer=tf.random_uniform_initializer(-0.1, 0.1))
                attention = \
                    tf.layers.dense(hidden, units=1, 
                                    activation=None, kernel_initializer=tf.random_uniform_initializer(-0.1, 0.1))
                attention = tf.transpose(tf.nn.softmax(tf.transpose(attention, perm=[0, 2, 1])), perm=[0, 2, 1])
            outputs = tf.reduce_sum(outputs * attention, axis=1)
            logits = tf.layers.dense(outputs, units=self._num_class, activation=None)
            loss = tf.log(tf.boolean_mask(tf.nn.softmax(logits), tf.one_hot(labels, self._num_class, on_value=True, off_value=False)))
            loss = -tf.reduce_sum(loss)
            training_op = tf.train.AdamOptimizer(learning_rate=0.01).minimize(loss)
            return words, words_length, sentences_length, labels, init_state, logits, loss, training_op
    def _make_graph(self, graph):
        with graph.as_default():
            words = tf.placeholder(tf.int32, [None, None], name='words')
            length = tf.placeholder(tf.int32, [None], name='length')
            labels = tf.placeholder(tf.int32, (), name='labels')
            init_state = tf.placeholder(tf.float32, [None, self._hidden_dim * 2], name='initial_state')
            
            with tf.variable_scope('embeddings'):
                embedding = \
                    tf.get_variable('parameter', 
                                    shape=self._embedding_matrix.shape, 
                                    initializer=tf.constant_initializer(embedding_matrix), 
                                    dtype=tf.float32, trainable=True)
                embedded  = tf.nn.embedding_lookup(embedding, words, name='lookup')
            with tf.variable_scope('words_lstm'):
                cell_fw = self._make_rnn_cell()
                cell_bw = self._make_rnn_cell()
                def step(state, inp):
                    data = tf.expand_dims(inp[0], axis=0)
                    length = tf.expand_dims(inp[1], axis=0)
                    fw_state = tf.split(state[inp[1] - 1, :], 2)[0]
                    bw_state = tf.split(state[0, :], 2)[1]
                    fw_state = tf.expand_dims(fw_state, axis=0)
                    bw_state = tf.expand_dims(bw_state, axis=0)
                    (outputs_fw, outputs_bw), _ = \
                        tf.nn.bidirectional_dynamic_rnn(
                            cell_fw, cell_bw, data, sequence_length=length, 
                            initial_state_fw=fw_state, initial_state_bw=bw_state, dtype=tf.float32
                        )
                    outputs = tf.squeeze(tf.concat([outputs_fw, outputs_bw], axis=2), axis=[0])
                    return outputs
                outputs = tf.scan(step, (embedded, length), initializer=init_state)
            with tf.variable_scope('words_attention'):
                hidden = \
                    tf.layers.dense(outputs, units=self._hidden_dim * 2, 
                                    activation=tf.nn.tanh, kernel_initializer=tf.random_uniform_initializer(-0.1, 0.1))
                attention = \
                    tf.layers.dense(hidden, units=1, 
                                    activation=None, kernel_initializer=tf.random_uniform_initializer(-0.1, 0.1))
                attention = tf.transpose(tf.nn.softmax(tf.transpose(attention, perm=[0, 2, 1])), perm=[0, 2, 1])
            sentence_embedding = tf.reduce_sum(outputs * attention, axis=1)
            sentence_embedding = tf.expand_dims(sentence_embedding, axis=0)
            
            with tf.variable_scope('sentence_lstm'):
                cell_fw = self._make_rnn_cell()
                cell_bw = self._make_rnn_cell()
                (outputs_fw, outputs_bw), _ = \
                    tf.nn.bidirectional_dynamic_rnn(cell_fw, cell_bw, sentence_embedding, dtype=tf.float32)
            outputs = tf.squeeze(tf.concat([outputs_fw, outputs_bw], axis=2), axis=[0])
            with tf.variable_scope('sentence_attention'):
                hidden = \
                    tf.layers.dense(outputs, units=self._hidden_dim * 2, 
                                    activation=tf.nn.tanh, kernel_initializer=tf.random_uniform_initializer(-0.1, 0.1))
                attention = \
                    tf.layers.dense(hidden, units=1, 
                                    activation=None, kernel_initializer=tf.random_uniform_initializer(-0.1, 0.1))
                attention = tf.transpose(tf.nn.softmax(tf.transpose(attention)))                
            outputs = tf.reduce_sum(outputs * attention, axis=0)
            outputs = tf.expand_dims(outputs, axis=0)
            logits = tf.layers.dense(outputs, units=self._num_class, activation=None)
            loss = -tf.log(tf.nn.softmax(logits)[:, labels], name='loss')
#             loss = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits, labels=tf.expand_dims(labels, axis=0))
#             loss = tf.squeeze(loss, axis=[0])
            training_op = tf.train.AdamOptimizer(learning_rate=0.01).minimize(loss)
            return words, length, labels, init_state, logits, loss, training_op            
    def evaluate_batch(self, train_words_seq, train_length_seq, train_labels_seq, eval_words_seq, eval_length_seq, eval_labels_seq, num_epochs=20, batch_size=16, model=None):
        def prepare_data_batch(words_seq, length_seq, label_seq):
            max_words_len = max([max([w.shape[0] for w in ws]) for ws in words_seq])
            max_sents_len = max([ws.shape[0] for ws in words_seq])
            def pad(array):
                expand = np.zeros((max_sents_len, max_words_len), dtype=np.int32)
                expand[:array.shape[0], :array.shape[1]] = array
                return expand
            words = np.stack([pad(ws) for ws in words_seq])    
            words_length = tf.keras.preprocessing.sequence.pad_sequences(length_seq, padding='post', value=0.0)
            sentence_length = np.array([w.shape[0] for w in words_seq])
            labels = np.array(label_seq)
            return words, words_length, sentence_length, labels        
        
        graph = tf.Graph()
        words, words_length, sents_length, labels, init_state, logits, loss, training_op = \
            self._make_graph_batch(graph)
        
        with graph.as_default():
            predictions = \
                tf.to_float(tf.nn.in_top_k(logits, labels, k=1))
        
        # pre calculate the padded training sequence
        train_words_batch, train_words_length_batch, train_sents_length_batch, train_labels_batch = \
            prepare_data_batch(train_words_seq, train_length_seq, train_labels_seq)
        init_state_batch = np.zeros((train_words_batch.shape[0], train_words_batch.shape[2], 2 * self._hidden_dim), dtype=np.float32)
        train_feed_dict = \
            { 
                words : train_words_batch, 
                words_length : train_words_length_batch, 
                sents_length : train_sents_length_batch, 
                labels : train_labels_batch,
                init_state : init_state_batch
            } 
        # pre calculate the padded eval sequence
        eval_words_batch, eval_words_length_batch, eval_sents_length_batch, eval_labels_batch = \
            prepare_data_batch(eval_words_seq, eval_length_seq, eval_labels_seq)
        init_state_val = np.zeros((eval_words_batch.shape[0], eval_words_batch.shape[2], 2 * self._hidden_dim), dtype=np.float32)
        eval_feed_dict = \
            { 
                words : eval_words_batch, 
                words_length : eval_words_length_batch, 
                sents_length : eval_sents_length_batch, 
                labels : eval_labels_batch,
                init_state : init_state_val
            }
        
        with tf.Session(graph=graph) as sess:
            sess.run(tf.global_variables_initializer())
            for epoch in range(num_epochs):
                st = None
                en = None
                while True:
                    if st is None: st = 0
                    else: st += batch_size
                    if en is None: en = batch_size
                    else: en += batch_size
                        
                    if not train_words_seq[st:en]: break
                    train_words_batch, train_words_length_batch, train_sents_length_batch, train_labels_batch = \
                        prepare_data_batch(train_words_seq[st:en], train_length_seq[st:en], train_labels_seq[st:en])
                    init_state_batch = np.zeros((train_words_batch.shape[0], train_words_batch.shape[2], 2 * self._hidden_dim), dtype=np.float32)
                    feed_dict = \
                        { 
                            words : train_words_batch, 
                            words_length : train_words_length_batch, 
                            sents_length : train_sents_length_batch, 
                            labels : train_labels_batch,
                            init_state : init_state_batch
                        }
                    sess.run(training_op, feed_dict=feed_dict)
                loss_val, pred_val = sess.run([loss, predictions], feed_dict=train_feed_dict)
                eval_pred_val = sess.run(predictions, feed_dict=eval_feed_dict)                
                print('Epoch [%d/%d], Loss %.3f, accuracy %.3f, eval accuracy %.3f' % (epoch + 1, num_epochs, loss_val, np.mean(pred_val), np.mean(eval_pred_val)))
            if model is not None:
                with graph().as_default():
                    tf.train.Saver().save(sess, model)
    def evaluate(self, train_words_seq, train_length_seq, train_labels_seq, eval_words_seq, eval_length_seq, eval_labels_seq, num_epochs=20, model=None):
        graph = tf.Graph()
#         words, length, labels, logits, loss, training_op = self._make_graph(graph)
        words, length, labels, init_state, logits, loss, training_op = self._make_graph(graph)
        with graph.as_default():
            prediction = \
                tf.to_float(tf.nn.in_top_k(logits, tf.expand_dims(labels, axis=0), k=1))
        with tf.Session(graph=graph) as sess:
            sess.run(tf.global_variables_initializer())
            for epoch in range(num_epochs):
                for words_val, length_val, labels_val in zip(train_words_seq, train_length_seq, train_labels_seq):
                    init_state_val = np.zeros((words_val.shape[-1], 2 * self._hidden_dim), dtype=np.float32)
                    feed_dict = { words : words_val, length : length_val, labels : labels_val, init_state : init_state_val }
                    sess.run(training_op, feed_dict=feed_dict)
                losses = []
                preds = []
                for words_val, length_val, labels_val in zip(train_words_seq, train_length_seq, train_labels_seq):
                    init_state_val = np.zeros((words_val.shape[-1], 2 * self._hidden_dim), dtype=np.float32)
                    feed_dict = { words : words_val, length : length_val, labels : labels_val, init_state : init_state_val }
                    loss_val, pred_val = sess.run([loss, prediction], feed_dict=feed_dict)
                    losses.append(loss_val)
                    preds.append(pred_val)
                eval_preds = []
                for words_val, length_val, labels_val in zip(eval_words_seq, eval_length_seq, eval_labels_seq):
                    init_state_val = np.zeros((words_val.shape[-1], 2 * self._hidden_dim), dtype=np.float32)
                    feed_dict = { words : words_val, length : length_val, labels : labels_val, init_state : init_state_val }
                    pred_val = sess.run(prediction, feed_dict=feed_dict)
                    eval_preds.append(pred_val)
                print('Epoch [%d/%d], Loss %.3f, accuracy %.3f, eval accuracy %.3f' % (epoch + 1, num_epochs, np.sum(losses), np.mean(preds), np.mean(eval_preds)))
            if model is not None:
                with graph.as_default():
                    tf.train.Saver().save(sess, model)

In [3]:
from sklearn.datasets import fetch_20newsgroups
import spacy
# from gensim.scripts.glove2word2vec import glove2word2vec
# from gensim.models import KeyedVectors

train_data = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'))
test_data  = fetch_20newsgroups(subset='test', remove=('headers', 'footers', 'quotes'))
nlp = spacy.load('en')

# glove2word2vec('data/glove.6B.100d.txt', 'data/glove.6B.100d.converted.txt')
# embeddings = KeyedVectors.load_word2vec_format('data/glove.6B.100d.converted.txt')

print('train_data shape: ', train_data.target.shape[0])
print('test_data shape: ', test_data.target.shape[0])
# print('embeddings shape: ', embeddings.syn0.shape)

train_data shape:  11314
test_data shape:  7532


In [4]:
tag_to_ix = dict([(n, i) for i, n in enumerate(train_data.target_names)])

train_data_size = 1000
test_data_size = 100

batch_size = train_data_size / 10

def prepare_embeddings(data):
    embeddings = dict()
    for i, doc in enumerate(data):
        for token in nlp(doc):
            if token.is_punct: continue
            w = token.lower_.strip()
            if not w: continue
            embeddings[w] = len(embeddings)
#         if (i + 1) % batch_size == 0: print('processed %d documents' % (i + 1))
    return embeddings

def prepare_data(data, labels, embeddings):
    sequences_seq = []
    for i, doc in enumerate(data):
        sequences = []
        for sent in nlp(doc).sents:
            sequence = []
            for token in sent:
                if token.is_punct: continue
                w = token.lower_.strip()
                if not w: continue
                
                if w in embeddings: sequence.append(embeddings[w] + 1)
                else: sequence.append(0)
            if sequence: sequences.append(sequence)
        if sequences: sequences_seq.append(sequences)
#         if (i + 1) % batch_size == 0: print('processed %d documents' % (i + 1))
                
    words_seq = []
    length_seq = []
    label_seq = []
    for i, (sequences, label) in enumerate(zip(sequences_seq, labels)):
        words = tf.keras.preprocessing.sequence.pad_sequences(sequences, padding='post', value=-1)
        length = np.apply_along_axis(lambda ws : next(i for i, d in enumerate(ws) if d < 0), axis=1, arr=np.c_[words, np.ones((len(words), 1)) * -1])
        words_seq.append(words + np.where(words < 0, 1, 0).astype(np.int32))
        length_seq.append(length)
        label_seq.append(label)
#         if (i + 1) % batch_size == 0: print('padded %d documents' % (i + 1))
    return words_seq, length_seq, label_seq

# print('preparing embeddings')
embeddings = prepare_embeddings(train_data.data[:train_data_size])# + test_data.data[:test_data_size])
# print('converting training documents')
train_words_seq, train_length_seq, train_label_seq = \
    prepare_data(train_data.data[:train_data_size], train_data.target[:train_data_size], embeddings)
# print('converting test documents')
test_words_seq, test_length_seq, test_label_seq = \
    prepare_data(test_data.data[:test_data_size], test_data.target[:test_data_size], embeddings)                                
#     prepare_data(train_data.data[train_data_size:train_data_size+test_data_size], train_data.target[train_data_size:train_data_size+test_data_size], embeddings)

In [7]:
print('online training (feeding one training example at a time)')
embedding_matrix = np.random.randn(len(embeddings) + 2, 10)
model = HierarchicalAttentionNetwork(embedding_matrix, len(tag_to_ix), hidden_dim=10)
model.evaluate(train_words_seq, train_length_seq, train_label_seq, test_words_seq, test_length_seq, test_label_seq, 40)

online training (feeding one training example at a time)
Epoch [1/40], Loss 2897.101, accuracy 0.070, eval accuracy 0.071
Epoch [2/40], Loss 2818.146, accuracy 0.128, eval accuracy 0.040
Epoch [3/40], Loss 2617.001, accuracy 0.156, eval accuracy 0.030
Epoch [4/40], Loss 2173.917, accuracy 0.288, eval accuracy 0.040
Epoch [5/40], Loss 1915.840, accuracy 0.351, eval accuracy 0.051
Epoch [6/40], Loss 1545.377, accuracy 0.492, eval accuracy 0.040
Epoch [7/40], Loss 1295.022, accuracy 0.554, eval accuracy 0.040
Epoch [8/40], Loss 1179.530, accuracy 0.598, eval accuracy 0.121
Epoch [9/40], Loss 751.712, accuracy 0.753, eval accuracy 0.020
Epoch [10/40], Loss 729.572, accuracy 0.755, eval accuracy 0.051
Epoch [11/40], Loss 920.071, accuracy 0.709, eval accuracy 0.030
Epoch [12/40], Loss 695.593, accuracy 0.776, eval accuracy 0.000
Epoch [13/40], Loss 382.206, accuracy 0.879, eval accuracy 0.030
Epoch [14/40], Loss 515.913, accuracy 0.836, eval accuracy 0.030
Epoch [15/40], Loss 485.860, accur

In [6]:
batch_size = 100
print('mini batch training (batch size = %d)' % batch_size)
embedding_matrix = np.random.randn(len(embeddings) + 2, 10)
model = HierarchicalAttentionNetwork(embedding_matrix, len(tag_to_ix), hidden_dim=10)
model.evaluate_batch(train_words_seq, train_length_seq, train_label_seq, test_words_seq, test_length_seq, test_label_seq, num_epochs=40, batch_size=batch_size)

mini batch training (batch size = 100)
Epoch [1/40], Loss 2910.292, accuracy 0.062, eval accuracy 0.051
Epoch [2/40], Loss 2906.813, accuracy 0.068, eval accuracy 0.040
Epoch [3/40], Loss 2901.758, accuracy 0.068, eval accuracy 0.081
Epoch [4/40], Loss 2901.286, accuracy 0.068, eval accuracy 0.071
Epoch [5/40], Loss 2898.722, accuracy 0.070, eval accuracy 0.081
Epoch [6/40], Loss 2897.740, accuracy 0.067, eval accuracy 0.061
Epoch [7/40], Loss 2889.184, accuracy 0.084, eval accuracy 0.051
Epoch [8/40], Loss 2813.171, accuracy 0.111, eval accuracy 0.051
Epoch [9/40], Loss 2598.731, accuracy 0.172, eval accuracy 0.040
Epoch [10/40], Loss 2480.859, accuracy 0.199, eval accuracy 0.030
Epoch [11/40], Loss 2452.607, accuracy 0.229, eval accuracy 0.040
Epoch [12/40], Loss 2432.214, accuracy 0.225, eval accuracy 0.010
Epoch [13/40], Loss 2357.286, accuracy 0.191, eval accuracy 0.020
Epoch [14/40], Loss 2241.114, accuracy 0.293, eval accuracy 0.010
Epoch [15/40], Loss 2037.580, accuracy 0.364, 