In [1]:
import tensorflow as tf
from elmo import elmo_embedding
import numpy as np
import os
from tqdm import tqdm
import json
from model_v2 import LanguageModel

In [2]:
with open('pos2idx.json', 'r') as inp:
    pos2idx = json.load(inp)
with open('ner2idx.json', 'r') as inp:
    ner2idx = json.load(inp)

In [3]:
def tagger(inputs, labels, seq_lens, n_units, n_classes, drop_i, drop_o, name='tagger', reuse=False, is_training=True):
    with tf.variable_scope(name, reuse=reuse):
        def __cell():
            cell = tf.contrib.rnn.GRUBlockCellV2(n_units, name='cell', reuse=reuse)
            if is_training:
                cell = tf.nn.rnn_cell.DropoutWrapper(cell=cell, input_keep_prob=1.0-drop_i, output_keep_prob=1.0-drop_o, variational_recurrent=True, input_size=inputs.shape[-1], dtype=tf.float32)
            return cell
        outputs, state = tf.nn.bidirectional_dynamic_rnn(__cell(), __cell(), inputs, seq_lens, time_major=False, dtype=tf.float32)
        outputs = tf.concat(outputs + (inputs,), axis=-1)
        s = tf.shape(outputs)
        W = tf.get_variable(name='W', shape=(outputs.shape[2], n_classes), initializer=tf.glorot_uniform_initializer(), trainable=True)
        b = tf.get_variable(name='b', shape=(n_classes, ), initializer=tf.zeros_initializer(), trainable=True)
        outputs = tf.reshape(outputs, (s[0] * s[1], s[2]), name='before_proj')
        outputs = tf.nn.xw_plus_b(outputs, W, b)
        outputs = tf.reshape(outputs, (s[0], s[1], n_classes), name='after_proj')
        log_likelihood, transition_params = tf.contrib.crf.crf_log_likelihood(
            inputs=outputs,
            tag_indices=labels,
            sequence_lengths=seq_lens
        )
        loss = tf.reduce_mean(-log_likelihood)
        decode_tags, best_scores = tf.contrib.crf.crf_decode(
            potentials=outputs,
            transition_params=transition_params,
            sequence_length=seq_lens
        )
        mask = tf.sequence_mask(seq_lens, dtype=tf.float32)
        acc = tf.reduce_sum(tf.to_float(tf.equal(decode_tags, labels)) * mask) / tf.reduce_sum(mask)
    return outputs, loss, transition_params, decode_tags, best_scores, acc

In [None]:
train_embed = {}
train_pos = {}
train_ner = {}
for p, _, files in os.walk('VLSP/train'):
    for fn in tqdm([x for x in files if x.endswith('.npy')]):
        with open(os.path.join(p, fn), 'rb') as inp:
            arr = np.load(inp)
        idx = int(fn[:-5])
        if fn[-5] == 'e':
            train_embed[idx] = arr
        elif fn[-5] == 'n':
            train_ner[idx] = [ner2idx[x] for x in arr]
        elif fn[-5] == 'p':
            train_pos[idx] = [pos2idx[x] for x in arr]
        else:
            raise ValueError()

 68%|██████▊   | 34602/50577 [05:50<02:41, 98.84it/s] 

In [None]:
test_embed = {}
test_pos = {}
test_ner = {}
for p, _, files in os.walk('VLSP/test'):
    for fn in tqdm([x for x in files if x.endswith('.npy')]):
        with open(os.path.join(p, fn), 'rb') as inp:
            arr = np.load(inp)
        idx = int(fn[:-5])
        if fn[-5] == 'e':
            test_embed[idx] = arr
        elif fn[-5] == 'n':
            test_ner[idx] = [ner2idx[x] for x in arr]
        elif fn[-5] == 'p':
            test_pos[idx] = [pos2idx[x] for x in arr]
        else:
            raise ValueError()

In [None]:
def pad_sequences(sequences):
    maxlen = max(len(x) for x in sequences)
#     print(np.argmax([len(x) for x in sequences]))
    if isinstance(sequences[0], np.ndarray):
        arr = np.zeros((maxlen, len(sequences)) + sequences[0].shape[1:])
    else:
        arr = np.zeros((maxlen, len(sequences)))
#     print(arr.shape)
    for i, x in enumerate(sequences):
        arr[:len(x), i] = x
    return np.transpose(arr, axes=[1, 0] + [x for x in range(2, arr.ndim)])
def get_batch(data, batch_size, shuffle=True):
    idx = np.arange(0, len(data))
    data = np.array(data)
    if shuffle:
        idx = np.random.permutation(idx)
    for i in range(0, len(idx), batch_size):
        indices = idx[i:i+batch_size]
#         print(indices)
        batch = data[indices]
        embed = pad_sequences([x[0] for x in batch])
        batch_pos = pad_sequences(np.array([x[1] for x in batch]))
        batch_ner = pad_sequences(np.array([x[2] for x in batch]))
        seq_len = [len(x[0]) for x in batch]
        yield embed, batch_pos, batch_ner, seq_len

In [None]:
train_data = [(train_embed[k], train_pos[k], train_ner[k]) for k in train_embed]
test_data = [(test_embed[k], test_pos[k], test_ner[k]) for k in test_embed]

In [None]:
tf.reset_default_graph()
session = tf.Session()
x = tf.placeholder(dtype=tf.float32, shape=(None, None, 1024, 4), name='x')
y = tf.placeholder(dtype=tf.int32, shape=(None, None), name='y')
seq_len = tf.placeholder(dtype=tf.int32, shape=(None,), name='seq_len')
drop_i = tf.placeholder(dtype=tf.float32, shape=(), name='drop_i')
drop_o = tf.placeholder(dtype=tf.float32, shape=(), name='drop_o')
elmo, elmo_l2_reg = elmo_embedding(x, seq_len, layer_norm=True)
outputs, loss, transition_params, decode_tags, best_scores, acc = tagger(elmo, y, seq_len, 200, len(pos2idx), drop_i, drop_o)
optimizer = tf.train.AdamOptimizer(1e-3)
grads, _vars = zip(*optimizer.compute_gradients(loss + elmo_l2_reg))
grads, _ = tf.clip_by_global_norm(grads, clip_norm=1.0)
global_step = tf.Variable(0, name="global_step", trainable=False)
train_op = optimizer.apply_gradients(
    zip(grads, _vars),
    global_step=global_step
)
session.run(tf.global_variables_initializer())

In [None]:
for i in range(10):
    gen = get_batch(train_data, 128)
    for em, _, ner, sl in gen:
        _, s, l, a = session.run([train_op, global_step, loss, acc], feed_dict={
            x: em, y: ner, seq_len: sl, drop_i: 0.5, drop_o: 0.5
        })
        print('Step {}: loss {}, acc {}'.format(s, l, a))

In [None]:
total_loss = 0.0
total_acc = 0.0
sum_sl = 0
gen = get_batch(test_data, 128, False)
for em, _, ner, sl in gen:
    l, a = session.run([loss, acc], feed_dict={
        x: em, y: ner, seq_len: sl, drop_i: 0.0, drop_o: 0.0
    })
    total_acc += a * sum(sl)
    total_loss += l * sum(sl)
    sum_sl += sum(sl)
    print('Loss {}, acc {}'.format(l, a))
print('Total loss {}, total acc {}'.format(total_loss / sum_sl, total_acc / sum_sl))

In [None]:
saver = tf.train.Saver(tf.global_variables())
saver.save(session, './ner_tagger_2.cpkt', global_step=s)