In [1]:
# !wget https://raw.githubusercontent.com/synalp/NER/master/corpus/CoNLL-2003/eng.train
# !wget https://raw.githubusercontent.com/synalp/NER/master/corpus/CoNLL-2003/eng.testa

In [2]:
def parse(file):
    with open(file) as fopen:
        texts = fopen.read().split('\n')
    left, right = [], []
    for text in texts:
        if '-DOCSTART-' in text or not len(text):
            continue
        splitted = text.split()
        left.append(splitted[0])
        right.append(splitted[-1])
    return left, right

In [3]:
left_train, right_train = parse('eng.train')
left_test, right_test = parse('eng.testa')

In [4]:
import re
import numpy as np
import tensorflow as tf
from tqdm import tqdm

In [5]:
def process_string(string):
    string = re.sub('[^A-Za-z0-9\-\/ ]+', ' ', string).split()
    return ' '.join([to_title(y.strip()) for y in string])

def to_title(string):
    if string.isupper():
        string = string.title()
    return string

In [6]:
np.unique(right_train,return_counts=True)

(array(['B-LOC', 'B-MISC', 'B-ORG', 'I-LOC', 'I-MISC', 'I-ORG', 'I-PER',
        'O'], dtype='<U6'),
 array([    11,     37,     24,   8286,   4556,  10001,  11128, 169578]))

In [7]:
word2idx = {'PAD': 0,'NUM':1,'UNK':2}
tag2idx = {'PAD': 0}
char2idx = {'PAD': 0}
word_idx = 3
tag_idx = 1
char_idx = 1

def parse_XY(texts, labels):
    global word2idx, tag2idx, char2idx, word_idx, tag_idx, char_idx
    X, Y = [], []
    for no, text in enumerate(texts):
        text = text.lower()
        tag = labels[no]
        for c in text:
            if c not in char2idx:
                char2idx[c] = char_idx
                char_idx += 1
        if tag not in tag2idx:
            tag2idx[tag] = tag_idx
            tag_idx += 1
        Y.append(tag2idx[tag])
        if text not in word2idx:
            word2idx[text] = word_idx
            word_idx += 1
        X.append(word2idx[text])
    return X, np.array(Y)

In [8]:
train_X, train_Y = parse_XY(left_train, right_train)
test_X, test_Y = parse_XY(left_test, right_test)

In [9]:
idx2word = {idx: tag for tag, idx in word2idx.items()}
idx2tag = {i: w for w, i in tag2idx.items()}

In [10]:
seq_len = 50
def iter_seq(x):
    return np.array([x[i: i+seq_len] for i in range(0, len(x)-seq_len, 1)])

def to_train_seq(*args):
    return [iter_seq(x) for x in args]

def generate_char_seq(batch):
    x = [[len(idx2word[i]) for i in k] for k in batch]
    maxlen = max([j for i in x for j in i])
    temp = np.zeros((batch.shape[0],batch.shape[1],maxlen),dtype=np.int32)
    for i in range(batch.shape[0]):
        for k in range(batch.shape[1]):
            for no, c in enumerate(idx2word[batch[i,k]]):
                temp[i,k,-1-no] = char2idx[c]
    return temp

In [11]:
X_seq, Y_seq = to_train_seq(train_X, train_Y)
X_char_seq = generate_char_seq(X_seq)
X_seq.shape

(203571, 50)

In [12]:
X_seq_test, Y_seq_test = to_train_seq(test_X, test_Y)
X_char_seq_test = generate_char_seq(X_seq_test)
X_seq_test.shape

(51312, 50)

In [13]:
train_X, train_Y, train_char = X_seq, Y_seq, X_char_seq
test_X, test_Y, test_char = X_seq_test, Y_seq_test, X_char_seq_test

In [16]:
class Model:
    def __init__(
        self,
        dim_word,
        dim_char,
        dropout,
        learning_rate,
        hidden_size_char,
        hidden_size_word,
        num_layers,
    ):
        def cells(size, reuse = False):
            return tf.contrib.rnn.DropoutWrapper(
                tf.nn.rnn_cell.LSTMCell(
                    size,
                    initializer = tf.orthogonal_initializer(),
                    reuse = reuse,
                ),
                output_keep_prob = dropout,
            )
        
        self.word_ids = tf.placeholder(tf.int32, shape = [None, None])
        self.char_ids = tf.placeholder(tf.int32, shape = [None, None, None])
        self.labels = tf.placeholder(tf.int32, shape = [None, None])
        self.maxlen = tf.shape(self.word_ids)[1]
        self.lengths = tf.count_nonzero(self.word_ids, 1, dtype = tf.int32)

        self.word_embeddings = tf.Variable(
            tf.truncated_normal(
                [len(word2idx), dim_word], stddev = 1.0 / np.sqrt(dim_word)
            )
        )
        self.char_embeddings = tf.Variable(
            tf.truncated_normal(
                [len(char2idx), dim_char], stddev = 1.0 / np.sqrt(dim_char)
            )
        )

        word_embedded = tf.nn.embedding_lookup(
            self.word_embeddings, self.word_ids
        )
        char_embedded = tf.nn.embedding_lookup(
            self.char_embeddings, self.char_ids
        )
        s = tf.shape(char_embedded)
        char_embedded = tf.reshape(
            char_embedded, shape = [s[0] * s[1], s[-2], dim_char]
        )
        for n in range(num_layers):
            (out_fw, out_bw), (
                state_fw,
                state_bw,
            ) = tf.nn.bidirectional_dynamic_rnn(
                cell_fw = cells(hidden_size_char),
                cell_bw = cells(hidden_size_char),
                inputs = char_embedded,
                dtype = tf.float32,
                scope = 'bidirectional_rnn_char_%d' % (n),
            )
            char_embedded = tf.concat((out_fw, out_bw), 2)
        output = tf.reshape(
            char_embedded[:, -1], shape = [s[0], s[1], 2 * hidden_size_char]
        )
        word_embedded = tf.concat([word_embedded, output], axis = -1)

        for n in range(num_layers):
            (out_fw, out_bw), (
                state_fw,
                state_bw,
            ) = tf.nn.bidirectional_dynamic_rnn(
                cell_fw = cells(hidden_size_word),
                cell_bw = cells(hidden_size_word),
                inputs = word_embedded,
                dtype = tf.float32,
                scope = 'bidirectional_rnn_word_%d' % (n),
            )
            word_embedded = tf.concat((out_fw, out_bw), 2)
        logits = tf.layers.dense(word_embedded, len(idx2tag))
        y_t = self.labels
        log_likelihood, transition_params = tf.contrib.crf.crf_log_likelihood(
            logits, y_t, self.lengths
        )
        self.cost = tf.reduce_mean(-log_likelihood)
        self.optimizer = tf.train.AdamOptimizer(
            learning_rate = learning_rate
        ).minimize(self.cost)
        mask = tf.sequence_mask(self.lengths, maxlen = self.maxlen)
        self.tags_seq, tags_score = tf.contrib.crf.crf_decode(
            logits, transition_params, self.lengths
        )
        self.tags_seq = tf.identity(self.tags_seq, name = 'logits')

        y_t = tf.cast(y_t, tf.int32)
        self.prediction = tf.boolean_mask(self.tags_seq, mask)
        mask_label = tf.boolean_mask(y_t, mask)
        correct_pred = tf.equal(self.prediction, mask_label)
        correct_index = tf.cast(correct_pred, tf.float32)
        self.accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))

In [17]:
tf.reset_default_graph()
sess = tf.InteractiveSession()

dim_word = 64
dim_char = 128
dropout = 0.8
learning_rate = 1e-3
hidden_size_char = 128
hidden_size_word = 128
num_layers = 2
batch_size = 128

model = Model(dim_word,dim_char,dropout,learning_rate,
              hidden_size_char,hidden_size_word,num_layers)
sess.run(tf.global_variables_initializer())



Instructions for updating:
Use keras.layers.dense instead.
Instructions for updating:
Use tf.cast instead.


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


In [18]:
import time

for e in range(3):
    lasttime = time.time()
    train_acc, train_loss, test_acc, test_loss = 0, 0, 0, 0
    pbar = tqdm(
        range(0, len(train_X), batch_size), desc = 'train minibatch loop'
    )
    for i in pbar:
        batch_x = train_X[i : min(i + batch_size, train_X.shape[0])]
        batch_char = train_char[i : min(i + batch_size, train_X.shape[0])]
        batch_y = train_Y[i : min(i + batch_size, train_X.shape[0])]
        acc, cost, _ = sess.run(
            [model.accuracy, model.cost, model.optimizer],
            feed_dict = {
                model.word_ids: batch_x,
                model.char_ids: batch_char,
                model.labels: batch_y
            },
        )
        assert not np.isnan(cost)
        train_loss += cost
        train_acc += acc
        pbar.set_postfix(cost = cost, accuracy = acc)
    pbar = tqdm(
        range(0, len(test_X), batch_size), desc = 'test minibatch loop'
    )
    for i in pbar:
        batch_x = test_X[i : min(i + batch_size, test_X.shape[0])]
        batch_char = test_char[i : min(i + batch_size, test_X.shape[0])]
        batch_y = test_Y[i : min(i + batch_size, test_X.shape[0])]
        acc, cost = sess.run(
            [model.accuracy, model.cost],
            feed_dict = {
                model.word_ids: batch_x,
                model.char_ids: batch_char,
                model.labels: batch_y
            },
        )
        assert not np.isnan(cost)
        test_loss += cost
        test_acc += acc
        pbar.set_postfix(cost = cost, accuracy = acc)
    
    train_loss /= len(train_X) / batch_size
    train_acc /= len(train_X) / batch_size
    test_loss /= len(test_X) / batch_size
    test_acc /= len(test_X) / batch_size

    print('time taken:', time.time() - lasttime)
    print(
        'epoch: %d, training loss: %f, training acc: %f, valid loss: %f, valid acc: %f\n'
        % (e, train_loss, train_acc, test_loss, test_acc)
    )

train minibatch loop: 100%|██████████| 1591/1591 [15:04<00:00,  1.90it/s, accuracy=0.989, cost=5.3] 
test minibatch loop: 100%|██████████| 401/401 [01:18<00:00,  5.20it/s, accuracy=0.932, cost=7]   
train minibatch loop:   0%|          | 0/1591 [00:00<?, ?it/s]

time taken: 982.7449724674225
epoch: 0, training loss: 18.019671, training acc: 0.878658, valid loss: 11.073292, valid acc: 0.921732



train minibatch loop: 100%|██████████| 1591/1591 [15:02<00:00,  1.91it/s, accuracy=0.993, cost=1.47] 
test minibatch loop: 100%|██████████| 401/401 [01:17<00:00,  5.13it/s, accuracy=0.952, cost=4.36] 
train minibatch loop:   0%|          | 0/1591 [00:00<?, ?it/s]

time taken: 980.822240114212
epoch: 1, training loss: 6.994036, training acc: 0.947728, valid loss: 6.815563, valid acc: 0.952570



train minibatch loop: 100%|██████████| 1591/1591 [15:03<00:00,  1.89it/s, accuracy=0.997, cost=0.862]
test minibatch loop: 100%|██████████| 401/401 [01:17<00:00,  5.14it/s, accuracy=0.936, cost=4.52] 

time taken: 981.0697901248932
epoch: 2, training loss: 3.482352, training acc: 0.975290, valid loss: 6.376965, valid acc: 0.955703






In [19]:
def pred2label(pred):
    out = []
    for pred_i in pred:
        out_i = []
        for p in pred_i:
            out_i.append(idx2tag[p])
        out.append(out_i)
    return out

In [20]:
real_Y, predict_Y = [], []

pbar = tqdm(
    range(0, len(test_X), batch_size), desc = 'validation minibatch loop'
)
for i in pbar:
    batch_x = test_X[i : min(i + batch_size, test_X.shape[0])]
    batch_char = test_char[i : min(i + batch_size, test_X.shape[0])]
    batch_y = test_Y[i : min(i + batch_size, test_X.shape[0])]
    predicted = pred2label(sess.run(model.tags_seq,
            feed_dict = {
                model.word_ids: batch_x,
                model.char_ids: batch_char,
            },
    ))
    real = pred2label(batch_y)
    predict_Y.extend(predicted)
    real_Y.extend(real)

validation minibatch loop: 100%|██████████| 401/401 [01:19<00:00,  5.10it/s]


In [21]:
from sklearn.metrics import classification_report
print(classification_report(np.array(real_Y).ravel(), np.array(predict_Y).ravel()))

             precision    recall  f1-score   support

     B-MISC       0.00      0.00      0.00       200
      I-LOC       0.92      0.86      0.89    104662
     I-MISC       0.69      0.79      0.74     63129
      I-ORG       0.67      0.73      0.70    104387
      I-PER       0.96      0.84      0.90    157385
          O       0.98      0.98      0.98   2135837

avg / total       0.96      0.96      0.96   2565600



  'precision', 'predicted', average, warn_for)
