In [1]:
import numpy as np 
import matplotlib.pyplot as plt
import tensorflow as tf
import os
import sys
from datetime import datetime
from sklearn.metrics import f1_score
from sklearn.utils import shuffle
from tensorflow.contrib.rnn import static_rnn as get_rnn_output
from tensorflow.contrib.rnn import BasicRNNCell, GRUCell

In [2]:
def get_data(split_sequences = False):
    word2idx = {}
    tag2idx = {}
    word_idx = 1
    tag_idx = 1
    Xtrain = []
    Ytrain = []
    currentX = []
    currentY = []
    for line in open('ner.txt'):
        line = line.rstrip()
        if line:
            r = line.split()
            word, tag = r
            word = word.lower()
            if word not in word2idx:
                word2idx[word] = word_idx
                word_idx += 1 
            currentX.append(word2idx[word])
            
            if tag not in word2idx:
                tag2idx[tag] = tag_idx
                tag_idx += 1 
            currentY.append(tag2idx[tag])
        
        elif split_sequences:
            Xtrain.append(currentX)
            Ytrain.append(currentY)
            currentX = []
            currentY = []
    
    if not split_sequences:
        Xtrain = currentX
        Ytrain = currentY
        
    print("number of samples:", len(Xtrain))
    Xtrain, Ytrain = shuffle(Xtrain, Ytrain)
    Ntest = int(0.3*len(Xtrain))
    Xtest = Xtrain[:Ntest]
    Ytest = Ytrain[:Ntest]
    Xtrain = Xtrain[Ntest:]
    Ytrain = Ytrain[Ntest:]
    print("number of classes:", len(tag2idx))
    return Xtrain, Ytrain, Xtest, Ytest, word2idx, tag2idx

In [3]:
def flatten(l):
    return [item for sublist in l for item in sublist]

In [4]:
Xtrain, Ytrain, Xtest, Ytest, word2idx, tag2idx = get_data(split_sequences = True)
V = len(word2idx) + 2 # +1 for unknown, +1 for padding 0
K = len(set(flatten(Ytrain)) | set(flatten(Ytest))) + 1 # +1 for padding 0

number of samples: 2394
number of classes: 21


In [None]:
epochs = 4
learning_rate = 1e-2
mu = 0.99
batch_size = 32
hidden_layer_size = 10
embedding_dim = 10
sequence_length = max(len(x) for x in Xtrain + Xtest)

In [None]:
Xtrain = tf.keras.preprocessing.sequence.pad_sequences(Xtrain, maxlen = sequence_length)
Ytrain = tf.keras.preprocessing.sequence.pad_sequences(Ytrain, maxlen = sequence_length)
Xtest = tf.keras.preprocessing.sequence.pad_sequences(Xtest, maxlen = sequence_length)
Ytest = tf.keras.preprocessing.sequence.pad_sequences(Ytest, maxlen = sequence_length)
print("Xtrain.shape:", Xtrain.shape)
print("Ytrain.shape:", Ytrain.shape)

Xtrain.shape: (1676, 39)
Ytrain.shape: (1676, 39)


In [None]:
inputs = tf.placeholder(tf.int32, shape = (None, sequence_length))
targets = tf.placeholder(tf.int32, shape = (None, sequence_length))
num_samples = tf.shape(inputs)[0]

In [None]:
We = np.random.randn(V, embedding_dim).astype(np.float32)
Wo = np.random.randn(hidden_layer_size, K)/np.sqrt(hidden_layer_size + K)
Wo = Wo.astype(np.float32)
bo = np.zeros(K).astype(np.float32)
tfWe = tf.Variable(We)
tfWo = tf.Variable(Wo)
tfbo = tf.Variable(bo)

In [None]:
rnn_unit = GRUCell(num_units=hidden_layer_size, activation=tf.nn.relu)
x = tf.nn.embedding_lookup(tfWe, inputs)
x = tf.unstack(x, sequence_length, 1)
outputs, states = get_rnn_output(rnn_unit, x, dtype=tf.float32)
outputs = tf.transpose(outputs, (1, 0, 2))
outputs = tf.reshape(outputs, (sequence_length*num_samples, hidden_layer_size)) # NT x M
logits = tf.matmul(outputs, tfWo) + tfbo # NT x K
predictions = tf.argmax(logits, 1)
predict_op = tf.reshape(predictions, (num_samples, sequence_length)) # N x T
labels_flat = tf.reshape(targets, [-1]) # NT

In [None]:
cost_op = tf.reduce_mean(
    tf.nn.sparse_softmax_cross_entropy_with_logits(
        logits = logits,
        labels = labels_flat
    )
)
train_op = tf.train.AdamOptimizer(learning_rate).minimize(cost_op)

In [None]:
sess = tf.InteractiveSession()
init = tf.global_variables_initializer()
sess.run(init)

In [None]:
costs = []
n_batches = len(Ytrain) // batch_size
for i in range(epochs):
    n_total = 0 
    n_correct = 0
    
    t0 = datetime.now()
    Xtrain, Ytrain = shuffle(Xtrain, Ytrain)
    cost = 0 
    
    for j in range(n_batches):
        x = Xtrain[j*batch_size:(j+1)*batch_size]
        y = Ytrain[j*batch_size:(j+1)*batch_size]
        
        c, p, _ = sess.run(
            (cost_op, predict_op, train_op),
            feed_dict = {inputs: x, targets: y}
        ) # p: N x T   y: N x T
        cost += c
        
        for yi, pi in zip(y, p):
            yii = yi[yi > 0]
            pii = pi[yi > 0]
            n_correct += np.sum(yii == pii)
            n_total += len(yii)
        
        if j % 10 == 0:
            sys.stdout.write(
                "j/N: %d/%d correct rate so far: %f, cost so far: %f\r" %
                (j, n_batches, float(n_correct)/n_total, cost)
            )
            sys.stdout.flush()
        
    p = sess.run(predict_op, feed_dict = {inputs: Xtest, targets: Ytest})
    n_test_correct = 0 
    n_test_total = 0 
    for yi, pi in zip(Ytest, p):
        yii = yi[yi > 0]
        pii = pi[yi > 0]
        n_test_correct += np.sum(yii == pii)
        n_test_total += len(yii)
    test_acc = float(n_test_correct) / n_test_total
    
    print(
        "i:", i, "cost:", "%.4f" % cost,
        "train acc:", "%.4f" % (float(n_correct)/n_total),
        "test acc:", "%.4f" % test_acc,
        "time for epoch:", (datetime.now() - t0)
    )
    costs.append(cost)

plt.plot(costs)
plt.show()

i: 0 cost: 346.3824 train acc: 0.0000 test acc: 0.0000 time for epoch: 0:01:54.228401
i: 1 cost: 274.0212 train acc: 0.0000 test acc: 0.0000 time for epoch: 0:01:46.477832
i: 2 cost: 263.3712 train acc: 0.0000 test acc: 0.0000 time for epoch: 0:01:50.897552
j/N: 50/52 correct rate so far: 0.000189, cost so far: 241.236607