# import libs

In [1]:
from __future__ import division
import tensorflow as tf
import numpy as np
import re
import string
import time

# Define Some Functions

## Load Data & Generate Data Functions

In [2]:
# load the wordvectors and the words
def load_word_vec():
    word2vec = np.loadtxt('data/ner/wordVectors.txt')
    with open('data/ner/vocab.txt') as fd:
       words = [line.strip() for line in fd]
    words_dict = dict(enumerate(words))
    return words_dict, word2vec

In [3]:
# the style of words_dict is : {index :word}
# after inverse, get {word : index}
def invert_dict(dictionary):
    inv_dict = {}
    for key, value in dictionary.items():
        inv_dict.setdefault(value, key)
    return inv_dict

In [4]:
## read the documents, 
## generate the dataset List[word, class]
def generate_dataset(filename):
    #load the set
    docs = []
    cur_line = []
    
    with open(filename) as fd:
        for line in fd:
            #begin of th doc 
            if re.match(r"-DOCSTART-.+", line) or (len(line.strip())==0):
                if(len(line.strip())==0):
                    #[] denote the begining or end of the sentence 
                    cur_line = ['<s>']
                    docs.append(cur_line)
            else:
                cur_line = line.strip().split('\t', 1)
                docs.append(cur_line)
                #print(docs[0])
    return docs

In [5]:
## generate word context (windows default size = 3) from docs
## note: string '<s>' is just a notation for empty of the word
## example : 
def docs_to_windows(docs, word_dict, tag_dict, window_size = 3):
    #from words to indices
    #at the begin and the end add the paddings
    
    words = []
    tags = []
    for index in range(len(docs)):
        if docs[index] == ['<s>']:
            continue;
        else:
            item = list([docs[index-1][0], docs[index][0],docs[index+1][0]])
            words.append(item)
            tags.append(docs[index][1])
    return words, tags

In [6]:
## lookup the inv_words_dict, 
## change the word to index 
def word_to_index(words, inv_words_dict):
    indices = []
    for item in words:
        item_indices = []
        for word in item:
            if inv_words_dict.has_key(word.lower()):
                item_indices.append(inv_words_dict[word.lower()])
            else:
                #for word not in the vacabulary ,use unknown word'UUUNKKK' denote
                word = 'UUUNKKK'
                item_indices.append(inv_words_dict[word])
        
        indices.append(item_indices)
    return indices

In [7]:
## lookup the inv_tag_dict
## change the tag to index
def tag_to_index(tags, inv_tag_dict):
    return [inv_tag_dict[tag] for tag in tags]

In [8]:
## shuffle the data 
## shuffle the words and the tags at the same time
def shuffle_data(matrix_A, matrix_B):
    indices = np.random.permutation(len(matrix_A))
    return matrix_A[indices], matrix_B[indices]

## Build Graph Functions

In [9]:
## add the placeholders
def add_placeholders():
    #input
    input_placeholder = tf.placeholder(dtype=tf.int32, shape=[None, 3])
    #labels
    label_placeholder = tf.placeholder(dtype=tf.float32, shape=[None, 5])
    return input_placeholder, label_placeholder

In [10]:
## create the feed dict
def create_feed_dict(input_placeholder, input_batch, label_placeholder, label_batch):
    feed_dict = {input_placeholder: input_batch,
                label_placeholder:label_batch}
    return feed_dict

In [11]:
def evaluation(y_pred, labels):
    label_right = tf.argmax(labels, dimension=1)
    label_pred = tf.argmax(y_pred, dimension=1)
    correct_pred_num =  tf.reduce_sum(tf.cast(tf.equal(label_right, label_pred), tf.int32))
    return correct_pred_num

In [30]:
def do_eval(sess, eval_correct, words_indices,
            batch_size, input_placeholder, label_placeholder,tags_matrix):
    true_count = 0
    steps_per_epoch = len(tags_test) // batch_size
    
    num_examples = steps_per_epoch * batch_size
    valid_loss = []
    for step in range(steps_per_epoch):
        input_batch = words_indices[step*batch_size: (step+1)*batch_size, :]
        label_batch = tags_matrix[step*batch_size: (step+1)*batch_size, :]
        feed_dict = create_feed_dict(input_placeholder, input_batch, label_placeholder, label_batch)
        count_step,loss_step  = sess.run([eval_correct, loss], feed_dict)
        true_count += count_step
        valid_loss.append(loss_step)
    mean_loss = np.mean(valid_loss)
    accuracy = true_count / num_examples
    print('Validation Stage: Loss : %f, Validation Accuracy : %f' %(mean_loss, accuracy))

### start 

In [13]:
time_start = time.time()

### define some parameters

In [14]:
flags = tf.app.flags
FLAGS = flags.FLAGS
flags.DEFINE_integer('batch_size', 64, 'the size of the batch')
flags.DEFINE_integer('max_epoch', 1, 'the max times to rerun the training progress')

# Load data

In [15]:
words_dict, word2vec = load_word_vec()
word2vec = word2vec.astype(np.float32)

In [16]:
inv_words_dict = invert_dict(words_dict)

In [17]:
tagnames = ['O', 'LOC', 'MISC', 'ORG', 'PER']
tag_dict = dict(enumerate(tagnames))
inv_tag_dict = invert_dict(tag_dict)
print(inv_tag_dict)

{'LOC': 1, 'MISC': 2, 'PER': 4, 'O': 0, 'ORG': 3}


### generate the training_set

In [18]:
docs = generate_dataset('data/ner/train')
words, tags = docs_to_windows(docs, words_dict, tag_dict)
words_indices = np.array(word_to_index(words, inv_words_dict))
tags_indices = np.array(tag_to_index(tags, inv_tag_dict))
tags_matrix = np.array(np.zeros([len(tags_indices), 5]))
for i in range(len(tags_indices)):
    index = tags_indices[i]
    tags_matrix[i, index] = 1

### generate the validation_set

In [19]:
docs_test = generate_dataset('data/ner/dev')
words_test, tags_test = docs_to_windows(docs_test, words_dict, tag_dict)
words_indices_test = np.array(word_to_index(words_test, inv_words_dict))
tags_indices_test = np.array(tag_to_index(tags_test, inv_tag_dict))
tags_matrix_test = np.array(np.zeros([len(tags_indices_test), 5]))
for i in range(len(tags_indices_test)):
    index = tags_indices_test[i]
    tags_matrix_test[i, index] = 1

In [20]:
max_step = len(tags_indices) // FLAGS.batch_size

In [21]:
### a simple majority guess classifier
### if we guess the entity is 'O', how much probability that we will get right

In [22]:
count = 0
for i in range(len(tags)):
    if tags[i] == 'O':
        count += 1
print(count/len(tags))

count = 0
for i in range(len(tags_test)):
    if tags_test[i] == 'O':
        count += 1
print(count/len(tags_test))

0.832811939829
0.832502628402


# build the graph

In [31]:
with tf.Graph().as_default(), tf.device('/cpu:0'):
    #add placeholders
    input_placeholder, label_placeholder = add_placeholders()
    #add the embed layer
    with  tf.device('/cpu:0'):
        embedding = tf.get_variable('Embedding', [len(word2vec), 50]) 
        embeds = tf.nn.embedding_lookup(embedding, input_placeholder)
        embeds = tf.reshape(embeds, [-1, 150])
    #for the hidden unit
    W = tf.Variable(tf.random_uniform([150, 100], 
                                      minval=-np.sqrt(6.0/(150+100)), maxval = np.sqrt(6.0/(150+100))))
    b1 = tf.Variable(tf.zeros([100]))
    h = tf.nn.tanh(tf.matmul(embeds, W) + b1)
    U = tf.Variable(tf.random_uniform([100, 5], 
                                      minval=-np.sqrt(6.0/(100+5)), maxval = np.sqrt(6.0/(100+5))))
    b2 = tf.Variable(tf.zeros([5]))
    y = tf.matmul(h, U) + b2
    y = tf.nn.dropout(y, 0.9)
    pred = tf.nn.softmax(y)
    #
    eval_correct = evaluation(pred, label_placeholder)
    ##regulurization
    #L2 = tf.reduce_sum(tf.square(W)) + tf.reduce_sum(tf.square(U))
    loss = tf.reduce_mean(-tf.reduce_sum(label_placeholder*tf.log(tf.clip_by_value(pred,1e-7,1.0)), 
                                         reduction_indices=[1]))+ 0.5*0.001*tf.nn.l2_loss(W)+ 0.5*0.001*tf.nn.l2_loss(U)
    #loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(y, label_placeholder)) + 0.5*0.001*tf.nn.l2_loss(W)+ 0.5*0.001*tf.nn.l2_loss(U)
    train_op = tf.train.AdamOptimizer(learning_rate=0.001).minimize(loss)
    sess = tf.Session()
    sess.run(tf.initialize_all_variables())
    for epoch in range(FLAGS.max_epoch):
        
        #shuffle the data
        words_indices, tags_matrix = shuffle_data(words_indices, tags_matrix)
        words_indices_test, tags_matrix_test = shuffle_data(words_indices_test, tags_matrix_test)
        correct_num = []
        print('epoch %d , Trainging' % (epoch))
        for step in range(max_step):
            input_batch = words_indices[step*FLAGS.batch_size: (step+1)*FLAGS.batch_size, :]
            label_batch = tags_matrix[step*FLAGS.batch_size: (step+1)*FLAGS.batch_size, :]
            
            ##shuffle the data
            feed_dict = create_feed_dict(input_placeholder, input_batch, label_placeholder, label_batch)
            _, loss_step, correct_step = sess.run([train_op, loss, eval_correct], feed_dict)
            correct_num.append(correct_step)
            if step % 100 == 0:
                print('step %d / %d: loss %f' %(step,  max_step,loss_step))
        print('epoch %d  Training Accuracy : %f \n \n' % (epoch, np.sum(correct_num)/ (max_step*FLAGS.batch_size)))
        print('epoch %d, Validating' % (epoch))
        do_eval(sess, eval_correct,  words_indices_test,
                FLAGS.batch_size, input_placeholder, label_placeholder, tags_matrix_test)

epoch 0 , trainging
step 0 / 3181: loss 1.641708
step 100 / 3181: loss 0.336718
step 200 / 3181: loss 0.636345
step 300 / 3181: loss 0.293760
step 400 / 3181: loss 0.513436
step 500 / 3181: loss 0.445194
step 600 / 3181: loss 0.313875
step 700 / 3181: loss 0.416534
step 800 / 3181: loss 0.343436
step 900 / 3181: loss 0.359057
step 1000 / 3181: loss 0.283767
step 1100 / 3181: loss 0.231637
step 1200 / 3181: loss 0.313768
step 1300 / 3181: loss 0.291815
step 1400 / 3181: loss 0.287816
step 1500 / 3181: loss 0.248984
step 1600 / 3181: loss 0.255572
step 1700 / 3181: loss 0.253633
step 1800 / 3181: loss 0.222316
step 1900 / 3181: loss 0.318004
step 2000 / 3181: loss 0.301382
step 2100 / 3181: loss 0.160079
step 2200 / 3181: loss 0.261188
step 2300 / 3181: loss 0.133320
step 2400 / 3181: loss 0.338615
step 2500 / 3181: loss 0.360968
step 2600 / 3181: loss 0.212219
step 2700 / 3181: loss 0.358713
step 2800 / 3181: loss 0.225175
step 2900 / 3181: loss 0.183463
step 3000 / 3181: loss 0.385651


running time:  1236.321323 seconds
