# import libs

In [1]:
from __future__ import division
import tensorflow as tf
import numpy as np
import re
import string

# Load data

## load vocabulary and wordvectors 

In [2]:
def load_word_vec():
    word2vec = np.loadtxt('data/ner/wordVectors.txt')
    with open('data/ner/vocab.txt') as fd:
       words = [line.strip() for line in fd]
    words_dict = dict(enumerate(words))
    return words_dict, word2vec

In [3]:
def invert_dict(dictionary):
    inv_dict = {}
    for key, value in dictionary.items():
        inv_dict.setdefault(value, key)
    return inv_dict

In [4]:
words_dict, word2vec = load_word_vec()
word2vec = word2vec.astype(np.float32)

In [5]:
inv_words_dict = invert_dict(words_dict)

In [6]:
print(word2vec.shape)

(100232, 50)


## Load and Generate the train_set satisfy the requirement

In [7]:
def generate_dataset(filename):
    #load the set
    docs = []
    cur_line = []
    
    with open(filename) as fd:
        for line in fd:
            #begin of th doc 
            if re.match(r"-DOCSTART-.+", line) or (len(line.strip())==0):
                if(len(line.strip())==0):
                    #[] denote the begining or end of the sentence 
                    cur_line = ['<s>']
                    docs.append(cur_line)
            else:
                cur_line = line.strip().split('\t', 1)
                docs.append(cur_line)
                #print(docs[0])
    return docs

In [8]:
#generate windows(default size = 3) from docs
def docs_to_windows(docs, word_dict, tag_dict, window_size = 3):
    #from words to indices
    #at the begin and the end add the paddings
    
    words = []
    tags = []
    for index in range(len(docs)):
        if docs[index] == ['<s>']:
            continue;
        else:
            item = list([docs[index-1][0], docs[index][0],docs[index+1][0]])
            words.append(item)
            tags.append(docs[index][1])
    return words, tags

In [9]:
def word_to_index(words, inv_words_dict):
    indices = []
    for item in words:
        item_indices = []
        for word in item:
            if inv_words_dict.has_key(word.lower()):
                item_indices.append(inv_words_dict[word.lower()])
            else:
                #for word not in the vacabulary ,use unknown word'UUUNKKK' denote
                word = 'UUUNKKK'
                item_indices.append(inv_words_dict[word])
        
        indices.append(item_indices)
    return indices

def tag_to_index(tags, inv_tag_dict):
    return [inv_tag_dict[tag] for tag in tags]

In [10]:
tagnames = ['O', 'LOC', 'MISC', 'ORG', 'PER']
tag_dict = dict(enumerate(tagnames))
inv_tag_dict = invert_dict(tag_dict)
print(inv_tag_dict)

{'LOC': 1, 'MISC': 2, 'PER': 4, 'O': 0, 'ORG': 3}


In [11]:
docs = generate_dataset('data/ner/train')
words, tags = docs_to_windows(docs, words_dict, tag_dict)
words_indices = np.array(word_to_index(words, inv_words_dict))
tags_indices = np.array(tag_to_index(tags, inv_tag_dict))

In [12]:
tags_matrix = np.array(np.zeros([len(tags_indices), 5]))
for i in range(len(tags_indices)):
    index = tags_indices[i]
    tags_matrix[i, index] = 1

In [13]:
help(tf.add_to_collection)

Help on function add_to_collection in module tensorflow.python.framework.ops:

add_to_collection(name, value)
    Wrapper for `Graph.add_to_collection()` using the default graph.
    
    See [`Graph.add_to_collection()`](../../api_docs/python/framework.md#Graph.add_to_collection)
    for more details.
    
    Args:
      name: The key for the collection. For example, the `GraphKeys` class
        contains many standard names for collections.
      value: The value to add to the collection.



In [14]:
docs_test = generate_dataset('data/ner/dev')
words_test, tags_test = docs_to_windows(docs_test, words_dict, tag_dict)
words_indices_test = np.array(word_to_index(words_test, inv_words_dict))
tags_indices_test = np.array(tag_to_index(tags_test, inv_tag_dict))
tags_matrix_test = np.array(np.zeros([len(tags_indices_test), 5]))
for i in range(len(tags_indices_test)):
    index = tags_indices_test[i]
    tags_matrix_test[i, index] = 1

In [15]:
print(tags_matrix_test.shape)

(51362, 5)


# shuffle the data

In [16]:
def shuffle_data(matrix_A, matrix_B):
    indices = np.random.permutation(len(matrix_A))
    return matrix_A[indices], matrix_B[indices]

In [17]:
A = words_indices[22:32]
B = tags_matrix[22:32]
print(A)
print(B)
print(shuffle_data(A,B))

[[   17   445  3510]
 [  445  3510     9]
 [ 3510     9  7037]
 [    9  7037     9]
 [ 7037     9 26237]
 [    9 26237   192]
 [26237   192  5288]
 [  192  5288   127]
 [ 5288   127  3179]
 [  127  3179  2544]]
[[ 0.  0.  1.  0.  0.]
 [ 1.  0.  0.  0.  0.]
 [ 1.  0.  0.  0.  0.]
 [ 1.  0.  0.  0.  0.]
 [ 1.  0.  0.  0.  0.]
 [ 1.  0.  0.  0.  0.]
 [ 0.  0.  1.  0.  0.]
 [ 1.  0.  0.  0.  0.]
 [ 1.  0.  0.  0.  0.]
 [ 1.  0.  0.  0.  0.]]
(array([[  192,  5288,   127],
       [ 7037,     9, 26237],
       [   17,   445,  3510],
       [ 3510,     9,  7037],
       [  445,  3510,     9],
       [  127,  3179,  2544],
       [    9,  7037,     9],
       [ 5288,   127,  3179],
       [26237,   192,  5288],
       [    9, 26237,   192]]), array([[ 1.,  0.,  0.,  0.,  0.],
       [ 1.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  1.,  0.,  0.],
       [ 1.,  0.,  0.,  0.,  0.],
       [ 1.,  0.,  0.,  0.,  0.],
       [ 1.,  0.,  0.,  0.,  0.],
       [ 1.,  0.,  0.,  0.,  0.],
       [ 1.,  0.,

In [18]:
help(np.random.permutation)

Help on built-in function permutation:

permutation(...)
    permutation(x)
    
    Randomly permute a sequence, or return a permuted range.
    
    If `x` is a multi-dimensional array, it is only shuffled along its
    first index.
    
    Parameters
    ----------
    x : int or array_like
        If `x` is an integer, randomly permute ``np.arange(x)``.
        If `x` is an array, make a copy and shuffle the elements
        randomly.
    
    Returns
    -------
    out : ndarray
        Permuted sequence or array range.
    
    Examples
    --------
    >>> np.random.permutation(10)
    array([1, 7, 4, 3, 0, 9, 2, 5, 8, 6])
    
    >>> np.random.permutation([1, 4, 9, 12, 15])
    array([15,  1,  9,  4, 12])
    
    >>> arr = np.arange(9).reshape((3, 3))
    >>> np.random.permutation(arr)
    array([[6, 7, 8],
           [0, 1, 2],
           [3, 4, 5]])



In [19]:
count = 0
for i in range(len(tags)):
    if tags[i] == 'O':
        count += 1
print(count/len(tags))

count = 0
for i in range(len(tags_test)):
    if tags_test[i] == 'O':
        count += 1
print(count/len(tags_test))

print(len(tags_test))
print(len(tags))
print(tags_test[:10])

0.832811939829
0.832502628402
51362
203621
['O', 'O', 'ORG', 'O', 'O', 'O', 'O', 'O', 'O', 'O']


In [20]:
'''
print(docs[1][0])
print(words_indices[0:23])
for i in range(words_indices.shape[0]):
    if words_indices[i, 2] == 30:
        #print('1')
        words_indices[i, 2] = 31
for i in range(words_indices_test.shape[0]):
    if words_indices_test[i, 2] == 30:
        words_indices_test[i, 2] = 31
'''

"\nprint(docs[1][0])\nprint(words_indices[0:23])\nfor i in range(words_indices.shape[0]):\n    if words_indices[i, 2] == 30:\n        #print('1')\n        words_indices[i, 2] = 31\nfor i in range(words_indices_test.shape[0]):\n    if words_indices_test[i, 2] == 30:\n        words_indices_test[i, 2] = 31\n"

In [21]:
print(words_indices.shape[1])

3


# build the graph

In [22]:
batch_size = 64
max_step = len(tags_indices) // batch_size
max_epoch = 24

In [23]:
def add_placeholders():
    #input
    input_placeholder = tf.placeholder(dtype=tf.int32, shape=[None, 3])
    #labels
    label_placeholder = tf.placeholder(dtype=tf.float32, shape=[None, 5])
    return input_placeholder, label_placeholder

def create_feed_dict(input_placeholder, input_batch, label_placeholder, label_batch):
    feed_dict = {input_placeholder: input_batch,
                label_placeholder:label_batch}
    return feed_dict

In [24]:
#generate the batch 
def indices_to_vec(words_indices, word2vec):
    vec = np.ndarray([batch_size, 150])
    for i in range(batch_size):
        #print(array(words_indices[i]))
        result = word2vec[np.array(words_indices[i]), :]
        result = np.reshape(result,[150])
        vec[i, :] = result
        #print(result.shape)
    return vec

def add_embed_layer(word2vec, input_placeholder):
    return tf.nn.embedding_lookup(word2vec, input_placeholder)

In [25]:
def evaluation(y_pred, labels):
    label_right = tf.argmax(labels, dimension=1)
    label_pred = tf.argmax(y_pred, dimension=1)
    correct_pred_num =  tf.reduce_sum(tf.cast(tf.equal(label_right, label_pred), tf.int32))
    return correct_pred_num

def do_eval(sess, eval_correct, words_indices,
            batch_size, input_placeholder, label_placeholder,tags_matrix):
    true_count = 0
    steps_per_epoch = len(tags_test) // batch_size
    
    num_examples = steps_per_epoch * batch_size
    print(num_examples)
    test_loss = []
    for step in range(steps_per_epoch):
        input_batch = words_indices[step*batch_size: (step+1)*batch_size, :]
        label_batch = tags_matrix[step*batch_size: (step+1)*batch_size, :]
        feed_dict = create_feed_dict(input_placeholder, input_batch, label_placeholder, label_batch)
        tmp_count,tmp_loss  = sess.run([eval_correct, loss], feed_dict)
        true_count += tmp_count
        test_loss.append(tmp_loss)
    print(true_count)
    mean_loss = np.mean(test_loss)
    precision = true_count / num_examples
    print('Validation Loss: %f' %(mean_loss))
    print('Validation Precision: %f' % (precision))

In [34]:
with tf.Graph().as_default():
    #add placeholders
    input_placeholder, label_placeholder = add_placeholders()
    #add the embed layer
    embedding = tf.get_variable('Embedding', [len(word2vec), 50]) 
    embeds = tf.nn.embedding_lookup(embedding, input_placeholder)
    embeds = tf.reshape(embeds, [-1, 150])
    #for the hidden unit
    W = tf.Variable(tf.random_uniform([150, 100], 
                                      minval=-np.sqrt(6.0/(150+100)), maxval = np.sqrt(6.0/(150+100))))
    b1 = tf.Variable(tf.zeros([100]))
    h = tf.nn.tanh(tf.matmul(embeds, W) + b1)
    U = tf.Variable(tf.random_uniform([100, 5], 
                                      minval=-np.sqrt(6.0/(100+5)), maxval = np.sqrt(6.0/(100+5))))
    b2 = tf.Variable(tf.zeros([5]))
    y = tf.matmul(h, U) + b2
    y = tf.nn.dropout(y, 0.9)
    pred = tf.nn.softmax(y)
    #
    eval_correct = evaluation(pred, label_placeholder)
    ##regulurization
    #L2 = tf.reduce_sum(tf.square(W)) + tf.reduce_sum(tf.square(U))
    loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(y, label_placeholder)) + 0.5*0.0001*tf.nn.l2_loss(W)+ 0.5*0.0001*tf.nn.l2_loss(U)
    train_op = tf.train.AdamOptimizer(learning_rate=0.001).minimize(loss)
    sess = tf.Session()
    sess.run(tf.initialize_all_variables())
    for epoch in range(max_epoch):
        
        #shuffle the data
        words_indices, tags_matrix = shuffle_data(words_indices, tags_matrix)
        words_indices_test, tags_matrix_test = shuffle_data(words_indices_test, tags_matrix_test)
        correct_num = []
        
        for step in range(max_step):
            input_batch = words_indices[step*batch_size: (step+1)*batch_size, :]
            label_batch = tags_matrix[step*batch_size: (step+1)*batch_size, :]
            
            #input_batch = indices_to_vec(input_indices_batch, word2vec)
            ##shuffle the data
            feed_dict = create_feed_dict(input_placeholder, input_batch, label_placeholder, label_batch)
            _, loss_iter, correct_tmp = sess.run([train_op, loss, eval_correct], feed_dict)
            correct_num.append(correct_tmp)
            if step % 200 == 0:
                print(loss_iter)
        print(np.sum(correct_num)/ (max_step*batch_size))
        do_eval(sess, eval_correct,  words_indices_test,
                batch_size, input_placeholder, label_placeholder, tags_matrix_test)

1.6122
0.436686
0.324668
0.458925
0.403082
0.200905
0.16913
0.22696
0.073411
0.297769
0.307932
0.215146
0.181093
0.163965
0.201217
0.151164
0.9259765011
51328
48428
Validation Loss: 0.224921
Validation Precision: 0.943501
0.184523
0.0617676
0.185237
0.0957405
0.104207
0.114061
0.221457
0.100315
0.100898
0.126498
0.113688
0.134258
0.134043
0.0410077
0.136602
0.0914294
0.963214201509
51328
48439
Validation Loss: 0.223082
Validation Precision: 0.943715
0.182362
0.0411393
0.0780863
0.226395
0.0837828
0.230036
0.134901
0.136362
0.0657355
0.16822
0.382148
0.137836
0.180086
0.220435
0.188398
0.114687
0.968587904747
51328
48397
Validation Loss: 0.229408
Validation Precision: 0.942897
0.123968
0.0632047
0.29622
0.0472185
0.312508
0.0493164
0.129145
0.0993374
0.0911122
0.214584
0.123246
0.0416243
0.130712
0.205632
0.118174
0.0498126
0.970960389815
51328
48467
Validation Loss: 0.230543
Validation Precision: 0.944260
0.167575
0.104121
0.184485
0.119779
0.135298
0.193462
0.118292
0.0761743
0.109811

KeyboardInterrupt: 

In [29]:
Validation Precision: 0.832567
add L2Validation Precision: 0.832450
add xavier initialization Validation Precision: 0.832450

SyntaxError: invalid syntax (<ipython-input-29-3829de0ae4d6>, line 1)

In [None]:
help(tf.nn.softmax)

In [None]:
help(tf.nn.dropout)

In [35]:
48777 

0.9502026184538653

In [36]:
48896/51328

0.9526184538653366