# Bi-LSTM for NER 

使用预训练好的词向量。

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
from gensim.models import KeyedVectors
import pickle


# 导入预训练好的词向量
with open('../data/embedding_data.pkl', 'rb') as inp:
    W_embedding = pickle.load(inp)
    sr_id2word = pickle.load(inp)
    sr_word2id = pickle.load(inp)

In [2]:
import tensorflow as tf
config = tf.ConfigProto()
config.gpu_options.allow_growth = True
sess = tf.Session(config=config)
from tensorflow.contrib import rnn

'''
For Chinese word segmentation.
'''
# ##################### config ######################
timestep_size = max_len = 32           # 句子长度
vocab_size = 5159    # 样本中不同字的个数+1(padding 0)，根据处理数据的时候得到
input_size = embedding_size = 64       # 字向量长度
class_num = 6
hidden_size = 128    # 隐含层节点数
layer_num = 2        # bi-lstm 层数
max_grad_norm = 5.0  # 最大梯度（超过此值的梯度将被裁剪）

lr = tf.placeholder(tf.float32)
keep_prob = tf.placeholder(tf.float32)
batch_size = tf.placeholder(tf.int32)  # 注意类型必须为 tf.int32


def weight_variable(shape):
    """Create a weight variable with appropriate initialization."""
    initial = tf.truncated_normal(shape, stddev=0.1)
    return tf.Variable(initial)


def bias_variable(shape):
    """Create a bias variable with appropriate initialization."""
    initial = tf.constant(0.1, shape=shape)
    return tf.Variable(initial)


X_inputs = tf.placeholder(tf.int32, [None, timestep_size], name='X_input')
y_inputs = tf.placeholder(tf.int32, [None, timestep_size], name='y_input')    

def lstm_cell():
    cell = rnn.LSTMCell(hidden_size, reuse=tf.get_variable_scope().reuse)
    return rnn.DropoutWrapper(cell, output_keep_prob=keep_prob)
    
def bi_lstm(X_inputs):
    """build the bi-LSTMs network. Return the y_pred"""
    # ** 0.char embedding
#     embedding = tf.get_variable("embedding", [vocab_size, embedding_size], dtype=tf.float32)
    embedding = tf.get_variable(name="embedding", shape=W_embedding.shape, 
                                initializer=tf.constant_initializer(W_embedding), trainable=False)
    # X_inputs.shape = [batchsize, timestep_size]  ->  inputs.shape = [batchsize, timestep_size, embedding_size]
    inputs = tf.nn.embedding_lookup(embedding, X_inputs)  
    
    # ** 1.构建前向后向多层 LSTM
    cell_fw = rnn.MultiRNNCell([lstm_cell() for _ in range(layer_num)], state_is_tuple=True)
    cell_bw = rnn.MultiRNNCell([lstm_cell() for _ in range(layer_num)], state_is_tuple=True)
  
    # ** 2.初始状态
    initial_state_fw = cell_fw.zero_state(batch_size, tf.float32)
    initial_state_bw = cell_bw.zero_state(batch_size, tf.float32)  
    
    # **************************************************************
    # ** 把 inputs 处理成 rnn.static_bidirectional_rnn 的要求形式
    # ** 文档说明
    # inputs: A length T list of inputs, each a tensor of shape
    # [batch_size, input_size], or a nested tuple of such elements.
    # *************************************************************
    # ** 3.bi-lstm 计算（tf封装）  一般采用下面 static_bidirectional_rnn 函数调用。
    #   但是为了理解计算的细节，所以把后面的这段代码进行展开自己实现了一遍。
    # Unstack to get a list of 'n_steps' tensors of shape (batch_size, n_input)
    # inputs.shape = [batchsize, timestep_size, embedding_size]  ->  timestep_size tensor, each_tensor.shape = [batchsize, embedding_size]
    inputs = tf.unstack(inputs, timestep_size, 1)
    try:
        outputs, _, _ = rnn.static_bidirectional_rnn(cell_fw, cell_bw, inputs, 
                        initial_state_fw = initial_state_fw, initial_state_bw = initial_state_bw, dtype=tf.float32)
    except Exception: # Old TensorFlow version only returns outputs not states
        outputs = rnn.static_bidirectional_rnn(cell_fw, cell_bw, inputs, 
                        initial_state_fw = initial_state_fw, initial_state_bw = initial_state_bw, dtype=tf.float32)
    output = tf.reshape(tf.concat(outputs, 1), [-1, hidden_size * 2])
    softmax_w = weight_variable([hidden_size * 2, class_num]) 
    softmax_b = bias_variable([class_num]) 
    logits = tf.matmul(output, softmax_w) + softmax_b
    return logits


y_pred = bi_lstm(X_inputs)
# adding extra statistics to monitor
# y_inputs.shape = [batch_size, timestep_size]
correct_prediction = tf.equal(tf.cast(tf.argmax(y_pred, 1), tf.int32), tf.reshape(y_inputs, [-1]))
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
cost = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(labels = tf.reshape(y_inputs, [-1]), logits = y_pred))

# ***** 优化求解 *******
# 获取模型的所有参数
tvars = tf.trainable_variables()
# 获取损失函数对于每个参数的梯度
grads, _ = tf.clip_by_global_norm(tf.gradients(cost, tvars), max_grad_norm)
# 优化器
optimizer = tf.train.AdamOptimizer(learning_rate=lr)
# 梯度下降计算
train_op = optimizer.apply_gradients( zip(grads, tvars),
    global_step=tf.contrib.framework.get_or_create_global_step())
print 'Finished creating the bi-lstm model.'

Finished creating the bi-lstm model.


## 导入数据

In [3]:
import time
import sys
sys.path.append('..')
from dataset import BatchGenerator

with open('../data/dataset.pkl', 'rb') as inp:
    X = pickle.load(inp)
    y = pickle.load(inp)

In [4]:
# 划分 train, valid=0.1, test=0.2
sample_num = X.shape[0]
valid_num = int(sample_num * 0.1)
test_num = int(sample_num * 0.2)
np.random.seed(13)
new_index = np.random.permutation(sample_num)
X = X[new_index]
y = y[new_index]
X_valid = X[:valid_num]
y_valid = y[:valid_num]
X_test = X[valid_num:valid_num+test_num]
y_test = y[valid_num:valid_num+test_num]
X_train = X[valid_num+test_num:]
y_train = y[valid_num+test_num:]
print 'train_num=%d, valid_num=%d, test_num=%d' % (X_train.shape[0], X_valid.shape[0], X_test.shape[0])

# 构建数据生成器
data_train = BatchGenerator(X_train, y_train, shuffle=True)
data_valid = BatchGenerator(X_valid, y_valid, shuffle=False)
data_test = BatchGenerator(X_test, y_test, shuffle=False)

train_num=13819, valid_num=1973, test_num=3947


In [5]:
import os
model_save_path = '../ckpt/pretrain/'  # 模型保存位置
if not os.path.exists(model_save_path):
    os.makedirs(model_save_path)
model_save_path = model_save_path + 'bi-lstm.ckpt'

In [6]:
def test_epoch(dataset):
    """Testing or valid."""
    _batch_size = 980
    fetches = [accuracy, cost]
    _y = dataset.y
    data_size = _y.shape[0]
    batch_num = int(data_size / _batch_size)
    start_time = time.time()
    _costs = 0.0
    _accs = 0.0
    for i in xrange(batch_num):
        X_batch, y_batch = dataset.next_batch(_batch_size)
        feed_dict = {X_inputs:X_batch, y_inputs:y_batch, lr:1e-5, batch_size:_batch_size, keep_prob:1.0}
        _acc, _cost = sess.run(fetches, feed_dict)
        _accs += _acc
        _costs += _cost    
    mean_acc= _accs / batch_num     
    mean_cost = _costs / batch_num
    return mean_acc, mean_cost


decay = 0.92
max_epoch = 10
max_max_epoch = 50  # 本例中，50个epoch基本上就收敛了
tr_batch_size = 128 
display_num = 1  # 每个 epoch 显示是个结果
sess.run(tf.global_variables_initializer())
tr_batch_num = int(data_train.y.shape[0] / tr_batch_size)  # 每个 epoch 中包含的 batch 数
display_batch = int(tr_batch_num / display_num)  # 每训练 display_batch 之后输出一次
saver = tf.train.Saver(max_to_keep=10)  # 最多保存的模型数量
for epoch in xrange(max_max_epoch):
    _lr = 1e-4
    if epoch > max_epoch:
        _lr = _lr * ((decay) ** (epoch - max_epoch))
    print 'EPOCH %d， lr=%g' % (epoch+1, _lr)
    start_time = time.time()
    _costs = 0.0
    _accs = 0.0
    show_accs = 0.0
    show_costs = 0.0
    for batch in xrange(tr_batch_num): 
        fetches = [accuracy, cost, train_op]
        X_batch, y_batch = data_train.next_batch(tr_batch_size)
        feed_dict = {X_inputs:X_batch, y_inputs:y_batch, lr:_lr, batch_size:tr_batch_size, keep_prob:0.5}
        _acc, _cost, _ = sess.run(fetches, feed_dict) # the cost is the mean cost of one batch
        _accs += _acc
        _costs += _cost
        show_accs += _acc
        show_costs += _cost
        if (batch + 1) % display_batch == 0:
            valid_acc, valid_cost = test_epoch(data_valid)  # valid
            print '\ttraining acc=%g, cost=%g;  valid acc= %g, cost=%g ' % (show_accs / display_batch,
                                                show_costs / display_batch, valid_acc, valid_cost)
            show_accs = 0.0
            show_costs = 0.0
    mean_acc = _accs / tr_batch_num 
    mean_cost = _costs / tr_batch_num
    if (epoch + 1) % 10 == 0:  # 每 3 个 epoch 保存一次模型
        save_path = saver.save(sess, model_save_path, global_step=(epoch+1))
        print 'the save path is ', save_path
#     print '\ttraining %d, acc=%g, cost=%g ' % (data_train.y.shape[0], mean_acc, mean_cost)
    print 'Epoch training %d, acc=%g, cost=%g, speed=%g s/epoch' % (data_train.y.shape[0], mean_acc, mean_cost, time.time()-start_time)        
# testing
print '**TEST RESULT:'
test_acc, test_cost = test_epoch(data_test)
print '**Test %d, acc=%g, cost=%g' % (data_test.y.shape[0], test_acc, test_cost)

EPOCH 1， lr=0.0001
	training acc=0.890317, cost=0.54758;  valid acc= 0.944819, cost=0.257498 
Epoch training 13819, acc=0.890317, cost=0.54758, speed=9.9482 s/epoch
EPOCH 2， lr=0.0001
	training acc=0.941504, cost=0.260413;  valid acc= 0.94678, cost=0.227145 
Epoch training 13819, acc=0.941504, cost=0.260413, speed=8.37079 s/epoch
EPOCH 3， lr=0.0001
	training acc=0.943473, cost=0.235491;  valid acc= 0.947147, cost=0.205897 
Epoch training 13819, acc=0.943473, cost=0.235491, speed=8.12263 s/epoch
EPOCH 4， lr=0.0001
	training acc=0.944242, cost=0.21233;  valid acc= 0.947545, cost=0.181164 
Epoch training 13819, acc=0.944242, cost=0.21233, speed=8.13089 s/epoch
EPOCH 5， lr=0.0001
	training acc=0.945744, cost=0.184518;  valid acc= 0.949841, cost=0.150531 
Epoch training 13819, acc=0.945744, cost=0.184518, speed=8.10326 s/epoch
EPOCH 6， lr=0.0001
	training acc=0.950862, cost=0.155467;  valid acc= 0.95802, cost=0.123583 
Epoch training 13819, acc=0.950862, cost=0.155467, speed=8.19468 s/epoch

## 预测结果

In [7]:
pre_softmax = sess.run(y_pred, feed_dict={X_inputs:X_test, batch_size:X_test.shape[0], keep_prob:1.0})
pre_label = np.argmax(pre_softmax, axis=1)
sr_pre = pd.Series(pre_label)
sr_pre.value_counts()

0    73087
5    47732
1     2737
2     1285
4     1214
3      249
dtype: int64

In [8]:
true_label = y_test.flatten()
sr_label= pd.Series(true_label)
sr_label.value_counts()

0    72593
5    47888
1     2840
4     1339
2     1333
3      311
dtype: int64

## 混淆矩阵与分类结果分析

In [9]:
def my_confusion_matrix(y_true, y_pred):  
    from sklearn.metrics import confusion_matrix  
    labels = list(set(y_true))  
    conf_mat = confusion_matrix(y_true, y_pred, labels = labels)  
    print "confusion_matrix(left labels: y_true, up labels: y_pred):"  
    print "labels\t",  
    for i in range(len(labels)):  
        print labels[i],"\t",  
    print   
    for i in range(len(conf_mat)):  
        print i,"\t",  
        for j in range(len(conf_mat[i])):  
            print conf_mat[i][j],'\t',  
        print   
    print   

def my_classification_report(y_true, y_pred):  
    from sklearn.metrics import classification_report  
    print "classification_report(left: labels):"  
    print classification_report(y_true, y_pred) 

In [10]:
my_confusion_matrix(true_label, pre_label)

confusion_matrix(left labels: y_true, up labels: y_pred):
labels	0 	1 	2 	3 	4 	5 	
0 	71812 	238 	84 	33 	140 	286 	
1 	351 	2302 	133 	5 	49 	0 	
2 	121 	155 	1050 	7 	0 	0 	
3 	76 	1 	13 	192 	29 	0 	
4 	285 	41 	5 	12 	996 	0 	
5 	442 	0 	0 	0 	0 	47446 	



In [11]:
my_classification_report(true_label, pre_label)

classification_report(left: labels):
             precision    recall  f1-score   support

          0       0.98      0.99      0.99     72593
          1       0.84      0.81      0.83      2840
          2       0.82      0.79      0.80      1333
          3       0.77      0.62      0.69       311
          4       0.82      0.74      0.78      1339
          5       0.99      0.99      0.99     47888

avg / total       0.98      0.98      0.98    126304



- 和之前没有使用预训练的词向量相比较，结果要好很多。
- 使用预训练词向量在一定程度上解决了数据不均衡的问题。
- 填充 UNKNOWN 对分类结果没有太大的影响。从混淆矩阵上看，所有的实体词都没有被分到 UNKNOWN 类。
- 尽管在整个训练语料中，样本数量比较小，而且类别非常不均衡。但是最后结果显示还是有较好的识别能力。特别是在单词视角上，准确率达到0.83.但是这还需要进一步改进。如果在后面加上 HMM 模型，实体提取的准确率和召回率恒大程度上依赖于 视角首词（也就是类别 2）的识别性能。