# 【NLP序列标注】Keras-BiLSTM-NER | Keras实现中文命名实体识别

In [3]:
import pdb
import platform
import numpy
import pickle
from collections import Counter
from keras.preprocessing.sequence import pad_sequences
from keras.callbacks import Callback, ModelCheckpoint, ReduceLROnPlateau, EarlyStopping,CSVLogger
from keras.models import Sequential
from keras.layers import Dense,Dropout,Embedding,LSTM,Bidirectional
from keras_contrib.layers import CRF

Using TensorFlow backend.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


## 数据读取及预处理

- 数据读取，分割每句话在一个子列表中
- 生成训练数据的对应词表，频次目前取>2的
- 对文本x和标签y进行padding


In [4]:
tag_split_text = " " # 文本与标签的分隔符空格或者\t或者其他

def read_data(fh):
    #  in windows the new line is '\r\n\r\n' the space is '\r\n' . so if you use windows system,
    #  you have to use recorsponding instructions

    if platform.system() == 'Windows':
        split_text = '\r\n'
    else:
        split_text = '\n'
        
    # 以字符串读取，全文都在一个字符串上“'从 O\n效益 O\n上 O\n来 O\n看 O\n...”
    string = fh.read().decode('utf-8')
    rows_data = [row.replace("#","") for row in string.strip().split(split_text)]
    
    # 分割句子
    sentence_data = []
    sentence_tmp= []
    for row in rows_data:
        if row.strip():
            sentence_tmp.append(row)
        else:
            sentence_data.append(sentence_tmp)
            sentence_tmp = []
            
    fh.close()
    return sentence_data

def load_data():

    train = read_data(open('../../data/industry/train.txt', 'rb'))
    test = read_data(open('../../data/industry/dev.txt', 'rb'))
    
    word_counts = Counter(word.split(" ")[0].lower() for sentence in train for word in sentence)
    
    vocab = [w for w, f in iter(word_counts.items()) if f >= 2]
    tags = ['O', 'B-industry', 'I-industry']

    # save initial config data
    with open('config.pkl', 'wb') as outp:
        pickle.dump((vocab, tags), outp)

    train = _process_data(train, vocab, tags)
    test = _process_data(test, vocab, tags)
    return train, test, (vocab, tags)

def _process_data(data, vocab, tags, maxlen=100, onehot=False):

    word2idx = dict((w, i) for i, w in enumerate(vocab))
    idx2word = dict((i, w) for i, w in enumerate(vocab))
    
    # set to <unk> (index end) if not in vocab
    UNK_index = len(vocab)
    x = [[word2idx.get(row.split(tag_split_text)[0],UNK_index) for row in s]for s in data]
    y = [[tags.index(row.split(tag_split_text)[1]) for row in s] for s in data]

    x = pad_sequences(x, maxlen,padding="post",truncating="post")  # Right padding
    y = pad_sequences(y, maxlen,padding="post",truncating="post",value=-1)

    if onehot:
        y = numpy.eye(len(tags), dtype='float32')[y]
    else:
        y = numpy.expand_dims(y, 2)
    return x, y

## 模型构建及训练
- 模型是最简单的Embedding+BiLSTM+CRF

In [None]:
def create_model(input_dim,embed_dim,birnn_units,crf_classify_num):
    model = Sequential()
    model.add(Embedding(input_dim, embed_dim, mask_zero=True))  # Random embedding
    # 双向RNN包装器
    model.add(Bidirectional(LSTM(birnn_units // 2, return_sequences=True)))
    crf = CRF(crf_classify_num, sparse_target=True)
    model.add(crf)
    model.summary()
    model.compile('adam', loss=crf.loss_function, metrics=[crf.accuracy])
    return model

def run_train():
    EPOCHS = 10
    (train_x, train_y), (test_x, test_y), (vocab, tags) = load_data()
    model = create_model(input_dim = len(vocab),embed_dim=200,birnn_units = 200,crf_classify_num=len(tags))
    model.fit(train_x, train_y,batch_size=16,epochs=EPOCHS,validation_data=[test_x, test_y])
    model.save('crf_test.h5')
    
run_train()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, None, 200)         5490000   
_________________________________________________________________
bidirectional_1 (Bidirection (None, None, 200)         240800    
_________________________________________________________________
crf_1 (CRF)                  (None, None, 3)           618       
Total params: 5,731,418
Trainable params: 5,731,418
Non-trainable params: 0
_________________________________________________________________




Train on 48274 samples, validate on 5373 samples
Epoch 1/10
 5312/48274 [==>...........................] - ETA: 24:33 - loss: nan - crf_viterbi_accuracy: 0.9228