## 建立 BiLSTM + CRF 模型訓練命名實體識別

In [1]:
import tensorflow as tf
import os
import pandas as pd
import numpy as np
os.environ["CUDA_VISIBLE_DEVICES"] = '0'

#let the gpu allocates memory space dynamically
config = tf.ConfigProto()
config.gpu_options.allow_growth = True

  from ._conv import register_converters as _register_converters


In [2]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Bidirectional, Dense, TimeDistributed, Dropout
from keras_contrib.layers.crf import CRF
from keras_contrib.utils import save_load_utils
from keras.callbacks import ModelCheckpoint, EarlyStopping
from keras import optimizers

Using TensorFlow backend.


In [3]:
training_data = pd.read_csv('04_NER_training_data.csv')
training_data['question'] = training_data['question'].apply(lambda x : eval(x))
training_data['tag'] = training_data['tag'].apply(lambda x : eval(x))
training_data['purpose'] = training_data['purpose'].apply(lambda x : eval(x))


testing_data = pd.read_csv('04_NER_testing_data.csv')
testing_data = testing_data.apply(lambda x : x.apply(lambda y : eval(y)))

In [4]:
all_data = pd.concat([training_data['question'], testing_data['question']])
tokenizer_X = Tokenizer()
tokenizer_X.fit_on_texts(all_data)
word_index_X = tokenizer_X.word_index
print('Found %s unique tokens.' % len(word_index_X))

Found 4057 unique tokens.


In [5]:
MAX_SEQUENCE_LENGTH = all_data.apply(lambda x : len(x)).max()
X = tokenizer_X.texts_to_sequences(training_data['question'])
X = pad_sequences(X, maxlen = MAX_SEQUENCE_LENGTH)

In [6]:
tokenizer_Y = Tokenizer()
tokenizer_Y.fit_on_texts(training_data['tag'])
word_index_Y = tokenizer_Y.word_index
print('Found %s unique tokens.' % len(word_index_Y))
Y = tokenizer_Y.texts_to_sequences(training_data['tag'])
Y = pad_sequences(Y, maxlen = MAX_SEQUENCE_LENGTH)
Y = np.expand_dims(Y,2)

Found 3 unique tokens.


#### 建立兩層的 BiLSTM ，最後再接上 CRF Layer

In [7]:
model = Sequential()
model.add(Embedding(4057, 100, input_length=X.shape[1], trainable=True))

model.add(Bidirectional(LSTM(50, dropout=0.2, recurrent_dropout=0.2, return_sequences=True)))
model.add(Dropout(0.1))
model.add(Bidirectional(LSTM(50, return_sequences=True)))
model.add(Dropout(0.1))
crf_layer = CRF(4, sparse_target=True)
model.add(crf_layer)
optimizer = optimizers.Adam(lr=0.0005, beta_1=0.9, beta_2=0.999, epsilon=None, decay=0.0001, amsgrad=False)
model.compile(loss = crf_layer.loss_function, optimizer = optimizer, metrics=[crf_layer.accuracy])
print(model.summary())



_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 58, 100)           361400    
_________________________________________________________________
bidirectional_1 (Bidirection (None, 58, 100)           60400     
_________________________________________________________________
dropout_1 (Dropout)          (None, 58, 100)           0         
_________________________________________________________________
bidirectional_2 (Bidirection (None, 58, 100)           60400     
_________________________________________________________________
dropout_2 (Dropout)          (None, 58, 100)           0         
_________________________________________________________________
crf_1 (CRF)                  (None, 58, 4)             428       
Total params: 482,628
Trainable params: 482,628
Non-trainable params: 0
_________________________________________________________________
None

In [8]:
'''

epochs = 50

batch_size = 100

weight_save = ModelCheckpoint('04_NER_weight.hdf5', save_best_only=True, monitor='val_loss', mode='min')

model.fit(X, Y, epochs=epochs, batch_size=batch_size,validation_split=0.1,callbacks=[EarlyStopping(monitor='val_loss', patience = 2, min_delta=0.0001), weight_save])

'''

"\n\nepochs = 50\n\nbatch_size = 100\n\nweight_save = ModelCheckpoint('04_NER_weight.hdf5', save_best_only=True, monitor='val_loss', mode='min')\n\nmodel.fit(X, Y, epochs=epochs, batch_size=batch_size,validation_split=0.1,callbacks=[EarlyStopping(monitor='val_loss', patience = 2, min_delta=0.0001), weight_save])\n\n"

In [9]:
seq = tokenizer_X.texts_to_sequences(testing_data['question'])
padded = pad_sequences(seq, maxlen=MAX_SEQUENCE_LENGTH)

model.load_weights('04_NER_weight.hdf5')
pred = model.predict(padded)

In [10]:
Entity = []
Purpose = []

for i in range(len(testing_data['question'])) :
    
    prediction = np.argmax(pred[i], axis = 1)[-len(testing_data['question'][i]):]
    
    Entity.append(''.join(np.array(testing_data['question'][i])[prediction >= 2].tolist()))
    
    Purpose.append(np.array(testing_data['question'][i])[prediction == 1].tolist())
    
    
testing_data['entity'] = Entity
testing_data['purpose'] = Purpose

In [11]:
testing_data.head()

Unnamed: 0,question,entity,purpose
0,"[你, 知, 道, 计, 算, 机, 应, 用, 基, 础, 这, 本, 书, 的, 作, ...",计算机应用基础,"[你, 知, 道, 这, 本, 书, 的, 作, 者, 是, 谁, 吗, ？]"
1,"[计, 算, 机, 应, 用, 基, 础, 这, 本, 书, 的, 出, 版, 社, 是, ...",计算机应用基础,"[这, 本, 书, 的, 出, 版, 社, 是, 那, 个, ？]"
2,"[告, 诉, 我, 高, 等, 数, 学, 的, 出, 版, 时, 间, 是, 什, 么, ...",高等数学,"[告, 诉, 我, 的, 出, 版, 时, 间, 是, 什, 么, 时, 候, ？]"
3,"[我, 想, 知, 道, 戴, 维, 斯, 是, 什, 么, 国, 家, 的, 人, ？]",戴维斯,"[我, 想, 知, 道, 是, 什, 么, 国, 家, 的, 人, ？]"
4,"[你, 知, 道, 高, 等, 数, 学, 的, i, s, b, n, 吗, ？]",高等数学,"[你, 知, 道, 的, i, s, b, n, 吗, ？]"


In [12]:
testing_data.to_csv('04_Entity_testing_data.csv', index = False, encoding='utf_8_sig')