# Sequence Labeling using LSTM+CRF - Part 3

### ---- Chinese NER tagging

In [1]:
import tensorflow as tf
import numpy as np

## Create Database Iterator

In [5]:
tag2index = {"O": 0,
             "B-PER": 1, "I-PER": 2,
             "B-LOC": 3, "I-LOC": 4,
             "B-ORG": 5, "I-ORG": 6
             }

In [6]:
UNK = "$UNK$"
NUM = "$NUM$"
ENG = "$ENG$"
PAD = "$PAD$"

In [7]:
def load_word2index(file_name):
    token2idx = {}
    token2idx[PAD] = 0
    with open(file_name) as f:
        for idx, token in enumerate(f):
            token = token.strip()
            token2idx[token] = idx + 1
    return token2idx

In [8]:
word2index = load_word2index('data/ch_word_vocab.txt')

In [11]:
def get_processing_token(token2index):
    
    def f(token):
        
        if token.isdigit():
            token = NUM
        elif ('\u0041' <= token <='\u005a') or ('\u0061' <= token <='\u007a'):
            token = ENG
            
        if token in token2index:
            token = token2index[token]
        else:
            token = token2index[UNK]
        
        return token

    return f

In [12]:
def get_processing_tag(token2index):
    
    def f(token):
        return token2index[token]
    
    return f

In [13]:
process_word_f = get_processing_token(word2index)
process_tag_f = get_processing_tag(tag2index)

In [60]:
class Dataset(object):
    def __init__(self, file_name, processing_word, processing_tag, max_iter=None):
        self.file_name = file_name
        self.processing_word = processing_word
        self.processing_tag = processing_tag
        self.max_iter = max_iter
        
    def __iter__(self):
        words = []
        ner_tags = []
        niter = 0
        with open(self.file_name, encoding='utf-8') as f:
            for line in f:
#                 line = line.strip()
                if line == '\n':
                    if len(words)!=0:
                        niter+=1
                        if self.max_iter is not None and niter > self.max_iter:
                            break
                        yield (words, ner_tags)
                        words, ner_tags = [], []
                else:
                    ls = line.strip().split()
                    word, ner_tag = ls[0], ls[-1]
#                     print("w->n", word, ner_tag)
                    if self.processing_word is not None:
                        word = self.processing_word(word)
                    if self.processing_tag is not None:
                        ner_tag = self.processing_tag(ner_tag)
                    words += [word]
                    ner_tags += [ner_tag]

## Create Embedding Matrix

In [93]:
def export_trimmed_glove_vectors(word2index, filename_glove, filename_trimmed, dim):
    """Saves glove vectors in numpy array
    Args:
        vocab: dictionary vocab[word] = index
        glove_filename: a path to a glove file
        trimmed_filename: a path where to store a matrix in npy
        dim: (int) dimension of embeddings
    """
    word_num = len(word2index)
    embedding_matrix = np.zeros((word_num, dim))
    with open(filename_glove) as f:
        for line in f:
            line = line.strip().split(' ')
            word = line[0]
            if word in word2index:
                embedding_matrix[word2index[word]] = np.asarray(line[1:])

    np.savez_compressed(filename_trimmed, embeddings=embedding_matrix)
    print('embedding matrix with shape {} saved'.format(embedding_matrix.shape))

In [19]:
dim = 50

In [94]:
filename_glove = "data/glove.6B/glove.6B.{}d.txt".format(dim)
filename_trimmed = "data/glove.6B/glove.6B.{}d.trimmed.npz".format(dim)
export_trimmed_glove_vectors(word2index, filename_glove, filename_trimmed, dim)

embedding matrix with shape (4812, 50) saved


In [93]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Train Model

In [113]:
from BiLSTM_CRF import BiLSTM_CRF
from EarlyStoppingCheckPoint import EarlyStoppingCheckPoint

In [114]:
word_num = len(word2index)
hidden_size_lstm = 300
tag_num = len(tag2index)
kr              = 0.7 # keep rate
lr              = 0.01 # learning rate

params = {}
params['learning_rate'] = lr
params['keep_dropout_rate'] = kr
params['word_number'] = word_num
params['vector_dim'] = dim
params['hidden_size_lstm'] = hidden_size_lstm
params['tag_number'] = tag_num

In [96]:
tf.reset_default_graph()

In [97]:
bilstm_model = BiLSTM_CRF()

In [98]:
bilstm_model.build(params)

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


In [99]:
train_file = 'data/ch_train.txt'
test_file = 'data/ch_test.txt'

train_dataset = Dataset(train_file, process_word_f, process_tag_f)
valid_dataset = Dataset(test_file, process_word_f, process_tag_f)

In [102]:
model_file_path = "checkpoints/ch/ner.ckpt"
earlyStoppingCheckPoint = EarlyStoppingCheckPoint(file_path=model_file_path, monitor='acc', patience=7)

In [104]:
nepochs         = 2
batch_size      = 32
bilstm_model.fit(train_dataset, valid_dataset, nepochs, batch_size, earlyStoppingCheckPoint)

ep: 0 iter: 10 loss: 16.264107
ep: 0 iter: 20 loss: 13.770277
ep: 0 iter: 30 loss: 12.8786
ep: 0 iter: 40 loss: 12.4181185
ep: 0 iter: 50 loss: 11.577864
accuracy 0.8850470159500814
ep: 0 iter: 60 loss: 11.083816
ep: 0 iter: 70 loss: 10.71337
ep: 0 iter: 80 loss: 10.765925
ep: 0 iter: 90 loss: 10.676321
ep: 0 iter: 100 loss: 10.373866
accuracy 0.8897051581392924
ep: 0 iter: 110 loss: 10.273533
ep: 0 iter: 120 loss: 9.992379
ep: 0 iter: 130 loss: 9.64046
ep: 0 iter: 140 loss: 9.567707
ep: 0 iter: 150 loss: 9.477866
accuracy 0.8970515813929235
ep: 0 iter: 160 loss: 9.285762
ep: 0 iter: 170 loss: 9.136686
ep: 0 iter: 180 loss: 8.9825945
ep: 0 iter: 190 loss: 8.800861
ep: 0 iter: 200 loss: 8.739961
accuracy 0.9235867694856924
ep: 0 iter: 210 loss: 8.586854
ep: 0 iter: 220 loss: 8.542168
ep: 0 iter: 230 loss: 8.416842
ep: 0 iter: 240 loss: 8.214833
ep: 0 iter: 250 loss: 8.047419
accuracy 0.9206088029617442
ep: 0 iter: 260 loss: 7.8742385
ep: 0 iter: 270 loss: 7.730769
ep: 0 iter: 280 loss: 

ep: 1 iter: 640 loss: 1.8184664
ep: 1 iter: 650 loss: 1.8085765
accuracy 0.9622481909143052
ep: 1 iter: 660 loss: 1.8037119
ep: 1 iter: 670 loss: 1.807525
ep: 1 iter: 680 loss: 1.7961609
ep: 1 iter: 690 loss: 1.7981856
ep: 1 iter: 700 loss: 1.7929238
accuracy 0.962555257501405
ep: 1 iter: 710 loss: 1.7907134
ep: 1 iter: 720 loss: 1.7903755
ep: 1 iter: 730 loss: 1.7849901
ep: 1 iter: 740 loss: 1.7822084
ep: 1 iter: 750 loss: 1.7925477
accuracy 0.9678970573751021
ep: 1 iter: 760 loss: 1.7930641
ep: 1 iter: 770 loss: 1.7995198
ep: 1 iter: 780 loss: 1.7947171
ep: 1 iter: 790 loss: 1.787004
ep: 1 iter: 800 loss: 1.7828858
accuracy 0.9626363694300728
ep: 1 iter: 810 loss: 1.783238
ep: 1 iter: 820 loss: 1.7791986
ep: 1 iter: 830 loss: 1.7790332
ep: 1 iter: 840 loss: 1.7751092
ep: 1 iter: 850 loss: 1.7846925
accuracy 0.9643339262229071
ep: 1 iter: 860 loss: 1.796004
ep: 1 iter: 870 loss: 1.7971581
ep: 1 iter: 880 loss: 1.7947258
ep: 1 iter: 890 loss: 1.7938842
ep: 1 iter: 900 loss: 1.7913607
a

## Test Model

In [115]:
tf.reset_default_graph()

In [116]:
bilstm_model = BiLSTM_CRF()

In [117]:
bilstm_model.build(params)

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


In [118]:
bilstm_model.load_model(model_file_path)

INFO:tensorflow:Restoring parameters from checkpoints/ch/ner.ckpt


In [119]:
overall_acc, ret = bilstm_model.run_validation(valid_dataset, 32)

In [120]:
print(overall_acc)

0.9661415634903622


In [126]:
index2word = {index:word for word, index in word2index.items()}
index2tag = {index:tag for tag, index in tag2index.items()}

In [129]:
def getSequence(indices, index2token):
    sequence = []
    for idx in indices:
        sequence.append(index2token[idx])
    return sequence

In [134]:
for words_idx, labels_idx, pred_labels_idx, acc in ret[:3]:
    print("---------------------------------------------")
    print('sentence', getSequence(words_idx, index2word))
    print('t lbl', getSequence(labels_idx, index2tag))
    print('p lbl', getSequence(pred_labels_idx, index2tag))
    print('acc', acc)

---------------------------------------------
sentence ['中', '共', '中', '央', '致', '中', '国', '致', '公', '党', '十', '一', '大', '的', '贺', '词']
t lbl ['B-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'O', 'B-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'O', 'O', 'O']
p lbl ['B-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'O', 'O', 'O']
acc [True, True, True, True, False, False, True, True, True, True, True, True, True, True, True, True]
---------------------------------------------
sentence ['各', '位', '代', '表', '、', '各', '位', '同', '志', '：']
t lbl ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
p lbl ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
acc [True, True, True, True, True, True, True, True, True, True]
---------------------------------------------
sentence ['在', '中', '国', '致', '公', '党', '第', '十', '一', '次', '全', '国', '代', '表', '大', '会', '隆', '重', '召', '开', '之', '际', '，', '中', '国', '共', '产', '党', '中', '