# 参考方法

In [4]:
#https://www.cnblogs.com/vipyoumay/p/ner-chinese-keras.html
import numpy
from collections import Counter
from keras.preprocessing.sequence import pad_sequences
import pickle
import platform

def load_data():
    train = _parse_data(open('./train_data.data', 'rb'))
    test = _parse_data(open('./test_data.data', 'rb'))
#     print(train)

    word_counts = Counter(row[0].lower() for sample in train for row in sample)
    vocab = [w for w, f in iter(word_counts.items()) if f >= 2]
    chunk_tags = ['O', 'B-PER', 'I-PER', 'B-LOC', 'I-LOC', "B-ORG", "I-ORG"]

    # save initial config data
    with open('model/config.pkl', 'wb') as outp:
        pickle.dump((vocab, chunk_tags), outp)

    train = _process_data(train, vocab, chunk_tags)
    test = _process_data(test, vocab, chunk_tags)
    return train, test, (vocab, chunk_tags)


def _parse_data(fh):
    #  in windows the new line is '\r\n\r\n' the space is '\r\n' . so if you use windows system,
    #  you have to use recorsponding instructions

    if platform.system() == 'Windows':
        split_text = '\r\n'
    else:
        split_text = '\n'


    #样本是\n\n分割
    string = fh.read().decode('utf-8')
    print(string)
    data = [[row.split() for row in sample.split(split_text)] for
            sample in
            string.strip().split(split_text + split_text)]
    fh.close()
    return data


def _process_data(data, vocab, chunk_tags, maxlen=None, onehot=False):
    if maxlen is None:
        maxlen = max(len(s) for s in data)
    word2idx = dict((w, i) for i, w in enumerate(vocab))
    x = [[word2idx.get(w[0].lower(), 1) for w in s] for s in data]  # set to <unk> (index 1) if not in vocab

    y_chunk = [[chunk_tags.index(w[1]) for w in s] for s in data]

    x = pad_sequences(x, maxlen)  # left padding

    y_chunk = pad_sequences(y_chunk, maxlen, value=-1)

    if onehot:
        y_chunk = numpy.eye(len(chunk_tags), dtype='float32')[y_chunk]
    else:
        y_chunk = numpy.expand_dims(y_chunk, 2) #扩展维度
    return x, y_chunk


def process_data(data, vocab, maxlen=100):
    word2idx = dict((w, i) for i, w in enumerate(vocab))
    x = [word2idx.get(w[0].lower(), 1) for w in data]
    length = len(x)
    x = pad_sequences([x], maxlen)  # left padding
    return x, length


In [7]:
# train, test, (vocab, chunk_tags) = load_data()

train = _parse_data(open('./train_data.data', 'rb'))


IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [11]:
print(train[0])
print('\n')
print(train[0][0])

[['当', 'O'], ['希', 'O'], ['望', 'O'], ['工', 'O'], ['程', 'O'], ['救', 'O'], ['助', 'O'], ['的', 'O'], ['百', 'O'], ['万', 'O'], ['儿', 'O'], ['童', 'O'], ['成', 'O'], ['长', 'O'], ['起', 'O'], ['来', 'O'], ['，', 'O'], ['科', 'O'], ['教', 'O'], ['兴', 'O'], ['国', 'O'], ['蔚', 'O'], ['然', 'O'], ['成', 'O'], ['风', 'O'], ['时', 'O'], ['，', 'O'], ['今', 'O'], ['天', 'O'], ['有', 'O'], ['收', 'O'], ['藏', 'O'], ['价', 'O'], ['值', 'O'], ['的', 'O'], ['书', 'O'], ['你', 'O'], ['没', 'O'], ['买', 'O'], ['，', 'O'], ['明', 'O'], ['日', 'O'], ['就', 'O'], ['叫', 'O'], ['你', 'O'], ['悔', 'O'], ['不', 'O'], ['当', 'O'], ['初', 'O'], ['！', 'O']]


['当', 'O']


# load data

In [1]:
# %%pixie_debuggerger
def parse_file(file_name):
    with open(file_name, 'rb') as file:
        content = file.read().decode("utf-8").strip()
        samples = content.split('\n\n')
        sample_rows = [sample.split('\n') for sample in samples] 
        data = [[row.split() for row  in sample ] for sample in sample_rows]
        file.close()
        return data
    return []

def process(data, vocab, chunk_tags, maxlen=None, onehot=False):
    if maxlen is None:
        maxlen = max(len(sample) for sample in data)
    word2idx = dict((w, i) for i, w in enumerate(vocab))
    x = [[word2idx.get(w[0].lower(), 1) for w in s] for s in data]  # set to <unk> (index 1) if not in vocab

    y_chunk = [[chunk_tags.index(w[1]) for w in s] for s in data]

    x = pad_sequences(x, maxlen)  # left padding

    y_chunk = pad_sequences(y_chunk, maxlen, value=-1) #https://www.twblogs.net/a/5c113708bd9eee5e40bb23af

    if onehot:
        y_chunk = numpy.eye(len(chunk_tags), dtype='float32')[y_chunk]
    else:
        y_chunk = numpy.expand_dims(y_chunk, 2) #扩展维度
    return x, y_chunk
    
train = parse_file('train_data.data')
test = parse_file('test_data.data')
# print(train_data[0])
# print('\n')
# print(test_data[0][0])
# test_data = parse_file('test_data.data')

In [41]:
# row[0].lower for sample in train for row in sample

# words = ((row[0].lower() for row in sample) for sample in train)
# # print(words)
# words_counts = Counter(words)
# print(words_counts)

import pickle

words = (row[0].lower()for sample in train for row in sample)
print(words)
words_counts = Counter(words)

vocab = [w for w, f in iter(words_counts.items()) if f > 2 ]

chunk_tags = ['O', 'B-PER', 'I-PER', 'B-LOC', 'I-LOC', "B-ORG", "I-ORG"]


with open('config.pkl', 'wb') as outp:
      pickle.dump((vocab, chunk_tags), outp)


# print(words_counts)
train_X, train_y = process(train, vocab, chunk_tags)
test_X, test_y = process(test, vocab, chunk_tags)
# return train, test, (vocab, chunk_tags)

<generator object <genexpr> at 0x153a12c78>


# preprocess data

# define model

In [42]:
from keras.models import Sequential
from keras.layers.embeddings import Embedding
from keras.layers import Bidirectional
from keras_contrib.layers import CRF

from keras.layers.recurrent import LSTM

EMBED_DIM = 200
BiRNN_UNITS = 200

model = Sequential()
model.add(Embedding(len(vocab), EMBED_DIM, mask_zero=True))  # Random embedding
#双斜杠（//）表示地板除，即先做除法（/），然后向下取整（floor)
model.add(Bidirectional(LSTM(BiRNN_UNITS // 2, return_sequences=True)))
crf = CRF(len(chunk_tags), sparse_target=True)
model.add(crf)
model.summary()
model.compile('adam', loss=crf.loss_function, metrics=[crf.accuracy])

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, None, 200)         787400    
_________________________________________________________________
bidirectional_3 (Bidirection (None, None, 200)         240800    
_________________________________________________________________
crf_3 (CRF)                  (None, None, 7)           1470      
Total params: 1,029,670
Trainable params: 1,029,670
Non-trainable params: 0
_________________________________________________________________




# train model

In [46]:
model.fit(train_X, train_y, batch_size=16,epochs=5, validation_data=[test_X, test_y], verbose=2)

Instructions for updating:
Use tf.cast instead.
Train on 50658 samples, validate on 4631 samples
Epoch 1/5
 - 787s - loss: 4.0529 - crf_viterbi_accuracy: 0.9547 - val_loss: 7.8834 - val_crf_viterbi_accuracy: 0.9698
Epoch 2/5
 - 630s - loss: 3.9702 - crf_viterbi_accuracy: 0.9785 - val_loss: 7.8646 - val_crf_viterbi_accuracy: 0.9749
Epoch 3/5
 - 558s - loss: 3.9572 - crf_viterbi_accuracy: 0.9846 - val_loss: 7.8613 - val_crf_viterbi_accuracy: 0.9766
Epoch 4/5
 - 541s - loss: 3.9511 - crf_viterbi_accuracy: 0.9883 - val_loss: 7.8606 - val_crf_viterbi_accuracy: 0.9781
Epoch 5/5
 - 539s - loss: 3.9469 - crf_viterbi_accuracy: 0.9913 - val_loss: 7.8592 - val_crf_viterbi_accuracy: 0.9788


<keras.callbacks.History at 0x1414f8908>

# predict

In [61]:
def process_predict_data(data, vocab, maxlen=100):
    word2idx = dict((w, i) for i, w in enumerate(vocab))
    x = [word2idx.get(w[0].lower(), 1) for w in data]
    length = len(x)
    x = pad_sequences([x], maxlen)  # left padding
    return x, length

# predict_text = '中华人民共和国国务院总理周恩来在外交部长陈毅的陪同下，连续访问了埃塞俄比亚等非洲10国以及阿尔巴尼亚'
# predict_text = '香港是中华人民共和国两个特别行政区之一，位于南海北岸、珠江口东侧，北接广东省深圳市，西面与邻近的澳门特别行政区相距63公里，其余两面与南海邻接'
predict_text = '今日，米哈伊尔-普罗霍夫宣布将篮网和巴克莱中心卖给蔡崇信。'
sstr, length = process_predict_data(predict_text, vocab)


raw = model.predict(sstr)
# print(raw)
raw1 = raw[0][-length:]
print("raw1------")
print(raw1)

result = [np.argmax(row) for row in raw1] #获取最大值对应的位置索引

print("result------")
print(result)
result_tags = [chunk_tags[i] for i in result]
print("result_tags------")
print(result_tags)
per, loc, org = '', '', ''

for s, t in zip(predict_text, result_tags):
    if t in ('B-PER', 'I-PER'):
        per += ' ' + s if (t == 'B-PER') else s
    if t in ('B-ORG', 'I-ORG'):
        org += ' ' + s if (t == 'B-ORG') else s
    if t in ('B-LOC', 'I-LOC'):
        loc += ' ' + s if (t == 'B-LOC') else s

print(['person:' + per, 'location:' + loc, 'organzation:' + org])

raw1------
[[1. 0. 0. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0. 0.]
 [0. 0. 1. 0. 0. 0. 0.]
 [0. 0. 1. 0. 0. 0. 0.]
 [0. 0. 1. 0. 0. 0. 0.]
 [0. 0. 1. 0. 0. 0. 0.]
 [0. 0. 1. 0. 0. 0. 0.]
 [0. 0. 1. 0. 0. 0. 0.]
 [0. 0. 1. 0. 0. 0. 0.]
 [0. 0. 1. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 1. 0. 0. 0.]
 [0. 0. 0. 0. 1. 0. 0.]
 [0. 0. 0. 0. 1. 0. 0.]
 [0. 0. 0. 0. 1. 0. 0.]
 [0. 0. 0. 0. 1. 0. 0.]
 [1. 0. 0. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0. 0.]
 [0. 0. 1. 0. 0. 0. 0.]
 [0. 0. 1. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0. 0. 0.]]
result------
[0, 0, 0, 1, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 3, 4, 4, 4, 4, 0, 0, 1, 2, 2, 0]
result_tags------
['O', 'O', 'O', 'B-PER', 'I-PER', 'I-PER', 'I-PER', 'I-PER', 'I-PER', 'I-PER', 'I-PER', 'I-PER', 'O', 'O', 'O', 'O', 'O', 'O', 'B-LOC', 'I-LOC', 'I-LOC', 'I-LOC', 'I-LOC', '