In [1]:
from kashgari.corpus import ChineseDailyNerCorpus

W0423 12:24:23.275775 140670426830656 macros.py:36] CUDA GPU available, you can set `kashgari.config.use_cudnn_cell = True` to use CuDNNCell. This will speed up the training, but will make model incompatible with CPU device.


In [2]:
train_x, train_y = ChineseDailyNerCorpus.load_data('train')
valid_x, valid_y = ChineseDailyNerCorpus.load_data('validate')
test_x, test_y  = ChineseDailyNerCorpus.load_data('test')

In [3]:
print(f"train data count: {len(train_x)}")
print(f"validate data count: {len(valid_x)}")
print(f"test data count: {len(test_x)}")

train data count: 20864
validate data count: 2318
test data count: 4636


In [4]:
import tensorflow as tf
print(tf.test.is_gpu_available())

True


In [5]:
import kashgari
from kashgari.embeddings import BERTEmbedding

bert_embed = BERTEmbedding('chinese_L-12_H-768_A-12',task=kashgari.LABELING,sequence_length=100)

W0423 12:24:42.686179 140670426830656 bert_embedding.py:126] seq_len: 100


In [6]:
from kashgari.tasks.labeling import BiLSTM_CRF_Model

model = BiLSTM_CRF_Model(bert_embed)
model.fit(train_x,train_y,x_validate=valid_x,y_validate=valid_y,epochs=20,batch_size=256)
model.save('ner.h5')

Model: "model_4"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
Input-Token (InputLayer)        [(None, 100)]        0                                            
__________________________________________________________________________________________________
Input-Segment (InputLayer)      [(None, 100)]        0                                            
__________________________________________________________________________________________________
Embedding-Token (TokenEmbedding [(None, 100, 768), ( 16226304    Input-Token[0][0]                
__________________________________________________________________________________________________
Embedding-Segment (Embedding)   (None, 100, 768)     1536        Input-Segment[0][0]              
____________________________________________________________________________________________

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [7]:
model.evaluate(test_x, test_y)

           precision    recall  f1-score   support

      PER     0.9678    0.9711    0.9694      1797
      LOC     0.9373    0.9321    0.9347      3431
      ORG     0.8553    0.9054    0.8796      2147

micro avg     0.9197    0.9338    0.9267      7375
macro avg     0.9208    0.9338    0.9271      7375



'           precision    recall  f1-score   support\n\n      PER     0.9678    0.9711    0.9694      1797\n      LOC     0.9373    0.9321    0.9347      3431\n      ORG     0.8553    0.9054    0.8796      2147\n\nmicro avg     0.9197    0.9338    0.9267      7375\nmacro avg     0.9208    0.9338    0.9271      7375\n'

In [8]:
import kashgari
import re

loaded_model = kashgari.utils.load_model('ner.h5')

def cut_text(text, lenth):
    textArr = re.findall('.{' + str(lenth) + '}', text)
    textArr.append(text[(len(textArr) * lenth):])
    return textArr

def extract_labels(text, ners):
    ner_reg_list = []
    if ners:
        new_ners = []
        for ner in ners:
            new_ners += ner;
        for word, tag in zip([char for char in text], new_ners):
            if tag != 'O':
                ner_reg_list.append((word, tag))
    labels = {}
    if ner_reg_list:
        for i, item in enumerate(ner_reg_list):
            if item[1].startswith('B'):
                label = ""
                end = i + 1
                while end <= len(ner_reg_list) - 1 and ner_reg_list[end][1].startswith('I'):
                    end += 1
                ner_type = item[1].split('-')[1]
                if ner_type not in labels.keys():
                    labels[ner_type] = []
                label += ''.join([item[0] for item in ner_reg_list[i:end]])
                labels[ner_type].append(label)             
    return labels


while True:
    text_input = "代伟站在天安门广场前对着国旗敬礼。"
    texts = cut_text(text_input, 100)
    ners = loaded_model.predict([[char for char in text] for text in texts])
    print(ners)
    labels = extract_labels(text_input, ners)
    print(labels)

W0423 13:10:03.966809 140670426830656 base_embedding.py:126] Sequence length will auto set at 95% of sequence length


[['B-PER', 'I-PER', 'O', 'O', 'B-LOC', 'I-LOC', 'I-LOC', 'I-LOC', 'I-LOC', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']]
{'PER': ['代伟'], 'LOC': ['天安门广场']}
[['B-PER', 'I-PER', 'O', 'O', 'B-LOC', 'I-LOC', 'I-LOC', 'I-LOC', 'I-LOC', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']]
{'PER': ['代伟'], 'LOC': ['天安门广场']}
[['B-PER', 'I-PER', 'O', 'O', 'B-LOC', 'I-LOC', 'I-LOC', 'I-LOC', 'I-LOC', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']]
{'PER': ['代伟'], 'LOC': ['天安门广场']}
[['B-PER', 'I-PER', 'O', 'O', 'B-LOC', 'I-LOC', 'I-LOC', 'I-LOC', 'I-LOC', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']]
{'PER': ['代伟'], 'LOC': ['天安门广场']}
[['B-PER', 'I-PER', 'O', 'O', 'B-LOC', 'I-LOC', 'I-LOC', 'I-LOC', 'I-LOC', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']]
{'PER': ['代伟'], 'LOC': ['天安门广场']}
[['B-PER', 'I-PER', 'O', 'O', 'B-LOC', 'I-LOC', 'I-LOC', 'I-LOC', 'I-LOC', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']]
{'PER': ['代伟'], 'LOC': ['天安门广场']}
[['B-PER', 'I-PER', 'O', 'O', 'B-LOC', 'I-LOC', 'I-LOC', 'I-LOC', 'I-LOC', 'O', 'O', 'O', 'O', 'O', 

[['B-PER', 'I-PER', 'O', 'O', 'B-LOC', 'I-LOC', 'I-LOC', 'I-LOC', 'I-LOC', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']]
{'PER': ['代伟'], 'LOC': ['天安门广场']}
[['B-PER', 'I-PER', 'O', 'O', 'B-LOC', 'I-LOC', 'I-LOC', 'I-LOC', 'I-LOC', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']]
{'PER': ['代伟'], 'LOC': ['天安门广场']}
[['B-PER', 'I-PER', 'O', 'O', 'B-LOC', 'I-LOC', 'I-LOC', 'I-LOC', 'I-LOC', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']]
{'PER': ['代伟'], 'LOC': ['天安门广场']}
[['B-PER', 'I-PER', 'O', 'O', 'B-LOC', 'I-LOC', 'I-LOC', 'I-LOC', 'I-LOC', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']]
{'PER': ['代伟'], 'LOC': ['天安门广场']}
[['B-PER', 'I-PER', 'O', 'O', 'B-LOC', 'I-LOC', 'I-LOC', 'I-LOC', 'I-LOC', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']]
{'PER': ['代伟'], 'LOC': ['天安门广场']}
[['B-PER', 'I-PER', 'O', 'O', 'B-LOC', 'I-LOC', 'I-LOC', 'I-LOC', 'I-LOC', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']]
{'PER': ['代伟'], 'LOC': ['天安门广场']}
[['B-PER', 'I-PER', 'O', 'O', 'B-LOC', 'I-LOC', 'I-LOC', 'I-LOC', 'I-LOC', 'O', 'O', 'O', 'O', 'O', 

[['B-PER', 'I-PER', 'O', 'O', 'B-LOC', 'I-LOC', 'I-LOC', 'I-LOC', 'I-LOC', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']]
{'PER': ['代伟'], 'LOC': ['天安门广场']}
[['B-PER', 'I-PER', 'O', 'O', 'B-LOC', 'I-LOC', 'I-LOC', 'I-LOC', 'I-LOC', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']]
{'PER': ['代伟'], 'LOC': ['天安门广场']}
[['B-PER', 'I-PER', 'O', 'O', 'B-LOC', 'I-LOC', 'I-LOC', 'I-LOC', 'I-LOC', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']]
{'PER': ['代伟'], 'LOC': ['天安门广场']}
[['B-PER', 'I-PER', 'O', 'O', 'B-LOC', 'I-LOC', 'I-LOC', 'I-LOC', 'I-LOC', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']]
{'PER': ['代伟'], 'LOC': ['天安门广场']}
[['B-PER', 'I-PER', 'O', 'O', 'B-LOC', 'I-LOC', 'I-LOC', 'I-LOC', 'I-LOC', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']]
{'PER': ['代伟'], 'LOC': ['天安门广场']}
[['B-PER', 'I-PER', 'O', 'O', 'B-LOC', 'I-LOC', 'I-LOC', 'I-LOC', 'I-LOC', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']]
{'PER': ['代伟'], 'LOC': ['天安门广场']}
[['B-PER', 'I-PER', 'O', 'O', 'B-LOC', 'I-LOC', 'I-LOC', 'I-LOC', 'I-LOC', 'O', 'O', 'O', 'O', 'O', 

KeyboardInterrupt: 