In [1]:
import re
import sklearn_crfsuite
from sklearn_crfsuite import metrics
import joblib
import yaml
import warnings

warnings.filterwarnings('ignore')

## 通用函数

In [2]:
def load_data(data_path):
    data = list()
    data_sent_with_label = list()
    with open(data_path, mode='r', encoding="utf-8") as f:
        for line in f:
            if line.strip() == "":
                data.append(data_sent_with_label.copy())
                data_sent_with_label.clear()
            else:
                data_sent_with_label.append(tuple(line.strip().split(" ")))
    return data

In [3]:
tmp=load_data('data/SIGHAN.NER.train')
# tmp[0]

In [4]:
def word2features(sent, i):
    word = sent[i][0]

    features = {
        'bias': 1.0,
        'word': word,
        'word.isdigit()': word.isdigit(),
        # 'word.isdigit()': word.isalpha(),
    }
    if i > 0:
        word1 = sent[i-1][0]
        words = word1 + word
        features.update({
            '-1:word': word1,
            '-1:words': words,
            '-1:word.isdigit()': word1.isdigit(),
        })
    else:
        features['BOS'] = True

    if i > 1:
        word2 = sent[i-2][0]
        word1 = sent[i-1][0]
        words = word1 + word2 + word
        features.update({
            '-2:word': word2,
            '-2:words': words,
            '-3:word.isdigit()': word1.isdigit(),
        })

    if i > 2:
        word3 = sent[i - 3][0]
        word2 = sent[i - 2][0]
        word1 = sent[i - 1][0]
        words = word1 + word2 + word3 + word
        features.update({
            '-3:word': word3,
            '-3:words': words,
            '-3:word.isdigit()': word1.isdigit(),
        })

    if i < len(sent)-1:
        word1 = sent[i+1][0]
        words = word1 + word
        features.update({
            '+1:word': word1,
            '+1:words': words,
            '+1:word.isdigit()': word1.isdigit(),
        })
    else:
        features['EOS'] = True

    if i < len(sent)-2:
        word2 = sent[i + 2][0]
        word1 = sent[i + 1][0]
        words = word + word1 + word2
        features.update({
            '+2:word': word2,
            '+2:words': words,
            '+2:word.isdigit()': word2.isdigit(),
        })

    if i < len(sent)-3:
        word3 = sent[i + 3][0]
        word2 = sent[i + 2][0]
        word1 = sent[i + 1][0]
        words = word + word1 + word2 + word3
        features.update({
            '+3:word': word3,
            '+3:words': words,
            '+3:word.isdigit()': word3.isdigit(),
        })

    return features

In [5]:
def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]


def sent2labels(sent):
    return [ele[-1] for ele in sent]


In [6]:
train=load_data('data/SIGHAN.NER.train')
valid=load_data('data/SIGHAN.NER.vali')
test=load_data('data/SIGHAN.NER.test')
print(len(train),len(valid),len(test))

sample_text=''.join([c[0] for c in train[0]])
sample_tags=[c[1] for c in train[0]]
print(sample_text)
print(sample_tags)

18682 4499 4636
在一审中苗英毫称厂里为苗英毫垫付过几万的医疗费，实际上医疗费是孟凡荣垫付的，与厂里没有关系藏书本来就是所有传统收藏门类中的第一大户，只是我们结束温饱的时间太短而已。
['O', 'O', 'O', 'O', 'B-PER', 'I-PER', 'I-PER', 'O', 'O', 'O', 'O', 'B-PER', 'I-PER', 'I-PER', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-PER', 'I-PER', 'I-PER', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']


In [7]:


X_train = [sent2features(s) for s in train]
y_train = [sent2labels(s) for s in train]

X_dev = [sent2features(s) for s in valid]
y_dev = [sent2labels(s) for s in valid]

In [9]:
# X_train[0]

In [10]:
# **表示该位置接受任意多个关键字（keyword）参数，在函数**位置上转化为词典 [key:value, key:value ]
crf_model = sklearn_crfsuite.CRF(algorithm='lbfgs',c1=0.25,c2=0.018,max_iterations=100,
                                 all_possible_transitions=True,verbose=True)
crf_model.fit(X_train, y_train)

loading training data to CRFsuite: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 18682/18682 [00:25<00:00, 735.99it/s]



Feature generation
type: CRF1d
feature.minfreq: 0.000000
feature.possible_states: 0
feature.possible_transitions: 1
0....1....2....3....4....5....6....7....8....9....10
Number of features: 2779254
Seconds required: 10.108

L-BFGS optimization
c1: 0.250000
c2: 0.018000
num_memories: 6
max_iterations: 100
epsilon: 0.000010
stop: 10
delta: 0.000010
linesearch: MoreThuente
linesearch.max_iterations: 20

Iter 1   time=2.39  loss=785007.90 active=2775185 feature_norm=1.00
Iter 2   time=1.19  loss=673832.02 active=2762400 feature_norm=4.11
Iter 3   time=1.20  loss=577572.95 active=2764398 feature_norm=3.56
Iter 4   time=5.88  loss=382113.39 active=856593 feature_norm=2.11
Iter 5   time=2.55  loss=357613.86 active=889839 feature_norm=2.53
Iter 6   time=1.32  loss=329249.06 active=680760 feature_norm=2.84
Iter 7   time=3.67  loss=298188.85 active=677882 feature_norm=3.83
Iter 8   time=1.34  loss=276799.06 active=1125106 feature_norm=4.15
Iter 9   time=1.35  loss=261736.49 active=1121850 featur

CRF(algorithm='lbfgs', all_possible_transitions=True, c1=0.25, c2=0.018,
    keep_tempfiles=None, max_iterations=100, verbose=True)

In [11]:
labels=list(crf_model.classes_)
labels.remove("O")
y_pred = crf_model.predict(X_dev)
metrics.flat_f1_score(y_dev, y_pred,
                      average='weighted', labels=labels)
sorted_labels = sorted(labels,key=lambda name: (name[1:], name[0]))
print(metrics.flat_classification_report(
    y_dev, y_pred, labels=sorted_labels, digits=3
))

              precision    recall  f1-score   support

       B-LOC      0.909     0.850     0.879      2860
       I-LOC      0.894     0.825     0.858      3904
       B-ORG      0.832     0.739     0.783      1682
       I-ORG      0.886     0.805     0.844      7409
       B-PER      0.958     0.774     0.856      1826
       I-PER      0.949     0.823     0.882      3216

   micro avg      0.902     0.810     0.853     20897
   macro avg      0.905     0.803     0.850     20897
weighted avg      0.903     0.810     0.853     20897



In [12]:
import joblib
joblib.dump(crf_model, "./chinese_crf_model.joblib")

['./chinese_crf_model.joblib']

In [13]:
text = '澳大利亚总理宣布对俄罗斯采取制裁措施。'

NER_tagger = joblib.load('./chinese_crf_model.joblib')
list_result = []
new_sents = re.split(u'(。|！|\!|？|\?)', text)
sents_feature = [sent2features(sent) for sent in new_sents]
y_pred = NER_tagger.predict(sents_feature)
for sent, ner_tag in zip(new_sents, y_pred):
    for word, tag in zip(sent, ner_tag):
        list_result.append((word,tag))
list_result    

[('澳', 'B-LOC'),
 ('大', 'I-LOC'),
 ('利', 'I-LOC'),
 ('亚', 'I-LOC'),
 ('总', 'O'),
 ('理', 'O'),
 ('宣', 'O'),
 ('布', 'O'),
 ('对', 'O'),
 ('俄', 'B-LOC'),
 ('罗', 'I-LOC'),
 ('斯', 'I-LOC'),
 ('采', 'O'),
 ('取', 'O'),
 ('制', 'O'),
 ('裁', 'O'),
 ('措', 'O'),
 ('施', 'O'),
 ('。', 'O')]

In [12]:
import sklearn

In [13]:
sklearn.__version__

'0.23.0'

In [15]:
text = '新加坡3月5日宣布对俄罗斯实施金融制裁，并禁止出口电子产品、电脑和军用物品，以回应莫斯科在乌克兰发起“特别军事行动”'

NER_tagger = joblib.load('./chinese_crf_model.joblib')
list_result = []
new_sents = re.split(u'(。|！|\!|？|\?)', text)
sents_feature = [sent2features(sent) for sent in new_sents]
y_pred = NER_tagger.predict(sents_feature)
for sent, ner_tag in zip(new_sents, y_pred):
    for word, tag in zip(sent, ner_tag):
        list_result.append((word,tag))
list_result    

[('新', 'B-LOC'),
 ('加', 'I-LOC'),
 ('坡', 'I-LOC'),
 ('3', 'O'),
 ('月', 'O'),
 ('5', 'O'),
 ('日', 'O'),
 ('宣', 'O'),
 ('布', 'O'),
 ('对', 'O'),
 ('俄', 'B-LOC'),
 ('罗', 'I-LOC'),
 ('斯', 'I-LOC'),
 ('实', 'O'),
 ('施', 'O'),
 ('金', 'O'),
 ('融', 'O'),
 ('制', 'O'),
 ('裁', 'O'),
 ('，', 'O'),
 ('并', 'O'),
 ('禁', 'O'),
 ('止', 'O'),
 ('出', 'O'),
 ('口', 'O'),
 ('电', 'O'),
 ('子', 'O'),
 ('产', 'O'),
 ('品', 'O'),
 ('、', 'O'),
 ('电', 'O'),
 ('脑', 'O'),
 ('和', 'O'),
 ('军', 'O'),
 ('用', 'O'),
 ('物', 'O'),
 ('品', 'O'),
 ('，', 'O'),
 ('以', 'O'),
 ('回', 'O'),
 ('应', 'O'),
 ('莫', 'B-LOC'),
 ('斯', 'I-LOC'),
 ('科', 'I-LOC'),
 ('在', 'O'),
 ('乌', 'B-LOC'),
 ('克', 'I-LOC'),
 ('兰', 'I-LOC'),
 ('发', 'O'),
 ('起', 'O'),
 ('“', 'O'),
 ('特', 'O'),
 ('别', 'O'),
 ('军', 'O'),
 ('事', 'O'),
 ('行', 'O'),
 ('动', 'O'),
 ('”', 'O')]

In [20]:
text=[c[0] for c in list_result]
tag_pre=[c[1] for c in list_result]
# text,tag_pre

In [16]:
def _bulid_result_line(sentence, tag_pred):
    result_list = []
    for index, tag in zip(range(len(tag_pred)), tag_pred):
        if tag[0] == 'B':
            start = index
            end = index
            label_type = tag[2:]
            if end != len(tag_pred) - 1:
                while tag_pred[end + 1][0] == 'I' and tag_pred[end + 1][2:] == label_type:
                    end += 1
                    if end == len(tag_pred) - 1:
                        break
            result_list.append({'start': start,
                                'end': end,
                                'lable_type': label_type

                                })
    nouns = []
    line = ''.join(sentence)
    if len(result_list) != 0:
        for index, item in enumerate(result_list):
            nouns.append(''.join(sentence[result_list[index]['start']:result_list[index]['end'] + 1]))
    return nouns


In [21]:
_bulid_result_line(text,tag_pre)

['新加坡', '俄罗斯', '莫斯科', '乌克兰']