In [1]:
import re
import sklearn_crfsuite
from sklearn_crfsuite import metrics
import joblib
import yaml
import warnings

warnings.filterwarnings('ignore')

## 通用函数

In [2]:
def load_data(data_path):
    data = list()
    data_sent_with_label = list()
    with open(data_path, mode='r', encoding="utf-8") as f:
        for line in f:
            if line.strip() == "":
                data.append(data_sent_with_label.copy())
                data_sent_with_label.clear()
            else:
                data_sent_with_label.append(tuple(line.strip().split(" ")))
    return data

In [3]:
def word2features(sent, i):
    word = sent[i][0]

    features = {
        'bias': 1.0,
        'word': word,
        'word.isdigit()': word.isdigit(),
    }
    if i > 0:
        word1 = sent[i-1][0]
        words = word1 + word
        features.update({
            '-1:word': word1,
            '-1:words': words,
            '-1:word.isdigit()': word1.isdigit(),
        })
    else:
        features['BOS'] = True

    if i > 1:
        word2 = sent[i-2][0]
        word1 = sent[i-1][0]
        words = word1 + word2 + word
        features.update({
            '-2:word': word2,
            '-2:words': words,
            '-3:word.isdigit()': word1.isdigit(),
        })

    if i > 2:
        word3 = sent[i - 3][0]
        word2 = sent[i - 2][0]
        word1 = sent[i - 1][0]
        words = word1 + word2 + word3 + word
        features.update({
            '-3:word': word3,
            '-3:words': words,
            '-3:word.isdigit()': word1.isdigit(),
        })

    if i < len(sent)-1:
        word1 = sent[i+1][0]
        words = word1 + word
        features.update({
            '+1:word': word1,
            '+1:words': words,
            '+1:word.isdigit()': word1.isdigit(),
        })
    else:
        features['EOS'] = True

    if i < len(sent)-2:
        word2 = sent[i + 2][0]
        word1 = sent[i + 1][0]
        words = word + word1 + word2
        features.update({
            '+2:word': word2,
            '+2:words': words,
            '+2:word.isdigit()': word2.isdigit(),
        })

    if i < len(sent)-3:
        word3 = sent[i + 3][0]
        word2 = sent[i + 2][0]
        word1 = sent[i + 1][0]
        words = word + word1 + word2 + word3
        features.update({
            '+3:word': word3,
            '+3:words': words,
            '+3:word.isdigit()': word3.isdigit(),
        })

    return features

In [4]:
def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]


def sent2labels(sent):
    return [ele[-1] for ele in sent]


In [5]:
train=load_data('data/train.txt')
valid=load_data('data/train.txt')
test=load_data('data/train.txt')
print(len(train),len(valid),len(test))

sample_text=''.join([c[0] for c in train[0]])
sample_tags=[c[1] for c in train[0]]
print(sample_text)
print(sample_tags)

5021 5021 5021
(002399)调研地点
['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-NOUN', 'I-NOUN', 'I-NOUN', 'I-NOUN']


In [6]:


X_train = [sent2features(s) for s in train]
y_train = [sent2labels(s) for s in train]

X_dev = [sent2features(s) for s in valid]
y_dev = [sent2labels(s) for s in valid]

In [7]:
# **表示该位置接受任意多个关键字（keyword）参数，在函数**位置上转化为词典 [key:value, key:value ]
crf_model = sklearn_crfsuite.CRF(algorithm='lbfgs',c1=0.25,c2=0.018,max_iterations=100,
                                 all_possible_transitions=True,verbose=True)
crf_model.fit(X_train, y_train)

loading training data to CRFsuite: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5021/5021 [00:00<00:00, 5499.55it/s]



Feature generation
type: CRF1d
feature.minfreq: 0.000000
feature.possible_states: 0
feature.possible_transitions: 1
0....1....2....3....4....5....6....7....8....9....10
Number of features: 136892
Seconds required: 0.360

L-BFGS optimization
c1: 0.250000
c2: 0.018000
num_memories: 6
max_iterations: 100
epsilon: 0.000010
stop: 10
delta: 0.000010
linesearch: MoreThuente
linesearch.max_iterations: 20

Iter 1   time=0.05  loss=42874.41 active=134915 feature_norm=1.00
Iter 2   time=0.03  loss=35034.49 active=132963 feature_norm=1.91
Iter 3   time=0.04  loss=30110.05 active=134396 feature_norm=2.76
Iter 4   time=0.03  loss=25346.04 active=133673 feature_norm=3.33
Iter 5   time=0.03  loss=21812.82 active=133580 feature_norm=4.33
Iter 6   time=0.03  loss=19847.97 active=134380 feature_norm=5.54
Iter 7   time=0.03  loss=18043.07 active=134437 feature_norm=6.32
Iter 8   time=0.03  loss=16596.34 active=133137 feature_norm=7.65
Iter 9   time=0.03  loss=15358.26 active=132126 feature_norm=9.14
Iter

CRF(algorithm='lbfgs', all_possible_transitions=True, c1=0.25, c2=0.018,
    keep_tempfiles=None, max_iterations=100, verbose=True)

In [8]:
labels=list(crf_model.classes_)
labels.remove("O")
y_pred = crf_model.predict(X_dev)
metrics.flat_f1_score(y_dev, y_pred,
                      average='weighted', labels=labels)
sorted_labels = sorted(labels,key=lambda name: (name[1:], name[0]))
print(metrics.flat_classification_report(
    y_dev, y_pred, labels=sorted_labels, digits=3
))

              precision    recall  f1-score   support

      B-NOUN      0.993     0.992     0.993      7496
      I-NOUN      0.993     0.997     0.995     21526

   micro avg      0.993     0.996     0.995     29022
   macro avg      0.993     0.995     0.994     29022
weighted avg      0.993     0.996     0.995     29022



In [9]:
import joblib
joblib.dump(crf_model, "./chinese_crf_model.joblib")

['./chinese_crf_model.joblib']

In [33]:
text = '有哪些公司有哪些上市公司持有石药集团的股份'

NER_tagger = joblib.load('./chinese_crf_model.joblib')
list_result = []
new_sents = re.split(u'(。|！|\!|？|\?)', text)
sents_feature = [sent2features(sent) for sent in new_sents]
y_pred = NER_tagger.predict(sents_feature)
for sent, ner_tag in zip(new_sents, y_pred):
    for word, tag in zip(sent, ner_tag):
        list_result.append((word,tag))
list_result    

[('有', 'O'),
 ('哪', 'O'),
 ('些', 'O'),
 ('公', 'B-NOUN'),
 ('司', 'I-NOUN'),
 ('有', 'O'),
 ('哪', 'O'),
 ('些', 'O'),
 ('上', 'B-NOUN'),
 ('市', 'I-NOUN'),
 ('公', 'I-NOUN'),
 ('司', 'I-NOUN'),
 ('持', 'O'),
 ('有', 'O'),
 ('石', 'B-NOUN'),
 ('药', 'I-NOUN'),
 ('集', 'I-NOUN'),
 ('团', 'I-NOUN'),
 ('的', 'O'),
 ('股', 'B-NOUN'),
 ('份', 'I-NOUN')]

In [11]:
import sklearn

In [12]:
sklearn.__version__

'0.23.0'

In [53]:
def _bulid_result_line(sentence, tag_pred):
    result_list = []
    for index, tag in zip(range(len(tag_pred)), tag_pred):
        if tag[0] == 'B':
            start = index
            end = index
            label_type = tag[2:]
            if end != len(tag_pred) - 1:
                while tag_pred[end + 1][0] == 'I' and tag_pred[end + 1][2:] == label_type:
                    end += 1
                    if end == len(tag_pred) - 1:
                        break
            result_list.append({'start': start,
                                'end': end,
                                'lable_type': label_type

                                })
    nouns = []
    line = ''.join(sentence)
    if len(result_list) != 0:
        for index, item in enumerate(result_list):
            nouns.append(''.join(sentence[result_list[index]['start']:result_list[index]['end'] + 1]))
    return nouns

In [15]:
import pandas as pd
test = pd.read_table('data/test_a/span_extrace_test_A.txt', sep='\t')

In [16]:
test

Unnamed: 0,用户问句
0,缩倍量阴是什么情况
1,缩量高换手是什么意思
2,缩量和放量是什么意思
3,缩量十字星说明什么
4,缩量涨停意义
...,...
1150,有疑似庄股的个股
1151,有异动语音播报吗
1152,有游资的股票和庄股有什么区别
1153,有有哪些股票是庄股


In [58]:
# text = '有哪些公司有哪些上市公司持有石药集团的股份'

NER_tagger = joblib.load('./chinese_crf_model.joblib')
list_result = []
# new_sents = re.split(u'(。|！|\!|？|\?)', text)
res_tags=[]
for text in test['用户问句']:
    new_sents = re.split(u'(。|！|\!|？|\?)', text)
    sents_feature = [sent2features(sent) for sent in new_sents]
    y_pred = NER_tagger.predict(sents_feature)
    res_tags.append(y_pred[0])
# for sent, ner_tag in zip(new_sents, y_pred):
#     for word, tag in zip(sent, ner_tag):
#         list_result.append((word,tag))
# list_result    

In [59]:
_bulid_result_line(test['用户问句'][0],res_tags[0])

['缩倍量阴']

In [60]:
test['用户问句'][0]

'缩倍量阴是什么情况'

In [61]:
res_tags[0]

['B-NOUN', 'I-NOUN', 'I-NOUN', 'I-NOUN', 'O', 'O', 'O', 'O', 'O']

In [64]:
results=[]


for text,tags in zip(test['用户问句'],res_tags):
    results.append('_|_'.join(_bulid_result_line(text,tags)))

In [66]:
test['名词短语']=results

In [67]:
test.to_csv('result.txt', sep='\t', index=None)