In [1]:
import sklearn_crfsuite
from sklearn_crfsuite import metrics

In [80]:
def load_pre_data(file):
    """数据读取函数"""
    with open(file,'rb') as f:
        sent_lags=[] #用于临时存放一个中文句子和存放一个句子对应的标注
        # 每次读取一行数据
        for line in f:
            line = line.decode('utf8')
            sentence = line.split('。') # 每一行为一个段落，按句号将段落切分成句子
            for sent in sentence: # 每一个句子
                sent = sent.strip()
                if not sent:
                    continue 
                wordslist = sent.split("  ")  # 将句子按空格进行切分，得到词
                _sent_lags = []
                for word in wordslist: # word:"我爱你"
                    sentlist = list(word)# ['我', '爱', '你'] 
                    tagslist = get_tag(word)# ['B', 'M', 'E']
                    _sent_lags.extend(list(zip(sentlist,tagslist))) # [('我', 'B'), ('爱', 'M'), ('你', 'E')]
                sent_lags.append(_sent_lags)   
    return sent_lags

def get_tag(word):
    """
    将词转化为标签的函数
    get_tag("我爱你") >> ['B', 'M', 'E']
    
    """
    tags = []           #创建一个空列表用来存放标注数据
    word_len = len(word)
    if word_len == 1:   #如果是单字成词，标记为 S
        tags = ['S']
    elif word_len == 2: # 如果该词仅有两个字，则标记为 B 和 E
        tags = ['B', 'E']
    else:
        tags.append('B')     #第一个字标记为 B
        tags.extend(['M']*(len(word)-2)) #中间标记为 M ，
        tags.append('E')     #最后一个标记为 E
    return tags

# def pre_data(data,ifsent=True):
#     """数据预处理函数，得到句子和标签"""
#     sent_lags=[] #用于临时存放一个中文句子,时存放一个句子对应的标注
#     for sentence in data:
#         sentence = sentence.strip()
#         if not sentence:
#             continue
        
#         words = sentence.split("  ")
#         _sent_lags = []
#         for word in words:
#             sent = list(word)
#             tags = get_tag(word)#获得标注结果
#             _sent_lags.extend(list(zip(sent,tags)))
#         sent_lags.append(_sent_lags)
#     return sent_lags

def word2features(sent,i):
    """返回特征列表"""
    word = sent[i][0] #句子的目标字
    
    features = {
        'bias': 1.0,
        'word': word,
        'word.isdigit()': word.isdigit(),
        }
    if i > 0:
        word1 = sent[i-1][0]
        features.update({
            '-1 word': word1,
            '-1:0 word': word1 + word,
            '-1 word.isdigit()':word1.isdigit(),
            
            })
    else:
        features['BOS'] = True
     
    if i > 1:
        word2 = sent[i-2][0]
        word1 = sent[i-1][0]
        features.update({
            '-2 word': word2,
            '-2:0 word': word2+word1+word,
            '-2 word.isdigit()': word2.isdigit(),
        })    
        
    if i > 2:
        word3 = sent[i - 3][0]
        word2 = sent[i - 2][0]
        word1 = sent[i - 1][0]
        features.update({
            '-3 word': word3,
            '-3:0 word ': word3+word2+word1+word,
            '-3 word.isdigit()': word3.isdigit(),
        })        
        
    if i < len(sent)-1:
        word1 = sent[i+1][0]
        features.update({
            '1 word': word1,
            '0:1 word': word+word1,
            '1 word.isdigit()': word1.isdigit(),
        })        
        
    else:
        features['EOS'] = True        

    if i < len(sent)-2:
        word2 = sent[i + 2][0]
        word1 = sent[i + 1][0]
        features.update({
            '2 word': word2,
            '0:2 word': word + word1 + word2,
            '2 word.isdigit()': word2.isdigit(),
        })        
 
    if i < len(sent)-3:
        word3 = sent[i + 3][0]
        word2 = sent[i + 2][0]
        word1 = sent[i + 1][0]
        features.update({
            '3 word': word3,
            '0:3 words':word + word1 + word2 + word3,
            '3 word.isdigit()': word3.isdigit(),
        })
        
    return features

def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

def sent2labels(sent):
    return [ele[-1] for ele in sent]

def split_train_test(data_X,data_y,p=0.7):
    """划分训练集和测试集函数"""
    spl=int(len(data_X)*p)
    train_X=data_X[:spl]
    test_X=data_X[spl:]
    train_y=data_y[:spl]
    test_y=data_y[spl:]
    return train_X,train_y,test_X,test_y

In [59]:
datatrain_sent_lags=load_pre_data("E:\\corpus\\icwb2-data\\training\\pku_training.utf8")
X_train = [sent2features(s) for s in datatrain_sent_lags]
y_train = [sent2labels(s) for s in datatrain_sent_lags]

In [60]:
datatest_sent_lags = load_pre_data("E:\\corpus\\icwb2-data\\gold\\pku_test_gold.utf8")
X_test = [sent2features(s) for s in datatest_sent_lags]
y_test = [sent2labels(s) for s in datatest_sent_lags]

In [None]:
crf_model = sklearn_crfsuite.CRF(algorithm='l2sgd',max_iterations=100,c2=0.1,
                                 all_possible_transitions=True,verbose=True)
crf_model.fit(X_train, y_train)

In [62]:
y_pred = crf_model.predict(X_test)
labels = list(crf_model.classes_) 

In [63]:
sorted_labels = sorted(labels,key=lambda name: (name[1:], name[0]))
print(metrics.flat_classification_report(
    y_test, y_pred, labels=sorted_labels, digits=3
))

              precision    recall  f1-score   support

           B      0.955     0.954     0.955     56883
           E      0.952     0.951     0.952     56883
           M      0.740     0.891     0.808     11480
           S      0.953     0.904     0.928     44063

    accuracy                          0.936    169309
   macro avg      0.900     0.925     0.911    169309
weighted avg      0.939     0.936     0.937    169309



In [64]:
import joblib
joblib.dump(crf_model, "E:\\corpus\\crf_model.pkl")

['E:\\corpus\\crf_model.pkl']

In [102]:
import re
new_sents = []
text = "人要自强不息，每天都要有进步。"
new_sents.append(text.strip())

In [106]:
NER_tagger = joblib.load('E:\\corpus\\crf_model.pkl')
sents_feature = [sent2features(sent) for sent in new_sents]
y_pred = NER_tagger.predict(sents_feature)

In [109]:
list_result = []
for sent, ner_tag in zip(new_sents, y_pred):
    for word, tag in zip(sent, ner_tag):
        list_result.append((word,tag))

In [110]:
list_result

[('人', 'S'),
 ('要', 'S'),
 ('自', 'B'),
 ('强', 'M'),
 ('不', 'M'),
 ('息', 'E'),
 ('，', 'S'),
 ('每', 'B'),
 ('天', 'E'),
 ('都', 'S'),
 ('要', 'S'),
 ('有', 'S'),
 ('进', 'B'),
 ('步', 'E'),
 ('。', 'S')]