In [1]:
# test
from get_data import *
language = "Chinese"
# English 0.8702
# Chinese  0.9521


In [2]:
import pycrfsuite
from sklearn_crfsuite import CRF


crfmodel = CRF(algorithm='lbfgs',
               c1=0.1,
               c2=1e-3,
               max_iterations=100,
               all_possible_transitions=False,
               keep_tempfiles=True)

# 定义特征函数


def word2features(sent, i):
    word = sent[i][0]
    features = [
        'word.lower=' + word.lower(),
        'word[-3:]=' + word[-3:],
        'word[-2:]=' + word[-2:],
        'word.isupper=%s' % word.isupper(),
        'word.istitle=%s' % word.istitle(),
        'word.isdigit=%s' % word.isdigit(),
    ]
    unigram = []

    def get_words(i, offset):
        if i+offset < 0:
            return 'BOS'
        if i + offset > len(sent) - 1:
            return 'EOS'
        return sent[i + offset][0]
    unigram.append("U00:{}".format(get_words(i, -2)))
    # unigram.append("U01:{}".format(get_words(i, -1)))
    # unigram.append("U02:{}".format(get_words(i, 0)))
    # unigram.append("U03:{}".format(get_words(i, 1)))
    unigram.append("U04:{}".format(get_words(i, 2)))
    unigram.append("U05:{}/{}".format(get_words(i, -2), get_words(i, 1)))
    unigram.append("U06:{}/{}".format(get_words(i, -1), get_words(i, 0)))
    unigram.append("U07:{}/{}".format(get_words(i, -1), get_words(i, 1)))
    unigram.append("U08:{}/{}".format(get_words(i, 0), get_words(i, 1)))
    unigram.append("U09:{}/{}".format(get_words(i, 1), get_words(i, 2)))
    if i > 0:
        word1 = sent[i-1][0]
        features.extend([
            '-1:word.lower=' + word1.lower(),
            '-1:word.istitle=%s' % word1.istitle(),
            '-1:word.isupper=%s' % word1.isupper(),
        ])
    else:
        features.append('BOS')

    if i < len(sent)-1:
        word1 = sent[i+1][0]
        features.extend([
            '+1:word.lower=' + word1.lower(),
            '+1:word.istitle=%s' % word1.istitle(),
            '+1:word.isupper=%s' % word1.isupper(),
        ])
    else:
        features.append('EOS')
    return unigram+features

# 定义标签


def word2label(sent, i):
    return sent[i][1]

# 提取特征和标签


def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]


train_data = get_train_data(language)

sent2features(train_data[0])


[['U00:BOS',
  'U04:上',
  'U05:BOS/任',
  'U06:BOS/现',
  'U07:BOS/任',
  'U08:现/任',
  'U09:任/上',
  'word.lower=现',
  'word[-3:]=现',
  'word[-2:]=现',
  'word.isupper=False',
  'word.istitle=False',
  'word.isdigit=False',
  'BOS',
  '+1:word.lower=任',
  '+1:word.istitle=False',
  '+1:word.isupper=False'],
 ['U00:BOS',
  'U04:海',
  'U05:BOS/上',
  'U06:现/任',
  'U07:现/上',
  'U08:任/上',
  'U09:上/海',
  'word.lower=任',
  'word[-3:]=任',
  'word[-2:]=任',
  'word.isupper=False',
  'word.istitle=False',
  'word.isdigit=False',
  '-1:word.lower=现',
  '-1:word.istitle=False',
  '-1:word.isupper=False',
  '+1:word.lower=上',
  '+1:word.istitle=False',
  '+1:word.isupper=False'],
 ['U00:现',
  'U04:大',
  'U05:现/海',
  'U06:任/上',
  'U07:任/海',
  'U08:上/海',
  'U09:海/大',
  'word.lower=上',
  'word[-3:]=上',
  'word[-2:]=上',
  'word.isupper=False',
  'word.istitle=False',
  'word.isdigit=False',
  '-1:word.lower=任',
  '-1:word.istitle=False',
  '-1:word.isupper=False',
  '+1:word.lower=海',
  '+1:word.istitle=Fals

In [3]:

def sent2labels(sent):
    return [word2label(sent, i) for i in range(len(sent))]


# 创建 CRF 模型
trainer = pycrfsuite.Trainer(algorithm='lbfgs', verbose=True)

# 添加数据
for sent in train_data:
    trainer.append(sent2features(sent), sent2labels(sent))

# 设置参数
trainer.set_params({
    'c1': 1.0,
    'c2': 1e-3,
    'max_iterations': 50,
    'feature.possible_states': True,
    'feature.possible_transitions': True
})

# 训练模型
model_file = '{}_ner_model.crfsuite'.format(language)
trainer.train(model_file)


Feature generation
type: CRF1d
feature.minfreq: 0.000000
feature.possible_states: 1
feature.possible_transitions: 1
0....1....2....3....4....5....6....7....8....9....10
Number of features: 3595984
Seconds required: 15.020

L-BFGS optimization
c1: 1.000000
c2: 0.001000
num_memories: 6
max_iterations: 50
epsilon: 0.000010
stop: 10
delta: 0.000010
linesearch: MoreThuente
linesearch.max_iterations: 20

***** Iteration #1 *****
Loss: 286086.760208
Feature norm: 1.000000
Error norm: 97189.210788
Active features: 205178
Line search trials: 1
Line search step: 0.000007
Seconds required for this iteration: 0.589

***** Iteration #2 *****
Loss: 231523.726951
Feature norm: 3.291965
Error norm: 112726.534441
Active features: 189770
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.302

***** Iteration #3 *****
Loss: 223017.730955
Feature norm: 3.316730
Error norm: 193417.072288
Active features: 194656
Line search trials: 1
Line search step: 1.000000
Seconds re

In [4]:
# 加载模型
from check import check


def evluate(language, example_path, model_file):
    tagger = pycrfsuite.Tagger()
    tagger.open(model_file)
    pass
    acc = 0
    pred_path = "example_data/my_path_crf_{}.txt".format(language)
    valid_data = get_data_from_file(example_path)
    with open(pred_path, "w") as f:

        for test in valid_data:
            pred = tagger.tag(sent2features(test))
            if ' '.join(pred) == ' '.join(sent2labels(test)):
                acc += 1
            for i in range(len(test)):

                f.write(test[i][0]+' '+pred[i]+'\n')
            f.write('\n')
    print(acc/len(valid_data))
    check(language, "{}/validation.txt".format(language), pred_path)


# Chinese
# 0.8506493506493507
#   micro avg     0.9415    0.9525    0.9470      8437
#   macro avg     0.7222    0.7329    0.7270      8437
#   weighted avg     0.9418    0.9525    0.9470      8437
# 加入-2,2
# 0.8593073593073594
#   micro avg     0.9515    0.9528    0.9521      8437
#   macro avg     0.7217    0.7350    0.7277      8437
#   weighted avg     0.9517    0.9528    0.9521      8437
evluate(language, "{}/validation.txt".format(language), model_file)


0.8593073593073594
              precision    recall  f1-score   support

      B-NAME     0.9902    0.9902    0.9902       102
      M-NAME     0.9610    0.9867    0.9737        75
      E-NAME     0.9804    0.9804    0.9804       102
      S-NAME     1.0000    1.0000    1.0000         8
      B-CONT     1.0000    1.0000    1.0000        33
      M-CONT     1.0000    1.0000    1.0000        64
      E-CONT     1.0000    1.0000    1.0000        33
      S-CONT     0.0000    0.0000    0.0000         0
       B-EDU     0.9907    1.0000    0.9953       106
       M-EDU     0.9779    1.0000    0.9888       177
       E-EDU     0.9720    0.9811    0.9765       106
       S-EDU     0.0000    0.0000    0.0000         0
     B-TITLE     0.9297    0.9216    0.9257       689
     M-TITLE     0.9105    0.9283    0.9193      1479
     E-TITLE     0.9941    0.9855    0.9898       689
     S-TITLE     0.0000    0.0000    0.0000         0
       B-ORG     0.9665    0.9406    0.9534       522
       M