In [5]:
# 使用CRF进行命名实体识别
# load data
import nltk
import sklearn_crfsuite
from sklearn_crfsuite import metrics
nltk.corpus.conll2002.fileids()
train_sents = list(nltk.corpus.conll2002.iob_sents('esp.train'))
test_sents = list(nltk.corpus.conll2002.iob_sents('esp.testb'))

print("sample sents: ")
print("train sents: ", train_sents[0])
print("test sents: ", test_sents[0])

sample sents: 
train sents:  [('Melbourne', 'NP', 'B-LOC'), ('(', 'Fpa', 'O'), ('Australia', 'NP', 'B-LOC'), (')', 'Fpt', 'O'), (',', 'Fc', 'O'), ('25', 'Z', 'O'), ('may', 'NC', 'O'), ('(', 'Fpa', 'O'), ('EFE', 'NC', 'B-ORG'), (')', 'Fpt', 'O'), ('.', 'Fp', 'O')]
test sents:  [('La', 'DA', 'B-LOC'), ('Coruña', 'NC', 'I-LOC'), (',', 'Fc', 'O'), ('23', 'Z', 'O'), ('may', 'NC', 'O'), ('(', 'Fpa', 'O'), ('EFECOM', 'NP', 'B-ORG'), (')', 'Fpt', 'O'), ('.', 'Fp', 'O')]


In [6]:
#  提取句子特征
def word2features(sent, i):
    """
    :param sent: 句子
    :param i: word idx
    :return: 
    """
    word = sent[i][0]
    postag = sent[i][1]

    features = {
        'bias': 1.0,
        'word.lower()': word.lower(),
        'word[-3:]': word[-3:],
        'word[-2:]': word[-2:],
        'word.isupper()': word.isupper(),
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit(),
        'postag': postag,
        'postag[:2]': postag[:2],
    }
    if i > 0:
        word1 = sent[i-1][0]
        postag1 = sent[i-1][1]
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isupper()': word1.isupper(),
            '-1:postag': postag1,
            '-1:postag[:2]': postag1[:2],
        })
    else:
        features['BOS'] = True

    if i < len(sent)-1:
        word1 = sent[i+1][0]
        postag1 = sent[i+1][1]
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word.istitle()': word1.istitle(),
            '+1:word.isupper()': word1.isupper(),
            '+1:postag': postag1,
            '+1:postag[:2]': postag1[:2],
        })
    else:
        features['EOS'] = True

    return features


def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]


def sent2labels(sent):
    return [label for token, postag, label in sent]


def sent2tokens(sent):
    return [token for token, postag, label in sent]


X_train = [sent2features(s) for s in train_sents]
y_train = [sent2labels(s) for s in train_sents]

X_test = [sent2features(s) for s in test_sents]
y_test = [sent2labels(s) for s in test_sents]

print("sample some train data: ")
print("a word of X_train one sent : ", X_train[0][0])
print("a word of X_test one sent: ", X_test[0][0])

sample some train data: 
a word of X_train one sent :  {'bias': 1.0, 'word.lower()': 'melbourne', 'word[-3:]': 'rne', 'word[-2:]': 'ne', 'word.isupper()': False, 'word.istitle()': True, 'word.isdigit()': False, 'postag': 'NP', 'postag[:2]': 'NP', 'BOS': True, '+1:word.lower()': '(', '+1:word.istitle()': False, '+1:word.isupper()': False, '+1:postag': 'Fpa', '+1:postag[:2]': 'Fp'}
a word of X_test one sent:  {'bias': 1.0, 'word.lower()': 'la', 'word[-3:]': 'La', 'word[-2:]': 'La', 'word.isupper()': False, 'word.istitle()': True, 'word.isdigit()': False, 'postag': 'DA', 'postag[:2]': 'DA', 'BOS': True, '+1:word.lower()': 'coruña', '+1:word.istitle()': True, '+1:word.isupper()': False, '+1:postag': 'NC', '+1:postag[:2]': 'NC'}


In [7]:
# use CRF 模型进行命名实体识别
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=True
)
crf.fit(X_train, y_train)

y_pred = crf.predict(X_test)
print(metrics.flat_classification_report(
    y_test, y_pred,  digits=3
))


             precision    recall  f1-score   support

      B-LOC      0.810     0.784     0.797      1084
     B-MISC      0.731     0.569     0.640       339
      B-ORG      0.807     0.832     0.820      1400
      B-PER      0.850     0.884     0.867       735
      I-LOC      0.690     0.637     0.662       325
     I-MISC      0.699     0.589     0.639       557
      I-ORG      0.852     0.786     0.818      1104
      I-PER      0.893     0.943     0.917       634
          O      0.992     0.997     0.994     45355

avg / total      0.970     0.971     0.971     51533



In [8]:
# evaluation
y_pred = crf.predict(X_test)
print(metrics.flat_classification_report(
    y_test, y_pred,  digits=3
))



             precision    recall  f1-score   support

      B-LOC      0.810     0.784     0.797      1084
     B-MISC      0.731     0.569     0.640       339
      B-ORG      0.807     0.832     0.820      1400
      B-PER      0.850     0.884     0.867       735
      I-LOC      0.690     0.637     0.662       325
     I-MISC      0.699     0.589     0.639       557
      I-ORG      0.852     0.786     0.818      1104
      I-PER      0.893     0.943     0.917       634
          O      0.992     0.997     0.994     45355

avg / total      0.970     0.971     0.971     51533

