In [3]:
import nltk
import sklearn
import scipy.stats
from sklearn.metrics import make_scorer
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RandomizedSearchCV
import sklearn_crfsuite
from sklearn_crfsuite import scorers
from sklearn_crfsuite import metrics

In [6]:
# 基于NLTK下载示例数据集
nltk.download('conll2002')

[nltk_data] Downloading package conll2002 to
[nltk_data]     /Users/zebulonzhang/nltk_data...
[nltk_data]   Package conll2002 is already up-to-date!


True

In [8]:
# 设置训练和测试样本
train_sents = list(nltk.corpus.conll2002.iob_sents('esp.train'))
test_sents = list(nltk.corpus.conll2002.iob_sents('esp.testb'))

In [68]:
sent = train_sents[0]

In [9]:
def word2feature(sent,i):
    word = sent[i][0]
    postag = sent[i][1]
     #构造特征字典,字的标识,是否是数字和字周围的特征信息
    features = {
        'bias':1.0,
        'word.lower()': word.lower(),
        'word[-3:]': word[-3:],
        'word[-2:]': word[-2:],
        'word.isupper()': word.isupper(),
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit(),
        'postag': postag,
        'postag[:2]': postag[:2]
    }
    #该字的前一个字
    if i > 0:
        word1 = sent[i-1][0]
        postag1 = sent[i-1][1]
     #构造特征字典,字的标识,是否是数字和字周围的特征信息
        features.update({
        '-1:word.lower()': word1.lower(),
        '-1:word.istitle()': word1.istitle(),
        '-1:word.isupper()': word1.isupper(),
        '-1:postag': postag1,
        '-1:postag[:2]': postag1[:2]
    })
    else:
        features['BOS'] =True  #添加开头的标识 BOS(begin of sentence)
    #该字的后一个字
    if i < len(sent)-1:
        word1 = sent[i+1][0]
        postag1 = sent[i+1][1]
     #构造特征字典,字的标识,是否是数字和字周围的特征信息
        features.update({
        '+1:word.lower()': word1.lower(),
        '+1:word.istitle()': word1.istitle(),
        '+1:word.isupper()': word1.isupper(),
        '+1:postag': postag1,
        '+1:postag[:2]': postag1[:2]
    })
    else:
        features['EOS'] = True  #添加结束的标识 EOS end of sentence
    return features
def sent2features(sent):
    return [word2feature(sent,i) for i in range(len(sent))]
def sent2labels(sent):
    return [label for token, postag, label in sent]
def sent2tokens(sent):
    return [token for token, postag, label in sent]

In [69]:
sent

[('Melbourne', 'NP', 'B-LOC'),
 ('(', 'Fpa', 'O'),
 ('Australia', 'NP', 'B-LOC'),
 (')', 'Fpt', 'O'),
 (',', 'Fc', 'O'),
 ('25', 'Z', 'O'),
 ('may', 'NC', 'O'),
 ('(', 'Fpa', 'O'),
 ('EFE', 'NC', 'B-ORG'),
 (')', 'Fpt', 'O'),
 ('.', 'Fp', 'O')]

In [10]:
# 构造训练集和测试集
X_train = [sent2features(s) for s in train_sents]
y_train = [sent2labels(s) for s in train_sents]

X_test = [sent2features(s) for s in test_sents]
y_test = [sent2labels(s) for s in test_sents]

In [102]:
X_train[0]

[{'bias': 1.0,
  'word.lower()': 'melbourne',
  'word[-3:]': 'rne',
  'word[-2:]': 'ne',
  'word.isupper()': False,
  'word.istitle()': True,
  'word.isdigit()': False,
  'postag': 'NP',
  'postag[:2]': 'NP',
  'BOS': True,
  '+1:word.lower()': '(',
  '+1:word.istitle()': False,
  '+1:word.isupper()': False,
  '+1:postag': 'Fpa',
  '+1:postag[:2]': 'Fp'},
 {'bias': 1.0,
  'word.lower()': '(',
  'word[-3:]': '(',
  'word[-2:]': '(',
  'word.isupper()': False,
  'word.istitle()': False,
  'word.isdigit()': False,
  'postag': 'Fpa',
  'postag[:2]': 'Fp',
  '-1:word.lower()': 'melbourne',
  '-1:word.istitle()': True,
  '-1:word.isupper()': False,
  '-1:postag': 'NP',
  '-1:postag[:2]': 'NP',
  '+1:word.lower()': 'australia',
  '+1:word.istitle()': True,
  '+1:word.isupper()': False,
  '+1:postag': 'NP',
  '+1:postag[:2]': 'NP'},
 {'bias': 1.0,
  'word.lower()': 'australia',
  'word[-3:]': 'lia',
  'word[-2:]': 'ia',
  'word.isupper()': False,
  'word.istitle()': True,
  'word.isdigit()': F

In [11]:
# 创建CRF模型实例
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=True
)
crf.fit(X_train,y_train)
# 类别标签
labels = list(crf.classes_)
labels.remove('O') #对于O标签的预测我们不关心，就直接去掉
print(labels)
# 模型预测
y_pred = crf.predict(X_test)
# 计算F1得分
metrics.flat_f1_score(y_test, y_pred,
                      average='weighted', labels=labels)

['B-LOC', 'B-ORG', 'B-PER', 'I-PER', 'B-MISC', 'I-ORG', 'I-LOC', 'I-MISC']


0.7964686316443963

In [30]:
# 打印B和I组的模型结果
sorted_labels = sorted(labels,key=lambda name: (name[1:], name[0]))
# print(metrics.flat_classification_report(
#     y_test, y_pred,labels = sorted_labels,digits = 3
# ))