In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
plt.style.use('ggplot')

In [2]:
from itertools import chain

import nltk
import sklearn
import scipy.stats
from sklearn.metrics import make_scorer
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RandomizedSearchCV

import sklearn_crfsuite
from sklearn_crfsuite import scorers
from sklearn_crfsuite import metrics

In [3]:
nltk.download('conll2002')
nltk.corpus.conll2007.fileids()

[nltk_data] Downloading package conll2002 to /home/zhouh/nltk_data...
[nltk_data]   Package conll2002 is already up-to-date!


['esp.test', 'esp.train', 'eus.test', 'eus.train']

## Let's use CoNLL 2002 data to build a NER system

CoNLL2002 corpus is available in NLTK. We use Spanish data.

In [4]:
nltk.corpus.conll2002.fileids()

['esp.testa', 'esp.testb', 'esp.train', 'ned.testa', 'ned.testb', 'ned.train']

In [5]:
train_sents = list(nltk.corpus.conll2002.iob_sents('esp.train'))
test_sents = list(nltk.corpus.conll2002.iob_sents('esp.testb'))

In [6]:
train_sents[0]

[('Melbourne', 'NP', 'B-LOC'),
 ('(', 'Fpa', 'O'),
 ('Australia', 'NP', 'B-LOC'),
 (')', 'Fpt', 'O'),
 (',', 'Fc', 'O'),
 ('25', 'Z', 'O'),
 ('may', 'NC', 'O'),
 ('(', 'Fpa', 'O'),
 ('EFE', 'NC', 'B-ORG'),
 (')', 'Fpt', 'O'),
 ('.', 'Fp', 'O')]

## Features

Next, define some features. In this example we use word identity, word suffix, word shape and word POS tag; also, some information from nearby words is used. 

This makes a simple baseline, but you certainly can add and remove some features to get (much?) better results - experiment with it.

sklearn-crfsuite (and python-crfsuite) supports several feature formats; here we use feature dicts.

In [7]:
def word2features(sent, i):
    word = sent[i][0]
    postag = sent[i][1]
    
    features = {
        'bias': 1.0,  # proportion of label
        'word.lower()': word.lower(),
        'word[-3:]': word[-3:],
        'word[-2:]': word[-2:],
        'word.isupper()': word.isupper(),
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit(),
        'postag': postag,
        'postag[:2]': postag[:2],        
    }
    if i > 0:
        word1 = sent[i-1][0]
        postag1 = sent[i-1][1]
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isupper()': word1.isupper(),
            '-1:postag': postag1,
            '-1:postag[:2]': postag1[:2],
        })
    else:
        features['BOS'] = True
        
    if i < len(sent)-1:
        word1 = sent[i+1][0]
        postag1 = sent[i+1][1]
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word.istitle()': word1.istitle(),
            '+1:word.isupper()': word1.isupper(),
            '+1:postag': postag1,
            '+1:postag[:2]': postag1[:2],
        })
    else:
        features['EOS'] = True
                
    return features


def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

def sent2labels(sent):
    return [label for token, postag, label in sent]

def sent2tokens(sent):
    return [token for token, postag, label in sent]

In [13]:
word_to_ix = {}
for sentence in train_sents:
    for token, postag, label in sentence:
        if token not in word_to_ix:
            word_to_ix[token] = len(word_to_ix)
            
word_to_ix

{'Melbourne': 0,
 '(': 1,
 'Australia': 2,
 ')': 3,
 ',': 4,
 '25': 5,
 'may': 6,
 'EFE': 7,
 '.': 8,
 '-': 9,
 'El': 10,
 'Abogado': 11,
 'General': 12,
 'del': 13,
 'Estado': 14,
 'Daryl': 15,
 'Williams': 16,
 'subrayó': 17,
 'hoy': 18,
 'la': 19,
 'necesidad': 20,
 'de': 21,
 'tomar': 22,
 'medidas': 23,
 'para': 24,
 'proteger': 25,
 'al': 26,
 'sistema': 27,
 'judicial': 28,
 'australiano': 29,
 'frente': 30,
 'a': 31,
 'una': 32,
 'página': 33,
 'internet': 34,
 'que': 35,
 'imposibilita': 36,
 'el': 37,
 'cumplimiento': 38,
 'los': 39,
 'principios': 40,
 'básicos': 41,
 'Ley': 42,
 'La': 43,
 'petición': 44,
 'tiene': 45,
 'lugar': 46,
 'después': 47,
 'un': 48,
 'juez': 49,
 'Tribunal': 50,
 'Supremo': 51,
 'estado': 52,
 'Victoria': 53,
 'se': 54,
 'viera': 55,
 'forzado': 56,
 'disolver': 57,
 'jurado': 58,
 'popular': 59,
 'y': 60,
 'suspender': 61,
 'proceso': 62,
 'ante': 63,
 'argumento': 64,
 'defensa': 65,
 'las': 66,
 'personas': 67,
 'lo': 68,
 'componían': 69,
 'po

In [20]:
sent2tokens(train_sents[0])

['Melbourne', '(', 'Australia', ')', ',', '25', 'may', '(', 'EFE', ')', '.']

This is what word2features extracts:

In [15]:
sent2features(train_sents[0])[0]

{'bias': 1.0,
 'word.lower()': 'melbourne',
 'word[-3:]': 'rne',
 'word[-2:]': 'ne',
 'word.isupper()': False,
 'word.istitle()': True,
 'word.isdigit()': False,
 'postag': 'NP',
 'postag[:2]': 'NP',
 'BOS': True,
 '+1:word.lower()': '(',
 '+1:word.istitle()': False,
 '+1:word.isupper()': False,
 '+1:postag': 'Fpa',
 '+1:postag[:2]': 'Fp'}

Extract features from the data:

In [16]:

X_train = [sent2features(s) for s in train_sents]
y_train = [sent2labels(s) for s in train_sents]

X_test = [sent2features(s) for s in test_sents]
y_test = [sent2labels(s) for s in test_sents]

In [19]:
y_train[0]

['B-LOC', 'O', 'B-LOC', 'O', 'O', 'O', 'O', 'O', 'B-ORG', 'O', 'O']

## Training

To see all possible CRF parameters check its docstring. Here we are useing L-BFGS training algorithm (it is default) with Elastic Net (L1 + L2) regularization.

In [21]:
?sklearn_crfsuite.CRF

In [22]:
# %%time
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs', 
    c1=0.1, 
    c2=0.1, 
    max_iterations=100, 
    all_possible_transitions=True
)

crf.fit(X_train, y_train)

CRF(algorithm='lbfgs', all_possible_states=None, all_possible_transitions=True,
    averaging=None, c=None, c1=0.1, c2=0.1, calibration_candidates=None,
    calibration_eta=None, calibration_max_trials=None, calibration_rate=None,
    calibration_samples=None, delta=None, epsilon=None, error_sensitive=None,
    gamma=None, keep_tempfiles=None, linesearch=None, max_iterations=100,
    max_linesearch=None, min_freq=None, model_filename=None, num_memories=None,
    pa_type=None, period=None, trainer_cls=None, variance=None, verbose=False)

In [23]:
crf.attributes_

['bias',
 'word.lower():melbourne',
 'word[-3:]:rne',
 'word[-2:]:ne',
 'word.isupper()',
 'word.istitle()',
 'word.isdigit()',
 'postag:NP',
 'postag[:2]:NP',
 'BOS',
 '+1:word.lower():(',
 '+1:word.istitle()',
 '+1:word.isupper()',
 '+1:postag:Fpa',
 '+1:postag[:2]:Fp',
 'word.lower():(',
 'word[-3:]:(',
 'word[-2:]:(',
 'postag:Fpa',
 'postag[:2]:Fp',
 '-1:word.istitle()',
 '-1:word.isupper()',
 '-1:postag:NP',
 '-1:postag[:2]:NP',
 '+1:postag:NP',
 '+1:postag[:2]:NP',
 'word.lower():australia',
 'word[-3:]:lia',
 'word[-2:]:ia',
 '-1:word.lower():(',
 '-1:postag:Fpa',
 '-1:postag[:2]:Fp',
 '+1:word.lower():)',
 '+1:postag:Fpt',
 'word.lower():)',
 'word[-3:]:)',
 'word[-2:]:)',
 'postag:Fpt',
 '+1:word.lower():,',
 '+1:postag:Fc',
 '+1:postag[:2]:Fc',
 'word.lower():,',
 'word[-3:]:,',
 'word[-2:]:,',
 'postag:Fc',
 'postag[:2]:Fc',
 '-1:word.lower():)',
 '-1:postag:Fpt',
 '+1:word.lower():25',
 '+1:postag:Z',
 '+1:postag[:2]:Z',
 'word.lower():25',
 'word[-3:]:25',
 'word[-2:]:25'

## Evaluation

There is much more O entities in data set, but we're more interested in other entities. To account for this we'll use averaged F1 score computed for all labels except for O. ``sklearn-crfsuite.metrics`` package provides some useful metrics for sequence classification task, including this one.

In [25]:
labels = list(crf.classes_)
labels.remove('O')
labels

['B-LOC', 'B-ORG', 'B-PER', 'I-PER', 'B-MISC', 'I-ORG', 'I-LOC', 'I-MISC']

In [26]:
y_pred = crf.predict(X_test)
metrics.flat_f1_score(y_test, y_pred, 
                      average='weighted', labels=labels)

0.7964686316443963

Inspect per-class results in more detail:

In [27]:
# group B and I results
sorted_labels = sorted(
    labels, 
    key=lambda name: (name[1:], name[0])
)
print(metrics.flat_classification_report(
    y_test, y_pred, labels=sorted_labels, digits=3
))

              precision    recall  f1-score   support

       B-LOC      0.810     0.784     0.797      1084
       I-LOC      0.690     0.637     0.662       325
      B-MISC      0.731     0.569     0.640       339
      I-MISC      0.699     0.589     0.639       557
       B-ORG      0.807     0.832     0.820      1400
       I-ORG      0.852     0.786     0.818      1104
       B-PER      0.850     0.884     0.867       735
       I-PER      0.893     0.943     0.917       634

   micro avg      0.813     0.787     0.799      6178
   macro avg      0.791     0.753     0.770      6178
weighted avg      0.809     0.787     0.796      6178



## Hyperparameter Optimization

To improve quality try to select regularization parameters using randomized search and 3-fold cross-validation.

I takes quite a lot of CPU time and RAM (we're fitting a model ``50 * 3 = 150`` times), so grab a tea and be patient, or reduce n_iter in RandomizedSearchCV, or fit model only on a subset of training data.

In [28]:
## %%time
# define fixed parameters and parameters to search
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs', 
    max_iterations=100, 
    all_possible_transitions=True
)
params_space = {
    'c1': scipy.stats.expon(scale=0.5),
    'c2': scipy.stats.expon(scale=0.05),
}

# use the same metric for evaluation
f1_scorer = make_scorer(metrics.flat_f1_score, 
                        average='weighted', labels=labels)

# search
rs = RandomizedSearchCV(crf, params_space, 
                        cv=3,   for (attr, label), weight in state_features:
        print("%0.6f %-8s %s" % (weight, label, attr))    

print("Top positive:")
print_state_features(Counter(crf.state_features_).most_common(30))

print("\nTop negative:")
print_state_features(Counter(crf.state_features_).most_common()[-30:])
                        verbose=1, 
                        n_jobs=-1, 
                        n_iter=50, 
                        scoring=f1_scorer)
rs.fit(X_train, y_train)

Fitting 3 folds for each of 50 candidates, totalling 150 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:  9.9min
[Parallel(n_jobs=-1)]: Done 150 out of 150 | elapsed: 41.9min finished


RandomizedSearchCV(cv=3, error_score='raise-deprecating',
                   estimator=CRF(algorithm='lbfgs', all_possible_states=None,
                                 all_possible_transitions=True, averaging=None,
                                 c=None, c1=None, c2=None,
                                 calibration_candidates=None,
                                 calibration_eta=None,
                                 calibration_max_trials=None,
                                 calibration_rate=None,
                                 calibration_samples=None, delta=None,
                                 epsilon=None, error_sensitive=None,...
                   param_distributions={'c1': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7f97b78d0898>,
                                        'c2': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7f97b78d0a90>},
                   pre_dispatch='2*n_jobs', random_state=None, refit=True,
                   return_train_sco

Best result:

In [29]:
# crf = rs.best_estimator_
print('best params:', rs.best_params_)
print('best CV score:', rs.best_score_)
print('model size: {:0.2f}M'.format(rs.best_estimator_.size_ / 1000000))

best params: {'c1': 0.1626142476992158, 'c2': 0.00992258748099264}
best CV score: 0.749065332909574
model size: 1.15M


In [36]:
[s.parameters['c1'] for s in rs.]

TypeError: '_PredictScorer' object is not iterable

### Check parameter space

A chart which shows which ``c1`` and ``c2`` values have RandomizedSearchCV checked. Red color means better results, blue means worse.

In [30]:
_x = [s.parameters['c1'] for s in rs.]
_y = [s.parameters['c2'] for s in rs.grid_scores_]
_c = [s.mean_validation_score for s in rs.grid_scores_]

fig = plt.figure()
fig.set_size_inches(12, 12)
ax = plt.gca()
ax.set_yscale('log')
ax.set_xscale('log')
ax.set_xlabel('C1')
ax.set_ylabel('C2')
ax.set_title("Randomized Hyperparameter Search CV Results (min={:0.3}, max={:0.3})".format(
    min(_c), max(_c)
))

ax.scatter(_x, _y, c=_c, s=60, alpha=0.9, edgecolors=[0,0,0])

print("Dark blue => {:0.4}, dark red => {:0.4}".format(min(_c), max(_c)))

AttributeError: 'RandomizedSearchCV' object has no attribute 'grid_scores_'

## Check best estimator on our test data

As you can see, quality is improved.

In [52]:
crf = rs.best_estimator_
y_pred = crf.predict(X_test)
metrics.flat_f1_score(y_test, y_pred, 
                      average='weighted', labels=labels)

0.7944628836021157

In [37]:
crf = rs.best_estimator_
y_pred = crf.predict(X_test)
print(metrics.flat_classification_report(
    y_test, y_pred, labels=sorted_labels, digits=3
))

              precision    recall  f1-score   support

       B-LOC      0.800     0.785     0.792      1084
       I-LOC      0.685     0.662     0.673       325
      B-MISC      0.709     0.560     0.626       339
      I-MISC      0.698     0.592     0.641       557
       B-ORG      0.810     0.829     0.820      1400
       I-ORG      0.857     0.778     0.816      1104
       B-PER      0.845     0.880     0.862       735
       I-PER      0.887     0.943     0.914       634

   micro avg      0.809     0.785     0.797      6178
   macro avg      0.786     0.754     0.768      6178
weighted avg      0.806     0.785     0.794      6178



## Let's check what classifier learned

In [24]:
from collections import Counter

def print_transitions(trans_features):
    for (label_from, label_to), weight in trans_features:
        print("%-6s -> %-7s %0.6f" % (label_from, label_to, weight))

print("Top likely transitions:")
print_transitions(Counter(crf.transition_features_).most_common(20))

print("\nTop unlikely transitions:")
print_transitions(Counter(crf.transition_features_).most_common()[-20:])

Top likely transitions:
B-ORG  -> I-ORG   7.500912
I-ORG  -> I-ORG   7.206322
B-MISC -> I-MISC  6.833142
I-MISC -> I-MISC  6.753222
B-PER  -> I-PER   6.404557
B-LOC  -> I-LOC   5.696274
I-LOC  -> I-LOC   4.877422
I-PER  -> I-PER   4.709231
O      -> O       3.784430
O      -> B-ORG   2.754974
O      -> B-PER   2.549453
O      -> B-LOC   1.846099
O      -> B-MISC  1.804584
B-LOC  -> B-LOC   0.578393
B-ORG  -> O       0.325175
I-PER  -> B-LOC   0.300667
B-MISC -> B-ORG   0.298525
B-ORG  -> B-LOC   0.266688
B-LOC  -> B-PER   -0.046324
B-MISC -> O       -0.143646

Top unlikely transitions:
I-LOC  -> B-MISC  -1.976574
I-MISC -> I-PER   -2.008671
B-ORG  -> B-ORG   -2.107974
I-ORG  -> B-LOC   -2.199630
I-MISC -> B-LOC   -2.240108
I-ORG  -> I-PER   -2.272384
B-PER  -> B-MISC  -2.325289
I-PER  -> I-LOC   -2.455352
I-ORG  -> B-MISC  -2.486495
I-PER  -> B-ORG   -2.512129
I-ORG  -> I-LOC   -2.536158
I-MISC -> I-LOC   -2.557052
B-ORG  -> B-MISC  -2.581202
B-PER  -> B-PER   -2.825385
I-PER  -> B-MIS

We can see that, for example, it is very likely that the beginning of an organization name (B-ORG) will be followed by a token inside organization name (I-ORG), but transitions to I-ORG from tokens with other labels are penalized.

Check the state features:

In [41]:
crf.state_features_

{('bias', 'B-LOC'): -0.632968,
 ('bias', 'O'): 4.05394,
 ('bias', 'B-ORG'): -1.246552,
 ('bias', 'B-PER'): -0.596076,
 ('bias', 'I-PER'): -1.591598,
 ('bias', 'B-MISC'): -0.369917,
 ('bias', 'I-ORG'): -1.627245,
 ('bias', 'I-LOC'): -0.262979,
 ('bias', 'I-MISC'): -0.391216,
 ('word.lower():melbourne', 'B-LOC'): 1.685295,
 ('word.lower():melbourne', 'I-MISC'): 0.54326,
 ('word[-3:]:rne', 'B-LOC'): 0.551362,
 ('word[-3:]:rne', 'I-MISC'): 0.503875,
 ('word[-2:]:ne', 'B-LOC'): 0.008109,
 ('word[-2:]:ne', 'O'): 0.833204,
 ('word[-2:]:ne', 'B-ORG'): 0.06339,
 ('word[-2:]:ne', 'B-PER'): 1.048272,
 ('word.isupper()', 'B-LOC'): 1.00164,
 ('word.isupper()', 'O'): -6.248543,
 ('word.isupper()', 'B-ORG'): 5.178607,
 ('word.isupper()', 'B-PER'): 1.165861,
 ('word.isupper()', 'I-PER'): 0.274261,
 ('word.isupper()', 'B-MISC'): 3.698361,
 ('word.isupper()', 'I-ORG'): 3.294302,
 ('word.isupper()', 'I-LOC'): 0.34496,
 ('word.isupper()', 'I-MISC'): -0.078701,
 ('word.istitle()', 'B-LOC'): 2.538079,
 ('wo

In [25]:
def print_state_features(state_features):
    for (attr, label), weight in state_features:
        print("%0.6f %-8s %s" % (weight, label, attr))    

print("Top positive:")
print_state_features(Counter(crf.state_features_).most_common(30))

print("\nTop negative:")
print_state_features(Counter(crf.state_features_).most_common()[-30:])

Top positive:
9.810583 B-ORG    word.lower():efe-cantabria
8.587255 B-ORG    word.lower():psoe-progresistas
6.026318 I-ORG    -1:word.lower():l
4.902771 B-ORG    word.lower():xfera
4.896558 B-LOC    -1:word.lower():cantabria
4.867565 O        BOS
4.810829 B-LOC    word.lower():líbano
4.760313 B-ORG    word.lower():telefónica
4.723549 B-MISC   word.lower():justicia
4.674730 B-ORG    word[-2:]:-e
4.597672 B-MISC   word.lower():competencia
4.582394 O        word.lower():r.
4.582394 O        word[-3:]:R.
4.545455 B-MISC   word.lower():diversia
4.409233 B-ORG    word.lower():petrobras
4.277603 B-ORG    word.lower():coag-extremadura
4.261705 B-PER    -1:word.lower():según
4.229368 I-LOC    -1:word.lower():calle
4.223481 B-ORG    word.isupper()
4.189817 B-ORG    word.lower():esquerra
4.188726 B-PER    word.lower():valedor
4.156011 O        word.lower():b
4.156011 O        word[-3:]:B
4.156011 O        word[-2:]:B
4.150794 B-ORG    word.lower():terra
4.121745 B-ORG    -1:word.lower():distancia



Some observations:

   * **9.385823 B-ORG word.lower():psoe-progresistas** - the model remembered names of some entities - maybe it is overfit, or maybe our features are not adequate, or maybe remembering is indeed helpful;
   * **4.636151 I-LOC -1:word.lower():calle:** "calle" is a street in Spanish; model learns that if a previous word was "calle" then the token is likely a part of location;
   * **-5.632036 O word.isupper()**, **-8.215073 O word.istitle()** : UPPERCASED or TitleCased words are likely entities of some kind;
   * **-2.097561 O postag:NP** - proper nouns (NP is a proper noun in the Spanish tagset) are often entities.

What to do next

    * Load 'testa' Spanish data.
    * Use it to develop better features and to find best model parameters.
    * Apply the model to 'testb' data again.

The model in this notebook is just a starting point; you certainly can do better!

