In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.linear_model import Perceptron
from sklearn.model_selection import train_test_split
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report

In [2]:
df = pd.read_csv('ner_dataset.csv', encoding = "ISO-8859-1")
df = df[:100000]
df.head()

Unnamed: 0,Sentence #,Word,POS,Tag
0,Sentence: 1,Thousands,NNS,O
1,,of,IN,O
2,,demonstrators,NNS,O
3,,have,VBP,O
4,,marched,VBN,O


In [3]:
df.isnull().sum()

Sentence #    95456
Word              0
POS               0
Tag               0
dtype: int64

In [4]:
df = df.fillna(method='ffill')

We have 4,544 sentences that contain 10,922 unique words and tagged by 17 tags.

In [5]:
df['Sentence #'].nunique(), df.Word.nunique(), df.Tag.nunique()

(4544, 10922, 17)

In [10]:
df.head()

Unnamed: 0,Sentence #,Word,POS,Tag
0,Sentence: 1,Thousands,NNS,O
1,Sentence: 1,of,IN,O
2,Sentence: 1,demonstrators,NNS,O
3,Sentence: 1,have,VBP,O
4,Sentence: 1,marched,VBN,O


In [11]:
df.groupby('Tag').size().reset_index(name='counts')

Unnamed: 0,Tag,counts
0,B-art,75
1,B-eve,53
2,B-geo,3303
3,B-gpe,1740
4,B-nat,30
5,B-org,1876
6,B-per,1668
7,B-tim,1823
8,I-art,43
9,I-eve,47


In [6]:
X = df.drop('Tag', axis=1)
X.head()

Unnamed: 0,Sentence #,Word,POS
0,Sentence: 1,Thousands,NNS
1,Sentence: 1,of,IN
2,Sentence: 1,demonstrators,NNS
3,Sentence: 1,have,VBP
4,Sentence: 1,marched,VBN


In [7]:
X.columns

Index(['Sentence #', 'Word', 'POS'], dtype='object')

In [8]:
v = DictVectorizer(sparse=False)
X = v.fit_transform(X.to_dict('records'))
X.shape

(100000, 15507)

In [9]:
y = df.Tag.values

In [10]:
classes = np.unique(y)

In [11]:
classes = classes.tolist()
classes

['B-art',
 'B-eve',
 'B-geo',
 'B-gpe',
 'B-nat',
 'B-org',
 'B-per',
 'B-tim',
 'I-art',
 'I-eve',
 'I-geo',
 'I-gpe',
 'I-nat',
 'I-org',
 'I-per',
 'I-tim',
 'O']

In [12]:
X.shape, y.shape

((100000, 15507), (100000,))

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.33, random_state=0)

In [14]:
X_train.shape, y_train.shape

((67000, 15507), (67000,))

### Perceptron

In [15]:
new_classes = classes.copy()
new_classes.pop()
new_classes

['B-art',
 'B-eve',
 'B-geo',
 'B-gpe',
 'B-nat',
 'B-org',
 'B-per',
 'B-tim',
 'I-art',
 'I-eve',
 'I-geo',
 'I-gpe',
 'I-nat',
 'I-org',
 'I-per',
 'I-tim']

In [16]:
per = Perceptron(verbose=10, n_jobs=-1, max_iter=5)
per.partial_fit(X_train, y_train, classes)

-- Epoch 1-- Epoch 1-- Epoch 1
-- Epoch 1


Norm: 11.09, NNZs: 103, Bias: -3.000000, T: 67000, Avg. loss: 0.001134Norm: 50.70, NNZs: 1350, Bias: -4.000000, T: 67000, Avg. loss: 0.014970Norm: 68.07, NNZs: 2574, Bias: -3.000000, T: 67000, Avg. loss: 0.042104
Total training time: 2.52 seconds.

Total training time: 2.50 seconds.

Total training time: 2.49 seconds.
Norm: 13.53, NNZs: 159, Bias: -3.000000, T: 67000, Avg. loss: 0.001701
Total training time: 2.53 seconds.
-- Epoch 1-- Epoch 1
-- Epoch 1

-- Epoch 1
Norm: 8.37, NNZs: 59, Bias: -2.000000, T: 67000, Avg. loss: 0.000522
Total training time: 1.77 seconds.
-- Epoch 1
Norm: 45.00, NNZs: 1164, Bias: -3.000000, T: 67000, Avg. loss: 0.017567
Total training time: 1.79 seconds.
-- Epoch 1
Norm: 48.33, NNZs: 1679, Bias: -4.000000, T: 67000, Avg. loss: 0.022507
Total training time: 1.79 seconds.
-- Epoch 1
Norm: 57.04, NNZs: 2028, Bias: -5.000000, T: 67000, Avg. loss: 0.034493
Total training time: 1.85 seconds.
-- Epoch 1


[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:    4.5s


Norm: 10.15, NNZs: 83, Bias: -3.000000, T: 67000, Avg. loss: 0.000806
Total training time: 1.68 seconds.
-- Epoch 1
Norm: 10.30, NNZs: 92, Bias: -2.000000, T: 67000, Avg. loss: 0.001030
Total training time: 1.69 seconds.
-- Epoch 1
Norm: 34.35, NNZs: 811, Bias: -4.000000, T: 67000, Avg. loss: 0.011851
Total training time: 1.71 seconds.
-- Epoch 1
Norm: 10.72, NNZs: 93, Bias: -3.000000, T: 67000, Avg. loss: 0.001194
Total training time: 1.68 seconds.
-- Epoch 1


[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:    6.2s
[Parallel(n_jobs=-1)]: Done  12 out of  17 | elapsed:    6.2s remaining:    2.5s


Norm: 6.40, NNZs: 33, Bias: -3.000000, T: 67000, Avg. loss: 0.000194
Total training time: 1.94 seconds.
-- Epoch 1
Norm: 52.48, NNZs: 1692, Bias: -4.000000, T: 67000, Avg. loss: 0.025776
Total training time: 2.02 seconds.
Norm: 31.94, NNZs: 698, Bias: -4.000000, T: 67000, Avg. loss: 0.011791
Total training time: 2.01 seconds.
Norm: 60.29, NNZs: 2085, Bias: -5.000000, T: 67000, Avg. loss: 0.026746
Total training time: 2.09 seconds.


[Parallel(n_jobs=-1)]: Done  14 out of  17 | elapsed:    8.2s remaining:    1.7s


Norm: 73.60, NNZs: 2820, Bias: 3.000000, T: 67000, Avg. loss: 0.048672
Total training time: 1.48 seconds.


[Parallel(n_jobs=-1)]: Done  17 out of  17 | elapsed:    9.6s finished


Perceptron(alpha=0.0001, class_weight=None, eta0=1.0, fit_intercept=True,
      max_iter=5, n_iter=None, n_jobs=-1, penalty=None, random_state=0,
      shuffle=True, tol=None, verbose=10, warm_start=False)

In [17]:
print(classification_report(y_pred=per.predict(X_test), y_true=y_test, labels=new_classes))

             precision    recall  f1-score   support

      B-art       0.15      0.12      0.14        24
      B-eve       0.46      0.32      0.37        19
      B-geo       0.42      0.91      0.57      1085
      B-gpe       0.89      0.78      0.83       556
      B-nat       0.11      0.25      0.15        12
      B-org       0.55      0.35      0.43       589
      B-per       0.72      0.43      0.53       564
      B-tim       0.65      0.78      0.71       611
      I-art       0.02      0.08      0.03        12
      I-eve       0.00      0.00      0.00        18
      I-geo       0.81      0.32      0.46       230
      I-gpe       0.00      0.00      0.00        14
      I-nat       0.50      0.50      0.50         2
      I-org       0.71      0.41      0.52       445
      I-per       0.76      0.20      0.32       591
      I-tim       0.26      0.05      0.09       194

avg / total       0.62      0.55      0.53      4966



  'precision', 'predicted', average, warn_for)


### Linear classifiers with SGD training

In [18]:
sgd = SGDClassifier()
sgd.partial_fit(X_train, y_train, classes)



SGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='hinge', max_iter=None, n_iter=None,
       n_jobs=1, penalty='l2', power_t=0.5, random_state=None,
       shuffle=True, tol=None, verbose=0, warm_start=False)

In [19]:
print(classification_report(y_pred=sgd.predict(X_test), y_true=y_test, labels=new_classes))

             precision    recall  f1-score   support

      B-art       0.33      0.12      0.18        24
      B-eve       0.67      0.11      0.18        19
      B-geo       0.76      0.66      0.71      1085
      B-gpe       0.86      0.63      0.73       556
      B-nat       0.67      0.33      0.44        12
      B-org       0.63      0.42      0.50       589
      B-per       0.61      0.55      0.58       564
      B-tim       0.79      0.62      0.70       611
      I-art       1.00      0.08      0.15        12
      I-eve       0.00      0.00      0.00        18
      I-geo       0.82      0.39      0.53       230
      I-gpe       0.50      0.07      0.12        14
      I-nat       0.00      0.00      0.00         2
      I-org       0.36      0.68      0.47       445
      I-per       0.59      0.67      0.63       591
      I-tim       1.00      0.01      0.02       194

avg / total       0.69      0.56      0.59      4966



  'precision', 'predicted', average, warn_for)


### Naive Bayes classifier for multinomial models

In [20]:
nb = MultinomialNB(alpha=0.01)
nb.partial_fit(X_train, y_train, classes)

MultinomialNB(alpha=0.01, class_prior=None, fit_prior=True)

In [21]:
print(classification_report(y_pred=nb.predict(X_test), y_true=y_test, labels = new_classes))

             precision    recall  f1-score   support

      B-art       0.06      0.17      0.09        24
      B-eve       0.33      0.37      0.35        19
      B-geo       0.70      0.63      0.66      1085
      B-gpe       0.70      0.83      0.76       556
      B-nat       0.35      0.50      0.41        12
      B-org       0.41      0.44      0.43       589
      B-per       0.44      0.47      0.46       564
      B-tim       0.56      0.61      0.59       611
      I-art       0.07      0.08      0.08        12
      I-eve       0.46      0.33      0.39        18
      I-geo       0.40      0.52      0.46       230
      I-gpe       0.13      0.14      0.14        14
      I-nat       0.00      0.00      0.00         2
      I-org       0.50      0.51      0.51       445
      I-per       0.53      0.50      0.51       591
      I-tim       0.17      0.27      0.21       194

avg / total       0.54      0.56      0.54      4966



### Passive Aggressive Classifier

In [22]:
pa =PassiveAggressiveClassifier()
pa.partial_fit(X_train, y_train, classes)

PassiveAggressiveClassifier(C=1.0, average=False, class_weight=None,
              fit_intercept=True, loss='hinge', max_iter=None, n_iter=None,
              n_jobs=1, random_state=None, shuffle=True, tol=None,
              verbose=0, warm_start=False)

In [23]:
print(classification_report(y_pred=pa.predict(X_test), y_true=y_test, labels=new_classes))

             precision    recall  f1-score   support

      B-art       0.20      0.04      0.07        24
      B-eve       0.36      0.26      0.30        19
      B-geo       0.70      0.65      0.67      1085
      B-gpe       0.60      0.85      0.70       556
      B-nat       0.20      0.67      0.31        12
      B-org       0.70      0.32      0.44       589
      B-per       0.57      0.56      0.57       564
      B-tim       0.84      0.62      0.71       611
      I-art       0.02      0.50      0.04        12
      I-eve       0.83      0.28      0.42        18
      I-geo       0.50      0.62      0.55       230
      I-gpe       0.40      0.43      0.41        14
      I-nat       0.20      0.50      0.29         2
      I-org       0.78      0.28      0.41       445
      I-per       0.63      0.64      0.63       591
      I-tim       0.21      0.32      0.25       194

avg / total       0.65      0.56      0.58      4966



### Conditional Random Fields (CRFs)

In [24]:
import sklearn_crfsuite
from sklearn_crfsuite import scorers
from sklearn_crfsuite import metrics

#### Get sentences

In [25]:
class SentenceGetter(object):
    
    def __init__(self, data):
        self.n_sent = 1
        self.data = data
        self.empty = False
        agg_func = lambda s: [(w, p, t) for w, p, t in zip(s['Word'].values.tolist(), 
                                                           s['POS'].values.tolist(), 
                                                           s['Tag'].values.tolist())]
        self.grouped = self.data.groupby('Sentence #').apply(agg_func)
        self.sentences = [s for s in self.grouped]
        
    def get_next(self):
        try: 
            s = self.grouped['Sentence: {}'.format(self.n_sent)]
            self.n_sent += 1
            return s 
        except:
            return None

In [26]:
getter = SentenceGetter(df)

In [28]:
sent = getter.get_next()
print(sent)

[('Thousands', 'NNS', 'O'), ('of', 'IN', 'O'), ('demonstrators', 'NNS', 'O'), ('have', 'VBP', 'O'), ('marched', 'VBN', 'O'), ('through', 'IN', 'O'), ('London', 'NNP', 'B-geo'), ('to', 'TO', 'O'), ('protest', 'VB', 'O'), ('the', 'DT', 'O'), ('war', 'NN', 'O'), ('in', 'IN', 'O'), ('Iraq', 'NNP', 'B-geo'), ('and', 'CC', 'O'), ('demand', 'VB', 'O'), ('the', 'DT', 'O'), ('withdrawal', 'NN', 'O'), ('of', 'IN', 'O'), ('British', 'JJ', 'B-gpe'), ('troops', 'NNS', 'O'), ('from', 'IN', 'O'), ('that', 'DT', 'O'), ('country', 'NN', 'O'), ('.', '.', 'O')]


In [29]:
sentences = getter.sentences

#### Features extraction

Next, we extract more features (word parts, simplified POS tags, lower/title/upper flags, features of nearby words) and convert them to sklear-crfsuite format - each sentence should be converted to a list of dicts.

In [31]:
def word2features(sent, i):
    word = sent[i][0]
    postag = sent[i][1]
    
    features = {
        'bias': 1.0, 
        'word.lower()': word.lower(), 
        'word[-3:]': word[-3:],
        'word[-2:]': word[-2:],
        'word.isupper()': word.isupper(),
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit(),
        'postag': postag,
        'postag[:2]': postag[:2],
    }
    if i > 0:
        word1 = sent[i-1][0]
        postag1 = sent[i-1][1]
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isupper()': word1.isupper(),
            '-1:postag': postag1,
            '-1:postag[:2]': postag1[:2],
        })
    else:
        features['BOS'] = True
    if i < len(sent)-1:
        word1 = sent[i+1][0]
        postag1 = sent[i+1][1]
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word.istitle()': word1.istitle(),
            '+1:word.isupper()': word1.isupper(),
            '+1:postag': postag1,
            '+1:postag[:2]': postag1[:2],
        })
    else:
        features['EOS'] = True

    return features

def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

def sent2labels(sent):
    return [label for token, postag, label in sent]

def sent2tokens(sent):
    return [token for token, postag, label in sent]

The above code were taken from sklearn-crfsuite official site.

Split train and test sets.

In [32]:
X = [sent2features(s) for s in sentences]
y = [sent2labels(s) for s in sentences]

In [33]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=0)

In [34]:
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=True
)
crf.fit(X_train, y_train)

CRF(algorithm='lbfgs', all_possible_states=None,
  all_possible_transitions=True, averaging=None, c=None, c1=0.1, c2=0.1,
  calibration_candidates=None, calibration_eta=None,
  calibration_max_trials=None, calibration_rate=None,
  calibration_samples=None, delta=None, epsilon=None, error_sensitive=None,
  gamma=None, keep_tempfiles=None, linesearch=None, max_iterations=100,
  max_linesearch=None, min_freq=None, model_filename=None,
  num_memories=None, pa_type=None, period=None, trainer_cls=None,
  variance=None, verbose=False)

In [35]:
y_pred = crf.predict(X_test)
metrics.flat_f1_score(y_test, y_pred, average='weighted', labels=new_classes)

  'precision', 'predicted', average, warn_for)


0.7842087494747214

In [36]:
print(metrics.flat_classification_report(y_test, y_pred, labels = new_classes))

  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

      B-art       1.00      0.03      0.07        29
      B-eve       0.86      0.25      0.39        24
      B-geo       0.75      0.88      0.81      1043
      B-gpe       0.89      0.78      0.83       588
      B-nat       0.67      0.20      0.31        10
      B-org       0.75      0.64      0.69       649
      B-per       0.81      0.81      0.81       546
      B-tim       0.90      0.85      0.87       589
      I-art       0.00      0.00      0.00         7
      I-eve       0.57      0.22      0.32        18
      I-geo       0.71      0.71      0.71       204
      I-gpe       0.47      0.53      0.50        17
      I-nat       1.00      0.50      0.67         2
      I-org       0.78      0.73      0.76       545
      I-per       0.80      0.90      0.85       574
      I-tim       0.79      0.68      0.73       185

avg / total       0.80      0.78      0.78      5030



In [40]:
import scipy.stats
from sklearn.metrics import make_scorer
from sklearn.grid_search import RandomizedSearchCV

crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    max_iterations=100,
    all_possible_transitions=True
)
params_space = {
    'c1': scipy.stats.expon(scale=0.5),
    'c2': scipy.stats.expon(scale=0.05),
}

# use the same metric for evaluation
f1_scorer = make_scorer(metrics.flat_f1_score,
                        average='weighted', labels=new_classes)

# search
rs = RandomizedSearchCV(crf, params_space,
                        cv=3,
                        verbose=1,
                        n_jobs=-1,
                        n_iter=50,
                        scoring=f1_scorer)
rs.fit(X_train, y_train)



Fitting 3 folds for each of 50 candidates, totalling 150 fits


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  4.8min
[Parallel(n_jobs=-1)]: Done 150 out of 150 | elapsed: 17.0min finished


RandomizedSearchCV(cv=3, error_score='raise',
          estimator=CRF(algorithm='lbfgs', all_possible_states=None,
  all_possible_transitions=True, averaging=None, c=None, c1=None, c2=None,
  calibration_candidates=None, calibration_eta=None,
  calibration_max_trials=None, calibration_rate=None,
  calibration_samples=None, delta=None, epsilon=None, error...e,
  num_memories=None, pa_type=None, period=None, trainer_cls=None,
  variance=None, verbose=False),
          fit_params={}, iid=True, n_iter=50, n_jobs=-1,
          param_distributions={'c1': <scipy.stats._distn_infrastructure.rv_frozen object at 0x0000022C766AD048>, 'c2': <scipy.stats._distn_infrastructure.rv_frozen object at 0x0000022C76751978>},
          pre_dispatch='2*n_jobs', random_state=None, refit=True,
          scoring=make_scorer(flat_f1_score, average=weighted, labels=['B-art', 'B-eve', 'B-geo', 'B-gpe', 'B-nat', 'B-org', 'B-per', 'B-tim', 'I-art', 'I-eve', 'I-geo', 'I-gpe', 'I-nat', 'I-org', 'I-per', 'I-tim']),
   

In [41]:
print('best params:', rs.best_params_)
print('best CV score:', rs.best_score_)
print('model size: {:0.2f}M'.format(rs.best_estimator_.size_ / 1000000))

best params: {'c1': 0.0036898984638244928, 'c2': 0.11585183551331574}
best CV score: 0.7737211773297741
model size: 1.30M


In [45]:
crf = rs.best_estimator_
y_pred = crf.predict(X_test)
print(metrics.flat_classification_report(y_test, y_pred, labels=new_classes))

  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

      B-art       1.00      0.03      0.07        29
      B-eve       0.83      0.21      0.33        24
      B-geo       0.75      0.87      0.81      1043
      B-gpe       0.88      0.78      0.83       588
      B-nat       0.67      0.20      0.31        10
      B-org       0.74      0.63      0.68       649
      B-per       0.81      0.80      0.81       546
      B-tim       0.90      0.84      0.87       589
      I-art       0.00      0.00      0.00         7
      I-eve       0.67      0.22      0.33        18
      I-geo       0.67      0.71      0.69       204
      I-gpe       0.39      0.53      0.45        17
      I-nat       1.00      0.50      0.67         2
      I-org       0.78      0.72      0.75       545
      I-per       0.81      0.89      0.85       574
      I-tim       0.79      0.66      0.72       185

avg / total       0.80      0.78      0.78      5030



In [46]:
from collections import Counter

def print_transitions(trans_features):
    for (label_from, label_to), weight in trans_features:
        print("%-6s -> %-7s %0.6f" % (label_from, label_to, weight))

print("Top likely transitions:")
print_transitions(Counter(crf.transition_features_).most_common(20))

print("\nTop unlikely transitions:")
print_transitions(Counter(crf.transition_features_).most_common()[-20:])

Top likely transitions:
B-geo  -> I-geo   6.007604
I-geo  -> I-geo   5.296245
B-art  -> I-art   4.951198
B-eve  -> I-eve   4.847021
I-tim  -> I-tim   4.789188
B-per  -> I-per   4.711716
I-art  -> I-art   4.664539
B-tim  -> I-tim   4.575079
B-org  -> I-org   4.456466
I-org  -> I-org   4.320635
I-per  -> I-per   4.039724
I-gpe  -> I-gpe   3.969627
I-eve  -> I-eve   3.968368
B-gpe  -> I-gpe   3.919860
O      -> O       3.465068
B-nat  -> I-nat   3.208265
O      -> B-per   2.057576
B-org  -> B-art   2.001540
I-nat  -> I-nat   1.919624
B-geo  -> B-tim   1.688412

Top unlikely transitions:
B-gpe  -> I-org   -1.848015
O      -> I-gpe   -1.856660
B-geo  -> I-gpe   -1.880598
I-per  -> I-org   -1.889957
B-geo  -> I-org   -1.947059
O      -> I-eve   -2.033728
B-gpe  -> I-geo   -2.151673
I-org  -> B-org   -2.177301
B-org  -> B-org   -2.258343
O      -> I-art   -2.325744
B-org  -> I-per   -2.332204
B-tim  -> B-tim   -2.447829
I-org  -> I-per   -2.455738
I-per  -> B-per   -3.094530
O      -> I-per  

It is very likely that the beginning of a geographical entity (B-geo) will be followed by a token inside geographical entity (I-geo), but transitions to inside of an organization name (I-org) from tokens with other labels are penalized hugely.

In [47]:
def print_state_features(state_features):
    for (attr, label), weight in state_features:
        print("%0.6f %-8s %s" % (weight, label, attr))

print("Top positive:")
print_state_features(Counter(crf.state_features_).most_common(30))

print("\nTop negative:")
print_state_features(Counter(crf.state_features_).most_common()[-30:])

Top positive:
5.183603 B-tim    word[-3:]:day
4.699027 O        BOS
3.761687 O        bias
3.754395 I-tim    word[-3:]:day
3.593121 O        word.lower():kurdish
3.584948 O        word.lower():jewish
3.370614 B-per    word.lower():president
3.338913 B-org    word.lower():al-qaida
3.326234 B-tim    word.lower():thanksgiving
3.269326 B-tim    word[-2:]:ay
3.225759 O        word[-2:]:N1
3.171786 B-tim    +1:word.lower():year
3.119587 B-tim    word.lower():afternoon
3.118231 O        postag[:2]:VB
3.090609 B-org    -1:word.lower():telephoned
3.081398 B-org    word.lower():hamas
3.050390 B-gpe    word.istitle()
3.037740 B-tim    word[-2:]:0s
3.023566 B-gpe    word.lower():nepal
3.003322 B-gpe    word[-3:]:pal
2.998838 B-org    +1:word.lower():fought
2.997746 I-geo    +1:word.lower():town
2.995321 B-per    word.lower():obama
2.980474 B-geo    word.lower():mid-september
2.929354 B-geo    -1:word.lower():serb
2.924037 I-geo    +1:word.lower():achieved
2.921243 B-per    BOS
2.915479 B-tim    +1

Observations: 

1). __```5.183603 B-tim word[-3]:day```__
The model learns that if a nearby word was “day” then the token is likely a part of a Time indicator.

2). __```3.370614 B-per word.lower():president```__
The model learns that token "president" is likely to be at the beginning of a person name.

3). __```-3.521244 O postag:NNP```__
The model learns that proper nouns are often entities.

4). __```-3.087828 O word.isdigit()```__
Digits are likely entities.

5). __```-3.233526 O word.istitle()```__
TitleCased words are likely entities.

### ELI5

ELI5 is a Python package which helps to debug machine learning classifiers and explain their predictions. ELI5 allows to check weights of sklearn_crfsuite.CRF models.

In [52]:
import eli5

eli5.show_weights(crf, top=10)

From \ To,O,B-art,I-art,B-eve,I-eve,B-geo,I-geo,B-gpe,I-gpe,B-nat,I-nat,B-org,I-org,B-per,I-per,B-tim,I-tim
O,3.465,0.477,-2.326,0.973,-2.034,0.919,-4.235,0.506,-1.857,0.049,-1.256,0.794,-4.544,2.058,-3.123,1.417,-4.153
B-art,-0.876,-0.023,4.951,-0.003,-0.101,-0.373,-0.232,-0.373,-0.251,-0.008,-0.08,0.606,-0.601,-0.816,-0.784,-0.669,-0.324
I-art,-0.986,-0.279,4.665,-0.014,-0.086,0.336,-0.262,-0.272,-0.089,-0.008,-0.066,-0.44,-0.52,-0.747,-0.563,0.093,-0.399
B-eve,-0.533,-0.006,-0.077,-0.022,4.847,-0.234,-0.219,-0.328,-0.177,0.0,-0.04,-0.479,-0.504,-0.844,-0.409,-0.656,-0.515
I-eve,-0.333,0.0,-0.034,-0.653,3.968,-0.257,-0.193,-0.105,-0.059,-0.01,-0.009,-0.233,-0.272,-0.351,-0.387,-0.384,-0.177
B-geo,0.216,1.413,-1.024,-0.136,-0.695,-1.541,6.008,1.1,-1.881,-0.05,-0.502,-1.03,-1.947,-0.966,-1.813,1.688,-1.373
I-geo,-0.034,-0.048,-0.417,-0.029,-0.256,-1.011,5.296,-0.468,-0.719,-0.009,-0.147,-0.786,-1.018,-0.791,-0.642,1.238,-0.928
B-gpe,0.62,-0.255,-0.858,-0.278,-0.661,-0.184,-2.152,-3.169,3.92,-0.049,-0.296,0.951,-1.848,0.572,-1.357,-0.347,-0.987
I-gpe,-0.656,-0.163,-0.082,-0.01,-0.031,-0.007,-0.61,-0.624,3.97,0.0,-0.024,-0.377,-0.622,-0.619,-0.441,-0.684,-0.247
B-nat,-0.405,-0.001,-0.055,0.0,-0.042,-0.254,-0.109,-0.182,-0.068,-0.005,3.208,-0.255,-0.334,-0.55,-0.394,-0.231,-0.078

Weight?,Feature,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Unnamed: 6_level_0,Unnamed: 7_level_0,Unnamed: 8_level_0,Unnamed: 9_level_0,Unnamed: 10_level_0,Unnamed: 11_level_0,Unnamed: 12_level_0,Unnamed: 13_level_0,Unnamed: 14_level_0,Unnamed: 15_level_0,Unnamed: 16_level_0
Weight?,Feature,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
Weight?,Feature,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2
Weight?,Feature,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3,Unnamed: 11_level_3,Unnamed: 12_level_3,Unnamed: 13_level_3,Unnamed: 14_level_3,Unnamed: 15_level_3,Unnamed: 16_level_3
Weight?,Feature,Unnamed: 2_level_4,Unnamed: 3_level_4,Unnamed: 4_level_4,Unnamed: 5_level_4,Unnamed: 6_level_4,Unnamed: 7_level_4,Unnamed: 8_level_4,Unnamed: 9_level_4,Unnamed: 10_level_4,Unnamed: 11_level_4,Unnamed: 12_level_4,Unnamed: 13_level_4,Unnamed: 14_level_4,Unnamed: 15_level_4,Unnamed: 16_level_4
Weight?,Feature,Unnamed: 2_level_5,Unnamed: 3_level_5,Unnamed: 4_level_5,Unnamed: 5_level_5,Unnamed: 6_level_5,Unnamed: 7_level_5,Unnamed: 8_level_5,Unnamed: 9_level_5,Unnamed: 10_level_5,Unnamed: 11_level_5,Unnamed: 12_level_5,Unnamed: 13_level_5,Unnamed: 14_level_5,Unnamed: 15_level_5,Unnamed: 16_level_5
Weight?,Feature,Unnamed: 2_level_6,Unnamed: 3_level_6,Unnamed: 4_level_6,Unnamed: 5_level_6,Unnamed: 6_level_6,Unnamed: 7_level_6,Unnamed: 8_level_6,Unnamed: 9_level_6,Unnamed: 10_level_6,Unnamed: 11_level_6,Unnamed: 12_level_6,Unnamed: 13_level_6,Unnamed: 14_level_6,Unnamed: 15_level_6,Unnamed: 16_level_6
Weight?,Feature,Unnamed: 2_level_7,Unnamed: 3_level_7,Unnamed: 4_level_7,Unnamed: 5_level_7,Unnamed: 6_level_7,Unnamed: 7_level_7,Unnamed: 8_level_7,Unnamed: 9_level_7,Unnamed: 10_level_7,Unnamed: 11_level_7,Unnamed: 12_level_7,Unnamed: 13_level_7,Unnamed: 14_level_7,Unnamed: 15_level_7,Unnamed: 16_level_7
Weight?,Feature,Unnamed: 2_level_8,Unnamed: 3_level_8,Unnamed: 4_level_8,Unnamed: 5_level_8,Unnamed: 6_level_8,Unnamed: 7_level_8,Unnamed: 8_level_8,Unnamed: 9_level_8,Unnamed: 10_level_8,Unnamed: 11_level_8,Unnamed: 12_level_8,Unnamed: 13_level_8,Unnamed: 14_level_8,Unnamed: 15_level_8,Unnamed: 16_level_8
Weight?,Feature,Unnamed: 2_level_9,Unnamed: 3_level_9,Unnamed: 4_level_9,Unnamed: 5_level_9,Unnamed: 6_level_9,Unnamed: 7_level_9,Unnamed: 8_level_9,Unnamed: 9_level_9,Unnamed: 10_level_9,Unnamed: 11_level_9,Unnamed: 12_level_9,Unnamed: 13_level_9,Unnamed: 14_level_9,Unnamed: 15_level_9,Unnamed: 16_level_9
Weight?,Feature,Unnamed: 2_level_10,Unnamed: 3_level_10,Unnamed: 4_level_10,Unnamed: 5_level_10,Unnamed: 6_level_10,Unnamed: 7_level_10,Unnamed: 8_level_10,Unnamed: 9_level_10,Unnamed: 10_level_10,Unnamed: 11_level_10,Unnamed: 12_level_10,Unnamed: 13_level_10,Unnamed: 14_level_10,Unnamed: 15_level_10,Unnamed: 16_level_10
Weight?,Feature,Unnamed: 2_level_11,Unnamed: 3_level_11,Unnamed: 4_level_11,Unnamed: 5_level_11,Unnamed: 6_level_11,Unnamed: 7_level_11,Unnamed: 8_level_11,Unnamed: 9_level_11,Unnamed: 10_level_11,Unnamed: 11_level_11,Unnamed: 12_level_11,Unnamed: 13_level_11,Unnamed: 14_level_11,Unnamed: 15_level_11,Unnamed: 16_level_11
Weight?,Feature,Unnamed: 2_level_12,Unnamed: 3_level_12,Unnamed: 4_level_12,Unnamed: 5_level_12,Unnamed: 6_level_12,Unnamed: 7_level_12,Unnamed: 8_level_12,Unnamed: 9_level_12,Unnamed: 10_level_12,Unnamed: 11_level_12,Unnamed: 12_level_12,Unnamed: 13_level_12,Unnamed: 14_level_12,Unnamed: 15_level_12,Unnamed: 16_level_12
Weight?,Feature,Unnamed: 2_level_13,Unnamed: 3_level_13,Unnamed: 4_level_13,Unnamed: 5_level_13,Unnamed: 6_level_13,Unnamed: 7_level_13,Unnamed: 8_level_13,Unnamed: 9_level_13,Unnamed: 10_level_13,Unnamed: 11_level_13,Unnamed: 12_level_13,Unnamed: 13_level_13,Unnamed: 14_level_13,Unnamed: 15_level_13,Unnamed: 16_level_13
Weight?,Feature,Unnamed: 2_level_14,Unnamed: 3_level_14,Unnamed: 4_level_14,Unnamed: 5_level_14,Unnamed: 6_level_14,Unnamed: 7_level_14,Unnamed: 8_level_14,Unnamed: 9_level_14,Unnamed: 10_level_14,Unnamed: 11_level_14,Unnamed: 12_level_14,Unnamed: 13_level_14,Unnamed: 14_level_14,Unnamed: 15_level_14,Unnamed: 16_level_14
Weight?,Feature,Unnamed: 2_level_15,Unnamed: 3_level_15,Unnamed: 4_level_15,Unnamed: 5_level_15,Unnamed: 6_level_15,Unnamed: 7_level_15,Unnamed: 8_level_15,Unnamed: 9_level_15,Unnamed: 10_level_15,Unnamed: 11_level_15,Unnamed: 12_level_15,Unnamed: 13_level_15,Unnamed: 14_level_15,Unnamed: 15_level_15,Unnamed: 16_level_15
Weight?,Feature,Unnamed: 2_level_16,Unnamed: 3_level_16,Unnamed: 4_level_16,Unnamed: 5_level_16,Unnamed: 6_level_16,Unnamed: 7_level_16,Unnamed: 8_level_16,Unnamed: 9_level_16,Unnamed: 10_level_16,Unnamed: 11_level_16,Unnamed: 12_level_16,Unnamed: 13_level_16,Unnamed: 14_level_16,Unnamed: 15_level_16,Unnamed: 16_level_16
+4.699,BOS,,,,,,,,,,,,,,,
+3.762,bias,,,,,,,,,,,,,,,
+3.593,word.lower():kurdish,,,,,,,,,,,,,,,
+3.585,word.lower():jewish,,,,,,,,,,,,,,,
+3.226,word[-2:]:N1,,,,,,,,,,,,,,,
+3.118,postag[:2]:VB,,,,,,,,,,,,,,,
… 6280 more positive …,… 6280 more positive …,,,,,,,,,,,,,,,
… 1789 more negative …,… 1789 more negative …,,,,,,,,,,,,,,,
-3.088,word.isdigit(),,,,,,,,,,,,,,,
-3.234,word.istitle(),,,,,,,,,,,,,,,

Weight?,Feature
+4.699,BOS
+3.762,bias
+3.593,word.lower():kurdish
+3.585,word.lower():jewish
+3.226,word[-2:]:N1
+3.118,postag[:2]:VB
… 6280 more positive …,… 6280 more positive …
… 1789 more negative …,… 1789 more negative …
-3.088,word.isdigit()
-3.234,word.istitle()

Weight?,Feature
+2.250,word.lower():twitter
+2.203,word.lower():english
+2.038,-1:word.lower():tamilnet
+1.649,word.lower():dodge
+1.623,word.lower():jeep
+1.562,-1:word.lower():newspaper
+1.536,-1:word.lower():unlike
+1.528,word[-3:]:eep
+1.515,word[-2:]:ep
+1.511,-1:word.lower():either

Weight?,Feature
+1.221,word.lower():constitution
+1.220,+1:word.lower():airport
+1.076,-1:word.lower():magazine
+1.075,word[-2:]:Us
+1.075,word[-3:]:Us
+1.048,word.lower():us
+1.028,+1:word.lower():newspaper
+0.988,word[-2:]:le
+0.961,+1:word.lower():would
+0.951,word.lower():simple

Weight?,Feature
+2.018,-1:word.lower():war
+1.604,-1:word.lower():first
+1.530,-1:word.lower():celebrated
+1.523,word.lower():christmas
+1.456,+1:word.lower():get
+1.429,word.lower():games
+1.366,word[-3:]:mas
+1.347,word.lower():ii
+1.347,word[-3:]:II
+1.344,word[-2:]:II

Weight?,Feature
+1.133,word.lower():peace
+1.132,postag:NNPS
+1.122,-1:word.lower():korean
+1.111,word[-2:]:up
+1.030,word.lower():cup
+1.030,word[-3:]:Cup
+1.018,word[-3:]:ace
+0.991,word[-3:]:pen
+0.988,word.lower():open
+0.979,word[-2:]:rs

Weight?,Feature
+2.980,word.lower():mid-september
+2.929,-1:word.lower():serb
+2.882,word.lower():aswat
+2.623,word.lower():washington
+2.616,word.lower():china
+2.531,+1:word.lower():palestinian
+2.503,word.lower():zahedan
+2.451,word.lower():beijing
+2.441,word[-3:]:the
… 1699 more positive …,… 1699 more positive …

Weight?,Feature
+2.998,+1:word.lower():town
+2.924,+1:word.lower():achieved
+2.472,+1:word.lower():block
+2.221,+1:word.lower():produced
+2.129,-1:word.lower():tulkarem
+2.113,word.lower():settlement
+1.879,-1:word.lower():western
+1.805,+1:word.lower():regional
+1.781,+1:word.lower():about
+1.779,-1:word.lower():eastern

Weight?,Feature
+3.050,word.istitle()
+3.024,word.lower():nepal
+3.003,word[-3:]:pal
+2.784,+1:word.lower():mayor
+2.632,-1:word.lower():behind
+2.612,postag:NNS
+2.587,+1:word.lower():representative
+2.584,word[-3:]:ans
+2.560,+1:word.lower():unemployment
+2.502,+1:word.lower():if

Weight?,Feature
+2.781,+1:word.lower():began
+2.258,-1:word.lower():soviet
+1.976,+1:word.lower():health
+1.876,+1:word.lower():returned
+1.786,+1:word.lower():that
+1.785,word[-3:]:can
+1.519,+1:word.istitle()
+1.473,-1:word.lower():bosnian
+1.465,-1:word.lower():democratic
+1.394,+1:word.lower():countries

Weight?,Feature
+1.646,word.isupper()
+1.553,word.lower():h5n1
+1.553,word[-3:]:5N1
+1.532,+1:word.lower():toll
+1.522,word.lower():katrina
+1.492,word[-2:]:N1
+1.449,word.lower():marburg
+1.409,word[-3:]:urg
+1.408,+1:word.lower():form
+1.398,+1:word.lower():katrina

Weight?,Feature
+1.275,-1:word.lower():hurricane
+1.275,word.lower():katrina
+1.218,+1:word.lower():outbreak
+1.070,word[-3:]:ina
+1.055,word[-2:]:na
+0.841,-1:word.lower():jing
+0.826,word.lower():jing
+0.795,-1:postag:NNP
+0.763,word[-2:]:me
+0.726,-1:word.istitle()

Weight?,Feature
+3.339,word.lower():al-qaida
+3.091,-1:word.lower():telephoned
+3.081,word.lower():hamas
+2.999,+1:word.lower():fought
+2.899,-1:word.lower():brunei
+2.874,word.lower():parliament
+2.750,word[-3:]:ban
+2.677,+1:word.lower():recognizes
+2.663,+1:word.lower():influence
+2.640,+1:word.lower():assistant

Weight?,Feature
+2.207,+1:word.lower():mr.
+1.890,-1:word.lower():mediterranean
+1.766,+1:word.lower():will
+1.703,+1:word.lower():yorker
+1.692,word.lower():ministry
+1.685,-1:word.lower():munich
+1.656,word[-3:]:ate
+1.621,+1:word.lower():in
+1.580,word.lower():nations
… 1511 more positive …,… 1511 more positive …

Weight?,Feature
+3.371,word.lower():president
+2.995,word.lower():obama
+2.921,BOS
+2.818,word.lower():jupiter
+2.791,word.lower():prime
+2.687,word.lower():gotovina
+2.661,+1:word.lower():vladimir
+2.563,word.lower():bolton
+2.525,-1:word.lower():under
+2.415,+1:word.lower():president

Weight?,Feature
+1.742,+1:word.lower():david
+1.518,-1:postag:NN
+1.455,+1:word.lower():saad
+1.335,-1:word.lower():masjid
+1.283,word[-3:]:aad
… 1936 more positive …,… 1936 more positive …
… 263 more negative …,… 263 more negative …
-1.300,+1:word.lower():on
-1.310,-1:word.lower():sri
-1.454,word[-2:]:ka

Weight?,Feature
+5.184,word[-3:]:day
+3.326,word.lower():thanksgiving
+3.269,word[-2:]:ay
+3.172,+1:word.lower():year
+3.120,word.lower():afternoon
+3.038,word[-2:]:0s
+2.915,+1:word.lower():czech
+2.824,word[-3:]:ber
+2.819,word.lower():august
+2.809,+1:word.lower():years

Weight?,Feature
+3.754,word[-3:]:day
+2.461,word[-2:]:ay
+2.334,-1:word.lower():ceremonies
+2.279,+1:word.lower():moscow
+1.895,-1:word.lower():march
+1.894,-1:word.lower():anniversary
+1.853,word.lower():decades
+1.800,-1:word.lower():june
+1.769,+1:word.lower():rebel
+1.715,word.lower():quarter


It does make sense that I-entity must follow B-entity, such as I-geo follows B-geo, I-org follows B-org, I-per follows B-per, and so on. 

We can also see that it is not common in this dataset to have a person right after an organization name (B-org -> I-per has a large negative weight).

If we regularize CRF more, we can expect that only features which are generic will remain, and memoized tokens will go. Let’s check what effect does regularization have on CRF weights:

In [54]:
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=200,
    c2=0.1,
    max_iterations=20,
    all_possible_transitions=False,
)
crf.fit(X_train, y_train)
eli5.show_weights(crf, top=10)

From \ To,O,B-art,I-art,B-eve,I-eve,B-geo,I-geo,B-gpe,I-gpe,B-nat,I-nat,B-org,I-org,B-per,I-per,B-tim,I-tim
O,1.782,0.0,0.0,0.0,0.0,1.456,0.0,0.303,0.0,0.0,0.0,0.791,0.0,0.062,0.0,1.709,0.0
B-art,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
I-art,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
B-eve,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
I-eve,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
B-geo,0.23,0.0,0.0,0.0,0.0,0.0,2.704,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
I-geo,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
B-gpe,0.01,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
I-gpe,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
B-nat,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0

Weight?,Feature,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Unnamed: 6_level_0,Unnamed: 7_level_0,Unnamed: 8_level_0,Unnamed: 9_level_0,Unnamed: 10_level_0,Unnamed: 11_level_0,Unnamed: 12_level_0,Unnamed: 13_level_0,Unnamed: 14_level_0,Unnamed: 15_level_0,Unnamed: 16_level_0
Weight?,Feature,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
Weight?,Feature,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2
Weight?,Feature,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3,Unnamed: 11_level_3,Unnamed: 12_level_3,Unnamed: 13_level_3,Unnamed: 14_level_3,Unnamed: 15_level_3,Unnamed: 16_level_3
Weight?,Feature,Unnamed: 2_level_4,Unnamed: 3_level_4,Unnamed: 4_level_4,Unnamed: 5_level_4,Unnamed: 6_level_4,Unnamed: 7_level_4,Unnamed: 8_level_4,Unnamed: 9_level_4,Unnamed: 10_level_4,Unnamed: 11_level_4,Unnamed: 12_level_4,Unnamed: 13_level_4,Unnamed: 14_level_4,Unnamed: 15_level_4,Unnamed: 16_level_4
Weight?,Feature,Unnamed: 2_level_5,Unnamed: 3_level_5,Unnamed: 4_level_5,Unnamed: 5_level_5,Unnamed: 6_level_5,Unnamed: 7_level_5,Unnamed: 8_level_5,Unnamed: 9_level_5,Unnamed: 10_level_5,Unnamed: 11_level_5,Unnamed: 12_level_5,Unnamed: 13_level_5,Unnamed: 14_level_5,Unnamed: 15_level_5,Unnamed: 16_level_5
Weight?,Feature,Unnamed: 2_level_6,Unnamed: 3_level_6,Unnamed: 4_level_6,Unnamed: 5_level_6,Unnamed: 6_level_6,Unnamed: 7_level_6,Unnamed: 8_level_6,Unnamed: 9_level_6,Unnamed: 10_level_6,Unnamed: 11_level_6,Unnamed: 12_level_6,Unnamed: 13_level_6,Unnamed: 14_level_6,Unnamed: 15_level_6,Unnamed: 16_level_6
Weight?,Feature,Unnamed: 2_level_7,Unnamed: 3_level_7,Unnamed: 4_level_7,Unnamed: 5_level_7,Unnamed: 6_level_7,Unnamed: 7_level_7,Unnamed: 8_level_7,Unnamed: 9_level_7,Unnamed: 10_level_7,Unnamed: 11_level_7,Unnamed: 12_level_7,Unnamed: 13_level_7,Unnamed: 14_level_7,Unnamed: 15_level_7,Unnamed: 16_level_7
Weight?,Feature,Unnamed: 2_level_8,Unnamed: 3_level_8,Unnamed: 4_level_8,Unnamed: 5_level_8,Unnamed: 6_level_8,Unnamed: 7_level_8,Unnamed: 8_level_8,Unnamed: 9_level_8,Unnamed: 10_level_8,Unnamed: 11_level_8,Unnamed: 12_level_8,Unnamed: 13_level_8,Unnamed: 14_level_8,Unnamed: 15_level_8,Unnamed: 16_level_8
+3.691,bias,,,,,,,,,,,,,,,
+1.774,BOS,,,,,,,,,,,,,,,
+0.984,-1:postag[:2]:NN,,,,,,,,,,,,,,,
+0.435,postag[:2]:VB,,,,,,,,,,,,,,,
+0.218,EOS,,,,,,,,,,,,,,,
… 10 more positive …,… 10 more positive …,,,,,,,,,,,,,,,
… 1 more negative …,… 1 more negative …,,,,,,,,,,,,,,,
-0.467,postag:CD,,,,,,,,,,,,,,,
-0.467,postag[:2]:CD,,,,,,,,,,,,,,,
-1.472,word.isdigit(),,,,,,,,,,,,,,,

Weight?,Feature
+3.691,bias
+1.774,BOS
+0.984,-1:postag[:2]:NN
+0.435,postag[:2]:VB
+0.218,EOS
… 10 more positive …,… 10 more positive …
… 1 more negative …,… 1 more negative …
-0.467,postag:CD
-0.467,postag[:2]:CD
-1.472,word.isdigit()

Weight?,Feature
1.052,postag:NNP
0.534,word.istitle()
0.218,-1:postag:IN
0.218,-1:postag[:2]:IN
0.125,-1:word.lower():in
-0.289,-1:postag[:2]:NN

Weight?,Feature
0.267,-1:postag:NNP

Weight?,Feature
1.533,postag:JJ
1.506,postag[:2]:JJ
1.139,word.istitle()
0.549,word[-2:]:an
-0.033,postag:NNP

Weight?,Feature
0.806,postag:NNP
0.565,postag[:2]:NN
0.23,-1:postag[:2]:DT
0.23,-1:postag:DT
0.004,word.isupper()

Weight?,Feature
0.496,-1:postag:NNP
0.377,-1:word.istitle()
0.225,-1:postag[:2]:NN

Weight?,Feature
0.51,postag:NNP
0.438,+1:postag:NNP
0.308,+1:word.istitle()
0.075,postag[:2]:NN
0.022,word.istitle()
0.002,+1:postag[:2]:NN

Weight?,Feature
0.881,-1:postag:NNP
0.48,-1:postag[:2]:NN
0.404,-1:word.istitle()
0.196,postag:NNP

Weight?,Feature
1.74,word[-2:]:ay
1.657,word[-3:]:day
0.204,postag[:2]:CD
0.204,postag:CD
0.096,bias
0.033,-1:postag[:2]:IN
0.033,-1:postag:IN


In [55]:
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=True,
)
crf.fit(X_train, y_train);
eli5.show_weights(crf, top=5, show=['transition_features'])

From \ To,O,B-art,I-art,B-eve,I-eve,B-geo,I-geo,B-gpe,I-gpe,B-nat,I-nat,B-org,I-org,B-per,I-per,B-tim,I-tim
O,3.561,0.747,-2.344,0.928,-1.847,1.1,-4.541,0.523,-1.684,0.239,-1.025,0.795,-4.779,1.64,-3.017,1.39,-4.202
B-art,-0.556,0.0,5.467,0.0,0.0,-0.021,-0.085,-0.145,0.0,0.0,0.0,0.478,-0.328,-0.735,-0.394,-0.41,0.0
I-art,-0.74,0.0,5.438,0.0,0.0,0.509,-0.075,-0.079,0.0,0.0,0.0,-0.091,-0.315,-0.777,-0.246,0.19,-0.062
B-eve,-0.213,0.0,0.0,0.0,5.415,0.0,0.0,-0.12,0.0,0.0,0.0,-0.123,-0.319,-0.662,-0.397,-0.512,-0.193
I-eve,-0.239,0.0,0.0,-0.279,4.838,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.013,-0.215,-0.002,-0.236,0.0
B-geo,0.31,1.357,-0.737,0.0,-0.498,-1.395,5.227,0.733,-1.392,0.0,-0.233,-0.973,-1.993,-1.367,-1.873,1.648,-1.25
I-geo,-0.01,0.0,-0.123,0.0,0.0,-0.88,4.457,-0.486,-0.606,0.0,0.0,-0.766,-0.944,-1.214,-0.579,1.036,-0.96
B-gpe,0.623,0.0,-0.581,-0.029,-0.427,-0.116,-2.327,-3.359,4.986,0.0,0.0,0.901,-1.945,0.294,-1.231,-0.431,-0.876
I-gpe,-0.21,0.0,0.0,0.0,0.0,0.24,-0.291,-0.233,4.878,0.0,0.0,-0.101,-0.352,-0.385,-0.263,-0.379,0.0
B-nat,-0.402,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.694,0.0,-0.016,-0.48,-0.194,0.0,0.0


The model learned large negative weights for impossible transitions like O -> I-geo, O -> I-org and O -> I-tim, and so on.

In order to easy to read, we can check only a subset of tags.

In [58]:
eli5.show_weights(crf, top=10, targets=['O', 'B-org', 'I-per'])

From \ To,O,B-org,I-per
O,3.561,0.795,-3.017
B-org,0.086,-2.285,-2.309
I-per,-0.066,-1.147,3.898

Weight?,Feature,Unnamed: 2_level_0
Weight?,Feature,Unnamed: 2_level_1
Weight?,Feature,Unnamed: 2_level_2
+4.720,BOS,
+4.236,bias,
+4.046,word.lower():jewish,
+3.494,word.lower():kurdish,
+3.435,word[-2:]:N1,
+3.031,+1:word.lower():minister,
… 1685 more positive …,… 1685 more positive …,
… 983 more negative …,… 983 more negative …,
-3.080,+1:word.lower():last,
-3.387,word.istitle(),

Weight?,Feature
+4.720,BOS
+4.236,bias
+4.046,word.lower():jewish
+3.494,word.lower():kurdish
+3.435,word[-2:]:N1
+3.031,+1:word.lower():minister
… 1685 more positive …,… 1685 more positive …
… 983 more negative …,… 983 more negative …
-3.080,+1:word.lower():last
-3.387,word.istitle()

Weight?,Feature
+3.603,word.lower():al-qaida
+3.340,word.lower():hamas
+3.270,word.lower():parliament
+3.041,-1:word.lower():telephoned
+2.991,-1:word.lower():brunei
+2.966,+1:word.lower():fought
+2.894,word[-3:]:ban
+2.754,-1:word.lower():extremist
+2.654,+1:word.lower():influence
+2.630,word.lower():westerners

Weight?,Feature
+1.788,+1:word.lower():david
+1.709,+1:word.lower():saad
+1.659,+1:word.lower():reports
+1.519,+1:word.lower():clinton
+1.491,-1:postag:NN
+1.482,word.lower():rice
+1.385,-1:word.lower():masjid
… 858 more positive …,… 858 more positive …
… 182 more negative …,… 182 more negative …
-1.365,bias


Or check only some of the features for all tags.

In [59]:
eli5.show_weights(crf, top=10, feature_re='^word\.is',
                  horizontal_layout=False, show=['targets'])

Weight?,Feature
-2.186,word.isupper()
-2.72,word.isdigit()
-3.387,word.istitle()

Weight?,Feature
0.151,word.istitle()
-0.214,word.isupper()

Weight?,Feature
0.607,word.istitle()
0.597,word.isdigit()

Weight?,Feature
1.185,word.isupper()
0.391,word.isdigit()
-0.2,word.istitle()

Weight?,Feature
0.919,word.isupper()
0.069,word.istitle()

Weight?,Feature
1.264,word.istitle()
-0.046,word.isupper()
-0.728,word.isdigit()

Weight?,Feature
0.726,word.istitle()
0.534,word.isdigit()
-0.0,word.isupper()

Weight?,Feature
2.97,word.istitle()
1.333,word.isupper()

Weight?,Feature
0.21,word.istitle()
-0.167,word.isupper()

Weight?,Feature
1.622,word.isupper()
-0.252,word.istitle()

Weight?,Feature
0.003,word.istitle()

Weight?,Feature
1.978,word.isupper()
0.0,word.istitle()
-0.804,word.isdigit()

Weight?,Feature
0.366,word.istitle()
0.021,word.isupper()
-0.443,word.isdigit()

Weight?,Feature
0.146,word.istitle()
-0.098,word.isdigit()
-1.003,word.isupper()

Weight?,Feature
0.208,word.istitle()
-0.02,word.isdigit()
-0.391,word.isupper()

Weight?,Feature
2.573,word.isdigit()
-0.435,word.istitle()
-1.133,word.isupper()

Weight?,Feature
1.978,word.isdigit()
-0.286,word.isupper()
-1.304,word.istitle()
