In [1]:
import extract_features as ef

import sklearn_crfsuite

from collections import Counter
from sklearn_crfsuite import scorers
from sklearn_crfsuite import metrics

The code in this notebook is based on: https://github.com/TeamHG-Memex/sklearn-crfsuite/blob/master/docs/CoNLL2002.ipynb

In [2]:
def crf_feat(infilepath):
    """
    tbd
    """
        
    with open(infilepath, 'r') as f:
        text = f.read()

    ner_tags = list()
    sentences = list()
    
    for line in text.split('\n\n'):
        if line == '-DOCSTART- -X- O O' or '"' in line:
            continue
        else:
            sentence = list()
            sent_ner_tags = list()
                        
            sent_items = line.split('\n')
            for idx, item in enumerate(sent_items):
                if len(item.split(' ')) < 4:
                    continue
                else:
                    feat_dict = dict()
                                        
                    token = item.split(' ')[0]
                    ner_tag = ef.clean_ner_tags(item.split(' ')[3])
                    lemma = ef.lemmatizer(token)
                    if idx == 0 and token.istitle():
                        shape = 'upcase_BOS'
                    elif idx != 0 and token.istitle():
                        shape = 'upcase_IN'
                    elif token.islower():
                        shape = 'lowcase'
                    elif token.isupper():
                        shape = 'all_caps'
                    else:
                        shape = 'other'
                    
                    feat_dict['lemma'] = lemma
                    feat_dict['shape'] = shape
                    
                    sent_ner_tags.append(ner_tag)              
                    sentence.append(feat_dict)
                    
            ner_tags.append(sent_ner_tags)
            sentences.append(sentence)
        
    return sentences, ner_tags

In [3]:
# prep train and test data

train_sents, train_tags = crf_feat('data/train_reuters.en')

test_sents, test_tags = crf_feat('data/test.conll')

# remove empty lists:

for idx, item in enumerate(train_sents):
    if item == []:
        train_sents.pop(idx)
        train_tags.pop(idx)
        
for idx, item in enumerate(test_sents):
    if item == []:
        test_sents.pop(idx)
        test_tags.pop(idx)

In [6]:
# define features; the format need to be as follows:
# X - list of lists of dicts
# y - list of lists of strings

X_train = train_sents
y_train = train_tags

X_test = test_sents
y_test = test_tags

In [7]:
# train CRF model

crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs', 
    c1=0.1, 
    c2=0.1, 
    max_iterations=100, 
    all_possible_transitions=True
)

crf.fit(X_train, y_train)

CRF(algorithm='lbfgs', all_possible_states=None,
  all_possible_transitions=True, averaging=None, c=None, c1=0.1, c2=0.1,
  calibration_candidates=None, calibration_eta=None,
  calibration_max_trials=None, calibration_rate=None,
  calibration_samples=None, delta=None, epsilon=None, error_sensitive=None,
  gamma=None, keep_tempfiles=None, linesearch=None, max_iterations=100,
  max_linesearch=None, min_freq=None, model_filename=None,
  num_memories=None, pa_type=None, period=None, trainer_cls=None,
  variance=None, verbose=False)

In [8]:
# training F1-score

y_pred_crf_train = crf.predict(X_train)

print(f"Training F1-score: {metrics.flat_f1_score(y_train, y_pred_crf_train, average='weighted').round(2)}")

Training F1-score: 0.99


In [9]:
# testing F1-score

y_pred_crf = crf.predict(X_test)

print(f"Test F1-score: {metrics.flat_f1_score(y_test, y_pred_crf, average='weighted').round(2)}")

Test F1-score: 0.97


In [10]:
# precision, recall, F1-score per label

print(metrics.flat_classification_report(y_test, y_pred_crf, digits=3))

              precision    recall  f1-score   support

         LOC      0.930     0.835     0.880      1961
        MISC      0.924     0.792     0.853      1111
           O      0.988     0.993     0.990     34736
         ORG      0.807     0.784     0.796      1868
         PER      0.863     0.928     0.894      2810

   micro avg      0.967     0.967     0.967     42486
   macro avg      0.902     0.866     0.883     42486
weighted avg      0.967     0.967     0.967     42486



In [11]:
# show the transitions that the classifier learned

def print_transitions(trans_features):
    for (label_from, label_to), weight in trans_features:
        print("%-6s -> %-7s %0.6f" % (label_from, label_to, weight))

print("Top likely transitions:")
print_transitions(Counter(crf.transition_features_).most_common(20))

print("\nTop unlikely transitions:")
print_transitions(Counter(crf.transition_features_).most_common()[-20:])

Top likely transitions:
PER    -> PER     4.193654
MISC   -> MISC    3.971928
ORG    -> ORG     3.735996
ORG    -> O       2.152950
O      -> O       1.613371
MISC   -> O       1.496939
LOC    -> LOC     1.349455
PER    -> O       1.241341
LOC    -> O       1.120530
O      -> PER     0.825543
O      -> MISC    0.825199
MISC   -> PER     0.333931
O      -> LOC     -0.304484
O      -> ORG     -0.590443
ORG    -> MISC    -0.737057
LOC    -> MISC    -0.942692
ORG    -> PER     -1.333312
MISC   -> ORG     -1.787679
PER    -> MISC    -2.751714
MISC   -> LOC     -2.808083

Top unlikely transitions:
MISC   -> O       1.496939
LOC    -> LOC     1.349455
PER    -> O       1.241341
LOC    -> O       1.120530
O      -> PER     0.825543
O      -> MISC    0.825199
MISC   -> PER     0.333931
O      -> LOC     -0.304484
O      -> ORG     -0.590443
ORG    -> MISC    -0.737057
LOC    -> MISC    -0.942692
ORG    -> PER     -1.333312
MISC   -> ORG     -1.787679
PER    -> MISC    -2.751714
MISC   -> LOC   