In [1]:
# ! mkdir -p conll03
# ! wget -q -O conll03/train.conll https://raw.githubusercontent.com/davidsbatista/NER-datasets/master/CONLL2003/train.txt
# ! wget -q -O conll03/valid.conll https://raw.githubusercontent.com/davidsbatista/NER-datasets/master/CONLL2003/valid.txt
# ! wget -q -O conll03/test.conll https://raw.githubusercontent.com/davidsbatista/NER-datasets/master/CONLL2003/test.txt

In [2]:
! head -n 20 conll03/train.conll

-DOCSTART- -X- -X- O

EU NNP B-NP B-ORG
rejects VBZ B-VP O
German JJ B-NP B-MISC
call NN I-NP O
to TO B-VP O
boycott VB I-VP O
British JJ B-NP B-MISC
lamb NN I-NP O
. . O O

Peter NNP B-NP B-PER
Blackburn NNP I-NP I-PER

BRUSSELS NNP B-NP B-LOC
1996-08-22 CD I-NP O

The DT B-NP O
European NNP I-NP B-ORG


In [3]:
from spacy_crfsuite import CRFExtractor

component_config = {
    "features": [
        [
            "low", 
            "title", 
            "upper", 
            "pos", 
            "pos2"
        ],
        [
            "low",
            "bias",
            "prefix5",
            "prefix2",
            "suffix5",
            "suffix3",
            "suffix2",
            "upper",
            "title",
            "digit",
            "pos",
            "pos2"
        ],
        [
            "low", 
            "title", 
            "upper", 
            "pos", 
            "pos2"
        ],
    ],
    "c1": 0.03,
    "c2": 0.06
}

crf_extractor = CRFExtractor(component_config=component_config)
crf_extractor

<spacy_crfsuite.crf_extractor.CRFExtractor at 0x10f842518>

In [4]:
from tqdm.notebook import tqdm_notebook
from spacy_crfsuite import read_file, prepare_example

def read_examples(file, limit=None):
    examples = []
    it = read_file(file)
    it = it[:limit] if limit else it
    for raw_example in tqdm_notebook(it, desc=file):
        ex = prepare_example(raw_example, crf_extractor=crf_extractor)
        examples.append(ex)
    return examples


train_examples = read_examples("conll03/train.conll")
dev_examples = read_examples("conll03/test.conll")

HBox(children=(FloatProgress(value=0.0, description='conll03/train.conll', max=14041.0, style=ProgressStyle(de…




HBox(children=(FloatProgress(value=0.0, description='conll03/test.conll', max=3453.0, style=ProgressStyle(desc…


train examples: 14041
test examples: 3453


In [5]:
crf_extractor.train(train_examples)

print(crf_extractor.explain())

Most likely transitions:
U-ORG      -> U-ORG      4.760297
U-LOC      -> U-LOC      4.300355
U-MISC     -> U-MISC     4.216579
U-PER      -> U-PER      3.858093
O          -> O          2.211699
O          -> U-LOC      0.306021
U-PER      -> O          0.290876
U-ORG      -> O          0.235541
O          -> U-PER      0.210910
O          -> U-MISC     0.202401

Positive features:
6.972993 U-ORG      1:low:inc.
6.506540 O          0:prefix2:W1
6.305071 U-ORG      -1:low:v
5.906445 U-PER      0:prefix2:Mc
5.841444 O          -1:low:stadler
5.683671 U-ORG      1:low:arose
5.536084 U-ORG      -1:low:bj
5.160226 O          0:suffix3:day
5.067074 U-LOC      -1:low:lord
4.912943 U-MISC     1:low:vermonter


In [17]:
train_examples[3]

[CRFToken(text='The', tag='DT', entity='O', pattern={}, dense_features=[]),
 CRFToken(text='European', tag='NNP', entity='U-ORG', pattern={}, dense_features=[]),
 CRFToken(text='Commission', tag='NNP', entity='U-ORG', pattern={}, dense_features=[]),
 CRFToken(text='said', tag='VBD', entity='O', pattern={}, dense_features=[]),
 CRFToken(text='on', tag='IN', entity='O', pattern={}, dense_features=[]),
 CRFToken(text='Thursday', tag='NNP', entity='O', pattern={}, dense_features=[]),
 CRFToken(text='it', tag='PRP', entity='O', pattern={}, dense_features=[]),
 CRFToken(text='disagreed', tag='VBD', entity='O', pattern={}, dense_features=[]),
 CRFToken(text='with', tag='IN', entity='O', pattern={}, dense_features=[]),
 CRFToken(text='German', tag='JJ', entity='U-MISC', pattern={}, dense_features=[]),
 CRFToken(text='advice', tag='NN', entity='O', pattern={}, dense_features=[]),
 CRFToken(text='to', tag='TO', entity='O', pattern={}, dense_features=[]),
 CRFToken(text='consumers', tag='NNS', en

In [6]:
f1_score, classification_report = crf_extractor.eval(dev_examples)

print(classification_report)



              precision    recall  f1-score   support

       U-LOC      0.848     0.875     0.861      1853
      U-MISC      0.776     0.762     0.769       905
       U-ORG      0.795     0.761     0.777      2462
       U-PER      0.880     0.891     0.886      2709

   micro avg      0.835     0.832     0.834      7929
   macro avg      0.825     0.822     0.823      7929
weighted avg      0.834     0.832     0.833      7929



In [32]:
import spacy

from spacy_crfsuite import CRFEntityExtractor

nlp = spacy.blank("en")
pipe = CRFEntityExtractor(nlp, crf_extractor=crf_extractor)
nlp.add_pipe(pipe)

doc = nlp(
    "George Walker Bush (born July 6, 1946) is an American politician and businessman "
    "who served as the 43rd president of the United States from 2001 to 2009.")

for ent in doc.ents:
    print(ent, ent.label_)

George PER
Walker PER
Bush PER
American MISC
United LOC
States LOC
