In [1]:
# Uncomment to download the CoNLL 2003 dataset
# 
# ! mkdir -p conll03
# ! wget -q -O conll03/train.conll https://raw.githubusercontent.com/davidsbatista/NER-datasets/master/CONLL2003/train.txt
# ! wget -q -O conll03/valid.conll https://raw.githubusercontent.com/davidsbatista/NER-datasets/master/CONLL2003/valid.txt
# ! wget -q -O conll03/test.conll https://raw.githubusercontent.com/davidsbatista/NER-datasets/master/CONLL2003/test.txt

In [2]:
! head -n 20 conll03/train.conll

-DOCSTART- -X- -X- O

EU NNP B-NP B-ORG
rejects VBZ B-VP O
German JJ B-NP B-MISC
call NN I-NP O
to TO B-VP O
boycott VB I-VP O
British JJ B-NP B-MISC
lamb NN I-NP O
. . O O

Peter NNP B-NP B-PER
Blackburn NNP I-NP I-PER

BRUSSELS NNP B-NP B-LOC
1996-08-22 CD I-NP O

The DT B-NP O
European NNP I-NP B-ORG


In [3]:
from spacy_crfsuite import CRFExtractor

component_config = {
    "BILOU_flag": False,
    "features": [
        [
            "low", 
            "title", 
            "upper", 
            "pos", 
            "pos2"
        ],
        [
            "low",
            "bias",
            "prefix5",
            "prefix2",
            "suffix5",
            "suffix3",
            "suffix2",
            "upper",
            "title",
            "digit",
            "pos",
            "pos2"
        ],
        [
            "low", 
            "title", 
            "upper", 
            "pos", 
            "pos2"
        ],
    ],
    "c1": 0.03,
    "c2": 0.06
}

crf_extractor = CRFExtractor(component_config=component_config)
crf_extractor

<spacy_crfsuite.crf_extractor.CRFExtractor at 0x10b0eb390>

In [4]:
from tqdm.notebook import tqdm_notebook
from spacy_crfsuite import read_file, prepare_example

def read_examples(file, limit=None):
    examples = []
    it = read_file(file)
    it = it[:limit] if limit else it
    for raw_example in tqdm_notebook(it, desc=file):
        ex = prepare_example(raw_example, crf_extractor=crf_extractor)
        examples.append(ex)
    return examples

train_examples = read_examples("conll03/train.conll")
dev_examples = read_examples("conll03/test.conll")

HBox(children=(FloatProgress(value=0.0, description='conll03/train.conll', max=14041.0, style=ProgressStyle(de…




HBox(children=(FloatProgress(value=0.0, description='conll03/test.conll', max=3453.0, style=ProgressStyle(desc…




In [5]:
crf_extractor.train(train_examples)

print(crf_extractor.explain())

Most likely transitions:
B-ORG      -> I-ORG      8.325343
B-PER      -> I-PER      8.274836
I-ORG      -> I-ORG      7.803342
I-MISC     -> I-MISC     7.773912
B-LOC      -> I-LOC      7.627518
B-MISC     -> I-MISC     7.384203
I-LOC      -> I-LOC      6.976467
I-PER      -> I-PER      5.985890
O          -> B-PER      4.049560
O          -> O          3.103193

Positive features:
6.239672 O          0:bias:bias
5.356058 O          0:suffix3:day
5.193756 B-ORG      -1:low:v
5.035596 B-ORG      1:low:arose
4.698496 B-PER      BOS
4.621891 O          0:prefix2:W1
4.572906 I-ORG      -1:low:bj
4.381915 O          BOS
4.320171 B-PER      0:prefix2:Mc
4.158817 I-ORG      -1:low:v


In [6]:
f1_score, classification_report = crf_extractor.eval(dev_examples)

print(classification_report)



              precision    recall  f1-score   support

       B-LOC      0.853     0.880     0.867      1615
       I-LOC      0.757     0.706     0.730       238
      B-MISC      0.808     0.759     0.783       698
      I-MISC      0.652     0.662     0.657       207
       B-ORG      0.806     0.725     0.763      1644
       I-ORG      0.717     0.740     0.728       818
       B-PER      0.847     0.861     0.854      1592
       I-PER      0.886     0.955     0.919      1117

   micro avg      0.822     0.819     0.820      7929
   macro avg      0.791     0.786     0.788      7929
weighted avg      0.821     0.819     0.819      7929



In [7]:
import spacy

from spacy_crfsuite import CRFEntityExtractor

# Example of a spaCy pipeline
nlp = spacy.blank("en")

# also good for IOB scheme ... otherwise, entities will be word-level.
crf_extractor.component_config["BILOU_flag"] = True

pipe = CRFEntityExtractor(nlp, crf_extractor=crf_extractor)
nlp.add_pipe(pipe)

doc = nlp(
    "George Walker Bush (born July 6, 1946) is an American politician and businessman "
    "who served as the 43rd president of the United States from 2001 to 2009.")

for ent in doc.ents:
    print(ent, "-", ent.label_)

[38;5;4mℹ Inconsistent BILOU tagging found, B- tag, L- tag pair encloses
multiple entity classes.i.e. [B-a, I-b, L-a] instead of [B-a, I-a, L-a].
Assuming B- class is correct.[0m
[38;5;4mℹ Inconsistent BILOU tagging found, B- tag not closed by L- tag, i.e
[B-a, I-a, O] instead of [B-a, L-a, O]. Assuming last tag is L-[0m
[38;5;4mℹ Inconsistent BILOU tagging found, B- tag, L- tag pair encloses
multiple entity classes.i.e. [B-a, I-b, L-a] instead of [B-a, I-a, L-a].
Assuming B- class is correct.[0m
[38;5;4mℹ Inconsistent BILOU tagging found, B- tag not closed by L- tag, i.e
[B-a, I-a, O] instead of [B-a, L-a, O]. Assuming last tag is L-[0m
[38;5;4mℹ Inconsistent BILOU tagging found, B- tag, L- tag pair encloses
multiple entity classes.i.e. [B-a, I-b, L-a] instead of [B-a, I-a, L-a].
Assuming B- class is correct.[0m
[38;5;4mℹ Inconsistent BILOU tagging found, B- tag not closed by L- tag, i.e
[B-a, I-a, O] instead of [B-a, L-a, O]. Assuming last tag is L-[0m
George Walker Bush 

In [8]:
crf_extractor.to_disk("spacy_crfsuite_conll03.bz2")