In [1]:
# Uncomment to download the CoNLL 2003 dataset
# 
# ! mkdir -p conll03
# ! wget -q -O conll03/train.conll https://raw.githubusercontent.com/davidsbatista/NER-datasets/master/CONLL2003/train.txt
# ! wget -q -O conll03/valid.conll https://raw.githubusercontent.com/davidsbatista/NER-datasets/master/CONLL2003/valid.txt
# ! wget -q -O conll03/test.conll https://raw.githubusercontent.com/davidsbatista/NER-datasets/master/CONLL2003/test.txt

In [2]:
! head -n 20 conll03/train.conll

-DOCSTART- -X- -X- O

EU NNP B-NP B-ORG
rejects VBZ B-VP O
German JJ B-NP B-MISC
call NN I-NP O
to TO B-VP O
boycott VB I-VP O
British JJ B-NP B-MISC
lamb NN I-NP O
. . O O

Peter NNP B-NP B-PER
Blackburn NNP I-NP I-PER

BRUSSELS NNP B-NP B-LOC
1996-08-22 CD I-NP O

The DT B-NP O
European NNP I-NP B-ORG


In [3]:
from spacy_crfsuite import CRFExtractor

component_config = {
    "BILOU_flag": False,
    "features": [
        [
            "low", 
            "title", 
            "upper", 
            "pos", 
            "pos2"
        ],
        [
            "low",
            "bias",
            "prefix5",
            "prefix2",
            "suffix5",
            "suffix3",
            "suffix2",
            "upper",
            "title",
            "digit",
            "pos",
            "pos2"
        ],
        [
            "low", 
            "title", 
            "upper", 
            "pos", 
            "pos2"
        ],
    ],
    "c1": 0.03,
    "c2": 0.06
}

crf_extractor = CRFExtractor(component_config=component_config)
crf_extractor

<spacy_crfsuite.crf_extractor.CRFExtractor at 0x10475c390>

In [4]:
from tqdm.notebook import tqdm_notebook
from spacy_crfsuite import read_file, prepare_example

def read_examples(file, limit=None):
    examples = []
    it = read_file(file)
    it = it[:limit] if limit else it
    for raw_example in tqdm_notebook(it, desc=file):
        ex = prepare_example(raw_example, crf_extractor=crf_extractor)
        examples.append(ex)
    return examples

In [5]:
# this is going to take a while, so you might need a coffee break ...
val_examples = read_examples("conll03/valid.conll")

rs = crf_extractor.fine_tune(val_examples)
print(rs.best_params_)

HBox(children=(FloatProgress(value=0.0, description='conll03/valid.conll', max=3250.0, style=ProgressStyle(des…


Fitting 3 folds for each of 10 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:  4.4min finished


{'c1': 0.0009937734676826594, 'c2': 0.024229930809336817}


In [8]:
crf_extractor.component_config["c1"] = 0.001
crf_extractor.component_config["c2"] = 0.02

In [9]:
train_examples = read_examples("conll03/train.conll")
crf_extractor.train(train_examples)

print(crf_extractor.explain())

HBox(children=(FloatProgress(value=0.0, description='conll03/train.conll', max=14041.0, style=ProgressStyle(de…


Most likely transitions:
B-ORG      -> I-ORG      9.590496
B-PER      -> I-PER      9.054956
I-MISC     -> I-MISC     8.828105
B-LOC      -> I-LOC      8.738556
I-ORG      -> I-ORG      8.707403
B-MISC     -> I-MISC     8.471613
I-LOC      -> I-LOC      7.761805
I-PER      -> I-PER      6.444599
O          -> B-PER      4.446363
O          -> O          3.553140

Positive features:
6.676161 B-ORG      1:low:arose
6.660508 O          0:bias:bias
6.481304 B-LOC      1:low:21,240
6.402033 I-ORG      -1:low:bj
5.906739 B-ORG      -1:low:v
5.843368 I-ORG      -1:low:v
5.653799 O          0:suffix3:day
5.575185 B-LOC      1:low:raged
5.573339 I-LOC      1:low:five
5.146548 B-PER      BOS


In [10]:
test_examples = read_examples("conll03/test.conll")
f1_score, classification_report = crf_extractor.eval(test_examples)

print(classification_report)

HBox(children=(FloatProgress(value=0.0, description='conll03/test.conll', max=3453.0, style=ProgressStyle(desc…






              precision    recall  f1-score   support

       B-LOC      0.854     0.880     0.867      1615
       I-LOC      0.769     0.714     0.741       238
      B-MISC      0.790     0.758     0.773       698
      I-MISC      0.643     0.652     0.647       207
       B-ORG      0.797     0.729     0.761      1644
       I-ORG      0.720     0.753     0.736       818
       B-PER      0.859     0.856     0.857      1592
       I-PER      0.900     0.951     0.925      1117

   micro avg      0.823     0.819     0.821      7929
   macro avg      0.791     0.787     0.789      7929
weighted avg      0.822     0.819     0.820      7929



In [11]:
# Example of a spaCy pipeline
import spacy
from spacy_crfsuite import CRFEntityExtractor

# We need to mark the BILOU flag which is also good for IOB scheme. 
# Otherwise, entities will be word-level.
crf_extractor.component_config["BILOU_flag"] = True


# Create a blank model
nlp = spacy.blank("en")

# Add a our CRF tagger to pipeline
pipe = CRFEntityExtractor(nlp, crf_extractor=crf_extractor)
nlp.add_pipe(pipe)

# And use natively ..
doc = nlp(
    "George Walker Bush (born July 6, 1946) is an American politician and businessman "
    "who served as the 43rd president of the United States from 2001 to 2009.")

for ent in doc.ents:
    print(ent, "-", ent.label_)

[38;5;4mℹ Inconsistent BILOU tagging found, B- tag, L- tag pair encloses
multiple entity classes.i.e. [B-a, I-b, L-a] instead of [B-a, I-a, L-a].
Assuming B- class is correct.[0m
[38;5;4mℹ Inconsistent BILOU tagging found, B- tag not closed by L- tag, i.e
[B-a, I-a, O] instead of [B-a, L-a, O]. Assuming last tag is L-[0m
[38;5;4mℹ Inconsistent BILOU tagging found, B- tag, L- tag pair encloses
multiple entity classes.i.e. [B-a, I-b, L-a] instead of [B-a, I-a, L-a].
Assuming B- class is correct.[0m
[38;5;4mℹ Inconsistent BILOU tagging found, B- tag not closed by L- tag, i.e
[B-a, I-a, O] instead of [B-a, L-a, O]. Assuming last tag is L-[0m
[38;5;4mℹ Inconsistent BILOU tagging found, B- tag, L- tag pair encloses
multiple entity classes.i.e. [B-a, I-b, L-a] instead of [B-a, I-a, L-a].
Assuming B- class is correct.[0m
[38;5;4mℹ Inconsistent BILOU tagging found, B- tag not closed by L- tag, i.e
[B-a, I-a, O] instead of [B-a, L-a, O]. Assuming last tag is L-[0m
George Walker Bush 

In [None]:
# Save model to disk ..
crf_extractor.to_disk("spacy_crfsuite_conll03.bz2")