In [1]:
! mkdir -p conll03
! wget -q -O conll03/train.conll https://raw.githubusercontent.com/davidsbatista/NER-datasets/master/CONLL2003/train.txt
! wget -q -O conll03/valid.conll https://raw.githubusercontent.com/davidsbatista/NER-datasets/master/CONLL2003/valid.txt
! wget -q -O conll03/test.conll https://raw.githubusercontent.com/davidsbatista/NER-datasets/master/CONLL2003/test.txt

In [2]:
! head -n 20 conll03/train.conll

-DOCSTART- -X- -X- O

EU NNP B-NP B-ORG
rejects VBZ B-VP O
German JJ B-NP B-MISC
call NN I-NP O
to TO B-VP O
boycott VB I-VP O
British JJ B-NP B-MISC
lamb NN I-NP O
. . O O

Peter NNP B-NP B-PER
Blackburn NNP I-NP I-PER

BRUSSELS NNP B-NP B-LOC
1996-08-22 CD I-NP O

The DT B-NP O
European NNP I-NP B-ORG


In [3]:
from spacy_crfsuite import CRFExtractor

component_config = {
    "features": [
        [
            "low",
            "title",
            "upper",
            "pos",
            "pos2"
        ],
        [
            "low",
            "bias",
            "prefix5",
            "prefix2",
            "suffix5",
            "suffix3",
            "suffix2",
            "upper",
            "title",
            "digit",
            "pos",
            "pos2"
        ],
        [
            "low",
            "title",
            "upper",
            "pos",
            "pos2"
        ],
    ],
    "c1": 0.01,
    "c2": 0.22
}

crf_extractor = CRFExtractor(component_config=component_config)
crf_extractor

<spacy_crfsuite.crf_extractor.CRFExtractor at 0x10d393e80>

In [4]:
import spacy

use_dense_features = crf_extractor.use_dense_features()

if use_dense_features:
    nlp = spacy.load("en_core_web_md")
else:
    nlp = spacy.load("en_core_web_sm")



In [5]:
from tqdm.notebook import tqdm_notebook

from spacy_crfsuite import read_file
from spacy_crfsuite.train import gold_example_to_crf_tokens
from spacy_crfsuite.tokenizer import SpacyTokenizer

def read_examples(file, tokenizer, use_dense_features=False, limit=None):
    examples = []
    it = read_file(file)
    it = it[:limit] if limit else it
    for raw_example in tqdm_notebook(it, desc=file):
        crf_example = gold_example_to_crf_tokens(
            raw_example, 
            tokenizer=tokenizer, 
            use_dense_features=use_dense_features, 
            bilou=False
        )
        examples.append(crf_example)
    return examples

# Spacy tokenizer
tokenizer = SpacyTokenizer(nlp)

# OPTIONAL: fine-tune hyper-params
# this is going to take a while, so you might need a coffee break ...
dev_examples = None
# dev_examples = read_examples("conll03/valid.conll", tokenizer, use_dense_features=use_dense_features)

if dev_examples:
    rs = crf_extractor.fine_tune(dev_examples, cv=5, n_iter=30, random_state=42)
    print("best params:", rs.best_params_, ", score:", rs.best_score_)
    crf_extractor.component_config.update(rs.best_params_)

In [6]:
train_examples = read_examples("conll03/train.conll", tokenizer=tokenizer, use_dense_features=use_dense_features)

crf_extractor.train(train_examples, dev_samples=dev_examples)
print(crf_extractor.explain())

HBox(children=(FloatProgress(value=0.0, description='conll03/train.conll', max=14041.0, style=ProgressStyle(de…


Most likely transitions:
B-ORG      -> I-ORG      7.260039
B-PER      -> I-PER      7.016621
I-ORG      -> I-ORG      6.787570
I-MISC     -> I-MISC     6.301491
B-LOC      -> I-LOC      6.284100
B-MISC     -> I-MISC     6.052984
I-LOC      -> I-LOC      5.508632
I-PER      -> I-PER      4.808802
O          -> B-PER      3.496929
O          -> O          2.778873

Positive features:
5.259367 O          0:bias:bias
4.243087 O          0:suffix3:day
3.915058 B-ORG      -1:low:v
3.763559 B-PER      BOS
3.526967 O          BOS
3.258426 B-PER      0:prefix2:Mc
3.239694 O          0:prefix2:W1
3.103391 B-LOC      BOS
3.099422 B-ORG      BOS
2.878765 B-ORG      0:suffix5:shire


In [7]:
test_examples = read_examples("conll03/test.conll", tokenizer=tokenizer, use_dense_features=use_dense_features)

f1_score, classification_report = crf_extractor.eval(test_examples)
print(classification_report)

HBox(children=(FloatProgress(value=0.0, description='conll03/test.conll', max=3453.0, style=ProgressStyle(desc…






              precision    recall  f1-score   support

       B-LOC      0.855     0.880     0.867      1615
       I-LOC      0.742     0.714     0.728       238
      B-MISC      0.804     0.759     0.781       698
      I-MISC      0.665     0.671     0.668       207
       B-ORG      0.816     0.729     0.770      1644
       I-ORG      0.727     0.746     0.736       818
       B-PER      0.855     0.867     0.861      1592
       I-PER      0.887     0.960     0.922      1117

   micro avg      0.826     0.822     0.824      7929
   macro avg      0.794     0.791     0.792      7929
weighted avg      0.825     0.822     0.823      7929



In [8]:
# Example of a spaCy pipeline
from spacy_crfsuite import CRFEntityExtractor

# Add our CRF component to pipeline
nlp = spacy.load("en_core_web_sm", disable=["ner"])
pipe = CRFEntityExtractor(nlp, crf_extractor=crf_extractor)
nlp.add_pipe(pipe)

# And use natively ..
doc = nlp(
    "George Walker Bush (born July 6, 1946) is an American politician and businessman "
    "who served as the 43rd president of the United States from 2001 to 2009.")

for ent in doc.ents:
    print(ent, "-", ent.label_)

George Walker Bush - PER
American - MISC
United States - LOC


Assuming B- class is correct.
  "Inconsistent BILOU tagging found, B- tag, L- "
Assuming last tag is L-
  "Inconsistent BILOU tagging found, B- tag not "


In [9]:
# Save model to disk ..
model_name = f"conll03_{nlp._meta['lang']}_{nlp._meta['name']}.bz2"
crf_extractor.to_disk(model_name)