In [1]:
# ! mkdir -p conll03
# ! wget -q -O conll03/train.conll https://raw.githubusercontent.com/davidsbatista/NER-datasets/master/CONLL2003/train.txt
# ! wget -q -O conll03/valid.conll https://raw.githubusercontent.com/davidsbatista/NER-datasets/master/CONLL2003/valid.txt
# ! wget -q -O conll03/test.conll https://raw.githubusercontent.com/davidsbatista/NER-datasets/master/CONLL2003/test.txt

In [2]:
! head conll03/train.conll

-DOCSTART- -X- -X- O

EU NNP B-NP B-ORG
rejects VBZ B-VP O
German JJ B-NP B-MISC
call NN I-NP O
to TO B-VP O
boycott VB I-VP O
British JJ B-NP B-MISC
lamb NN I-NP O


In [3]:
from spacy_crfsuite.utils import read_examples

train_data = read_examples("conll03/train.conll")
valid_data = read_examples("conll03/valid.conll")
test_data = read_examples("conll03/test.conll")

print("train examples:", len(train_data))
print("valid examples:", len(valid_data))
print("test examples:", len(test_data))

train examples: 14041
valid examples: 3250
test examples: 3453


In [4]:
train_data[0]

{'text': 'EU rejects German call to boycott British lamb .',
 'tokens': [<spacy_crfsuite.tokenizer.Token at 0x12c563dd8>,
  <spacy_crfsuite.tokenizer.Token at 0x12c563e48>,
  <spacy_crfsuite.tokenizer.Token at 0x12c563e80>,
  <spacy_crfsuite.tokenizer.Token at 0x12c563ef0>,
  <spacy_crfsuite.tokenizer.Token at 0x12c563f28>,
  <spacy_crfsuite.tokenizer.Token at 0x12c563f60>,
  <spacy_crfsuite.tokenizer.Token at 0x12c563f98>,
  <spacy_crfsuite.tokenizer.Token at 0x12c551048>,
  <spacy_crfsuite.tokenizer.Token at 0x12c551080>],
 'entities': [{'value': 'EU', 'entity': 'ORG', 'start': 0, 'end': 2},
  {'value': 'German', 'entity': 'MISC', 'start': 11, 'end': 17},
  {'value': 'British', 'entity': 'MISC', 'start': 34, 'end': 41}]}

In [5]:
from spacy_crfsuite import prepare_example, CRFExtractor

crf_extractor = CRFExtractor(component_config={'c1': 0.03, 'c2': 0.06})
crf_extractor

<spacy_crfsuite.crf_extractor.CRFExtractor at 0x10fe7f6a0>

In [6]:
from tqdm.notebook import tqdm_notebook

train_examples = []
for raw_example in tqdm_notebook(train_data[:3000]):
    example = prepare_example(raw_example, crf_extractor=crf_extractor)
    train_examples.append(example)

HBox(children=(FloatProgress(value=0.0, max=3000.0), HTML(value='')))




In [8]:
crf_extractor.train(train_examples)

<spacy_crfsuite.crf_extractor.CRFExtractor at 0x10fe7f6a0>

In [9]:
print(crf_extractor.explain())

Most likely transitions:
U-ORG      -> U-ORG      4.326296
U-PER      -> U-PER      4.266279
U-MISC     -> U-MISC     4.109938
U-LOC      -> U-LOC      3.222692
O          -> O          2.544722
O          -> U-LOC      1.032091
U-MISC     -> O          0.864462
U-ORG      -> O          0.855243
O          -> U-PER      0.812733
O          -> U-MISC     0.725287

Positive features:
6.413900 O          0:prefix2:W1
5.075359 O          0:bias:bias
4.251132 U-ORG      1:low:yr
4.195013 U-ORG      0:prefix2:x-
3.889309 U-LOC      -1:low:at
3.882487 O          0:suffix3:day
3.768032 U-PER      0:title
3.724738 U-ORG      1:low:0
3.657899 U-ORG      -1:low:12
3.650809 U-ORG      -1:low:v


In [10]:
dev_examples = []
for raw_example in tqdm_notebook(test_data[:1000]):
    example = prepare_example(raw_example, crf_extractor=crf_extractor)
    dev_examples.append(example)

HBox(children=(FloatProgress(value=0.0, max=1000.0), HTML(value='')))




In [11]:
f1_score, classification_report = crf_extractor.eval(dev_examples)

print(classification_report)

              precision    recall  f1-score   support

       U-LOC      0.731     0.804     0.766       480
      U-MISC      0.757     0.726     0.741       274
       U-ORG      0.764     0.491     0.598       731
       U-PER      0.832     0.863     0.847      1130

   micro avg      0.789     0.734     0.760      2615
   macro avg      0.771     0.721     0.738      2615
weighted avg      0.786     0.734     0.751      2615



