In [None]:
from __future__ import unicode_literals, print_function
import plac
import random
from pathlib import Path
import spacy
from tqdm import tqdm
import json
import random
import logging
from sklearn.metrics import classification_report
from sklearn.metrics import precision_recall_fscore_support
from spacy.gold import GoldParse
from spacy.scorer import Scorer
from sklearn.metrics import accuracy_score

In [None]:

def convert_dataturks_to_spacy(filename):

    with open(filename) as train_data:
	
        train = json.load(train_data)

    TRAIN_DATA = []

    for data in train:

	    ents = [tuple(entity) for entity in data['entities']]

	    TRAIN_DATA.append((data['content'],{'entities':ents}))
        
    return TRAIN_DATA

In [39]:
TRAIN_DATA = convert_dataturks_to_spacy('data/data_base/loss_runs_NER_data.json')
TRAIN_DATA = TRAIN_DATA[:][:-1]

In [40]:
## Hyperparameters
model = None
output_dir=Path("./data/results/models")
n_iter=100

## Load model

#load the model
if model is not None:
    nlp = spacy.load(model)  
    print("Loaded model '%s'" % model)
else:
    nlp = spacy.blank('en')  
    print("Created blank 'en' model")

#set up the pipeline
if 'ner' not in nlp.pipe_names:
    ner = nlp.create_pipe('ner')
    nlp.add_pipe(ner, last=True)
else:
    ner = nlp.get_pipe('ner')

## Disable PIPELINE
for _, annotations in TRAIN_DATA:
    for ent in annotations.get('entities'):
        ner.add_label(ent[2])

other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
with nlp.disable_pipes(*other_pipes):  # only train NER
    optimizer = nlp.begin_training()
    for itn in range(n_iter):
        random.shuffle(TRAIN_DATA)
        losses = {}
        for text, annotations in tqdm(TRAIN_DATA):
            nlp.update(
                [text],  
                [annotations],  
                drop=0.5,  
                sgd=optimizer,
                losses=losses)
        print(losses)

Created blank 'en' model
100%|██████████| 33/33 [00:00<00:00, 33.93it/s]
 12%|█▏        | 4/33 [00:00<00:00, 39.55it/s]{'ner': 229.87807888115984}
100%|██████████| 33/33 [00:00<00:00, 36.34it/s]
 12%|█▏        | 4/33 [00:00<00:00, 34.93it/s]{'ner': 60.652146172854465}
100%|██████████| 33/33 [00:00<00:00, 36.96it/s]
 12%|█▏        | 4/33 [00:00<00:00, 29.27it/s]{'ner': 54.93458172057083}
100%|██████████| 33/33 [00:00<00:00, 33.95it/s]
 12%|█▏        | 4/33 [00:00<00:00, 33.02it/s]{'ner': 55.709184480990416}
100%|██████████| 33/33 [00:00<00:00, 35.70it/s]
  9%|▉         | 3/33 [00:00<00:01, 23.99it/s]{'ner': 65.12310708623669}
100%|██████████| 33/33 [00:00<00:00, 35.49it/s]
  9%|▉         | 3/33 [00:00<00:01, 29.23it/s]{'ner': 62.9775108882305}
100%|██████████| 33/33 [00:01<00:00, 32.44it/s]
 12%|█▏        | 4/33 [00:00<00:00, 37.15it/s]{'ner': 66.18811029796615}
100%|██████████| 33/33 [00:00<00:00, 35.19it/s]
 12%|█▏        | 4/33 [00:00<00:00, 30.97it/s]{'ner': 154.4288963001102}
100%|

## Save the re-trained model ##

In [41]:
if output_dir is not None:
    output_dir = Path(output_dir)
    if not output_dir.exists():
        output_dir.mkdir()
    nlp.to_disk(output_dir)
    print("Saved model to", output_dir)

Saved model to data/results/models
