In [None]:
from __future__ import unicode_literals, print_function
import plac
import random
from pathlib import Path
import spacy
from tqdm import tqdm
import json
import random
import logging
from sklearn.metrics import classification_report
from sklearn.metrics import precision_recall_fscore_support
from spacy.gold import GoldParse
from spacy.scorer import Scorer
from sklearn.metrics import accuracy_score

In [None]:

def convert_dataturks_to_spacy(filename):

    with open(filename) as train_data:
	
        train = json.load(train_data)

    TRAIN_DATA = []

    for data in train:

	    ents = [tuple(entity) for entity in data['entities']]

	    TRAIN_DATA.append((data['content'],{'entities':ents}))
        
    return TRAIN_DATA

In [68]:
TRAIN_DATA = convert_dataturks_to_spacy('data/data_base/NER_DATA_TEST_FROM_SFILTER.json')
TRAIN_DATA = TRAIN_DATA[:][:-1]

In [None]:
## Ignore this until GPU is supported

#import thinc_gpu_ops
#thinc_gpu_ops.AVAILABLE

#import spacy 
#spacy.prefer_gpu()r
#spacy.require_gpu()


## First NER model ##

In [74]:
# First ner mdoel

import time
# check process time 
start_time = time.time()
# Optimal values: n_iter = 10, drop = 0.01

## Hyperparameters
model = None
output_dir=Path("./data/results/models")
n_iter= 100

## Load model

#load the model
if model is not None:
    nlp = spacy.load(model)  
    print("Loaded model '%s'" % model)
else:
    nlp = spacy.blank('en')  
    print("Created blank 'en' model")

#set up the pipeline
if 'ner' not in nlp.pipe_names:
    ner = nlp.create_pipe('ner')
    nlp.add_pipe(ner, last=True)
else:
    ner = nlp.get_pipe('ner')

## Disable PIPELINE
for _, annotations in TRAIN_DATA:
    for ent in annotations.get('entities'):
        #print(ent[2])
        ner.add_label(ent[2])

other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
with nlp.disable_pipes(*other_pipes):  # only train NER
    optimizer = nlp.begin_training()
    for itn in range(n_iter):
        random.shuffle(TRAIN_DATA)
        losses = {}
        for text, annotations in tqdm(TRAIN_DATA):
            nlp.update(
                [text],  
                [annotations],  
                drop=0.01,  
                sgd=optimizer,
                losses=losses)
        print(losses)
print('.'*50)
print("--- %s seconds ---" % (time.time() - start_time))
print('.'*50)

Created blank 'en' model
100%|██████████| 8/8 [00:00<00:00, 32.30it/s]
 38%|███▊      | 3/8 [00:00<00:00, 29.77it/s]{'ner': 104.04842398688197}
100%|██████████| 8/8 [00:00<00:00, 32.07it/s]
 50%|█████     | 4/8 [00:00<00:00, 33.33it/s]{'ner': 14.623817185150756}
100%|██████████| 8/8 [00:00<00:00, 30.31it/s]
 50%|█████     | 4/8 [00:00<00:00, 33.47it/s]{'ner': 62.12394527208744}
100%|██████████| 8/8 [00:00<00:00, 33.45it/s]
 50%|█████     | 4/8 [00:00<00:00, 37.73it/s]{'ner': 51.87308599215612}
100%|██████████| 8/8 [00:00<00:00, 33.49it/s]
 50%|█████     | 4/8 [00:00<00:00, 31.99it/s]{'ner': 32.73802209679796}
100%|██████████| 8/8 [00:00<00:00, 32.37it/s]
 50%|█████     | 4/8 [00:00<00:00, 31.48it/s]{'ner': 23.985594821206675}
100%|██████████| 8/8 [00:00<00:00, 33.72it/s]
 38%|███▊      | 3/8 [00:00<00:00, 29.93it/s]{'ner': 19.463509851037646}
100%|██████████| 8/8 [00:00<00:00, 30.80it/s]
 38%|███▊      | 3/8 [00:00<00:00, 29.45it/s]{'ner': 35.53549256370686}
100%|██████████| 8/8 [00:00

## Second NER model ##

In [69]:
import random
# make a blank model
start_time = time.time()

nlp = spacy.blank('en')

# Create a new NER and added to pipeline
ner = nlp.create_pipe('ner')

# Add the new NEW to pipeline
nlp.add_pipe(ner)

# Add the new label
ner.add_label('ORG')
ner.add_label('REPORT_DATE')
ner.add_label('POLICY_NUM')
ner.add_label('NAME')
ner.add_label('STATUS')
ner.add_label('MONEY')


# hyperparameters 
iterations = 1 # epocs

# Begin training
for ith in range(iterations):
    # shuffle the train data
    random.shuffle(TRAIN_DATA)
    optimizer = nlp.begin_training()
    for batch in spacy.util.minibatch(TRAIN_DATA,size=2):
        losses = {}
        for text, annotations in tqdm(TRAIN_DATA):
            try:
                nlp.update(
                    [text],  
                    [annotations],  
                    drop=0.01,  
                    sgd=optimizer,
                    losses=losses)
            except:
                pass
        print(losses)
print('.'*50)
print("--- %s seconds ---" % (time.time() - start_time))
print('.'*50)

100%|██████████| 8/8 [00:00<00:00, 801.32it/s]
100%|██████████| 8/8 [00:00<00:00, 2118.34it/s]
100%|██████████| 8/8 [00:00<00:00, 1858.56it/s]
100%|██████████| 8/8 [00:00<00:00, 2020.62it/s]{'ner': 0.0}
{'ner': 0.0}
{'ner': 0.0}
{'ner': 0.0}
..................................................
--- 0.4221510887145996 seconds ---
..................................................



## Save the re-trained model ##

In [73]:
if output_dir is not None:
    output_dir = Path(output_dir)
    if not output_dir.exists():
        output_dir.mkdir()
    nlp.to_disk(output_dir)
    print("Saved model to", output_dir)

Saved model to data/results/models
