In [9]:
!pwd


/bin/bash: line 0: cd: too many arguments
/content


In [11]:
cd drive/

/content/drive


In [12]:
cd MyDrive/

/content/drive/MyDrive


In [13]:
cd ner/ner_TF/

/content/drive/MyDrive/ner/ner_TF


In [17]:
import sys
!{sys.executable} -m pip install spacy # !{sys.executable} ensures package installation in conda env




In [14]:
import spacy
import random
import time
import numpy as np
from spacy.util import minibatch, compounding


In [None]:
# from os import path, mkdir
# if not path.isdir("data/"):
#     mkdir("data/")
# if not path.isdir("models/"):
#     mkdir("models/")
!mkdir data
!mkdir models


In [None]:
!ls

data  models


In [None]:
!curl https://groups.csail.mit.edu/sls/downloads/movie/engtest.bio -o data/test.txt
!curl https://groups.csail.mit.edu/sls/downloads/movie/engtrain.bio -o data/train.txt


  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  246k  100  246k    0     0   454k      0 --:--:-- --:--:-- --:--:--  453k
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  989k  100  989k    0     0  2844k      0 --:--:-- --:--:-- --:--:-- 2844k


In [None]:
!pwd

/content/drive/My Drive/ner/ner_TF


In [15]:
def load_data_spacy(file_path):
    ''' Converts data from:
    label \t word \n label \t word \n \n label \t word
    to: sentence, {entities : [(start, end, label), (stard, end, label)]}
    '''
    file = open(file_path, 'r')
    training_data, entities, sentence, unique_labels = [], [], [], []
    current_annotation = None
    end = 0 # initialize counter to keep track of start and end characters
    for line in file:
        line = line.strip("\n").split("\t")
        # lines with len > 1 are words
        if len(line) > 1:
            label = line[0][2:]
            label_type = line[0][0] # beginning of annotations - "B", intermediate - "I"
            word = line[1]
            sentence.append(word)
            end += (len(word) + 1)
            if label_type != 'I' and current_annotation:
                entities.append((start, end - 2 - len(word), current_annotation))
                current_annotation = None
            if label_type == 'B':
                start = end - len(word) - 1
                current_annotation = label
            if label_type == 'I':
                current_annotation = label
            if label != 'O' and label not in unique_labels:
                unique_labels.append(label)
            # lines with len == 1 are breaks between sentences
        if len(line) == 1:
            if current_annotation:
                entities.append((start, end - 1, current_annotation))
            sentence = " ".join(sentence)
            training_data.append([sentence, {'entities' : entities}])
            # reset the counters and temporary lists
            end = 0 
            entities, sentence = [], []
            current_annotation = None
    file.close()
    return training_data, unique_labels
TRAIN_DATA, LABELS = load_data_spacy("data/train.txt")


In [18]:
[x[0] for x in TRAIN_DATA[1:10]]


['show me films with drew barrymore from the 1980s',
 'what movies starred both al pacino and robert deniro',
 'find me all of the movies that starred harold ramis and bill murray',
 'find me a movie with a quote about baseball in it',
 'what movies have mississippi in the title',
 'show me science fiction films directed by steven spielberg',
 'do you have any thrillers directed by sofia coppola',
 'what leonard cohen songs have been used in a movie',
 'show me films elvis films set in hawaii']

In [19]:
[x[1] for x in TRAIN_DATA[1:10]]


[{'entities': [(19, 33, 'ACTOR'), (43, 48, 'YEAR')]},
 {'entities': [(25, 34, 'ACTOR'), (39, 52, 'ACTOR')]},
 {'entities': [(39, 51, 'ACTOR'), (56, 67, 'ACTOR')]},
 {'entities': []},
 {'entities': [(17, 28, 'TITLE')]},
 {'entities': [(8, 29, 'GENRE'), (42, 58, 'DIRECTOR')]},
 {'entities': [(16, 25, 'GENRE'), (38, 51, 'DIRECTOR')]},
 {'entities': [(5, 24, 'SONG')]},
 {'entities': [(14, 19, 'ACTOR'), (26, 39, 'PLOT')]}]

In [20]:
!{sys.executable} -m spacy download en


[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_sm')
[38;5;2m✔ Linking successful[0m
/usr/local/lib/python3.7/dist-packages/en_core_web_sm -->
/usr/local/lib/python3.7/dist-packages/spacy/data/en
You can now load the model via spacy.load('en')


In [21]:
from spacy import displacy
import warnings
warnings.filterwarnings("ignore")
nlp = spacy.load('en')
TEST_DATA, _ = load_data_spacy("data/test.txt")

test_sentences = [x[0] for x in TEST_DATA[0:15]] # extract the sentences from [sentence, entity]
for x in test_sentences:
    doc = nlp(x)
    displacy.render(doc, jupyter = True, style = "ent")
warnings.filterwarnings("default")


In [None]:
# A simple decorator to log function processing time
def timer(method):
    def timed(*args, **kw):
        ts = time.time()
        result = method(*args, **kw)
        te = time.time()
        print("Completed in {} seconds".format(int(te - ts)))
        return result
    return timed

# Data must be of the form (sentence, {entities: [start, end, label]})
@timer
def train_spacy(train_data, labels, iterations, dropout = 0.2, display_freq = 1):
    ''' Train a spacy NER model, which can be queried against with test data
    train_data : training data in the format of (sentence, {entities: [(start, end, label)]})
    labels : a list of unique annotations
    iterations : number of training iterations
    dropout : dropout proportion for training
    display_freq : number of epochs between logging losses to console
    '''
    nlp = spacy.blank('en')
    if 'ner' not in nlp.pipe_names:
        ner = nlp.create_pipe('ner')
        nlp.add_pipe(ner)
# Add entity labels to the NER pipeline
    for i in labels:
        ner.add_label(i)
# Disable other pipelines in SpaCy to only train NER
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
    with nlp.disable_pipes(*other_pipes):
        nlp.vocab.vectors.name = 'spacy_model' # without this, spaCy throws an "unnamed" error
        optimizer = nlp.begin_training()
        for itr in range(iterations):
            random.shuffle(train_data) # shuffle the training data before each iteration
            losses = {}
            batches = minibatch(train_data, size = compounding(4., 32., 1.001))
            for batch in batches:
                texts, annotations = zip(*batch)
                nlp.update(
                    texts,
                    annotations,
                    drop = dropout,
                    sgd = optimizer,
                    losses = losses)
            if itr % display_freq == 0:
                print("Iteration {} Loss: {}".format(itr + 1, losses))
    return nlp
# Train (and save) the NER model
ner = train_spacy(TRAIN_DATA, LABELS,6)
ner.to_disk("models/spacy_example")


  return f(*args, **kwds)


Iteration 1 Loss: {'ner': 19285.5924703256}
Iteration 2 Loss: {'ner': 12826.147116060138}
Iteration 3 Loss: {'ner': 10983.750106795245}
Iteration 4 Loss: {'ner': 9883.371633674325}
Iteration 5 Loss: {'ner': 8970.071292094592}
Iteration 6 Loss: {'ner': 8208.749495199832}
Completed in 218 seconds


  srsly.json_dumps(self.meta)
  writer(path / key)


In [None]:
# A simple decorator to log function processing time
def timer(method):
    def timed(*args, **kw):
        ts = time.time()
        result = method(*args, **kw)
        te = time.time()
        print("Completed in {} seconds".format(int(te - ts)))
        return result
    return timed

# Data must be of the form (sentence, {entities: [start, end, label]})
@timer
def train_spacy(train_data, labels, iterations, dropout = 0.2, display_freq = 1):
    ''' Train a spacy NER model, which can be queried against with test data
    train_data : training data in the format of (sentence, {entities: [(start, end, label)]})
    labels : a list of unique annotations
    iterations : number of training iterations
    dropout : dropout proportion for training
    display_freq : number of epochs between logging losses to console
    '''
    nlp = spacy.blank('en')
    if 'ner' not in nlp.pipe_names:
        ner = nlp.create_pipe('ner')
        nlp.add_pipe(ner)
# Add entity labels to the NER pipeline
    for i in labels:
        ner.add_label(i)
# Disable other pipelines in SpaCy to only train NER
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
    with nlp.disable_pipes(*other_pipes):
        nlp.vocab.vectors.name = 'spacy_model' # without this, spaCy throws an "unnamed" error
        optimizer = nlp.begin_training()
        for itr in range(iterations):
            random.shuffle(train_data) # shuffle the training data before each iteration
            losses = {}
            batches = minibatch(train_data, size = compounding(4., 32., 1.001))
            for batch in batches:
                texts, annotations = zip(*batch)
                nlp.update(
                    texts,
                    annotations,
                    drop = dropout,
                    sgd = optimizer,
                    losses = losses)
            if itr % display_freq == 0:
                print("Iteration {} Loss: {}".format(itr + 1, losses))
    return nlp
# Train (and save) the NER model
ner = train_spacy(TRAIN_DATA, LABELS,10)
ner.to_disk("models/spacy_example")


Iteration 1 Loss: {'ner': 19363.052856104143}
Iteration 2 Loss: {'ner': 13082.70383554919}
Iteration 3 Loss: {'ner': 11044.4054218838}
Iteration 4 Loss: {'ner': 9987.992103624289}
Iteration 5 Loss: {'ner': 9134.67087454532}
Iteration 6 Loss: {'ner': 8392.598757864178}
Iteration 7 Loss: {'ner': 7923.6016581037675}
Iteration 8 Loss: {'ner': 7473.660795767361}
Iteration 9 Loss: {'ner': 7098.115221611118}
Iteration 10 Loss: {'ner': 6619.416343232932}
Completed in 430 seconds


  srsly.json_dumps(self.meta)
  writer(path / key)


In [None]:
# A simple decorator to log function processing time
def timer(method):
    def timed(*args, **kw):
        ts = time.time()
        result = method(*args, **kw)
        te = time.time()
        print("Completed in {} seconds".format(int(te - ts)))
        return result
    return timed

# Data must be of the form (sentence, {entities: [start, end, label]})
@timer
def train_spacy(train_data, labels, iterations, dropout = 0.2, display_freq = 1):
    ''' Train a spacy NER model, which can be queried against with test data
    train_data : training data in the format of (sentence, {entities: [(start, end, label)]})
    labels : a list of unique annotations
    iterations : number of training iterations
    dropout : dropout proportion for training
    display_freq : number of epochs between logging losses to console
    '''
    nlp = spacy.blank('en')
    if 'ner' not in nlp.pipe_names:
        ner = nlp.create_pipe('ner')
        nlp.add_pipe(ner)
# Add entity labels to the NER pipeline
    for i in labels:
        ner.add_label(i)
# Disable other pipelines in SpaCy to only train NER
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
    with nlp.disable_pipes(*other_pipes):
        nlp.vocab.vectors.name = 'spacy_model' # without this, spaCy throws an "unnamed" error
        optimizer = nlp.begin_training()
        for itr in range(iterations):
            random.shuffle(train_data) # shuffle the training data before each iteration
            losses = {}
            batches = minibatch(train_data, size = compounding(4., 32., 1.001))
            for batch in batches:
                texts, annotations = zip(*batch)
                nlp.update(
                    texts,
                    annotations,
                    drop = dropout,
                    sgd = optimizer,
                    losses = losses)
            if itr % display_freq == 0:
                print("Iteration {} Loss: {}".format(itr + 1, losses))
    return nlp
# Train (and save) the NER model
ner = train_spacy(TRAIN_DATA, LABELS,15)
ner.to_disk("models/spacy_example")


Iteration 1 Loss: {'ner': 19340.28966159749}
Iteration 2 Loss: {'ner': 13021.786018995565}
Iteration 3 Loss: {'ner': 10953.720307043908}
Iteration 4 Loss: {'ner': 9803.261234055592}
Iteration 5 Loss: {'ner': 8962.081964163184}
Iteration 6 Loss: {'ner': 8535.459550399994}
Iteration 7 Loss: {'ner': 7998.21778409236}
Iteration 8 Loss: {'ner': 7678.102943739372}
Iteration 9 Loss: {'ner': 7051.078658205442}
Iteration 10 Loss: {'ner': 6788.574800990848}
Iteration 11 Loss: {'ner': 6482.951724743434}
Iteration 12 Loss: {'ner': 6135.833851709095}
Iteration 13 Loss: {'ner': 5899.169452024936}
Iteration 14 Loss: {'ner': 5645.812932558561}
Iteration 15 Loss: {'ner': 5457.107700761301}
Completed in 692 seconds


  srsly.json_dumps(self.meta)
  writer(path / key)


In [22]:
# A simple decorator to log function processing time
def timer(method):
    def timed(*args, **kw):
        ts = time.time()
        result = method(*args, **kw)
        te = time.time()
        print("Completed in {} seconds".format(int(te - ts)))
        return result
    return timed

# Data must be of the form (sentence, {entities: [start, end, label]})
@timer
def train_spacy(train_data, labels, iterations, dropout = 0.2, display_freq = 1):
    ''' Train a spacy NER model, which can be queried against with test data
    train_data : training data in the format of (sentence, {entities: [(start, end, label)]})
    labels : a list of unique annotations
    iterations : number of training iterations
    dropout : dropout proportion for training
    display_freq : number of epochs between logging losses to console
    '''
    nlp = spacy.blank('en')
    if 'ner' not in nlp.pipe_names:
        ner = nlp.create_pipe('ner')
        nlp.add_pipe(ner)
# Add entity labels to the NER pipeline
    for i in labels:
        ner.add_label(i)
# Disable other pipelines in SpaCy to only train NER
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
    with nlp.disable_pipes(*other_pipes):
        nlp.vocab.vectors.name = 'spacy_model' # without this, spaCy throws an "unnamed" error
        optimizer = nlp.begin_training()
        for itr in range(iterations):
            random.shuffle(train_data) # shuffle the training data before each iteration
            losses = {}
            batches = minibatch(train_data, size = compounding(4., 32., 1.001))
            for batch in batches:
                texts, annotations = zip(*batch)
                nlp.update(
                    texts,
                    annotations,
                    drop = dropout,
                    sgd = optimizer,
                    losses = losses)
            if itr % display_freq == 0:
                print("Iteration {} Loss: {}".format(itr + 1, losses))
    return nlp
# Train (and save) the NER model
ner = train_spacy(TRAIN_DATA, LABELS,30)
ner.to_disk("models/spacy_example")


  return f(*args, **kwds)


Iteration 1 Loss: {'ner': 19019.74426940676}
Iteration 2 Loss: {'ner': 12830.695900784089}
Iteration 3 Loss: {'ner': 10989.96456429876}
Iteration 4 Loss: {'ner': 9813.775539953618}
Iteration 5 Loss: {'ner': 9180.638777810615}
Iteration 6 Loss: {'ner': 8394.474227124892}
Iteration 7 Loss: {'ner': 7935.246571758179}
Iteration 8 Loss: {'ner': 7283.91281054702}
Iteration 9 Loss: {'ner': 7124.204717916468}
Iteration 10 Loss: {'ner': 6781.481537387383}
Iteration 11 Loss: {'ner': 6423.513041305701}
Iteration 12 Loss: {'ner': 6053.377239212816}
Iteration 13 Loss: {'ner': 5878.882489675567}
Iteration 14 Loss: {'ner': 5730.652088310121}
Iteration 15 Loss: {'ner': 5328.182437542447}
Iteration 16 Loss: {'ner': 5309.032357399481}
Iteration 17 Loss: {'ner': 5262.976354904392}
Iteration 18 Loss: {'ner': 4915.629261033916}
Iteration 19 Loss: {'ner': 4823.157442005399}
Iteration 20 Loss: {'ner': 4820.599999608763}
Iteration 21 Loss: {'ner': 4569.896051017202}
Iteration 22 Loss: {'ner': 4342.47403748193

  srsly.json_dumps(self.meta)
  writer(path / key)


In [24]:
# A simple decorator to log function processing time
def timer(method):
    def timed(*args, **kw):
        ts = time.time()
        result = method(*args, **kw)
        te = time.time()
        print("Completed in {} seconds".format(int(te - ts)))
        return result
    return timed

# Data must be of the form (sentence, {entities: [start, end, label]})
@timer
def train_spacy(train_data, labels, iterations, dropout = 0.2, display_freq = 1):
    ''' Train a spacy NER model, which can be queried against with test data
    train_data : training data in the format of (sentence, {entities: [(start, end, label)]})
    labels : a list of unique annotations
    iterations : number of training iterations
    dropout : dropout proportion for training
    display_freq : number of epochs between logging losses to console
    '''
    nlp = spacy.blank('en')
    if 'ner' not in nlp.pipe_names:
        ner = nlp.create_pipe('ner')
        nlp.add_pipe(ner)
# Add entity labels to the NER pipeline
    for i in labels:
        ner.add_label(i)
# Disable other pipelines in SpaCy to only train NER
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
    with nlp.disable_pipes(*other_pipes):
        nlp.vocab.vectors.name = 'spacy_model' # without this, spaCy throws an "unnamed" error
        optimizer = nlp.begin_training()
        for itr in range(iterations):
            random.shuffle(train_data) # shuffle the training data before each iteration
            losses = {}
            batches = minibatch(train_data, size = compounding(4., 32., 1.001))
            for batch in batches:
                texts, annotations = zip(*batch)
                nlp.update(
                    texts,
                    annotations,
                    drop = dropout,
                    sgd = optimizer,
                    losses = losses)
            if itr % display_freq == 0:
                print("Iteration {} Loss: {}".format(itr + 1, losses))
    return nlp
# Train (and save) the NER model
ner = train_spacy(TRAIN_DATA, LABELS,60)
ner.to_disk("models/spacy_example")


Iteration 1 Loss: {'ner': 19336.53553482932}
Iteration 2 Loss: {'ner': 12948.264283357496}
Iteration 3 Loss: {'ner': 10847.168732393659}
Iteration 4 Loss: {'ner': 9866.058514049537}
Iteration 5 Loss: {'ner': 9173.150289795385}
Iteration 6 Loss: {'ner': 8366.55339299964}
Iteration 7 Loss: {'ner': 7834.466614185692}
Iteration 8 Loss: {'ner': 7407.275133760962}
Iteration 9 Loss: {'ner': 6994.706559609894}
Iteration 10 Loss: {'ner': 6734.476595958864}
Iteration 11 Loss: {'ner': 6150.517926633854}
Iteration 12 Loss: {'ner': 6174.099823646037}
Iteration 13 Loss: {'ner': 5891.533834329663}
Iteration 14 Loss: {'ner': 5739.600926223964}
Iteration 15 Loss: {'ner': 5390.210577435957}
Iteration 16 Loss: {'ner': 5283.6722016839585}
Iteration 17 Loss: {'ner': 5122.564868160487}
Iteration 18 Loss: {'ner': 4998.307690284629}
Iteration 19 Loss: {'ner': 4882.688914907015}
Iteration 20 Loss: {'ner': 4602.38563583102}
Iteration 21 Loss: {'ner': 4409.292741963467}
Iteration 22 Loss: {'ner': 4414.4721403122

  srsly.json_dumps(self.meta)
  writer(path / key)


In [25]:
# A simple decorator to log function processing time
def timer(method):
    def timed(*args, **kw):
        ts = time.time()
        result = method(*args, **kw)
        te = time.time()
        print("Completed in {} seconds".format(int(te - ts)))
        return result
    return timed

# Data must be of the form (sentence, {entities: [start, end, label]})
@timer
def train_spacy(train_data, labels, iterations, dropout = 0.2, display_freq = 1):
    ''' Train a spacy NER model, which can be queried against with test data
    train_data : training data in the format of (sentence, {entities: [(start, end, label)]})
    labels : a list of unique annotations
    iterations : number of training iterations
    dropout : dropout proportion for training
    display_freq : number of epochs between logging losses to console
    '''
    nlp = spacy.blank('en')
    if 'ner' not in nlp.pipe_names:
        ner = nlp.create_pipe('ner')
        nlp.add_pipe(ner)
# Add entity labels to the NER pipeline
    for i in labels:
        ner.add_label(i)
# Disable other pipelines in SpaCy to only train NER
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
    with nlp.disable_pipes(*other_pipes):
        nlp.vocab.vectors.name = 'spacy_model' # without this, spaCy throws an "unnamed" error
        optimizer = nlp.begin_training()
        for itr in range(iterations):
            random.shuffle(train_data) # shuffle the training data before each iteration
            losses = {}
            batches = minibatch(train_data, size = compounding(4., 32., 1.001))
            for batch in batches:
                texts, annotations = zip(*batch)
                nlp.update(
                    texts,
                    annotations,
                    drop = dropout,
                    sgd = optimizer,
                    losses = losses)
            if itr % display_freq == 0:
                print("Iteration {} Loss: {}".format(itr + 1, losses))
    return nlp
# Train (and save) the NER model
ner = train_spacy(TRAIN_DATA, LABELS,100)
ner.to_disk("models/spacy_example")


Iteration 1 Loss: {'ner': 19530.17756882755}
Iteration 2 Loss: {'ner': 13005.028980016074}
Iteration 3 Loss: {'ner': 11075.012673848662}
Iteration 4 Loss: {'ner': 9833.993687054897}
Iteration 5 Loss: {'ner': 8965.239768946476}
Iteration 6 Loss: {'ner': 8350.109319847616}
Iteration 7 Loss: {'ner': 7829.894639746408}
Iteration 8 Loss: {'ner': 7469.593691022965}
Iteration 9 Loss: {'ner': 7019.4537318757175}
Iteration 10 Loss: {'ner': 6828.800997936601}
Iteration 11 Loss: {'ner': 6464.9850941327195}
Iteration 12 Loss: {'ner': 6156.509570491942}
Iteration 13 Loss: {'ner': 5875.95028815788}
Iteration 14 Loss: {'ner': 5673.428318202797}
Iteration 15 Loss: {'ner': 5476.864998847019}
Iteration 16 Loss: {'ner': 5349.528187562326}
Iteration 17 Loss: {'ner': 5240.335472280065}
Iteration 18 Loss: {'ner': 4833.666835736649}
Iteration 19 Loss: {'ner': 4828.189895744993}
Iteration 20 Loss: {'ner': 4664.655413110397}
Iteration 21 Loss: {'ner': 4475.131395639426}
Iteration 22 Loss: {'ner': 4479.18348230

  srsly.json_dumps(self.meta)
  writer(path / key)


In [38]:
# A simple decorator to log function processing time
def timer(method):
    def timed(*args, **kw):
        ts = time.time()
        result = method(*args, **kw)
        te = time.time()
        print("Completed in {} seconds".format(int(te - ts)))
        return result
    return timed

# Data must be of the form (sentence, {entities: [start, end, label]})
@timer
def train_spacy(train_data, labels, iterations, dropout = 0.2, display_freq = 1):
    ''' Train a spacy NER model, which can be queried against with test data
    train_data : training data in the format of (sentence, {entities: [(start, end, label)]})
    labels : a list of unique annotations
    iterations : number of training iterations
    dropout : dropout proportion for training
    display_freq : number of epochs between logging losses to console
    '''
    nlp = spacy.blank('en')
    if 'ner' not in nlp.pipe_names:
        ner = nlp.create_pipe('ner')
        nlp.add_pipe(ner)
# Add entity labels to the NER pipeline
    for i in labels:
        ner.add_label(i)
# Disable other pipelines in SpaCy to only train NER
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
    with nlp.disable_pipes(*other_pipes):
        nlp.vocab.vectors.name = 'spacy_model' # without this, spaCy throws an "unnamed" error
        optimizer = nlp.begin_training()
        for itr in range(iterations):
            random.shuffle(train_data) # shuffle the training data before each iteration
            losses = {}
            batches = minibatch(train_data, size = compounding(4., 32., 1.001))
            for batch in batches:
                texts, annotations = zip(*batch)
                nlp.update(
                    texts,
                    annotations,
                    drop = dropout,
                    sgd = optimizer,
                    losses = losses)
            if itr % display_freq == 0:
                print("Iteration {} Loss: {}".format(itr + 1, losses))
    return nlp
# Train (and save) the NER model
ner = train_spacy(TRAIN_DATA, LABELS,200)
ner.to_disk("models/spacy_example1")


Iteration 1 Loss: {'ner': 19076.38994675261}
Iteration 2 Loss: {'ner': 12897.316180109501}
Iteration 3 Loss: {'ner': 10812.334467485482}
Iteration 4 Loss: {'ner': 9878.649360656258}
Iteration 5 Loss: {'ner': 8987.8496806427}
Iteration 6 Loss: {'ner': 8343.796290468683}
Iteration 7 Loss: {'ner': 7799.15513548945}
Iteration 8 Loss: {'ner': 7459.061170550009}
Iteration 9 Loss: {'ner': 7171.142972891082}
Iteration 10 Loss: {'ner': 6727.372489516799}
Iteration 11 Loss: {'ner': 6450.344118799424}
Iteration 12 Loss: {'ner': 6224.211203536101}
Iteration 13 Loss: {'ner': 5991.623590730215}
Iteration 14 Loss: {'ner': 5767.829354063505}
Iteration 15 Loss: {'ner': 5556.949960908982}
Iteration 16 Loss: {'ner': 5384.634951897915}
Iteration 17 Loss: {'ner': 5152.585385215953}
Iteration 18 Loss: {'ner': 5047.356772117138}
Iteration 19 Loss: {'ner': 4919.51008677528}
Iteration 20 Loss: {'ner': 4805.203350892454}
Iteration 21 Loss: {'ner': 4650.888518856238}
Iteration 22 Loss: {'ner': 4516.028444899915}

  srsly.json_dumps(self.meta)
  writer(path / key)


In [40]:
from spacy import displacy

def load_model(model_path):
    ''' Loads a pre-trained model for prediction on new test sentences
    model_path : directory of model saved by spacy.to_disk
    '''
    nlp = spacy.blank('en')
    if 'ner' not in nlp.pipe_names:
        ner = nlp.create_pipe('ner')
        nlp.add_pipe(ner)
    ner = nlp.from_disk(model_path)
    return ner
ner = load_model("models/spacy_example1")

TEST_DATA, _ = load_data_spacy("data/test.txt")

test_sentences = [x[0] for x in TEST_DATA[0:15]] # extract the sentences from [sentence, entity]
for x in test_sentences:
    doc = ner(x)
    displacy.render(doc, jupyter = True, style = "ent")


In [41]:
def calc_precision(pred, true):
    precision = len([x for x in pred if x in true]) / (len(pred) + 1e-20) # true positives / total pred
    return precision

def calc_recall(pred, true):
    recall = len([x for x in true if x in pred]) / (len(true) + 1e-20)# true positives / total test
    return recall

def calc_f1(precision, recall):
    f1 = 2 * ((precision * recall) / (precision + recall + 1e-20))
    return f1


In [42]:
from itertools import chain

# run the predictions on each sentence in the test dataset, and return the spacy object
preds = [ner(x[0]) for x in TEST_DATA]

precisions, recalls, f1s = [], [], []

# iterate over predictions and test data and calculate precision, recall, and F1-score
for pred, true in zip(preds, TEST_DATA):
    true = [x[2] for x in list(chain.from_iterable(true[1].values()))] # x[2] = annotation, true[1] = (start, end, annot)
    pred = [i.label_ for i in pred.ents] # i.label_ = annotation label, pred.ents = list of annotations
    precision = calc_precision(true, pred)
    precisions.append(precision)
    recall = calc_recall(true, pred)
    recalls.append(recall)
    f1s.append(calc_f1(precision, recall))

print("Precision: {} \nRecall: {} \nF1-score: {}".format(np.around(np.mean(precisions), 3),np.around(np.mean(recalls), 3),np.around(np.mean(f1s), 3)))


Precision: 0.892 
Recall: 0.889 
F1-score: 0.882
