#### Training custom SpaCy NER Model:  
Based upon the NER example on AI Hub (https://aihub.cloud.google.com/p/products%2F2290fc65-0041-4c87-a898-0289f59aa8ba)

In [1]:
'''CUDA version must match the Spacy to use GPU'''
# !nvcc --version
# pip install spacy[cuda110]

'CUDA version must match the Spacy to use GPU'

In [2]:
# !pip install spacy-lookups-data

In [3]:
import sys
import os
import requests

import random
import time
import numpy as np
from spacy.util import minibatch, compounding
from spacy.training.example import Example

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

In [4]:
import spacy 
from spacy import displacy
spacy.prefer_gpu()
# spacy.require_gpu()

print(spacy.__version__)

3.3.1


#### Copy files to local FS from GCP bucket

In [5]:
path_ner = '/home/jupyter/data/ner'
path_spacy = '/home/jupyter/data/spacy'

os.makedirs(path_ner, exist_ok=True)
os.makedirs(path_spacy, exist_ok=True)

In [6]:
def get_gcs_data (bucket_name, folder_name, file_name, path_local):
    url = 'https://storage.googleapis.com/' + bucket_name + '/' + folder_name + '/' + file_name
    r = requests.get(url)
    open(path_local + '/' + file_name , 'wb').write(r.content)

In [7]:
bucket_name = 'msca-bdp-data-open'
folder_name = 'ner'
file_name = ['train.txt', 'test.txt']
path_local = path_ner

os.makedirs(path_local, exist_ok=True)

for file in file_name:
    get_gcs_data (bucket_name = bucket_name,
                 folder_name = folder_name,
                 file_name = file,
                 path_local = path_local)
    print('Downloaded: ' + file)

Downloaded: train.txt
Downloaded: test.txt


#### Review the first N records

In [9]:
N = 10
train = path_ner + '/train.txt'
test = path_ner + '/test.txt'

with open(train, "r") as file:
    for i in range(N):
        line = next(file).strip()
        print(line)

O	what
O	movies
O	star
B-ACTOR	bruce
I-ACTOR	willis

O	show
O	me
O	films
O	with


#### Prepare the training data

In [10]:
'''Converts data from:
label \t word \n label \t word \n \n label \t word
to: sentence, {entities : [(start, end, label), (stard, end, label)]
'''
def load_data_spacy(file_path):
    file = open(file_path, 'r')
    training_data, entities, sentence, unique_labels = [], [], [], []
    current_annotation = None
    end = 0 #initialize counter
    for line in file:
        line = line.strip("\n").split("\t")
        
        #if lines are len > 1 are words
        if len(line) > 1:
            label = line [0][2:]
            label_type = line [0][0]
            word = line[1]
            sentence.append(word)
            end += (len(word) + 1)
            
            if label_type != 'I' and current_annotation:
                entities.append((start, end - 2 - len(word), current_annotation))
                current_annotation = None
            
            if label_type == 'B':
                start = end - len(word) -1
                current_annotation = label
                
            if label_type == 'I':
                current_annotation = label
                
            if label != 'O' and label not in unique_labels:
                unique_labels.append(label)
                
        if len(line) == 1:
            if current_annotation:
                entities.append((start, end - 1, current_annotation))
            sentence = " ".join(sentence)
            training_data.append([sentence, {'entities': entities}])
            
            end = 0
            entities, sentence = [], []
            current_annotation = None

    file.close()
    return training_data, unique_labels

TRAIN_DATA, LABELS = load_data_spacy(train) 

In [11]:
[x[0] for x in TRAIN_DATA[1:10]]

['show me films with drew barrymore from the 1980s',
 'what movies starred both al pacino and robert deniro',
 'find me all of the movies that starred harold ramis and bill murray',
 'find me a movie with a quote about baseball in it',
 'what movies have mississippi in the title',
 'show me science fiction films directed by steven spielberg',
 'do you have any thrillers directed by sofia coppola',
 'what leonard cohen songs have been used in a movie',
 'show me films elvis films set in hawaii']

In [12]:
[x[1] for x in TRAIN_DATA[1:10]]

[{'entities': [(19, 33, 'ACTOR'), (43, 48, 'YEAR')]},
 {'entities': [(25, 34, 'ACTOR'), (39, 52, 'ACTOR')]},
 {'entities': [(39, 51, 'ACTOR'), (56, 67, 'ACTOR')]},
 {'entities': []},
 {'entities': [(17, 28, 'TITLE')]},
 {'entities': [(8, 29, 'GENRE'), (42, 58, 'DIRECTOR')]},
 {'entities': [(16, 25, 'GENRE'), (38, 51, 'DIRECTOR')]},
 {'entities': [(5, 24, 'SONG')]},
 {'entities': [(14, 19, 'ACTOR'), (26, 39, 'PLOT')]}]

#### Load pre-trained SpaCy Model

In [13]:
nlp = spacy.load("en_core_web_sm")
# nlp = spacy.load("en_core_web_md")
# nlp = spacy.load("en_core_web_lg")

TEST_DATA, _ = load_data_spacy(test)

warnings.filterwarnings("ignore")

test_sentences = [x[0] for x in TEST_DATA[0:15]]
for x in test_sentences:
    doc = nlp(x)
    displacy.render(doc, jupyter = True, style = "ent")
    
warnings.filterwarnings("default")
warnings.filterwarnings("ignore", category=DeprecationWarning)

In [14]:
#function for logging processing time
def timer(method):
    def timed(*args, **kw):
        ts=time.time()
        result = method(*args, **kw)
        te = time.time()
        print("Completed in {} seconds.".format(int(te-ts)))
        return result
    return timed

#data must be in form (sentence, {entities: [start, end, label]})
@timer
def train_spacy(train_data, labels, iterations, dropout = 0.2, display_freq = 1):
    nlp = spacy.blank('en')
    if 'ner' not in nlp.pipe_names:
        ner = nlp.add_pipe('ner')
    
    #add entity lables to NER pipeline
    for i in labels:
        ner.add_label(i)
    
    #disable other pipelines in spaCy to only train NER
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
    with nlp.disable_pipes(*other_pipes):
        nlp.vocab.vectors.name = 'spacy_model' #will get unnamed without
        optimizer = nlp.begin_training()
        for itr in range (iterations):
            random.shuffle(train_data) #will reshufflbe before each iteration
            losses = {}
            batches = minibatch(train_data, size = compounding(4., 32., 1.001))
            for batch in batches:
                for text, annotations in batch:
                    doc = nlp.make_doc(text)
                    example = Example.from_dict(doc, annotations)
                    nlp.update([example], drop=0.35, sgd=optimizer, losses=losses)
            if itr % display_freq == 0:
                print("iteration {} Loss: {}".format(itr + 1, losses))
    return nlp

#train and save the NER model
ner = train_spacy(TRAIN_DATA, LABELS, 20)
ner.to_disk(path_spacy)

[2022-10-26 16:29:11,682] [INFO] Created vocabulary
[2022-10-26 16:29:11,684] [INFO] Finished initializing nlp object


iteration 1 Loss: {'ner': 18644.77722940268}
iteration 2 Loss: {'ner': 13642.835281830614}
iteration 3 Loss: {'ner': 11910.215798068526}
iteration 4 Loss: {'ner': 11073.793708168065}
iteration 5 Loss: {'ner': 10426.40024635688}
iteration 6 Loss: {'ner': 10016.651539885306}
iteration 7 Loss: {'ner': 9854.75091597559}
iteration 8 Loss: {'ner': 9835.141665109659}
iteration 9 Loss: {'ner': 9872.575353464365}
iteration 10 Loss: {'ner': 9556.690664056558}
iteration 11 Loss: {'ner': 9646.410664100278}
iteration 12 Loss: {'ner': 9523.54937663881}
iteration 13 Loss: {'ner': 9530.134081332179}
iteration 14 Loss: {'ner': 9448.257371088373}
iteration 15 Loss: {'ner': 9213.428603502049}
iteration 16 Loss: {'ner': 9138.341782377911}
iteration 17 Loss: {'ner': 9327.298960355527}
iteration 18 Loss: {'ner': 9242.081441251212}
iteration 19 Loss: {'ner': 9361.240543902557}
iteration 20 Loss: {'ner': 9260.882032146072}
Completed in 6481 seconds.


In [15]:
# !gsutil -m cp -r -n '/home/jupyter/data/spacy/*' 'gs://msca-bdp-data-open/spacy/' 

In [16]:
def load_model(model_path):
    
    nlp = spacy.blank('en')
    if 'ner' not in nlp.pipe_names:
        ner = nlp.add_pipe('ner')
    ner = nlp.from_disk(model_path)
    return ner

ner = load_model(path_spacy)

TEST_DATA, _ = load_data_spacy(path_ner + '/test.txt')

test_sentences = [x[0] for x in TEST_DATA[0:15]] #extract the sentences from [sentence, entity]

for x in test_sentences:
    doc = ner(x)
    displacy.render(doc, jupyter = True, style = "ent")



In [17]:
import datetime
import pytz

datetime.datetime.now(pytz.timezone('US/Central')).strftime("%a, %d %B %Y %H:%M:%S")

'Wed, 26 October 2022 13:17:14'