# This Notebook shows how to build a NER Model in detail:
### 1. Importing required Libraries
### 2. Building a function to convert data to spaCy format
### 3. Check a train data point on how it looks like
### 4. Training a spaCy model
### 5. Loading the model and testing(inference)


## **1. Import Libraries**

In [2]:
#Import all required libraries
import spacy
import random
import time
import numpy as np
from spacy.util import minibatch, compounding
import sys
from spacy import displacy
from itertools import chain
import matplotlib.pyplot as plt 
from matplotlib.ticker import MaxNLocator

## **2. Building function to Convert Data from file into the required format for spaCy**

In [3]:
def conv_data_to_spacy(file_path):
    ''' This function converts data from BIO format as we have seen above to spacy format:
    -> (sentence, {entities : [(start, end, label), (stard, end, label)]})
    '''
    file = open(file_path, 'r')
    training_data, entities, sentence, unique_labels = [], [], [], []
    current_annotation = None
    start =0
    end = 0 # initialize counter to keep track of start and end characters
    for line in file:
        line = line.strip("\n").split("\t")
        # lines with len > 1 are words
        if len(line) > 1:
            label = line[1]
            if(label != 'O'):
                # Appending _Disease for B and I Labels for better representation
                label = line[1]+"_Disease" # the .txt is formatted: label \t word, label[0:2] = label_type
            #label_type = line[0][0] # beginning of annotations - "B", intermediate - "I"
            word = line[0]
            sentence.append(word)
            start = end
            end += (len(word) + 1)  # length of the word + trailing space
           
            if label == 'I_Disease' :  # if at the end of an annotation
                entities.append(( start,end-1, label))  # append the annotation
                              
            if label == 'B_Disease':                         # if beginning new annotation
                entities.append(( start,end-1, label))# start annotation at beginning of word                          
           
            if label != 'O' and label not in unique_labels:
                unique_labels.append(label)
 
        # lines with len == 1 are breaks between sentences
        if len(line) == 1:
            if(len(entities) > 0):
                sentence = " ".join(sentence)
                training_data.append([sentence, {'entities' : entities}])
            # reset the counters and temporary lists
            end = 0 
            start = 0
            entities, sentence = [], []
            
    file.close()
    return training_data, unique_labels


In [4]:
TRAIN_DATA, LABELS = conv_data_to_spacy("/content/train.tsv")
#print('Sample data point:',TRAIN_DATA[5])
print('Train Data Len:',len(TRAIN_DATA))
TEST_DATA, _ = conv_data_to_spacy("/content/test.tsv")
print('Test Data Len:',len(TEST_DATA))
VALID_DATA, _ = conv_data_to_spacy("/content/train_dev.tsv")
print('Validation Data Len:',len(VALID_DATA))

Train Data Len: 2658
Test Data Len: 2842
Validation Data Len: 5385


## **3. Checking a training point on how it looks like**

In [5]:
print(TRAIN_DATA[6])

['A lesser degree of orthostatic hypotension occurred with standing .', {'entities': [(19, 30, 'B_Disease'), (31, 42, 'I_Disease')]}]


In [None]:
!python -m spacy download en_core_web_md

**Here we are taking the below English train pipeline for our NER training. It is trained on English core vocabulary and its size is medium**

In [7]:
import en_core_web_md
nlp = en_core_web_md.load()

In [8]:
LABELS

['B_Disease', 'I_Disease']

## **4. Training a spaCy model**

In [9]:
from tqdm import tqdm
def training_spacy_model(train_data, labels, iterations):
    ''' This Function helps in Training the spacy NER model, 
    which takes the 3 parameters :train_data, labels , iterations 
    '''
    nlp = en_core_web_md.load()
    #nlp = spacy.blank('en')
    if 'ner' not in nlp.pipe_names:
        ner = nlp.create_pipe('ner')
        nlp.add_pipe(ner)
    else:
        ner = nlp.get_pipe("ner")
   
    # Add entity labels to the NER pipeline
    for i in labels:
        ner.add_label(i)

    # Disable other pipelines in SpaCy to only train NER
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
    with nlp.disable_pipes(*other_pipes):
        #nlp.vocab.vectors.name = 'spacy_model' # without this, spaCy throws an "unnamed" error
        optimizer = nlp.begin_training()
        for itr in tqdm(range(iterations)):
            random.shuffle(train_data) # shuffle the training data before each iteration
            losses = {}
            batches = minibatch(train_data, size = compounding(16.0, 64.0, 1.5))
            for batch in batches:
                texts, annotations = zip(*batch)
                nlp.update(          
                    texts,
                    annotations, 
                    sgd = optimizer,
                    losses = losses)
            print('=======================================')
            print('Interation = '+str(itr))
            print('Losses = '+str(losses))          
    return nlp

In [10]:
# Train (and save) the NER model
ner = training_spacy_model(TRAIN_DATA, LABELS,20)
ner.to_disk("/content/spacy_example")

  5%|▌         | 1/20 [00:09<03:04,  9.70s/it]

Interation = 0
Losses = {'ner': 9749.123747348785}


 10%|█         | 2/20 [00:19<02:53,  9.64s/it]

Interation = 1
Losses = {'ner': 4495.945324540138}


 15%|█▌        | 3/20 [00:28<02:44,  9.65s/it]

Interation = 2
Losses = {'ner': 2580.298750936985}


 20%|██        | 4/20 [00:38<02:33,  9.61s/it]

Interation = 3
Losses = {'ner': 1414.3167940936983}


 25%|██▌       | 5/20 [00:48<02:24,  9.62s/it]

Interation = 4
Losses = {'ner': 723.3792272889987}


 30%|███       | 6/20 [00:57<02:14,  9.61s/it]

Interation = 5
Losses = {'ner': 423.38887047418393}


 35%|███▌      | 7/20 [01:07<02:04,  9.61s/it]

Interation = 6
Losses = {'ner': 289.45202455000253}


 40%|████      | 8/20 [01:16<01:55,  9.62s/it]

Interation = 7
Losses = {'ner': 231.10180060937435}


 45%|████▌     | 9/20 [01:26<01:45,  9.60s/it]

Interation = 8
Losses = {'ner': 156.96646079645507}


 50%|█████     | 10/20 [01:36<01:35,  9.58s/it]

Interation = 9
Losses = {'ner': 97.82159840232464}


 55%|█████▌    | 11/20 [01:45<01:26,  9.61s/it]

Interation = 10
Losses = {'ner': 64.53473079473358}


 60%|██████    | 12/20 [01:55<01:16,  9.60s/it]

Interation = 11
Losses = {'ner': 87.70119817845813}


 65%|██████▌   | 13/20 [02:04<01:07,  9.60s/it]

Interation = 12
Losses = {'ner': 75.62512510835165}


 70%|███████   | 14/20 [02:14<00:57,  9.59s/it]

Interation = 13
Losses = {'ner': 109.66773063632965}


 75%|███████▌  | 15/20 [02:24<00:47,  9.58s/it]

Interation = 14
Losses = {'ner': 83.73778630346986}


 80%|████████  | 16/20 [02:33<00:38,  9.59s/it]

Interation = 15
Losses = {'ner': 64.833064871007}


 85%|████████▌ | 17/20 [02:43<00:28,  9.60s/it]

Interation = 16
Losses = {'ner': 48.68512595634855}


 90%|█████████ | 18/20 [02:52<00:19,  9.58s/it]

Interation = 17
Losses = {'ner': 37.1744223605074}


 95%|█████████▌| 19/20 [03:02<00:09,  9.58s/it]

Interation = 18
Losses = {'ner': 36.86741289510065}


100%|██████████| 20/20 [03:11<00:00,  9.59s/it]

Interation = 19
Losses = {'ner': 43.89871613674764}





## **5. Loading and Testing a spaCy model**

In [11]:
def load_model(model_path):
    ''' Loads a pre-trained model for prediction on new test sentences   
    model_path : directory of model saved by spacy.to_disk
    '''
    nlp = spacy.blank('en')
    if 'ner' not in nlp.pipe_names:
        ner = nlp.create_pipe('ner')
        nlp.add_pipe(ner)
    ner = nlp.from_disk(model_path)
    return ner

In [16]:
ner = load_model("/content/spacy_example")

test_sentences = [x[0] for x in TEST_DATA[0:5]] # extract the sentences from [sentence, entity]
for x in test_sentences:
    doc = ner(x)
    for ent in doc.ents:
        print(ent.text, ent.start_char, ent.end_char, ent.label_)
    displacy.render(doc,jupyter=True, style = "ent")

Torsade 0 7 B_Disease
ventricular 19 30 B_Disease
tachycardia 31 42 I_Disease
dilated 111 118 B_Disease
cardiomyopathy 119 133 I_Disease
heart 149 154 B_Disease
failure 155 162 I_Disease


heart 79 84 B_Disease
failure 85 92 I_Disease
dilated 106 113 B_Disease
cardiomyopathy 114 128 I_Disease
ventricular 156 167 B_Disease
arrhythmias 168 179 I_Disease
torsade 214 221 B_Disease
ventricular 233 244 B_Disease
tachycardia 245 256 I_Disease


torsade 15 22 B_Disease
ventricular 34 45 B_Disease
tachycardia 46 57 I_Disease
arrhythmias 138 149 B_Disease
rhythm 227 233 B_Disease
disturbances 234 246 I_Disease


dyspnea 109 116 B_Disease
loss 119 123 B_Disease
of 124 126 I_Disease
consciousness 127 140 I_Disease
rash 171 175 B_Disease
sensation 237 246 B_Disease
heat 250 254 I_Disease
pain 268 272 B_Disease


allergy 94 101 B_Disease


In [12]:
ner = load_model("/content/spacy_example")
doc = ner("Selegiline - induced postural hypotension in Parkinson ' s disease : a longitudinal study on the effects of drug withdrawal.The aims of this study were to confirm our previous findings in a separate cohort of patients and to determine the time course of the cardiovascular consequences of stopping selegiline in the expectation that this might shed light on the mechanisms by which the drug causes orthostatic hypotension")
displacy.render(doc,jupyter=True, style = "ent")

In [13]:
ner = load_model("/content/spacy_example")
doc = ner("CD4 T cells are white blood cells that are specifically targeted and destroyed by HIV. Even if you have no symptoms, HIV infection progresses to AIDS when your CD4 T cell count dips below 200")
displacy.render(doc,jupyter=True, style = "ent")

In [14]:
ner = load_model("/content/spacy_example")
doc = ner("Deep brain stimulation (DBS): A new surgical procedure that is very effective in treating Parkinson's disease. The surgery includes the implantation of permanent electrodes in various parts of the brain through which continuous pulses of electricity are given to control the symptoms of Parkinson's.")
displacy.render(doc,jupyter=True, style = "ent")