In [35]:
import os

os.environ["CUDA_VISIBLE_DEVICES"] = "6"
import warnings
warnings.filterwarnings('ignore')
import spacy
import random
from spacy.util import minibatch, compounding
from spacy.training import Example

In [36]:
# Load the scibert model and create a blank NER model
nlp = spacy.load("en_core_sci_scibert")
ner_model = spacy.blank('en')

# Added NER to the blank NER model
ner = ner_model.add_pipe('ner', last=True)

# Combine the two models
ner_model.vocab.vectors = nlp.vocab.vectors

In [37]:
# Create Examples for training

# Load the training data
from spacy.tokens import DocBin

# Load the training data
TRAIN_DATA = [
    ("Homo sapiens is a species of primates.", {"entities": [(0, 11, "GENUS"), (12, 18, "SPECIES"), (32, 39, "FAMILY"), (40, 47, "ORDER")]}),
    ("Canis lupus is a species of canine.", {"entities": [(0, 5, "GENUS"), (6, 11, "SPECIES"), (25, 31, "FAMILY")]}),
    ("The Felidae family includes many species of cats.", {"entities": [(4, 11, "FAMILY")]}),
    ("The Panthera genus includes lions and tigers.", {"entities": [(4, 11, "GENUS"), (21, 26, "SPECIES"), (31, 37, "SPECIES")]}),
]

# Convert training data to Example objects
train_examples = []
labels = []
for text, anns in TRAIN_DATA:
    for start, end, label in anns['entities']:
        if label not in labels:
            labels.append(label)
    example = Example.from_dict(nlp.make_doc(text),anns)
    train_examples.append(example)

# Add the NER labels to the model
for label in labels:
    ner.add_label(label)

In [22]:
# Train the NER model
n_iter = 100
batch_size = 8
ner_model.begin_training()
for i in range(n_iter):
    random.shuffle(train_examples)
    batches = minibatch(train_examples, size=compounding(4.0, 32.0, 1.001))
    losses = {}
    for batch in batches:
        ner_model.update(batch, losses=losses, drop=0.5)
    print(f"Iteration {i}: Loss={losses['ner']}")

# Save the models to the disk
ner_model.to_disk('ner_based_on_scibert')
nlp.to_disk('new_scibert')

Iteration 0: Loss=20.588891327381134
Iteration 1: Loss=20.19592845439911
Iteration 2: Loss=19.776997089385986
Iteration 3: Loss=19.32099425792694
Iteration 4: Loss=18.809664964675903
Iteration 5: Loss=18.144729018211365
Iteration 6: Loss=17.47956556081772
Iteration 7: Loss=16.77994340658188
Iteration 8: Loss=14.902659356594086
Iteration 9: Loss=13.470107197761536
Iteration 10: Loss=13.096961259841919
Iteration 11: Loss=11.247962027788162
Iteration 12: Loss=8.990310728549957
Iteration 13: Loss=7.360689893364906
Iteration 14: Loss=6.436672121286392
Iteration 15: Loss=6.242825888097286
Iteration 16: Loss=5.702791491523385
Iteration 17: Loss=5.21747977938503
Iteration 18: Loss=4.956456020125188
Iteration 19: Loss=4.72588619651151
Iteration 20: Loss=5.231876943318639
Iteration 21: Loss=5.668231579074927
Iteration 22: Loss=5.4027199214433494
Iteration 23: Loss=5.188111403083894
Iteration 24: Loss=4.983926041615632
Iteration 25: Loss=4.178911379058263
Iteration 26: Loss=5.127599981962703
Iter

In [31]:
labels

['GENUS', 'SPECIES', 'FAMILY', 'ORDER']

In [32]:
test_text, test_ann = TRAIN_DATA[2]

print(f'Test text: \n {test_text}')

Test text: 
 The Felidae family includes many species of cats.


In [33]:
# Test the trained model
doc_nlp = nlp(test_text)
print("Entities by SciBert", [(ent.text, ent.label_) for ent in doc_nlp.ents])

Entities by SciBert [('Felidae', 'ENTITY'), ('family', 'ENTITY'), ('species', 'ENTITY'), ('cats', 'ENTITY')]


In [34]:
doc_ner = ner_model(test_text)
print("Entities by trained NER", [(ent.text, ent.label_) for ent in doc_ner.ents])

Entities by trained NER [('Felidae', 'FAMILY')]
