In [3]:
import utils
# auto load the changes of referenced codes
%load_ext autoreload
%autoreload 2

# ebablbe auto-completion
%config Completer.use_jedi = False

# if new methods are created in referenced codes, run the following code.
%reload_ext autoreload

In [6]:
import json
import spacy
import random
from spacy.training import Example
from spacy.scorer import Scorer
from sklearn.model_selection import train_test_split

def get_model():
    nlp = spacy.blank("en")
    print(f'nlp.pipe_names={nlp.pipe_names}')

    ner = nlp.add_pipe("ner")
    print(f'nlp.pipe_names={nlp.pipe_names}')
    print(f'nlp.meta={nlp.meta}')

    # 病菌, 身体状况, 药品
    labels = ['Pathogen', 'MedicalCondition', 'Medicine']
    for ent in labels:
        ner.add_label(ent)
    print(f'ner.labels={ner.labels}')   
    return nlp

print('-'*25, '创建初始模型', '-'*25)
nlp = get_model()
options = {'colors': {'Pathogen':"#56D7C4", 
                      'MedicalCondition':"#92E0AA", 
                      'Medicine':"lightgreen"} 
          }

print('-'*25, '数据准备', '-'*25)
with open("data/corona.json") as f:
    data = json.loads(f.read())

data_ = []

for (text, annot) in data:
    new_anno = []
    for st, end, label in annot["entities"]:
        new_anno.append((st, end, label))
    data_.append((text, {"entities": new_anno}))

print(data_[0])

# 生成训练和测试数据
print('-'*25, '生成训练和测试数据', '-'*25)
examples = []
for text, annots in data_:    
    examples.append(Example.from_dict(nlp.make_doc(text), annots))
    
train_examples, test_examples = train_test_split(examples,  
                                                 test_size=0.2, 
                                                 random_state=202109)

print(f'len(examples)={len(examples)}')  
print(f'len(train_examples)={len(train_examples)}')
print(f'len(test_examples)={len(test_examples)}')

------------------------- 创建初始模型 -------------------------
nlp.pipe_names=[]
nlp.pipe_names=['ner']
nlp.meta={'lang': 'en', 'name': 'pipeline', 'version': '0.0.0', 'spacy_version': '>=3.1.1,<3.2.0', 'description': '', 'author': '', 'email': '', 'url': '', 'license': '', 'spacy_git_version': 'ffaead8fe', 'vectors': {'width': 0, 'vectors': 0, 'keys': 0, 'name': None}, 'labels': {'ner': []}, 'pipeline': ['ner'], 'components': ['ner'], 'disabled': [], '_sourced_vectors_hashes': {}}
ner.labels=('MedicalCondition', 'Medicine', 'Pathogen')
------------------------- 数据准备 -------------------------
('Diarrhea, also spelled diarrhoea, is the condition of having at least three loose, liquid, or watery bowel movements each day.[2] It often lasts for a few days and can result in dehydration due to fluid loss.[2] Signs of dehydration often begin with loss of the normal stretchiness of the skin and irritable behaviour.[2] This can progress to decreased urination, loss of skin color, a fast heart rate,

In [9]:
print('-'*25, '训练', '-'*25)

def get_scores(nlp, examples):
    scores = nlp.evaluate(examples)
    ents_p = round(scores['ents_p'], 3)
    ents_r= round(scores['ents_r'], 3)
    ents_f = round(scores['ents_f'], 3)
    # 以下代码，效果完全相同
    # scorer = Scorer()
    # predicted_examples = [Example(nlp(example.predicted.text), example.reference)  
    #                       for example in examples]        
    # scores = scorer.score(predicted_examples)
    # ents_p = round(scores['ents_p'], 3)
    # ents_r= round(scores['ents_r'], 3)
    # ents_f = round(scores['ents_f'], 3)  
    return ents_p, ents_r, ents_f

def train(nlp, train_examples, test_examples, epochs, best_model_path):
    optimizer = nlp.initialize(lambda: train_examples)

    best_f1 = None    
    for itn in range(epochs):
        losses = {}        
        random.shuffle(train_examples)
        for batch in spacy.util.minibatch(train_examples, size=8):
            nlp.update(batch, losses=losses) 
        
        train_scores = get_scores(nlp, train_examples)
        test_scores = get_scores(nlp, test_examples)
        f1 = test_scores[2]

        loss = losses['ner']    
        print((f"{itn+1:>2}/{epochs} "  
               f"train=(loss:{round(loss, 3):>8}, p:{train_scores[0]:>5}, r:{train_scores[1]:>5}, f:{train_scores[2]:>5})"
               f" test=(p:{test_scores[0]:>5}, r:{test_scores[1]:>5}, f:{test_scores[2]:>5})"))
        if best_f1 is None or f1 > best_f1:
            best_f1 = f1        
            nlp.to_disk(best_model_path)
              
best_model_path='output/first_ner/best'                
train(nlp, train_examples, test_examples, epochs=40, best_model_path=best_model_path)

------------------------- 训练 -------------------------
 1/40 train=(loss: 1057.23, p:  0.0, r:  0.0, f:  0.0) test=(p:  0.0, r:  0.0, f:  0.0)
 2/40 train=(loss:1307.158, p:  0.0, r:  0.0, f:  0.0) test=(p:  0.0, r:  0.0, f:  0.0)
 3/40 train=(loss:1140.649, p:  0.0, r:  0.0, f:  0.0) test=(p:  0.0, r:  0.0, f:  0.0)
 4/40 train=(loss: 644.402, p:  0.0, r:  0.0, f:  0.0) test=(p:  0.0, r:  0.0, f:  0.0)
 5/40 train=(loss: 234.409, p:  0.0, r:  0.0, f:  0.0) test=(p:  0.0, r:  0.0, f:  0.0)
 6/40 train=(loss:  200.43, p:  0.0, r:  0.0, f:  0.0) test=(p:  0.0, r:  0.0, f:  0.0)
 7/40 train=(loss: 195.348, p:  0.0, r:  0.0, f:  0.0) test=(p:  0.0, r:  0.0, f:  0.0)
 8/40 train=(loss: 175.004, p:  0.0, r:  0.0, f:  0.0) test=(p:  0.0, r:  0.0, f:  0.0)
 9/40 train=(loss: 178.695, p:0.091, r:0.019, f:0.031) test=(p:  0.0, r:  0.0, f:  0.0)
10/40 train=(loss: 161.866, p:0.031, r: 0.01, f:0.015) test=(p:  0.0, r:  0.0, f:  0.0)
11/40 train=(loss: 161.612, p:0.188, r:0.057, f:0.088) test=(p:  

In [10]:
nlp = spacy.load(best_model_path)
# 等价于下面这条语句
# nlp = get_model().from_disk(best_model_path)

train_scores = get_scores(nlp, train_examples)
test_scores = get_scores(nlp, test_examples)

print(f"train=(p:{train_scores[0]:>5}, r:{train_scores[1]:>5}, f:{train_scores[2]:>5})")
print(f" test=(p:{test_scores[0]:>5}, r:{test_scores[1]:>5}, f:{test_scores[2]:>5})")

train=(p: 0.81, r:0.771, f: 0.79)
 test=(p:0.094, r:0.125, f:0.107)


In [12]:
def show_results(nlp, texts):
    docs = list(nlp.pipe(texts))
    for doc in docs:
        print('-'*80)
        spacy.displacy.render(doc, style='ent', options=options)

texts = [
    "One of the bacterial diseases with the highest disease burden is tuberculosis, caused by Mycobacterium tuberculosis bacteria, which kills about 2 million people a year.",
    "Pathogenic bacteria contribute to other globally important diseases, such as pneumonia, which can be caused by bacteria such as Streptococcus and Pseudomonas, and foodborne illnesses, which can be caused by bacteria such as Shigella, Campylobacter, and Salmonella. Pathogenic bacteria also cause infections such as tetanus, typhoid fever, diphtheria, syphilis, and leprosy. Pathogenic bacteria are also the cause of high infant mortality rates in developing countries."
]   
   
show_results(nlp, texts) 

--------------------------------------------------------------------------------


--------------------------------------------------------------------------------


In [14]:
for example in test_examples[0:1]:
    print('='*80)
    doc = nlp(example.predicted.text)
    if len(doc.ents)>0:
        print('-'*30, 'predicted', '-'*30)
        spacy.displacy.render(doc, style='ent', options=options)
        print('-'*30, 'reference', '-'*30)
        spacy.displacy.render(example.reference, style='ent', options=options)            
    else:
        print(doc.text)

------------------------------ predicted ------------------------------


------------------------------ reference ------------------------------
