In [59]:
LABEL = 'CONCEPT'

In [60]:
TRAIN_DATA = [
    ("A Bayesian network, Bayes network, belief network, Bayes(ian) model or probabilistic directed acyclic graphical "
     "model is a probabilistic graphical model (a type of statistical model) that represents a set of variables and "
     "their conditional dependencies via a directed acyclic graph (DAG).", { 
        'entities': [(2, 18, 'CONCEPT'),
                     (20, 33, 'CONCEPT'),
                     (35, 49, 'CONCEPT'),
                     (51, 67, 'CONCEPT'),
                     (71, 117, 'CONCEPT'),
                     (123, 152, 'CONCEPT'),
                     (164, 181, 'CONCEPT'),
                     (201, 204, 'CONCEPT'),
                     (208, 216, 'CONCEPT'),
                     (228, 251, 'CONCEPT'),
                     (259, 280, 'CONCEPT'),
                     (283, 285, 'CONCEPT')]
    }),

    ("For example, a Bayesian network could represent the probabilistic relationships between diseases and symptoms. ", {
        'entities': [(15, 30, 'CONCEPT'),
                     (52, 78, 'CONCEPT')]
    }),

    ("Given symptoms, the network can be used to compute the probabilities of the presence of various diseases.", {
        'entities': [(20, 26, 'CONCEPT'),
                     (55, 67, 'CONCEPT')]
    }),

    ("Formally, Bayesian networks are DAGs whose nodes represent variables in the Bayesian sense: they may be observable quantities, latent variables, unknown parameters or hypotheses.", {
        'entities': [(10, 26, 'CONCEPT'),
                     (32, 35, 'CONCEPT'),
                     (43, 47, 'CONCEPT'),
                     (59, 67, 'CONCEPT'),
                     (76, 83, 'CONCEPT'),
                     (104, 124, 'CONCEPT'),
                     (127, 142, 'CONCEPT'),
                     (145, 162, 'CONCEPT'),
                     (167, 176, 'CONCEPT')]
    }),

    ("Edges represent conditional dependencies; nodes that are not connected (there is no path from one of the variables to the other in the Bayesian network) represent variables that are conditionally independent of each other.", {
        'entities': [(0, 4, 'CONCEPT'),
                     (16, 39, 'CONCEPT'),
                     (42, 46, 'CONCEPT'),
                     (84, 87, 'CONCEPT'),
                     (105, 113, 'CONCEPT'),
                     (135, 150, 'CONCEPT'),
                     (163, 171, 'CONCEPT'),
                     (182, 206, 'CONCEPT')]
    }),

    ("Each node is associated with a probability function that takes, as input, a particular set of values for the node's parent variables, and gives (as output) the probability (or probability distribution, if applicable) of the variable represented by the node.", {
        'entities': [(5, 8, 'CONCEPT'),
                     (31, 50, 'CONCEPT'),
                     (67, 71, 'CONCEPT'),
                     (87, 89, 'CONCEPT'),
                     (109, 112, 'CONCEPT'),
                     (116, 131, 'CONCEPT'),
                     (160, 170, 'CONCEPT'),
                     (176, 199, 'CONCEPT'),
                     (224, 231, 'CONCEPT'),
                     (252, 255, 'CONCEPT')]
    })
]

In [77]:
# Model Building 

import spacy
import random

def main(model = None, 
         new_model_name = 'concept',
         output_dir = None,
         n_iter = 20):
    
    # ============= Load or create a model ============= #
    
    if model is not None:
        nlp = spacy.load(model)
    else:
        nlp = spacy.blank('en')
        
    # ============= Add entity recognizer ============== #
    
    if 'ner' not in nlp.pipe_names:
        ner = nlp.create_pipe('ner')
        nlp.add_pipe(ner)
    else:
        nlp.get_pipe('ner')
        
    # ============= Add new entity label =============== #
    
    ner.add_label(LABEL)
    
    # ================ Add optimizer =================== #
    
    if model is None:
        optimizer = nlp.begin_training()
    else:
        optimizer = nlp.entity.create_optimizer()
        
    # ============= Disable other pipes ================ #
    
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
    with nlp.disable_pipes(*other_pipes):  # only train NER
        for itn in range(n_iter):
            random.shuffle(TRAIN_DATA)
            losses = {}
            for text, annotations in TRAIN_DATA:
                nlp.update([text], [annotations], sgd=optimizer, drop=0.35,
                           losses=losses)
            # print(losses)
    
    # ============== Test the Model ==================== #
    
    test_text = "Are Bayesian networks a concept? How about linear algebra? Are horses a concept?"
    
    doc = nlp(test_text)
    print("Entities in '%s'" % test_text)
    for ent in doc.ents:
        print(ent.label_, ent.text)
        
    # =============== Save the Model =================== #
    
    if output_dir is not None:
        output_dir = Path(output_dir)
        if not output_dir.exists():
            output_dir.mkdir()
        nlp.meta['name'] = new_model_name  # rename model
        nlp.to_disk(output_dir)
        print("Saved model to", output_dir)

        # test the saved model
        print("Loading from", output_dir)
        nlp2 = spacy.load(output_dir)
        doc2 = nlp2(test_text)
        for ent in doc2.ents:
            print(ent.label_, ent.text)

In [78]:
main()

Entities in 'Are Bayesian networks a concept? How about linear algebra? Are horses a concept?'
CONCEPT Bayesian networks
