In [1]:
import sys
import random
import os
from pathlib import Path
import shutil
import json

import argparse
import tqdm
import spacy
from spacy.gold import minibatch
from spacy.language import Language
from spacy import util

from scispacy.data_util import read_full_med_mentions, read_ner_from_tsv
from scispacy.per_class_scorer import PerClassScorer
from scispacy.train_utils import evaluate_ner

import en_core_sci_sm


In [2]:
def train_ner(output_dir: str,
              train_data_path: str,
              dev_data_path: str,
              test_data_path: str,
              run_test: bool = None,
              model: str = None,
              n_iter: int = 10,
              meta_overrides: str = None):

    util.fix_random_seed(util.env_opt("seed", 0))
    train_data = read_ner_from_tsv(train_data_path)
    dev_data = read_ner_from_tsv(dev_data_path)
    test_data = read_ner_from_tsv(test_data_path)
    os.makedirs(output_dir, exist_ok=True)
    if run_test:
        nlp = spacy.load(model)
        print("Loaded model '%s'" % model)
        evaluate_ner(nlp, dev_data, dump_path=os.path.join(output_dir, "dev_metrics.json"))
        evaluate_ner(nlp, test_data, dump_path=os.path.join(output_dir, "test_metrics.json"))
    else:
        train(model, train_data, dev_data, test_data, output_dir, n_iter, meta_overrides)

In [3]:
def train(model, train_data, dev_data, test_data, output_dir, n_iter, meta_overrides):
    """Load the model, set up the pipeline and train the entity recognizer."""
    
    if model is not None:
        nlp = spacy.load(model)  # load existing spaCy model
        print("Loaded model '%s'" % model)
    else:
#         nlp = spacy.blank('en')  # create blank Language class
        nlp = en_core_sci_sm.load()
        print("Created blank 'en_core_sci_sm' model")

    if meta_overrides is not None:
        metadata = json.load(open(meta_overrides))
        nlp.meta.update(metadata)

    original_tokenizer = nlp.tokenizer

#     nlp.tokenizer = nlp_en.tokenizer

    # create the built-in pipeline components and add them to the pipeline
    # nlp.create_pipe works for built-ins that are registered with spaCy
    if 'ner' not in nlp.pipe_names and "parser" in nlp.pipe_names:
        ner = nlp.create_pipe('ner')
        nlp.add_pipe(ner, after="parser")
    elif 'ner' not in nlp.pipe_names and "tagger" in nlp.pipe_names:
        ner = nlp.create_pipe('ner')
        nlp.add_pipe(ner, after="tagger")
    elif 'ner' not in nlp.pipe_names:
        ner = nlp.create_pipe('ner')
        nlp.add_pipe(ner, last=True)
    # otherwise, get it so we can add labels
    else:
        ner = nlp.get_pipe('ner')

    # add labels
    for _, annotations in train_data:
        for ent in annotations.get('entities'):
            ner.add_label(ent[2])

    # get names of other pipes to disable them during training
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']

    dropout_rates = util.decaying(util.env_opt('dropout_from', 0.2),
                                  util.env_opt('dropout_to', 0.2),
                                  util.env_opt('dropout_decay', 0.005))
    batch_sizes = util.compounding(util.env_opt('batch_from', 1),
                                   util.env_opt('batch_to', 32),
                                   util.env_opt('batch_compound', 1.001))

    optimizer = nlp.begin_training()
    best_epoch = 0
    best_f1 = 0
    for i in range(n_iter):
        print(str(i)+'--'+str(n_iter))
        random.shuffle(train_data)
        count = 0
        losses = {}
        total = len(train_data)

        with nlp.disable_pipes(*other_pipes):  # only train NER
            with tqdm.tqdm(total=total, leave=True) as pbar:
                for batch in minibatch(train_data, size=batch_sizes):
                    docs, golds = zip(*batch)
                    nlp.update(docs, golds, sgd=optimizer,
                               losses=losses, drop=next(dropout_rates))
                    pbar.update(len(batch))
                    if count % 100 == 0 and count > 0:
                        print('sum loss: %s' % losses['ner'])
                    count += 1

        # save model to output directory
        output_dir_path = Path(output_dir + "/" + str(i))
        if not output_dir_path.exists():
            output_dir_path.mkdir()

        with nlp.use_params(optimizer.averages):
            nlp.tokenizer = original_tokenizer
            nlp.to_disk(output_dir_path)
            print("Saved model to", output_dir_path)

        # test the saved model
        print("Loading from", output_dir_path)
        nlp2 = util.load_model_from_path(output_dir_path)
#         nlp2.tokenizer = nlp_en.tokenizer

        metrics = evaluate_ner(nlp2, dev_data)
        if metrics["f1-measure-overall"] > best_f1:
            best_f1 = metrics["f1-measure-overall"]
            best_epoch = i
    # save model to output directory
    best_model_path = Path(output_dir + "/" + "best")
    print(f"Best Epoch: {best_epoch} of {n_iter}")
    if os.path.exists(best_model_path):
        shutil.rmtree(best_model_path)
    shutil.copytree(os.path.join(output_dir, str(best_epoch)),
                    best_model_path)

    # test the saved model
    print("Loading from", best_model_path)
    nlp2 = util.load_model_from_path(best_model_path)
#     nlp2.tokenizer = nlp_en.tokenizer

    evaluate_ner(nlp2, dev_data, dump_path=os.path.join(output_dir, "dev_metrics.json"))
    evaluate_ner(nlp2, test_data, dump_path=os.path.join(output_dir, "test_metrics.json"))

In [4]:
datapath = '/nfs/gns/literature/Santosh_Tirunagari/GitHub/Unsupervised-Protein-Genes-Diseases-Extraction/Datasets/relation_dataset/'
model_output_dir = '/nfs/gns/literature/Santosh_Tirunagari/GitHub/spacy_models/relation_ner/'
train_data_path = datapath+'train.csv'
dev_data_path = datapath+'devel.csv'
test_data_path = datapath+'test.csv'
run_test = False
model_path = None
iterations = 10
meta_overrides = '/nfs/gns/literature/Santosh_Tirunagari/GitHub/scispacy/data/EPMC_ner.json'

In [5]:
train_ner(model_output_dir,
              train_data_path,
              dev_data_path,
              test_data_path,
              run_test,
              model_path,
              iterations,
              meta_overrides)

  0%|          | 2/488 [00:00<00:39, 12.38it/s]

Created blank 'en_core_sci_sm' model
0--10


 21%|██        | 103/488 [00:07<00:28, 13.50it/s]

sum loss: 1286.907295449678


 42%|████▏     | 203/488 [00:14<00:20, 14.06it/s]

sum loss: 2433.5036239369633


 62%|██████▏   | 303/488 [00:21<00:11, 15.58it/s]

sum loss: 3415.7817587864565


 83%|████████▎ | 403/488 [00:28<00:05, 15.38it/s]

sum loss: 4466.731679697009


100%|██████████| 488/488 [00:34<00:00, 14.33it/s]


Saved model to /nfs/gns/literature/Santosh_Tirunagari/GitHub/spacy_models/relation_ner/0
Loading from /nfs/gns/literature/Santosh_Tirunagari/GitHub/spacy_models/relation_ner/0


  8%|▊         | 5/61 [00:00<00:01, 44.17it/s]

Evaluating 61 rows


100%|██████████| 61/61 [00:01<00:00, 43.78it/s]
  0%|          | 2/488 [00:00<00:35, 13.77it/s]

precision-untyped: 		 0.6666666666666662
recall-untyped: 		 0.5079365079365077
f1-measure-untyped: 		 0.5765765765765272
precision-overall: 		 0.5972222222222218
recall-overall: 		 0.4550264550264548
f1-measure-overall: 		 0.5165165165164671
1--10


 21%|██        | 103/488 [00:07<00:27, 13.87it/s]

sum loss: 985.1858874339955


 42%|████▏     | 203/488 [00:15<00:24, 11.55it/s]

sum loss: 2058.910013603808


 82%|████████▏ | 401/488 [00:22<00:02, 30.76it/s]

sum loss: 4025.8034852732217


100%|██████████| 488/488 [00:24<00:00, 19.60it/s]


Saved model to /nfs/gns/literature/Santosh_Tirunagari/GitHub/spacy_models/relation_ner/1
Loading from /nfs/gns/literature/Santosh_Tirunagari/GitHub/spacy_models/relation_ner/1


  8%|▊         | 5/61 [00:00<00:01, 43.36it/s]

Evaluating 61 rows


100%|██████████| 61/61 [00:01<00:00, 51.60it/s]
  1%|          | 4/488 [00:00<00:14, 33.35it/s]

precision-untyped: 		 0.7382550335570465
recall-untyped: 		 0.5820105820105816
f1-measure-untyped: 		 0.6508875739644474
precision-overall: 		 0.6979865771812075
recall-overall: 		 0.5502645502645499
f1-measure-overall: 		 0.6153846153845658
2--10


 42%|████▏     | 206/488 [00:06<00:09, 28.45it/s]

sum loss: 1865.6044609369844


 83%|████████▎ | 404/488 [00:13<00:02, 30.18it/s]

sum loss: 3688.8786141548567


100%|██████████| 488/488 [00:16<00:00, 29.73it/s]


Saved model to /nfs/gns/literature/Santosh_Tirunagari/GitHub/spacy_models/relation_ner/2
Loading from /nfs/gns/literature/Santosh_Tirunagari/GitHub/spacy_models/relation_ner/2


  8%|▊         | 5/61 [00:00<00:01, 47.43it/s]

Evaluating 61 rows


100%|██████████| 61/61 [00:01<00:00, 43.00it/s]
  1%|          | 4/488 [00:00<00:17, 28.14it/s]

precision-untyped: 		 0.7251461988304089
recall-untyped: 		 0.6560846560846557
f1-measure-untyped: 		 0.6888888888888387
precision-overall: 		 0.6959064327485376
recall-overall: 		 0.6685393258426962
f1-measure-overall: 		 0.6819484240687176
3--10


 60%|██████    | 294/488 [00:07<00:04, 45.17it/s]

sum loss: 2494.9477137164777


100%|██████████| 488/488 [00:12<00:00, 39.62it/s]


Saved model to /nfs/gns/literature/Santosh_Tirunagari/GitHub/spacy_models/relation_ner/3
Loading from /nfs/gns/literature/Santosh_Tirunagari/GitHub/spacy_models/relation_ner/3


  8%|▊         | 5/61 [00:00<00:01, 49.40it/s]

Evaluating 61 rows


100%|██████████| 61/61 [00:01<00:00, 51.65it/s]
  1%|          | 6/488 [00:00<00:08, 54.26it/s]

precision-untyped: 		 0.7469879518072284
recall-untyped: 		 0.6560846560846557
f1-measure-untyped: 		 0.6985915492957244
precision-overall: 		 0.7289156626506019
recall-overall: 		 0.6402116402116398
f1-measure-overall: 		 0.6816901408450202
4--10


 64%|██████▍   | 312/488 [00:07<00:03, 45.74it/s]

sum loss: 2518.0633731764133


100%|██████████| 488/488 [00:10<00:00, 45.56it/s]


Saved model to /nfs/gns/literature/Santosh_Tirunagari/GitHub/spacy_models/relation_ner/4
Loading from /nfs/gns/literature/Santosh_Tirunagari/GitHub/spacy_models/relation_ner/4


  8%|▊         | 5/61 [00:00<00:01, 44.28it/s]

Evaluating 61 rows


100%|██████████| 61/61 [00:01<00:00, 43.49it/s]
  2%|▏         | 8/488 [00:00<00:10, 44.15it/s]

precision-untyped: 		 0.797546012269938
recall-untyped: 		 0.6878306878306875
f1-measure-untyped: 		 0.7386363636363135
precision-overall: 		 0.779141104294478
recall-overall: 		 0.6719576719576715
f1-measure-overall: 		 0.7215909090908589
5--10


 84%|████████▎ | 408/488 [00:08<00:01, 46.83it/s]

sum loss: 3271.3927103331544


100%|██████████| 488/488 [00:09<00:00, 49.30it/s]


Saved model to /nfs/gns/literature/Santosh_Tirunagari/GitHub/spacy_models/relation_ner/5
Loading from /nfs/gns/literature/Santosh_Tirunagari/GitHub/spacy_models/relation_ner/5


  8%|▊         | 5/61 [00:00<00:01, 44.20it/s]

Evaluating 61 rows


100%|██████████| 61/61 [00:01<00:00, 43.98it/s]
  2%|▏         | 8/488 [00:00<00:08, 55.83it/s]

precision-untyped: 		 0.7848837209302321
recall-untyped: 		 0.7142857142857139
f1-measure-untyped: 		 0.7479224376730799
precision-overall: 		 0.7558139534883715
recall-overall: 		 0.6878306878306875
f1-measure-overall: 		 0.720221606648149
6--10


 90%|█████████ | 441/488 [00:07<00:00, 72.96it/s]

sum loss: 3290.425525144984


100%|██████████| 488/488 [00:08<00:00, 60.38it/s]


Saved model to /nfs/gns/literature/Santosh_Tirunagari/GitHub/spacy_models/relation_ner/6
Loading from /nfs/gns/literature/Santosh_Tirunagari/GitHub/spacy_models/relation_ner/6


  8%|▊         | 5/61 [00:00<00:01, 44.43it/s]

Evaluating 61 rows


100%|██████████| 61/61 [00:01<00:00, 41.58it/s]
  1%|          | 5/488 [00:00<00:13, 35.20it/s]

precision-untyped: 		 0.7919075144508665
recall-untyped: 		 0.7248677248677244
f1-measure-untyped: 		 0.756906077348016
precision-overall: 		 0.7745664739884388
recall-overall: 		 0.7089947089947086
f1-measure-overall: 		 0.7403314917126568
7--10


100%|██████████| 488/488 [00:08<00:00, 58.02it/s]


Saved model to /nfs/gns/literature/Santosh_Tirunagari/GitHub/spacy_models/relation_ner/7
Loading from /nfs/gns/literature/Santosh_Tirunagari/GitHub/spacy_models/relation_ner/7


  8%|▊         | 5/61 [00:00<00:01, 44.09it/s]

Evaluating 61 rows


100%|██████████| 61/61 [00:01<00:00, 42.13it/s]
  2%|▏         | 10/488 [00:00<00:07, 61.06it/s]

precision-untyped: 		 0.7714285714285709
recall-untyped: 		 0.7142857142857139
f1-measure-untyped: 		 0.7417582417581913
precision-overall: 		 0.7599999999999995
recall-overall: 		 0.7037037037037033
f1-measure-overall: 		 0.7307692307691804
8--10


100%|██████████| 488/488 [00:07<00:00, 61.45it/s]


Saved model to /nfs/gns/literature/Santosh_Tirunagari/GitHub/spacy_models/relation_ner/8
Loading from /nfs/gns/literature/Santosh_Tirunagari/GitHub/spacy_models/relation_ner/8


  8%|▊         | 5/61 [00:00<00:01, 42.84it/s]

Evaluating 61 rows


100%|██████████| 61/61 [00:01<00:00, 44.23it/s]
  2%|▏         | 12/488 [00:00<00:06, 77.87it/s]

precision-untyped: 		 0.7643678160919535
recall-untyped: 		 0.7037037037037033
f1-measure-untyped: 		 0.7327823691459552
precision-overall: 		 0.7471264367816087
recall-overall: 		 0.6878306878306875
f1-measure-overall: 		 0.7162534435261205
9--10


100%|██████████| 488/488 [00:07<00:00, 68.17it/s]


Saved model to /nfs/gns/literature/Santosh_Tirunagari/GitHub/spacy_models/relation_ner/9
Loading from /nfs/gns/literature/Santosh_Tirunagari/GitHub/spacy_models/relation_ner/9


 10%|▉         | 6/61 [00:00<00:01, 53.29it/s]

Evaluating 61 rows


100%|██████████| 61/61 [00:01<00:00, 49.79it/s]


precision-untyped: 		 0.7836257309941516
recall-untyped: 		 0.7089947089947086
f1-measure-untyped: 		 0.7444444444443942
precision-overall: 		 0.771929824561403
recall-overall: 		 0.698412698412698
f1-measure-overall: 		 0.7333333333332832
Best Epoch: 6 of 10
Loading from /nfs/gns/literature/Santosh_Tirunagari/GitHub/spacy_models/relation_ner/best


  8%|▊         | 5/61 [00:00<00:01, 44.83it/s]

Evaluating 61 rows


100%|██████████| 61/61 [00:01<00:00, 48.03it/s]
  8%|▊         | 5/61 [00:00<00:01, 47.24it/s]

precision-untyped: 		 0.7919075144508665
recall-untyped: 		 0.7248677248677244
f1-measure-untyped: 		 0.756906077348016
precision-overall: 		 0.7745664739884388
recall-overall: 		 0.7089947089947086
f1-measure-overall: 		 0.7403314917126568
Evaluating 61 rows


100%|██████████| 61/61 [00:01<00:00, 47.50it/s]


precision-untyped: 		 0.7387387387387384
recall-untyped: 		 0.7999999999999996
f1-measure-untyped: 		 0.7681498829039309
precision-overall: 		 0.6568047337278102
recall-overall: 		 0.7449664429530196
f1-measure-overall: 		 0.6981132075471196


In [7]:
best_model_path = model_output_dir+'best/'

print("Loading from", best_model_path)
nlp2 = util.load_model_from_path(best_model_path)
doc = nlp2(text)


Loading from /nfs/gns/literature/Santosh_Tirunagari/GitHub/spacy_models/relation_ner/best/


In [16]:
import pandas as pd

df = pd.read_csv('/nfs/gns/literature/Santosh_Tirunagari/GitHub/Unsupervised-Protein-Genes-Diseases-Extraction/Notebooks/SpaCy NER and TextCat/agreed_.csv', names=['expert','sentence', 'label'])
agreed_df = df[['sentence', 'label']] 


In [32]:
text = agreed_df.iloc[30].sentence
# text = 'These results indicate the dentate gyrus is mostly comprised of mature neurons (NeuN), along with a smaller population of precursor cells (nestin) and newly differentiated neurons (DCX), which corresponds to prior findings examining the relative number of each cell population in the dentate gyrus, indicating the proportion of cells labeled by the sensor approximately reflects physiological proportions [21, 22]. '
sentence = nlp2(text)
print(text)
for ent in sentence.ents:
    print(ent.text, ent.start_char, ent.end_char, ent.label_)

Urinary TGF-β1 excretion increased in patients with crescentic nephritis (CN) (n = 15 p < 0.01) and with various types of proliferative GN (vGN) (n = 12 p < 0.05) compared to healthy subjects (n = 10).
increased 25 34 POSITIVE_REGULATION


In [34]:
!python -m spacy package /nfs/gns/literature/Santosh_Tirunagari/GitHub/spacy_models/relation_ner/best/ /nfs/gns/literature/Santosh_Tirunagari/GitHub/spacy_models/relation_ner/model

[38;5;2m✔ Loaded meta.json from file[0m
/nfs/gns/literature/Santosh_Tirunagari/GitHub/spacy_models/relation_ner/best/meta.json
[38;5;2m✔ Successfully created package 'en_ner_europepmc_md-0.2.5'[0m
/nfs/gns/literature/Santosh_Tirunagari/GitHub/spacy_models/relation_ner/model/en_ner_europepmc_md-0.2.5
To build the package, run `python setup.py sdist` in this directory.
