In [1]:
import sys
import random
import os
from pathlib import Path
import shutil
import json

import argparse
import tqdm
import spacy
from spacy.gold import minibatch
from spacy.language import Language
from spacy import util

In [7]:
from scispacy.data_util import read_full_med_mentions, read_ner_from_tsv
from scispacy.per_class_scorer import PerClassScorer
from scispacy.train_utils import evaluate_ner

In [19]:
from spacy_transformers import TransformersLanguage, TransformersWordPiecer, TransformersTok2Vec

name = "scibert-scivocab-uncased"
path = "/nfs/gns/literature/Santosh_Tirunagari/pretrained_word_embeddings/scibert_scivocab_uncased"

nlp = TransformersLanguage(trf_name=name, meta={"lang": "en"})
nlp.add_pipe(nlp.create_pipe("sentencizer"))
nlp.add_pipe(TransformersWordPiecer.from_pretrained(nlp.vocab, path))
nlp.add_pipe(TransformersTok2Vec.from_pretrained(nlp.vocab, path))



In [14]:
def train_ner(output_dir: str,
              train_data_path: str,
              dev_data_path: str,
              test_data_path: str,
              run_test: bool = None,
#               model: str = None,
              n_iter: int = 10,
              meta_overrides: str = None):

    util.fix_random_seed(util.env_opt("seed", 0))
    train_data = read_ner_from_tsv(train_data_path)
    dev_data = read_ner_from_tsv(dev_data_path)
    test_data = read_ner_from_tsv(test_data_path)
    os.makedirs(output_dir, exist_ok=True)
    if run_test:
#         nlp = spacy.load(model)
#         print("Loaded model '%s'" % model)
        evaluate_ner(nlp, dev_data, dump_path=os.path.join(output_dir, "dev_metrics.json"))
        evaluate_ner(nlp, test_data, dump_path=os.path.join(output_dir, "test_metrics.json"))
    else:
        train(train_data, dev_data, test_data, output_dir, n_iter)

In [15]:
def train(train_data, dev_data, test_data, output_dir, n_iter):
    """Load the model, set up the pipeline and train the entity recognizer."""
    
    original_tokenizer = nlp.tokenizer

#     nlp.tokenizer = WhitespaceTokenizer(nlp.vocab)

    # create the built-in pipeline components and add them to the pipeline
    # nlp.create_pipe works for built-ins that are registered with spaCy
    if 'ner' not in nlp.pipe_names and "parser" in nlp.pipe_names:
        ner = nlp.create_pipe('ner')
        nlp.add_pipe(ner, after="parser")
    elif 'ner' not in nlp.pipe_names and "tagger" in nlp.pipe_names:
        ner = nlp.create_pipe('ner')
        nlp.add_pipe(ner, after="tagger")
    elif 'ner' not in nlp.pipe_names:
        ner = nlp.create_pipe('ner')
        nlp.add_pipe(ner, last=True)
    else:
        ner = nlp.get_pipe('ner')

    # add labels
    for _, annotations in train_data:
        for ent in annotations.get('entities'):
            ner.add_label(ent[2])

    # get names of other pipes to disable them during training
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']

    dropout_rates = util.decaying(util.env_opt('dropout_from', 0.2),
                                  util.env_opt('dropout_to', 0.2),
                                  util.env_opt('dropout_decay', 0.005))
    batch_sizes = util.compounding(util.env_opt('batch_from', 1),
                                   util.env_opt('batch_to', 32),
                                   util.env_opt('batch_compound', 1.001))

    optimizer = nlp.begin_training()
    best_epoch = 0
    best_f1 = 0
    for i in range(n_iter):
        print(str(i)+'--'+str(n_iter))
        random.shuffle(train_data)
        count = 0
        losses = {}
        total = len(train_data)

        with nlp.disable_pipes(*other_pipes):  # only train NER
            with tqdm.tqdm(total=total, leave=True) as pbar:
                for batch in minibatch(train_data, size=batch_sizes):
                    docs, golds = zip(*batch)
                    nlp.update(docs, golds, sgd=optimizer,
                               losses=losses, drop=next(dropout_rates))
                    pbar.update(len(batch))
                    if count % 100 == 0 and count > 0:
                        print('sum loss: %s' % losses['ner'])
                    count += 1

        # save model to output directory
        output_dir_path = Path(output_dir + "/" + str(i))
        if not output_dir_path.exists():
            output_dir_path.mkdir()

        with nlp.use_params(optimizer.averages):
            nlp.tokenizer = original_tokenizer
            nlp.to_disk(output_dir_path)
            print("Saved model to", output_dir_path)

        # test the saved model
        print("Loading from", output_dir_path)
        nlp2 = util.load_model_from_path(output_dir_path)

        metrics = evaluate_ner(nlp2, dev_data)
        if metrics["f1-measure-overall"] > best_f1:
            best_f1 = metrics["f1-measure-overall"]
            best_epoch = i
    # save model to output directory
    best_model_path = Path(output_dir + "/" + "best")
    print(f"Best Epoch: {best_epoch} of {n_iter}")
    if os.path.exists(best_model_path):
        shutil.rmtree(best_model_path)
    shutil.copytree(os.path.join(output_dir, str(best_epoch)),
                    best_model_path)

    # test the saved model
    print("Loading from", best_model_path)
    nlp2 = util.load_model_from_path(best_model_path)

    evaluate_ner(nlp2, dev_data, dump_path=os.path.join(output_dir, "dev_metrics.json"))
    evaluate_ner(nlp2, test_data, dump_path=os.path.join(output_dir, "test_metrics.json"))

In [16]:
model_output_dir = '/nfs/gns/literature/Santosh_Tirunagari/GitHub/spacy_models/scibert-scivocab-uncased/'
train_data_path = '/nfs/gns/literature/machine-learning/Datasets/NER_Datasets/EBI_standard-IOB/train.csv'
dev_data_path = '/nfs/gns/literature/machine-learning/Datasets/NER_Datasets/EBI_standard-IOB/dev.csv'
test_data_path = '/nfs/gns/literature/machine-learning/Datasets/NER_Datasets/EBI_standard-IOB/test.csv'
run_test = False
# model_path = '/nfs/gns/literature/Santosh_Tirunagari/pretrained_word_embeddings/en-europepmc-lg' # None #'en_core_sci_md'
iterations = 5
meta_overrides = '/nfs/gns/literature/Santosh_Tirunagari/GitHub/scispacy/data/EPMC_ner.json'

In [20]:
train_ner(model_output_dir,
              train_data_path,
              dev_data_path,
              test_data_path,
              run_test,
#               model_path,
              iterations,
            )

  0%|          | 0/79401 [00:00<?, ?it/s]

0--5





KeyError: "[E001] No component 'trf_tok2vec' found in pipeline. Available names: ['ner']"

In [None]:
## Test best model performance on test set
best_model_path = '/nfs/gns/literature/Santosh_Tirunagari/GitHub/spacy_models/en-europepmc-lg/best'

print("Loading from", best_model_path)
nlp2 = util.load_model_from_path(best_model_path)

# nlp2.tokenizer = WhitespaceTokenizer(nlp2.vocab)

In [None]:
# [[113, 119, 'nortia', 'GP'], 
# [179, 184, 'ZmES4', 'GP'], 
# [146, 149, 'evn', 'GP'], [121, 124, 'nta', 'GP'], 
# [140, 144, 'evan', 'GP'], [127, 132, 'turan', 'GP'], 
# [72, 79, 'feronia', 'GP'], [88, 91, 'fer', 'GP'], 
# [107, 110, 'lre', 'GP'], [98, 105, 'lorelei', 'GP'], 
# [80, 86, 'sirène', 'GP'], [156, 177, 'Zea mays embryo sac 4', 'GP'], 
# [92, 95, 'srn', 'GP'], [134, 137, 'tun', 'GP']]

text = 'Interspecific PT overgrowth phenocopies the female gametophytic mutants feronia/sirène (fer/srn), lorelei (lre), nortia (nta), turan (tun), evan (evn), and Zea mays embryo sac 4 (ZmES4) RNAi-lines1314151617181920, which are defective in the reception of intraspecific PTs. '
sentence = nlp2(text)

print(sentence)
for ent in sentence.ents:
    print(ent.text, ent.start_char, ent.end_char, ent.label_)


In [None]:
from nltk.tokenize import WordPunctTokenizer, wordpunct_tokenize


def convert2IOB(text_data, ner_tags):
    tokenizer = WordPunctTokenizer()

    tokens = []
    ners = []
    spans = []

    split_text = tokenizer.tokenize(text_data)
    span_text = list(tokenizer.span_tokenize(text_data))
    # for each word token append 'O'
    arr = ['O'] * len(split_text)

    if ner_tags:
        try:
            ner_tags = literal_eval(ner_tags)
        except:
            pass

    elif isinstance(ner_tags, float) or ner_tags is None:
        return zip(split_text, arr)
    else:
        return zip(split_text, arr)

    for each_tag in ner_tags:
        span_list = (each_tag[0], each_tag[1])
        token_list = wordpunct_tokenize(each_tag[2])
        ner_list = wordpunct_tokenize(each_tag[3])

        if (len(token_list) > len(ner_list)):
            ner_list = len(token_list) * ner_list
        for i in range(0, len(ner_list)):
            # The logic here is look for the first B-tag and then append I-tag next
            if (i == 0):
                ner_list[i] = 'B-' + ner_list[i]
            else:
                ner_list[i] = 'I-' + ner_list[i]

        tokens.append(token_list)
        ners.append(ner_list)
        spans.append(span_list)

    split_token_span_list = list(zip(split_text, span_text))
    span_ner_list = list(zip(spans, ners))

    sub_spans = []  # get sub spans from the full spans of the ner

    for each_span_ner_list in span_ner_list:
        # in full range ner e.g., [144, 150, 'GM-CSF', 'GP']
        count = 0
        # count is to keep track of the B, I, sub tags in the ner list
        for each_token in split_token_span_list:
            sub_spans_ = find_sub_span(each_token[1], each_span_ner_list[0])
            if sub_spans_:
                sub_spans.append([sub_spans_, each_span_ner_list[1][count]])
                count = count + 1

    for i, each_span_token in enumerate(split_token_span_list):
        for each_ner_span in sub_spans:
            if each_span_token[1] == each_ner_span[0]:
                arr[i] = ''.join(each_ner_span[1])

    return zip(split_text, arr)


In [None]:
import pandas as pd
from tqdm import tqdm
from ast import literal_eval
import csv


def find_sub_span(sub_span_range, full_spans_range):
    # if a sub span is present in full span return it
    if sub_span_range[0] in range(full_spans_range[0], full_spans_range[1]):
        return sub_span_range
    
    

test_set = '/nfs/gns/literature/machine-learning/evaluation/300articles/CSV formats/test.csv'

result_path = '/nfs/gns/literature/machine-learning/evaluation/300articles/ML-NER/en-europepmc-lg/'


df_45 = pd.read_csv(test_set, sep = '\t', names = ['pmcid', 'sentence','ner'])

with open(result_path + 'en-europepmc-lg_iob.csv', 'a', newline='\n') as f1:
    ml_writer = csv.writer(f1, delimiter='\t', lineterminator='\n')
    
    for index, row in tqdm(df_45.iterrows(), total=df_45.shape[0]):
#         print(row['ner'])
        text = row['sentence'].encode('utf-8').decode('utf-8')
#         print(text)
        sentence = nlp2(text.strip())
        ml_ner =[]
        for ent in sentence.ents:
            ml_ner.append([ent.start_char, ent.end_char, ent.text, ent.label_])

        tagged_tokens = convert2IOB(text, ml_ner)

        for each_word in tagged_tokens:
            ml_writer.writerow(list(each_word))
        ml_writer.writerow('')


In [None]:
## Test best model performance on 2000 set



from ast import literal_eval

epmc_annotations_2000 = '/nfs/gns/literature/machine-learning/evaluation/2000articles/europePMC-NER/annotations_API/full_sentences/tagged_sentences/Europe_PMC_annotation.csv'

result_path = '/nfs/gns/literature/machine-learning/evaluation/2000articles/ML-NER/en-pubmed-pmc-lg/'


df_2000 = pd.read_csv(epmc_annotations_2000, sep = '\t', names = ['pmcid', 'section', 'sentence','ner'])


with open(result_path + 'en-pubmed-pmc-lg_2000_iob.csv', 'a', newline='\n') as f1:
    ml_writer = csv.writer(f1, delimiter='\t', lineterminator='\n')
    
    for index, row in tqdm(df_2000.iterrows(), total=df_2000.shape[0]):
#         print(row['ner'])
        text = row['sentence'].encode('utf-8').decode('utf-8')
#         print(text)
        sentence = nlp2(text.strip())
        ml_ner =[]
        for ent in sentence.ents:
            ml_ner.append([ent.start_char, ent.end_char, ent.text, ent.label_])

        tagged_tokens = convert2IOB(text, ml_ner)

        for each_word in tagged_tokens:
            ml_writer.writerow(list(each_word))
        ml_writer.writerow('')
        
        
