In [18]:
# from flair.models import TextClassifier
# from flair.data import Sentence, Token
# from flair.models import SequenceTagger

flair_models = '/nfs/gns/literature/Santosh_Tirunagari/GitHub/flair_models/ner/'
# load the model you trained
gene_model = SequenceTagger.load(flair_models+'multi_bio_ner_model_gene/v01/'+'best-model.pt')
disease_model = SequenceTagger.load(flair_models+'multi_bio_ner_model_disease/'+'best-model.pt')
organisms_model = SequenceTagger.load(flair_models+'multi_bio_ner_model_organisms/v01/'+'best-model.pt')

empc_model = SequenceTagger.load(flair_models+'manual_annotated_dataset/best-model.pt')


2019-11-25 09:21:08,479 loading file /nfs/gns/literature/Santosh_Tirunagari/GitHub/flair_models/ner/multi_bio_ner_model_gene/v01/best-model.pt
2019-11-25 09:21:17,415 loading file /nfs/gns/literature/Santosh_Tirunagari/GitHub/flair_models/ner/multi_bio_ner_model_organisms/v01/best-model.pt


In [6]:
import csv
data_path = '/nfs/gns/literature/machine-learning/Datasets/NER_Datasets/NCBI-disease-IOB/test.tsv'
targets = ['Disease']
                
def load_IOBdataset(data_path,targets):
    """
    load the IOB dataset, which is in csv format
    :param data_path: path to the csv file of IOB dataset
    :type data_path: str
    :param targets: a list of interest types
    :type targets: List[str]
    :return: list of labels of every sentence in dataset
    :rtype: List[List[str
    """
    X = []
    y = []

    X_sent = []
    y_sent = []
    with open(data_path, 'r') as f:
        csv_reader = csv.reader(f, delimiter='\t')
        for line in csv_reader:
            if line:
                token, tag = line[0], line[-1]
                X_sent.append(token)
                if targets:
                    if tag.split('-')[-1] in set(targets):
                        y_sent.append(tag)
                    else:
                        y_sent.append('O')
                else:
                    y_sent.append(tag)
            else:
                # we reach the end of a sentence
                if len(X_sent) > 0:
                    X.append(' '.join(X_sent))
                    y.append(y_sent)
                X_sent = []
                y_sent = []
    return X, y

NCBI_test,y = load_IOBdataset(data_path,targets)

In [131]:
from nltk.tokenize import wordpunct_tokenize, WordPunctTokenizer
from typing import List
def custom_tokenizer(text: str) -> List[Token]:
    """
    Tokenizer based on space character only.
    """
    tokens: List[Token] = []
    
    tokenizer = WordPunctTokenizer()

    text = tokenizer.tokenize(text)

    index = 0
    for index, word in enumerate(text):
            tokens.append(
                Token(
                    text=word, start_position=index, whitespace_after=False
                )
            )

    return tokens

# custom_tokenizer(text_temp_)

In [160]:
# from flair.data import segtok_tokenizer

tokenizer = WordPunctTokenizer()

text_temp = 'Although mutations of JMJD2B have been suggested to be responsible for neurodevelopmental disorders, the function of JMJD2B in the central nervous system (CNS) remains to be elucidated. '

sentence_1 = Sentence(text_temp, use_tokenizer=custom_tokenizer)
sentence_2 = Sentence(text_temp, use_tokenizer=custom_tokenizer)
empc_model.predict(sentence_1)
disease_model.predict(sentence_2)

sentence_3 = Sentence(text_temp, use_tokenizer=custom_tokenizer)
gene_model.predict(sentence_3)

sentence_4 = Sentence(text_temp, use_tokenizer=custom_tokenizer)
organisms_model.predict(sentence_4)

[Sentence: "Although mutations of JMJD2B have been suggested to be responsible for neurodevelopmental disorders , the function of JMJD2B in the central nervous system ( CNS ) remains to be elucidated ." - 31 Tokens]

In [161]:
manual_json = sentence_1.to_dict(tag_type='ner')
ncbi_json = sentence_2.to_dict(tag_type='ner')
bc2gm_json = sentence_3.to_dict(tag_type='ner')
linn_json = sentence_4.to_dict(tag_type='ner')

In [162]:
manual_json

{'text': 'Although mutations of JMJD2B have been suggested to be responsible for neurodevelopmental disorders, the function of JMJD2B in the central nervous system (CNS) remains to be elucidated.',
 'labels': [],
 'entities': [{'text': 'JMJD2B',
   'start_pos': 22,
   'end_pos': 28,
   'type': 'GP',
   'confidence': 0.998449444770813},
  {'text': 'neurodevelopmental disorders',
   'start_pos': 71,
   'end_pos': 99,
   'type': 'DS',
   'confidence': 0.8485407531261444},
  {'text': 'JMJD2B',
   'start_pos': 117,
   'end_pos': 123,
   'type': 'GP',
   'confidence': 0.9984619617462158}]}

In [158]:
ncbi_json

{'text': 'Provoked by public pressure and triggered by an increasing number of lethal lung diseases over the last few decades [1, 2], more and more studies in the field of inhalation toxicology now concentrate on the understanding of particle-lung interactions.',
 'labels': [],
 'entities': [{'text': 'lethal lung diseases',
   'start_pos': 69,
   'end_pos': 89,
   'type': 'DS',
   'confidence': 0.7132398088773092}]}

In [151]:
def extract_entities_from_json(man,ncbi,bc2gm,linn):
    all_entities = []
    for ea_an in man['entities']:
        all_entities.append([ea_an['start_pos'],ea_an['end_pos'],ea_an['text'],ea_an['type']])
        
    for ea_an in ncbi['entities']:
        ea_an['type'] ='DS'
        all_entities.append([ea_an['start_pos'],ea_an['end_pos'],ea_an['text'],ea_an['type']])

    for ea_an in bc2gm['entities']:
        ea_an['type'] ='GP'
        all_entities.append([ea_an['start_pos'],ea_an['end_pos'],ea_an['text'],ea_an['type']])

    for ea_an in linn['entities']:
        ea_an['type'] ='OG'
        all_entities.append([ea_an['start_pos'],ea_an['end_pos'],ea_an['text'],ea_an['type']])


    
    b_set = set(map(tuple,all_entities))  #need to convert the inner lists to tuples so they are hashable
    unique_entities = map(list,b_set) #Now convert tuples back into lists (maybe unnecessary?)
    return list(unique_entities)

In [156]:
extract_entities_from_json(manual_json,ncbi_json,bc2gm_json,linn_json)

[[69, 89, 'lethal lung diseases', 'DS']]

In [153]:
extract_entities_from_json(manual_json,ncbi_json,bc2gm_json,linn_json)

[[13, 16, 'aru', 'GP'],
 [332, 343, 'A. thaliana', 'OG'],
 [120, 124, 'OST3', 'GP'],
 [142, 175, 'oligosaccharyltransferase complex', 'GP'],
 [120, 134, 'OST3/6 subunit', 'GP'],
 [142, 167, 'oligosaccharyltransferase', 'GP'],
 [104, 107, 'ARU', 'GP'],
 [13, 24, 'aru mutants', 'GP'],
 [332, 348, 'A. thaliana OST3', 'OG'],
 [83, 94, 'A. thaliana', 'OG']]

In [None]:
from nltk.tokenize import wordpunct_tokenize, WordPunctTokenizer
from tqdm import tqdm
import csv
import pandas as pd

data_path = '/nfs/gns/literature/machine-learning/Datasets/NER_Datasets/NCBI-disease-IOB/test.tsv'
targets = ['Disease']
NCBI_test,y = load_IOBdataset(data_path,targets)

result_path = '/nfs/gns/literature/machine-learning/fused_models_on_test_data/'
result_file_name = 'PCSE_fused_on_NCBI_IOB.csv'


with open(result_path + result_file_name, 'a', newline='\n') as f1:
    public_writer = csv.writer(f1, delimiter='\t', lineterminator='\n')

    for each_sentence in tqdm(NCBI_test):
        text_temp = each_sentence

#         sentence = ' '.join(text_temp.split(' '))
        sentence_1 = Sentence(text_temp)
        sentence_2 = Sentence(text_temp)
        empc_model.predict(sentence_1)
        disease_model.predict(sentence_2)

        for token in sentence:
        # print what you need (text and NER value)
            ner_value = token.get_tag('ner').value
            if  ner_value == 'B-DS':
                ner_value = 'B-Disease'
                
            if  ner_value == 'I-DS':
                ner_value = 'I-Disease'   
            
            public_writer.writerow([token.text, ner_value])
        public_writer.writerow('')