In [1]:
import glob
from tqdm import tqdm 
from collections import defaultdict

root_path ='/home/santosh/Work/Datasets/Datasets/Covid-19pubmed/'

annotation_path = root_path+'annotations/'
text_path = root_path+'raw/'

# annotation_files = sorted(glob.glob(annotation_path + '*.a*'))
text_files = sorted(glob.glob(text_path + '*.txt*'))

In [2]:
# load sci_spacy
# https://allenai.github.io/scispacy/

import scispacy
import spacy
# import en_core_web_sm
import en_core_sci_sm
nlp = en_core_sci_sm.load() # for sentencising. The best sentenciser for biomedical text
# nlp = en_core_web_sm.load()

In [3]:
# get datasets for different entity type as well as datasets with all the entities in one.
non_overlapping_dataset = defaultdict(list) # the entities are non overlapping here
overlapping_dataset = defaultdict(list) 
Disorder_dataset= defaultdict(list) #(DISO)
Species_dataset= defaultdict(list) #(SPEC)
Chemical_Drug_dataset= defaultdict(list) #(CHED)
Gene_Protein_dataset = defaultdict(list)#(PRGE)
Enzyme_dataset = defaultdict(list)#(ENZY)
Anatomy_dataset = defaultdict(list)#(ANAT)
Biological_Process_dataset = defaultdict(list)#(PROC)
Molecular_Function_dataset = defaultdict(list)#(FUNC)
Cellular_Component_dataset = defaultdict(list)#(COMP)
Pathway_dataset = defaultdict(list)#(PATH)
microRNA_dataset = defaultdict(list) #(MRNA)

# iter = 0
for each_text_path in tqdm(text_files):

#     iter = iter+1
#     if iter==10:
#         break
    annotation_list =[] # list of entities from the annotations file
    text_list = [] # sentences list from the text file, with sentence spans
    non_overlapping_annotation_list = [] # list with non overlapping entities
    
    # read the text file 
    with open(each_text_path) as text_file:
        text = text_file.read()
        # sentencise the text and get the spans
        doc = nlp(text)
        for sent_ in doc.sents:
            sent = str(sent_).replace('\n',' ')
            text_list.append([sent, sent_.start_char, sent_.end_char])
    
    # read the annotations for the respective raw text file
    each_annotation_path = annotation_path+each_text_path.split('/')[-1].replace('txt','a1')
    
#     print(each_text_path)
#     print(each_annotation_path)
    
    with open(each_annotation_path) as annotation_file:
        annotation = annotation_file.readlines()
    
    # load the anotations into a list. There is a small problem here due to data formating consistencies.
    for each_line in annotation:
        temp_ = each_line.split()
        if 'T' in temp_[0]:
            if len(temp_)>5:
                merged_anno = temp_[0:4]+ [' '.join(temp_[4:])] # because of the general split we will have to merge entities with spaces together
                annotation_list.append(merged_anno)
            
            else:
                annotation_list.append(temp_)

                # Remove the overlaps
    for inx in range(-1, len(annotation_list)):
        if inx == len(annotation_list)-1:
            break
        if inx ==-1:
            non_overlapping_annotation_list.append(annotation_list[inx+1])
        else:
            if annotation_list[inx+1][2]>annotation_list[inx][3]:
                non_overlapping_annotation_list.append(annotation_list[inx+1])
            

    # we need to remove the off-set for the sentence span to reflect the span of the entities at the sentence level rather than at the document level
    for each_annotation in annotation_list:    
            
        st_ann_sp = int(each_annotation[2]) #  start of annotation span
        en_ann_sp = int(each_annotation[3]) #  end of annotation span
        ann_type = each_annotation[1] #  annotation type
        ann = each_annotation[4] # annotation
            
        for each_text in text_list:
            
            snt_text = str(each_text[0]) # sentence text
            st_snt_sp = int(each_text[1]) # Start of sentence span
            en_snt_sp = int(each_text[2]) # end of sentence span
            
                # check if the annotation span is with in the sentence span 
            if st_snt_sp <= st_ann_sp <= en_snt_sp and st_snt_sp <= en_ann_sp <= en_snt_sp:
                if ann in snt_text: # process only if the annotation is in the sentence 
                    anno_list = [st_ann_sp-st_snt_sp, en_ann_sp-st_snt_sp,ann, ann_type] # get the annotation details
                    if snt_text[st_ann_sp-st_snt_sp:en_ann_sp-st_snt_sp] == ann: # final check if annotation is indeed in the text and has the right span
                        if ann_type == 'SPEC':
                            Species_dataset[snt_text].append(anno_list)
                        elif ann_type == 'DISO':
                            Disorder_dataset[snt_text].append(anno_list)
                        elif ann_type == 'CHED':
                            Chemical_Drug_dataset[snt_text].append(anno_list)
                        elif ann_type == 'PRGE':
                            Gene_Protein_dataset[snt_text].append(anno_list)
                        elif ann_type == 'ENZY':
                            Enzyme_dataset[snt_text].append(anno_list)
                        elif ann_type == 'ANAT':
                            Anatomy_dataset[snt_text].append(anno_list)
                        elif ann_type == 'PROC':
                            Biological_Process_dataset[snt_text].append(anno_list)
                        elif ann_type == 'FUNC':
                            Molecular_Function_dataset[snt_text].append(anno_list)
                        elif ann_type == 'COMP':
                            Cellular_Component_dataset[snt_text].append(anno_list)
                        elif ann_type == 'PATH':
                            Pathway_dataset[snt_text].append(anno_list)
                        elif ann_type == 'MRNA':
                            microRNA_dataset[snt_text].append(anno_list)

#                 overlapping_dataset[each_text[0]].append([int(each_annotation[2])-int(each_text[1]), int(each_annotation[3])-int(each_text[1]),each_annotation[4], each_annotation[1]])                                                        
                
                # check if the annotations have overlap, the entities span in sentence should be greater than the previous one
    
    for each_annotation in non_overlapping_annotation_list:    

        st_ann_sp = int(each_annotation[2]) #  start of annotation span
        en_ann_sp = int(each_annotation[3]) #  end of annotation span
        ann_type = each_annotation[1] #  annotation type
        ann = each_annotation[4] # annotation

        for each_text in text_list:

            snt_text = str(each_text[0]) # sentence text
            st_snt_sp = int(each_text[1]) # Start of sentence span
            en_snt_sp = int(each_text[2]) # end of sentence span
            
            if st_snt_sp <= st_ann_sp <= en_snt_sp and st_snt_sp <= en_ann_sp <= en_snt_sp:
                if ann in snt_text: # process only if the annotation is in the sentence 
                    anno_list = [st_ann_sp-st_snt_sp, en_ann_sp-st_snt_sp,ann, ann_type] # get the annotation details
                    if snt_text[st_ann_sp-st_snt_sp:en_ann_sp-st_snt_sp] == ann: # final check if annotation is indeed in the text and has the right span
                        non_overlapping_dataset[snt_text].append(anno_list)
    
#     break

100%|██████████| 17740/17740 [09:20<00:00, 31.63it/s]


In [4]:
# Generate train, test and dev pmc ids
import math
import random
import jsonlines
import os
import pathlib
import csv
from nltk.tokenize import WordPunctTokenizer, wordpunct_tokenize


def get_train_dev_test_indxs(total_num_annotations):

    percentage=0.70
    iter = 0

    trainids = []
    devids = []
    testids =[]

    nLines = total_num_annotations
    nTrain = int(nLines*percentage) 
    nValid = math.floor((nLines - nTrain)/2)
    nTest = nLines - (nTrain+nValid)

    deck = list(range(0, nLines))
    random.seed(45) # This will be fixed for reproducibility
    random.shuffle(deck)

    train_ids = deck[0:nTrain]
    devel_ids = deck[nTrain:nTrain+nValid]
    test_ids = deck[nTrain+nValid:nTrain+nValid+nTest]

    return train_ids, devel_ids, test_ids


In [5]:
def convert_to_jsonl_format(dictionary_dataset, train_ids, devel_ids, test_ids,path, folder_name): 
    
    result_path = cwd_+folder_name
    pathlib.Path(result_path).mkdir(parents=True, exist_ok=True)
    
    train_jsonl_data = []
    devel_jsonl_data=[]
    test_jsonl_data = []
    
    iter = 1
    
    for key, values in tqdm(dictionary_dataset.items(), total=len(dictionary_dataset)):
        
        text = str(key)
        entities =[]

        for each_ner in values:
            point_start = int(each_ner[0])
            point_end = int(each_ner[1])
            label = each_ner[3]
            entities.append((point_start, point_end,label))
        
        if iter in train_ids:
            train_jsonl_data.append((text, {"entities" : entities}))
        elif iter in devel_ids:
            devel_jsonl_data.append((text, {"entities" : entities}))
        elif iter in test_ids:
            test_jsonl_data.append((text, {"entities" : entities}))            
    
        iter = iter+1
        
    with jsonlines.open(result_path+'train.json', mode='w') as writer:
        for each_line in test_jsonl_data:
            writer.write(each_line)

    with jsonlines.open(result_path+'devel.json', mode='w') as writer:
        for each_line in devel_jsonl_data:
            writer.write(each_line)
            
    with jsonlines.open(result_path+'test.json', mode='w') as writer:
        for each_line in test_jsonl_data:
            writer.write(each_line)  

In [6]:
def find_sub_span(sub_span_range, full_spans_range):
    # if a sub span is present in full span return it
    if sub_span_range[0] in range(full_spans_range[0],full_spans_range[1]):
        return sub_span_range

    
def convert2IOB(text_data, ner_tags):
    tokenizer = WordPunctTokenizer()

#     print(text_data, ner_tags)
    tokens = []
    ners = []
    spans = []

    split_text = tokenizer.tokenize(text_data)
    span_text = list(tokenizer.span_tokenize(text_data))
    # for each word token append 'O'
    arr = ['O'] * len(split_text)

    for each_tag in ner_tags:
        span_list = (int(each_tag[0]), int(each_tag[1]))
        token_list = wordpunct_tokenize(each_tag[2])
        ner_list = wordpunct_tokenize(each_tag[3])

        if (len(token_list) > len(ner_list)):
            ner_list = len(token_list) * ner_list
        for i in range(0, len(ner_list)):
            # The logic here is look for the first B-tag and then append I-tag next
            if (i == 0):
                ner_list[i] = 'B-' + ner_list[i]
            else:
                ner_list[i] = 'I-' + ner_list[i]

        tokens.append(token_list)
        ners.append(ner_list)
        spans.append(span_list)

    split_token_span_list = list(zip(split_text, span_text))
    span_ner_list = list(zip(spans, ners))

    
    sub_spans =[] # get sub spans from the full spans of the ner

    for each_span_ner_list in span_ner_list:
        # in full range ner e.g., [144, 150, 'COVID-19', 'DISO']
        count = 0
        # count is to keep track of the B, I, sub tags in the ner list
        for each_token in split_token_span_list:
            sub_spans_ = find_sub_span(each_token[1], each_span_ner_list[0])
            if sub_spans_:
                sub_spans.append([sub_spans_,each_span_ner_list[1][count]])
                count = count+1
            
            
    
    for i, each_span_token in enumerate(split_token_span_list):
        for each_ner_span in sub_spans:
            if each_span_token[1] == each_ner_span[0]:
                arr[i] = ''.join(each_ner_span[1])

    return zip(split_text, arr)

In [7]:
def convert_to_IOB_format(dictionary_dataset, train_ids, devel_ids, test_ids, path, folder_name):
    
    result_path = cwd_+folder_name
    pathlib.Path(result_path).mkdir(parents=True, exist_ok=True)
    
    with open(result_path + 'train.csv', 'w', newline='\n') as f1, open(result_path + 'devel.csv', 'w',
                        newline='\n') as f2, open(result_path + 'test.csv', 'w', newline='\n') as f3:

        train_writer = csv.writer(f1, delimiter='\t', lineterminator='\n')
        dev_writer = csv.writer(f2, delimiter='\t', lineterminator='\n')
        test_writer = csv.writer(f3, delimiter='\t', lineterminator='\n')
        
        iter = 0

        for key, values in tqdm(dictionary_dataset.items(), total=len(dictionary_dataset)):
            
            text = str(key)
#             print(text)
            tagged_tokens = convert2IOB(text, values)


            if iter in train_ids:
                for each_token in tagged_tokens:
                    train_writer.writerow(list(each_token))
                train_writer.writerow('')

            elif iter in devel_ids:
                for each_token in tagged_tokens:
                    dev_writer.writerow(list(each_token))
                dev_writer.writerow('')

            elif iter in test_ids:
                for each_token in tagged_tokens:
                    test_writer.writerow(list(each_token))
                test_writer.writerow('')
            
            iter = iter+1

In [8]:
cwd_ = os.getcwd()+'/Datasets/'
print(cwd_)

/home/santosh/Work/GitHub/COVID-19-Named-Entity-Recognition-/Datasets/


In [10]:
non_overlapping_dataset 
print('non_overlapping_dataset')
trainidx, develidx, testidx = get_train_dev_test_indxs(len(non_overlapping_dataset))
convert_to_IOB_format(non_overlapping_dataset,trainidx, develidx, testidx, cwd_, 'BIO/non_overlapping_dataset/' )
convert_to_jsonl_format(non_overlapping_dataset,trainidx, develidx, testidx, cwd_, 'SpaCy/non_overlapping_dataset/' )


# overlapping_dataset
# trainidx, develidx, testidx = get_train_dev_test_indxs(len(overlapping_dataset))
# convert_to_IOB_format(non_overlapping_dataset,trainidx, develidx, testidx, cwd_, 'BIO/overlapping_dataset/' )
# convert_to_jsonl_format(non_overlapping_dataset,trainidx, develidx, testidx, cwd_, 'SpaCy/overlapping_dataset/' )

# Disorder_dataset
print('Disorder_dataset')
trainidx, develidx, testidx = get_train_dev_test_indxs(len(Disorder_dataset))
convert_to_IOB_format(Disorder_dataset,trainidx, develidx, testidx, cwd_, 'BIO/Disorder_dataset/' )
convert_to_jsonl_format(Disorder_dataset,trainidx, develidx, testidx, cwd_, 'SpaCy/Disorder_dataset/' )

# Species_dataset
print('Species_dataset')
trainidx, develidx, testidx = get_train_dev_test_indxs(len(Species_dataset))
convert_to_IOB_format(Species_dataset,trainidx, develidx, testidx, cwd_, 'BIO/Species_dataset/' )
convert_to_jsonl_format(Species_dataset,trainidx, develidx, testidx, cwd_, 'SpaCy/Species_dataset/' )

# Chemical_Drug_dataset
print('Chemical_Drug_dataset')
trainidx, develidx, testidx = get_train_dev_test_indxs(len(Chemical_Drug_dataset))
convert_to_IOB_format(Chemical_Drug_dataset,trainidx, develidx, testidx, cwd_, 'BIO/Chemical_Drug_dataset/' )
convert_to_jsonl_format(Chemical_Drug_dataset,trainidx, develidx, testidx, cwd_, 'SpaCy/Chemical_Drug_dataset/' )

# Gene_Protein_dataset
print('Gene_Protein_dataset')
trainidx, develidx, testidx = get_train_dev_test_indxs(len(Gene_Protein_dataset))
convert_to_IOB_format(Gene_Protein_dataset,trainidx, develidx, testidx, cwd_, 'BIO/Gene_Protein_dataset/' )
convert_to_jsonl_format(Gene_Protein_dataset,trainidx, develidx, testidx, cwd_, 'SpaCy/Gene_Protein_dataset/' )

# Enzyme_dataset
print('Enzyme_dataset')
trainidx, develidx, testidx = get_train_dev_test_indxs(len(Enzyme_dataset))
convert_to_IOB_format(Enzyme_dataset,trainidx, develidx, testidx, cwd_, 'BIO/Enzyme_dataset/' )
convert_to_jsonl_format(Enzyme_dataset,trainidx, develidx, testidx, cwd_, 'SpaCy/Enzyme_dataset/' )

# Anatomy_dataset
print('Anatomy_dataset')
trainidx, develidx, testidx = get_train_dev_test_indxs(len(Anatomy_dataset))
convert_to_IOB_format(Anatomy_dataset,trainidx, develidx, testidx, cwd_, 'BIO/Anatomy_dataset/' )
convert_to_jsonl_format(Anatomy_dataset,trainidx, develidx, testidx, cwd_, 'SpaCy/Anatomy_dataset/' )

# Biological_Process_dataset
print('Biological_Process_dataset')
trainidx, develidx, testidx = get_train_dev_test_indxs(len(Biological_Process_dataset))
convert_to_IOB_format(Biological_Process_dataset,trainidx, develidx, testidx, cwd_, 'BIO/Biological_Process_dataset/' )
convert_to_jsonl_format(Biological_Process_dataset,trainidx, develidx, testidx, cwd_, 'SpaCy/Biological_Process_dataset/' )

# Molecular_Function_dataset
print('Molecular_Function_dataset')
trainidx, develidx, testidx = get_train_dev_test_indxs(len(Molecular_Function_dataset))
convert_to_IOB_format(Molecular_Function_dataset,trainidx, develidx, testidx, cwd_, 'BIO/Molecular_Function_dataset/' )
convert_to_jsonl_format(Molecular_Function_dataset,trainidx, develidx, testidx, cwd_, 'SpaCy/Molecular_Function_dataset/' )

# Cellular_Component_dataset
print('Cellular_Component_dataset')
trainidx, develidx, testidx = get_train_dev_test_indxs(len(Cellular_Component_dataset))
convert_to_IOB_format(Cellular_Component_dataset,trainidx, develidx, testidx, cwd_, 'BIO/Cellular_Component_dataset/' )
convert_to_jsonl_format(Cellular_Component_dataset,trainidx, develidx, testidx, cwd_, 'SpaCy/Cellular_Component_dataset/' )

# Pathway_dataset
print('Pathway_dataset')
trainidx, develidx, testidx = get_train_dev_test_indxs(len(Pathway_dataset))
convert_to_IOB_format(Pathway_dataset,trainidx, develidx, testidx, cwd_, 'BIO/Pathway_dataset/' )
convert_to_jsonl_format(Pathway_dataset,trainidx, develidx, testidx, cwd_, 'SpaCy/Pathway_dataset/' )

# microRNA_dataset
print('microRNA_dataset')
trainidx, develidx, testidx = get_train_dev_test_indxs(len(microRNA_dataset))
convert_to_IOB_format(microRNA_dataset,trainidx, develidx, testidx, cwd_, 'BIO/microRNA_dataset/' )
convert_to_jsonl_format(microRNA_dataset,trainidx, develidx, testidx, cwd_, 'SpaCy/microRNA_dataset/' )

  0%|          | 59/151052 [00:00<04:20, 580.65it/s]

non_overlapping_dataset


100%|██████████| 151052/151052 [03:57<00:00, 636.10it/s]
100%|██████████| 151052/151052 [03:34<00:00, 704.42it/s]
  0%|          | 135/98950 [00:00<01:13, 1341.74it/s]

Disorder_dataset


100%|██████████| 98950/98950 [01:34<00:00, 1048.31it/s]
100%|██████████| 98950/98950 [01:24<00:00, 1174.72it/s]
  0%|          | 192/67654 [00:00<00:35, 1901.81it/s]

Species_dataset


100%|██████████| 67654/67654 [00:45<00:00, 1500.53it/s]
100%|██████████| 67654/67654 [00:38<00:00, 1751.94it/s]
  1%|          | 269/43779 [00:00<00:16, 2677.48it/s]

Chemical_Drug_dataset


100%|██████████| 43779/43779 [00:19<00:00, 2191.35it/s]
100%|██████████| 43779/43779 [00:16<00:00, 2700.40it/s]
  1%|          | 336/29888 [00:00<00:08, 3356.84it/s]

Gene_Protein_dataset


100%|██████████| 29888/29888 [00:10<00:00, 2833.95it/s]
100%|██████████| 29888/29888 [00:07<00:00, 4074.19it/s]
  0%|          | 606/151052 [00:00<00:24, 6047.58it/s]

Enzyme_dataset


100%|██████████| 151052/151052 [00:25<00:00, 5830.00it/s]
100%|██████████| 151052/151052 [00:10<00:00, 14513.13it/s]
  3%|▎         | 216/6188 [00:00<00:02, 2157.24it/s]

Anatomy_dataset


100%|██████████| 6188/6188 [00:03<00:00, 2001.50it/s]
100%|██████████| 6188/6188 [00:02<00:00, 2392.14it/s]
  0%|          | 241/50554 [00:00<00:20, 2405.00it/s]

Biological_Process_dataset


100%|██████████| 50554/50554 [00:25<00:00, 1961.53it/s]
100%|██████████| 50554/50554 [00:21<00:00, 2368.57it/s]
  5%|▌         | 605/11398 [00:00<00:01, 6046.59it/s]

Molecular_Function_dataset


100%|██████████| 11398/11398 [00:02<00:00, 5530.12it/s]
100%|██████████| 11398/11398 [00:01<00:00, 11081.20it/s]
  1%|▏         | 402/27208 [00:00<00:06, 4012.31it/s]

Cellular_Component_dataset


100%|██████████| 27208/27208 [00:08<00:00, 3192.33it/s]
100%|██████████| 27208/27208 [00:06<00:00, 4398.19it/s]
 16%|█▌        | 907/5637 [00:00<00:00, 9067.72it/s]

Pathway_dataset


100%|██████████| 5637/5637 [00:00<00:00, 8008.20it/s]
100%|██████████| 5637/5637 [00:00<00:00, 22869.79it/s]
100%|██████████| 10/10 [00:00<00:00, 4028.34it/s]
100%|██████████| 10/10 [00:00<00:00, 26852.14it/s]

microRNA_dataset





In [12]:
# Enzyme_dataset
print('Enzyme_dataset')
trainidx, develidx, testidx = get_train_dev_test_indxs(len(Enzyme_dataset))
convert_to_IOB_format(Enzyme_dataset,trainidx, develidx, testidx, cwd_, 'BIO/Enzyme_dataset/' )
convert_to_jsonl_format(Enzyme_dataset,trainidx, develidx, testidx, cwd_, 'SpaCy/Enzyme_dataset/' )

# Anatomy_dataset
print('Anatomy_dataset')
trainidx, develidx, testidx = get_train_dev_test_indxs(len(Anatomy_dataset))
convert_to_IOB_format(Anatomy_dataset,trainidx, develidx, testidx, cwd_, 'BIO/Anatomy_dataset/' )
convert_to_jsonl_format(Anatomy_dataset,trainidx, develidx, testidx, cwd_, 'SpaCy/Anatomy_dataset/' )


 12%|█▏        | 725/6188 [00:00<00:00, 7248.71it/s]

Enzyme_dataset


100%|██████████| 6188/6188 [00:00<00:00, 7131.38it/s]
100%|██████████| 6188/6188 [00:00<00:00, 19277.85it/s]
  0%|          | 224/59830 [00:00<00:26, 2234.70it/s]

Anatomy_dataset


100%|██████████| 59830/59830 [00:37<00:00, 1600.37it/s]
100%|██████████| 59830/59830 [00:31<00:00, 1919.05it/s]
