In [45]:
from bs4 import BeautifulSoup
import lxml
import pandas as pd
import os
from tqdm import tqdm 
from collections import defaultdict

In [19]:
datapath = '/nfs/gns/literature/Santosh_Tirunagari/GitHub/Unsupervised-Protein-Genes-Diseases-Extraction/Datasets/'
infile = open(datapath+'CoMAGC/'+'CoMAGC.xml','r')
contents = infile.read()
soup = BeautifulSoup(contents,'html.parser')

In [20]:
annotations = soup.find_all('annotation_unit')


In [54]:
reldataset = defaultdict(list)

for repElem in tqdm(annotations):
    sentence = repElem.sentence.text
    try:
        GP = [int(repElem.gene.get('range').split('-')[0]), int(repElem.gene.get('range').split('-')[1]), repElem.gene.text, 'GP']
        reldataset[sentence].append(GP)
    except:
        pass
    try:    
        DS = [int(repElem.cancer_term.get('range').split('-')[0]),int(repElem.cancer_term.get('range').split('-')[1]), repElem.cancer_term.text, 'DS']
        reldataset[sentence].append(DS)
    except:
        pass
    try:
        rel1 = [int(repElem.expression_change_keyword_1.get('range').split('-')[0]),int(repElem.expression_change_keyword_1.get('range').split('-')[1]), repElem.expression_change_keyword_1.text, repElem.expression_change_keyword_1.get('type')] 
        reldataset[sentence].append(rel1)
    except:
        pass   
    try:
        rel2 = [int(repElem.expression_change_keyword_2.get('range').split('-')[0]), int(repElem.expression_change_keyword_2.get('range').split('-')[1]), repElem.expression_change_keyword_2.text, repElem.expression_change_keyword_2.get('type')]
        reldataset[sentence].append(rel2)
    except:
        pass   

        

100%|██████████| 821/821 [00:00<00:00, 1055.77it/s]


In [55]:
reldataset

defaultdict(list,
            {'Thus, FGF6 is increased in PIN and prostate cancer and can promote the proliferation of the transformed prostatic epithelial cells via paracrine and autocrine mechanisms.': [[6,
               9,
               'FGF6',
               'GP'],
              [35, 49, 'prostate cancer', 'DS'],
              [14, 22, 'increased', 'Positive_regulation']],
             'Isolation and characterization of the major form of human MUC18 cDNA gene and correlation of MUC18 over-expression in prostate cancer cell lines and tissues with malignant progression.': [[93,
               97,
               'MUC18',
               'GP'],
              [118, 132, 'prostate cancer', 'DS'],
              [99, 113, 'over-expression', 'Gene_expression'],
              [99, 113, 'over-expression', 'Positive_regulation']],
             'We therefore conclude that MUC18 expression is increased during prostate cancer initiation (high grade PIN) and progression to carcinoma, and in meta

In [56]:
# Generate train, test and dev pmc ids
import math
import random
import os
import pathlib
import csv
from nltk.tokenize import WordPunctTokenizer, wordpunct_tokenize


def get_train_dev_test_indxs(total_num_annotations):

    percentage=0.80
    iter = 0

    trainids = []
    devids = []
    testids =[]

    nLines = total_num_annotations
    nTrain = int(nLines*percentage) 
    nValid = math.floor((nLines - nTrain)/2)
    nTest = nLines - (nTrain+nValid)

    deck = list(range(0, nLines))
    random.seed(45) # This will be fixed for reproducibility
    random.shuffle(deck)

    train_ids = deck[0:nTrain]
    devel_ids = deck[nTrain:nTrain+nValid]
    test_ids = deck[nTrain+nValid:nTrain+nValid+nTest]

    return train_ids, devel_ids, test_ids

In [61]:
def find_sub_span(sub_span_range, full_spans_range):
    # if a sub span is present in full span return it
    if sub_span_range[0] in range(full_spans_range[0], full_spans_range[1]):
        return sub_span_range


def convert2IOB(text_data, ner_tags):
    tokenizer = WordPunctTokenizer()

    tokens = []
    ners = []
    spans = []

    split_text = tokenizer.tokenize(text_data)
    span_text = list(tokenizer.span_tokenize(text_data))
    # for each word token append 'O'
    arr = ['O'] * len(split_text)

    if 'None' in ner_tags:
        return zip(split_text, arr)

    ner_tags = sorted(ner_tags, key=lambda x: len(x[3]), reverse=True)

    for each_tag in ner_tags:
        span_list = (each_tag[0], each_tag[1])
        token_list = wordpunct_tokenize(each_tag[2])
        ner_list = wordpunct_tokenize(each_tag[3])

        if (len(token_list) > len(ner_list)):
            ner_list = len(token_list) * ner_list
        for i in range(0, len(ner_list)):
            # The logic here is look for the first B-tag and then append I-tag next
            if (i == 0):
                ner_list[i] = 'B-' + ner_list[i]
            else:
                ner_list[i] = 'I-' + ner_list[i]

        tokens.append(token_list)
        ners.append(ner_list)
        spans.append(span_list)

    split_token_span_list = list(zip(split_text, span_text))
    span_ner_list = list(zip(spans, ners))

    sub_spans = []  # get sub spans from the full spans of the ner

    for each_span_ner_list in span_ner_list:
        # in full range ner e.g., [144, 150, 'GM-CSF', 'GP']
        count = 0
        # count is to keep track of the B, I, sub tags in the ner list
        for each_token in split_token_span_list:
            sub_spans_ = find_sub_span(each_token[1], each_span_ner_list[0])
            if sub_spans_:
                sub_spans.append([sub_spans_, each_span_ner_list[1][count]])
                count = count + 1

    for i, each_span_token in enumerate(split_token_span_list):
        for each_ner_span in sub_spans:
            if each_span_token[1] == each_ner_span[0]:
                arr[i] = ''.join(each_ner_span[1])

    return zip(split_text, arr)

In [62]:
def convert_to_IOB_format(dictionary_dataset, train_ids, devel_ids, test_ids, path, folder_name):
    
    result_path = cwd_+folder_name
    pathlib.Path(result_path).mkdir(parents=True, exist_ok=True)
    
    with open(result_path + 'train.csv', 'w', newline='\n') as f1, open(result_path + 'devel.csv', 'w',
                        newline='\n') as f2, open(result_path + 'test.csv', 'w', newline='\n') as f3:

        train_writer = csv.writer(f1, delimiter='\t', lineterminator='\n')
        dev_writer = csv.writer(f2, delimiter='\t', lineterminator='\n')
        test_writer = csv.writer(f3, delimiter='\t', lineterminator='\n')
        
        iter = 0

        for key, values in tqdm(dictionary_dataset.items(), total=len(dictionary_dataset)):
            
            text = str(key)
#             print(text)
            tagged_tokens = convert2IOB(text, values)


            if iter in train_ids:
                for each_token in tagged_tokens:
                    train_writer.writerow(list(each_token))
                train_writer.writerow('')

            elif iter in devel_ids:
                for each_token in tagged_tokens:
                    dev_writer.writerow(list(each_token))
                dev_writer.writerow('')

            elif iter in test_ids:
                for each_token in tagged_tokens:
                    test_writer.writerow(list(each_token))
                test_writer.writerow('')
            
            iter = iter+1

In [63]:
cwd_ = datapath

In [64]:
trainidx, develidx, testidx = get_train_dev_test_indxs(len(reldataset))
convert_to_IOB_format(reldataset,trainidx, develidx, testidx, cwd_, '/relation_dataset/' )

100%|██████████| 610/610 [00:00<00:00, 2966.99it/s]
