In [None]:
import json
import codecs
import os
import stanza
from stanza import Pipeline
from tqdm import tqdm

In [None]:
MODELS_DIR = './stanfordmodel'

In [None]:
tokenize = Pipeline(dir=MODELS_DIR,processors='tokenize,pos,depparse,lemma')

In [None]:
entity_path = './dataset/train/entity'
event_path = './dataset/train/event'
text_path = './dataset/train/text'

In [None]:
trigger_set = ['Dephosphorylation','Binding','Blood_vessel_development','Breakdown',
               'Catabolism','Cell_proliferation','Death','Development','Gene_expression',
               'Growth','Localization','Negative_regulation','Positive_regulation',
               'Phosphorylation','Planned_process','Regulation','Remodeling','Synthesis',
               'Transcription']

role_set = ['Cause','Theme','Theme2']

In [None]:
#get all of filename in the folder
def getFileNames(path1,path2,path3):
    entity_dir = os.listdir(path1)
    event_dir = os.listdir(path2)
    text_dir = os.listdir(path3)
    
    return sorted(entity_dir),sorted(event_dir),sorted(text_dir)

In [None]:
def read_textFile(path):
    with codecs.open(path,'r',encoding='utf-8') as f:
        content = f.readlines()
    text = []
    for line in content:
        if line == '\n':
            continue
        text.append(line.strip())
    
    text_list = []
    for i,t in enumerate(text):
        if i==0:
            text_list.append(t)
        else:
            t=' '+t
            t_ = t.split('.')[:-1]
            for l in t_:
                l=l+'.'
                text_list.append(l)
    
    return text_list

In [None]:
def read_entityFile(path):
    entity_list = [line.strip().split('\t') for line in codecs.open(path,'r',encoding='utf-8').readlines()]
    
    entity_dic = {}
    for entity in entity_list:
        key = entity[0]
        value = {}
        infos = entity[1].split(' ')
        entity_type = infos[0]
        entity_ch_st = infos[1]
        entity_ch_ed = infos[2]
        value['entity_type'] = entity_type
        value['entity_ch_st'] = entity_ch_st
        value['entity_ch_ed'] = entity_ch_ed
        entity_text = entity[2]
        value['entity_text'] = entity_text
        entity_dic[key] = value
    
    return entity_dic

In [None]:
def read_eventFile(path):
    tri_event_list = [line.strip().split('\t') for line in codecs.open(path,'r',encoding='utf-8').readlines()]
    
    event_dic = {}
    trigger_dic = {}
    
    for item in tri_event_list:
        tag = item[0]
        if tag.startswith('T'):
            trigger_dic[tag] = {}
            temp = item[1].split()
            trigger_dic[tag]['trigger_ch_st'] = temp[1]
            trigger_dic[tag]['trigger_ch_ed'] = temp[2]
            trigger_dic[tag]['trigger_text'] = item[2]
        elif tag.startswith('E'):
            event_dic[tag] = {}
            temp = item[1].split()
            event_dic[tag]['event_type'] = temp[0].split(':')[0]
            event_dic[tag]['trigger'] = temp[0].split(':')[1]
            lens = len(temp)
            if lens == 1:
                event_dic[tag]['arguments'] = ['NONE']
            else:
                event_dic[tag]['arguments'] = []
                for arg in temp[1:]:
                    arg_role = arg.split(':')[0]
                    arg_tag = arg.split(':')[-1]
                    value = {}
                    value['arg_role'] = arg_role
                    value['arg_tag'] = arg_tag
                    event_dic[tag]['arguments'].append(value)
        else:
            continue
    
    return trigger_dic,event_dic

In [None]:
punk = ['.',',',':','!','?','%','(',')','<','>','{','}','[',']','\'','\"']
special_punk = ['-','/']

In [None]:
def find_local(text_index,text_list,ch_s,ch_e):
    start = 0
    end = 0
    doc = tokenize(text_list[text_index].strip())
    word_li = [word.text for sent in doc.sentences for word in sent.words]
    if text_index == 0:
        st_i = 0
        ed_i = 0
        length = 0
        punk_num = 0
        for index,word in enumerate(word_li):
            pre_length = length
            length +=len(word)
            if word in punk:
                punk_num += 1
            if word in special_punk:
                punk_num += 2
                
            st_i = pre_length+index-punk_num
            if st_i == ch_s:
                start = index
                length = 0
                punk_num = 0
                break
        
        for index,word in enumerate(word_li):
            length += len(word)
            if word in punk:
                punk_num += 1
            if word in special_punk:
                punk_num += 2
            
            ed_i = length+index-punk_num
            if ed_i ==ch_e:
                end = index+1
                break
    else:
        basic_len = 0
        for i in range(text_index):
            basic_len += len(text_list[i])
        st_i = 0
        ed_i = 0
        length = basic_len+1
        punk_num = 0
        for index,word in enumerate(word_li):
            pre_lenth = length
            length += len(word)
            if word in punk:
                punk_num += 1
            if word in special_punk:
                punk_num += 2
            
            st_i = pre_lenth+index-punk_num
            if st_i == ch_s:
                start = index
                length = basic_len+1
                punk_num = 0
                break
                
        
        for index,word in enumerate(word_li):
            length += len(word)
            if word in punk:
                punk_num += 1
            if word in special_punk:
                punk_num += 2
            
            ed_i = length+index-punk_num
            if ed_i ==ch_e:
                end = index+1
                break
                
    return start,end

In [None]:
def reconstruct_entityDic(dic_,text_list):
    entity_dic = dic_
    for key,value in entity_dic.items():
        ch_st = int(value['entity_ch_st'])
        ch_ed = int(value['entity_ch_ed'])
        entity_text = value['entity_text']
        temp = ''
        for i,text in enumerate(text_list):
            temp = temp+text
            if len(temp) >=ch_ed-1:
                if i != 0:
                    ch_st = ch_st-1
                    ch_ed = ch_ed -1
                if temp[ch_st:ch_ed] == entity_text:
                    if 'text_index' in entity_dic[key]:
                        break;
                    entity_dic[key]['text_index'] = i
                    start,end = find_local(i,text_list,ch_st,ch_ed)
                    entity_dic[key]['start'] = start
                    entity_dic[key]['end'] = end
    
    return entity_dic

In [None]:
def reconstruct_triggerDic(dic_,text_list):
    trigger_dic = dic_
    for key,value in trigger_dic.items():
        ch_st = int(value['trigger_ch_st'])
        ch_ed = int(value['trigger_ch_ed'])
        trigger_text = value['trigger_text']
        temp = ''
        for i,text in enumerate(text_list):
            temp = temp+text
            if len(temp) >=ch_ed-1:
                if i != 0:
                    ch_st = ch_st-1
                    ch_ed = ch_ed -1
                if temp[ch_st:ch_ed] == trigger_text:
                    if 'text_index' in trigger_dic[key]:
                        break;
                    trigger_dic[key]['text_index'] = i
                    start,end = find_local(i,text_list,ch_st,ch_ed)
                    trigger_dic[key]['start'] = start
                    trigger_dic[key]['end'] = end
    
    return trigger_dic

In [None]:
def reconstruct_eventDic(dic_,tri_dic):
    event_dic = dic_
    for key,value in event_dic.items():
        trigger_key = value['trigger']
        event_dic[key]['trigger'] = tri_dic[trigger_key]
    
    return event_dic

In [None]:
def getStanfordResult(text):
    doc = tokenize(text)
    word_list = [word.text for sent in doc.sentences for word in sent.words]
    pos_list = [word.upos for sent in doc.sentences for word in sent.words]
    stanford_parse = [f"{word.deprel}/dep={int(word.id)-1}/gov={int(word.head)-1}" for sent in doc.sentences for word in sent.words]
    
    return word_list,pos_list,stanford_parse

In [None]:
def getResultJsonList(text_list,entity_dic,event_dic):
    json_result = []
    for i,sent in enumerate(text_list):
        result_dic = {}
        result_dic["sentence"] = text_list[i].strip()
        
        word_list,pos_list,stanford_parse = getStanfordResult(text_list[i].strip())
        result_dic["tokens"] = word_list
        result_dic["pos-tags"] = pos_list
        result_dic["parse"] = stanford_parse
        
        gold_entity_mentions = []
        for key,value in entity_dic.items():
            en_dic = {}
            if value['text_index'] == i:
                en_dic["text"] = value['entity_text']
                en_dic["start"] = value['start']
                en_dic["end"] = value['end']
                en_dic["entity_type"] = value['entity_type']
                gold_entity_mentions.append(en_dic)
        result_dic["gold_entity_mentions"] = gold_entity_mentions

        gold_event_mentions = []
        for key,value in event_dic.items():
            even_dic = {}
            if value['trigger']['text_index'] == i:
                event_type = value['event_type']
                if event_type in trigger_set:
                    trigger = {}
                    trigger["text"] = value['trigger']['trigger_text']
                    trigger["start"] = value['trigger']['start']
                    trigger["end"] = value['trigger']['end']
                    even_dic["trigger"] = trigger

                    arguments = []
                    if value['arguments'] == ['NONE']:
                        pass
                    else:
                        for arg in value['arguments']:
                            role = arg['arg_role']
                            if role in role_set:
                                arg_info = {}
                                arg_info["role"] = role
                                tag = arg['arg_tag']
                                if tag.startswith('T'):
                                    arg_info["text"] = entity_dic[tag]['entity_text']
                                    arg_info["start"] = entity_dic[tag]['start']
                                    arg_info["end"] = entity_dic[tag]['end']
                                else:
                                    tri = event_dic[tag]['trigger']
                                    arg_info["text"] = tri['trigger_text']
                                    arg_info["start"] = tri['start']
                                    arg_info["end"] = tri['end']
                                arguments.append(arg_info)
                    even_dic["arguments"] = arguments
                    even_dic["event_type"] = event_type
                    gold_event_mentions.append(even_dic)
        result_dic["gold_event_mentions"] = gold_event_mentions
        json_result.append(result_dic)
        
        return json_result

In [None]:
def writeJson(path,jsonlist):
    jsonstr = json.dumps(jsonlist,indent=8,ensure_ascii=False)
    print(jsonstr)
    
    with codecs.open(path,'w',encoding='utf-8') as f:
        f.write(jsonstr)

In [None]:
entity_dir,event_dir,text_dir = getFileNames(entity_path,event_path,text_path)

In [None]:
json_result = []
for entity_fname,event_fname,text_fname in tqdm(zip(entity_dir,event_dir,text_dir)):
    entity_fpath = os.path.join(entity_path,entity_fname)
    event_fpath = os.path.join(event_path,event_fname)
    text_fpath = os.path.join(text_path,text_fname)
    
    text_list = read_textFile(text_fpath)
    entity_dic = read_entityFile(entity_fpath)
    trigger_dic,event_dic = read_eventFile(event_fpath)
    
    entity_dic = reconstruct_entityDic(entity_dic,text_list)
    trigger_dic = reconstruct_triggerDic(trigger_dic,text_list)
    event_dic = reconstruct_eventDic(event_dic,trigger_dic)
    
    try:
        result = getResultJsonList(text_list,entity_dic,event_dic)
    except KeyError:
        print(entity_fname,event_fname,text_fname)
    json_result.extend(result)

In [None]:
writeJson(path='train.json',jsonlist=json_result)