In [102]:
import os
import glob
import nltk.data
import re

In [103]:
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

In [104]:
def get_sentence_list(text_file):
    sentence_list = []
    with open(text_file,'r') as reader:
        for i,line in enumerate(reader):
            temp_list = tokenizer.tokenize(line.strip())
            sentence_list.extend(temp_list)  
    return sentence_list

In [105]:
def get_entity_dict(a1_file):
    entity_dict={}
    with open(a1_file, 'r') as reader:
        for line in reader:
            tab_del_list=line.strip().split("\t")
            entity_id = tab_del_list[0]
            entity_type = tab_del_list[1].split(' ')[0]
            start_off = int(tab_del_list[1].split(' ')[1])
            end_off = int(tab_del_list[1].split(' ')[2])
            entity_name = tab_del_list[2]
            
            #print entity_id, entity_type,start_off, end_off,entity_name
            if entity_id not in entity_dict:
                entity_dict[entity_id] = [entity_type, start_off, end_off,entity_name]
            elif entity_id in entity_dict:
                raise Exception('key error')
                
    return entity_dict

In [106]:
def get_entity_set(a1_file):
    with open(a1_file ,'r') as reader:
        entity_set = set([line.strip().split('\t')[2] for line in reader])
    return entity_set

def get_event_dict(a2_file):
    event_dict={}
    with open(a2_file, 'r') as reader:
        for line in reader:
            if 'E' in line.strip().split('\t')[0]:
                event_id = line.strip().split('\t')[0]
                if event_id not in event_dict:
                    event_dict[event_id]=line.strip().split('\t')[1].split(' ')
                else:
                    raise Exception('key error')  

                
    return event_dict

In [107]:
def extract_entity_relation_triple(line, event_dict, entity_dict):
    if 'E' in line.split('\t')[0]:
        #print line.split('\t')[0],line.split('\t')[1].split(' ')
        if len(line.split('\t')[1].split(' '))>=3:
            theme_string = line.split('\t')[1].split(' ')[1].split(':')[1]
            cause_string = line.split('\t')[1].split(' ')[2].split(':')[1]
            
            #print line
            if 'E' in theme_string:
                effect = event_dict[theme_string][0].split(':')[0]
                result_entity_id = event_dict[theme_string][1].split(':')[1]

                if 'E' in cause_string:
                    cause_entity_id = event_dict[cause_string][1].split(':')[1]
                    if 'T' in cause_entity_id and 'T' in result_entity_id:
                        return entity_dict[cause_entity_id][3] , effect ,  entity_dict[result_entity_id][3]
                
                elif 'E' not in cause_string :
                    cause_entity_id = line.split('\t')[1].split(' ')[2].split(':')[1]
                    if 'T' in result_entity_id:
                        return entity_dict[cause_entity_id][3] , effect , entity_dict[result_entity_id][3]

            elif 'E' in cause_string:
                    effect = line.split('\t')[1].split(' ')[0].split(':')[0]
                    result_entity_id = theme_string
                    cause_entity_id = event_dict[cause_string][1].split(':')[1]
                    
                    if 'T' in cause_entity_id: 
                        return entity_dict[cause_entity_id][3] , effect , entity_dict[result_entity_id][3]


def extract_entity_relation_triple_in_a2_file(a2_file,event_dict,entity_dict):
    result_list =[]
    
    with open(a2_file, 'r') as reader:
        for line in reader:
            line = line.strip()
            result = extract_entity_relation_triple(line,event_dict,entity_dict)
            if result is not None:
                result_list.append(result)
                
    return result_list
             
    

def annotate(text, relation_triple_list,outputpath):
    fw = open(outputpath,'a')
    
    sentence_list = get_sentence_list(text)
    for sentence in sentence_list:    
        sentence_contain_triple_list = []
        for relation_triple in list(set(relation_triple_list)):
            if (relation_triple[0] in sentence) and (relation_triple[2] in sentence):
                if (relation_triple[0] == relation_triple[2]):
                    if sentence.count(relation_triple[0])>=2:
                        sentence_contain_triple_list.append(relation_triple)
                else:
                    sentence_contain_triple_list.append(relation_triple)
        
        if len(sentence_contain_triple_list)>=1:
            fw.write(sentence+'\n')
            last_index = len(sentence_contain_triple_list)-1
            for i,triple in enumerate(sentence_contain_triple_list):
                if i!= last_index:
                    fw.write(str(triple)+',')
                elif i==last_index:
                    fw.write(str(triple)+'\n')
            fw.write('\n')

            
            
    fw.close()
                
def preprocess(text, a1_file, a2_file,outputpath):
    entity_dict = get_entity_dict(a1_file)
    event_dict = get_event_dict(a2_file)
    entity_relation_triple_list = extract_entity_relation_triple_in_a2_file(a2_file,event_dict,entity_dict)
    annotate(text,entity_relation_triple_list,outputpath)
    
    return entity_relation_triple_list


In [108]:
text = './data/BioNLP-ST-2013_GE_train_data_rev3/PMC-2889865-06-Discussion.txt'
a1 = './data/BioNLP-ST-2013_GE_train_data_rev3/PMC-2889865-06-Discussion.a1'
a2 = './data/BioNLP-ST-2013_GE_train_data_rev3/PMC-2889865-06-Discussion.a2'
outputpath="Total_output.txt"

In [109]:
preprocess(text,a1, a2 ,"temp.txt")

[('IL-2', 'Gene_expression', 'IL-2'),
 ('IL-2', 'Localization', 'IL-2'),
 ('Bcl10', 'Protein_catabolism', 'Bcl10')]

In [110]:
dataset_folder = './data/Total/'

In [111]:
file_list = list(set(map(lambda x:x.split('.')[0],os.listdir(dataset_folder))))
if 'LICENSE' in file_list:
    file_list.remove('LICENSE')
if 'README' in file_list:
    file_list.remove('README')
path_list = map(lambda x:dataset_folder+x ,file_list)

In [112]:
total_list=[]
for path in path_list:
    text = path+'.txt'
    a1_file = path+'.a1'
    a2_file = path+'.a2'
    total_list.extend(preprocess(text,a1_file, a2_file,outputpath))

In [123]:
count =0
with open(outputpath,'r') as reader:
    for i,line in enumerate(reader):
        count+=1

In [125]:
count/3

3870