In [1]:
import os
import re
import numpy as np
import pandas as pd
from collections import defaultdict

folder = r"D:\50 CausalCF\data\Causal-TimeBank-TimeML"
list_of_filepaths = []
for root, dirs, files in os.walk(folder):
    for file in files:
        if file.endswith(".tml"):
             list_of_filepaths.append(os.path.join(root, file))
print(len(list_of_filepaths))
print(list_of_filepaths[0])

183
D:\50 CausalCF\data\Causal-TimeBank-TimeML\ABC19980108.1830.0711.tml


In [2]:
list_of_filepaths[4]

'D:\\50 CausalCF\\data\\Causal-TimeBank-TimeML\\AP900815-0044.tml'

In [4]:
def get_text_chunk_loc(lines):
    start, end = 0, np.inf
    for i,row in enumerate(lines):
        if '<TEXT>' in row:
            start=i
        elif '</TEXT>' in row:
            end=i
            break
    return (start,end)


def re_search_within(string, pattern):
    p = re.compile(pattern)
    search_result = p.search(string)
    return search_result.group(1)
    

def run(fn):
    # Open File
    with open(fn, 'rU') as f:
        lines = f.readlines()
    
    # Get Segments
    t_start, t_end = get_text_chunk_loc(lines)
    text = lines[t_start+1:t_end]
    anns = lines[t_end+1:-1]

    # Format Annotations
    eiid_to_eid = {}
    rel_to_es = {}
    causal_rels = []

    for row in anns:
        if 'MAKEINSTANCE' in row:
            eid=re_search_within(row,'eventID="(\S*)"')
            eiid=re_search_within(row,'eiid="(\S*)"')
            eiid_to_eid[eiid]=eid
        else:
            lid=re_search_within(row,'lid="(\S*)"')
            splitted = row.split(' ')
            headid=re_search_within(splitted[-2],'="(\S*)"')
            tailid=re_search_within(splitted[-1],'="(\S*)"')
            rel_to_es[lid]=(headid,tailid)
            if 'CLINK' in row:
                causal_rels.append(lid)
    
    # Format Text
    sentid_to_text = {}
    etid_to_loc = {}
    etid_to_loc['t0'] = [-1,-1,-1]

    for sentid, sent in enumerate(text):
        # remove c-signal labels
        sent = re.sub('<(C-SIGNAL[^<]*)>', '', sent)
        sent = re.sub('<(/C-SIGNAL[^<]*)>', '', sent)
        # get cleaned sent
        cleaned_sent = ''
        prev = 0
        prev_etid = None
        accounting = 0
        for match in re.finditer('<([^<]*)>', sent):
            start, end = match.span()
            ann = match.group()
            if ' ' not in ann: # end
                etid_to_loc[prev_etid].extend([start-accounting, sentid])
                prev_etid = None
            else:
                etid = re_search_within(ann.split(' ')[1],'="(\S*)"')
                if etid in etid_to_loc.keys():
                    raise ValueError('There is already this etid! Non consequtive words exists!')
                etid_to_loc[etid] = [start-accounting]
                prev_etid = etid
            cleaned_sent += sent[prev:start]
            accounting += len(ann)
            prev = end
        cleaned_sent += sent[prev:]
        sentid_to_text[sentid]=cleaned_sent

    # Generate Examples
    def jump(etid):
        if etid[0]=='e':
            return eiid_to_eid[etid]
        else:
            return etid

    # Retain rels that are in same sentence
    corpus = 'ctb'
    cols = ['corpus','doc_id','sent_id','eg_id','index','text','text_w_pairs','seq_label','pair_label','context','num_sents']
    data = []
    rows_w_seq_label_to_update = []
    sentid_counter = defaultdict(int)
    for rel_id, (h, t) in rel_to_es.items():
    #     print(rel_id, etid_to_loc[jump(h)][2], etid_to_loc[jump(t)][2])
        if etid_to_loc[jump(h)][2] == etid_to_loc[jump(t)][2]:

            s_head, e_head, sentid = etid_to_loc[jump(h)]
            s_tail, e_tail, _ = etid_to_loc[jump(t)]

            num_eg_for_this_sentid = sentid_counter[sentid]
            identifiers = [corpus,os.path.basename(fn),str(sentid),str(num_eg_for_this_sentid)]
            unique_index = '_'.join(identifiers)
            text = sentid_to_text[sentid]

            if s_head<s_tail:
                text_w_pairs = text[:s_head]+'<ARG0>'+text[s_head:e_head]+'</ARG0>'+text[e_head:]
                text_w_pairs = text_w_pairs[:s_tail+13]+'<ARG1>'+text_w_pairs[s_tail+13:e_tail+13]+'</ARG1>'+text_w_pairs[e_tail+13:]
            else:
                text_w_pairs = text[:s_tail]+'<ARG1>'+text[s_tail:e_tail]+'</ARG1>'+text[e_tail:]
                text_w_pairs = text_w_pairs[:s_head+13]+'<ARG0>'+text_w_pairs[s_head+13:e_head+13]+'</ARG0>'+text_w_pairs[e_head+13:]

            seq_label = pair_label = 1 if rel_id in causal_rels else 0 
            if pair_label == 1 and num_eg_for_this_sentid>0:
                # Previous examples exists, need to change all seq_label to 1
                rows_w_seq_label_to_update.append(identifiers[:-1])

            data.append(
                identifiers+[
                    unique_index,
                    text.strip(),
                    text_w_pairs.strip(),
                    seq_label,
                    pair_label,
                    '',1
                ]
            )
            sentid_counter[sentid]+=1
        else:
            s_head, e_head, sentid_head = etid_to_loc[jump(h)]
            s_tail, e_tail, sentid_tail = etid_to_loc[jump(t)]
            
            if sentid_head<0 or sentid_tail<0:
                continue
            
            _min = min(sentid_head,sentid_tail)
            _max = max(sentid_head,sentid_tail) 
            sentid = f'{_min};{_max}'
            
            num_eg_for_this_sentid = sentid_counter[sentid]
            identifiers = [corpus,os.path.basename(fn),str(sentid),str(num_eg_for_this_sentid)]
            unique_index = '_'.join(identifiers)
            
            text_head = sentid_to_text[sentid_head]
            text_head_w_pairs = text_head[:s_head]+'<ARG0>'+text_head[s_head:e_head]+'</ARG0>'+text_head[e_head:]
            text_tail = sentid_to_text[sentid_tail]
            text_tail_w_pairs = text_tail[:s_tail]+'<ARG1>'+text_tail[s_tail:e_tail]+'</ARG1>'+text_tail[e_tail:]
            
            if abs(sentid_head-sentid_tail)<=1:
                context = ''
            else:
                context = ' '.join([sentid_to_text[s].strip() for s in range(_min, _max+1)])
            
            if sentid_head<sentid_tail:
                text = text_head.strip() + ' ' + text_tail.strip()
                text_w_pairs = text_head_w_pairs.strip() + ' ' + text_tail_w_pairs.strip()
            else:
                text = text_tail.strip() + ' ' + text_head.strip()
                text_w_pairs = text_tail_w_pairs.strip() + ' ' + text_head_w_pairs.strip()
            
#             if context==text:
#                 context = ''
            
            seq_label = pair_label = 1 if rel_id in causal_rels else 0 

            data.append(
                identifiers+[
                    unique_index,
                    text,
                    text_w_pairs,
                    seq_label,
                    pair_label,
                    context,
                    _max-_min+1
                ]
            )
            sentid_counter[sentid]+=1

    data = pd.DataFrame(data, columns=cols)
    data['sent_id'] = data['sent_id'].astype(str)
    data['seq_label'] = data.groupby(['corpus','doc_id','sent_id'])['seq_label'].transform('max')
    
    return data


run(list_of_filepaths[0])



Unnamed: 0,corpus,doc_id,sent_id,eg_id,index,text,text_w_pairs,seq_label,pair_label,context,num_sents
0,ctb,ABC19980108.1830.0711.tml,1,0,ctb_ABC19980108.1830.0711.tml_1_0,The financial assistance from the World Bank a...,The financial <ARG0>assistance</ARG0> from the...,0,0,,1
1,ctb,ABC19980108.1830.0711.tml,2,0,ctb_ABC19980108.1830.0711.tml_2_0,"In the last twenty four hours, the value of th...","In <ARG0>the last twenty four hours</ARG0>, th...",0,0,,1
2,ctb,ABC19980108.1830.0711.tml,2;4,0,ctb_ABC19980108.1830.0711.tml_2;4_0,"In the last twenty four hours, the value of th...","In <ARG0>the last twenty four hours</ARG0>, th...",0,0,"In the last twenty four hours, the value of th...",3
3,ctb,ABC19980108.1830.0711.tml,2;3,0,ctb_ABC19980108.1830.0711.tml_2;3_0,"In the last twenty four hours, the value of th...","In <ARG1>the last twenty four hours</ARG1>, th...",0,0,,2
4,ctb,ABC19980108.1830.0711.tml,2;6,0,ctb_ABC19980108.1830.0711.tml_2;6_0,"In the last twenty four hours, the value of th...","In <ARG1>the last twenty four hours</ARG1>, th...",0,0,"In the last twenty four hours, the value of th...",5
5,ctb,ABC19980108.1830.0711.tml,10,0,ctb_ABC19980108.1830.0711.tml_10_0,So when Wong Kwan spent seventy million dollar...,So when Wong Kwan <ARG1>spent</ARG1> seventy m...,0,0,,1
6,ctb,ABC19980108.1830.0711.tml,10;11,0,ctb_ABC19980108.1830.0711.tml_10;11_0,So when Wong Kwan spent seventy million dollar...,So when Wong Kwan <ARG1>spent</ARG1> seventy m...,0,0,,2
7,ctb,ABC19980108.1830.0711.tml,11,0,ctb_ABC19980108.1830.0711.tml_11_0,He sold the property to five buyers and said h...,He <ARG1>sold</ARG1> the property to five buye...,0,0,,1
8,ctb,ABC19980108.1830.0711.tml,13,0,ctb_ABC19980108.1830.0711.tml_13_0,"Now with new construction under way, three of ...",Now with new <ARG0>construction</ARG0> under w...,0,0,,1
9,ctb,ABC19980108.1830.0711.tml,18,0,ctb_ABC19980108.1830.0711.tml_18_0,"She estimates her properties, worth a hundred ...","She estimates her properties, <ARG0>worth</ARG...",0,0,,1


In [5]:
data = pd.DataFrame()
for counter, fn in enumerate(list_of_filepaths):
    df = run(fn)
    data = pd.concat([data, df])

data



Unnamed: 0,corpus,doc_id,sent_id,eg_id,index,text,text_w_pairs,seq_label,pair_label,context,num_sents
0,ctb,ABC19980108.1830.0711.tml,1,0,ctb_ABC19980108.1830.0711.tml_1_0,The financial assistance from the World Bank a...,The financial <ARG0>assistance</ARG0> from the...,0,0,,1
1,ctb,ABC19980108.1830.0711.tml,2,0,ctb_ABC19980108.1830.0711.tml_2_0,"In the last twenty four hours, the value of th...","In <ARG0>the last twenty four hours</ARG0>, th...",0,0,,1
2,ctb,ABC19980108.1830.0711.tml,2;4,0,ctb_ABC19980108.1830.0711.tml_2;4_0,"In the last twenty four hours, the value of th...","In <ARG0>the last twenty four hours</ARG0>, th...",0,0,"In the last twenty four hours, the value of th...",3
3,ctb,ABC19980108.1830.0711.tml,2;3,0,ctb_ABC19980108.1830.0711.tml_2;3_0,"In the last twenty four hours, the value of th...","In <ARG1>the last twenty four hours</ARG1>, th...",0,0,,2
4,ctb,ABC19980108.1830.0711.tml,2;6,0,ctb_ABC19980108.1830.0711.tml_2;6_0,"In the last twenty four hours, the value of th...","In <ARG1>the last twenty four hours</ARG1>, th...",0,0,"In the last twenty four hours, the value of th...",5
...,...,...,...,...,...,...,...,...,...,...,...
1,ctb,wsj_1040.tml,5,0,ctb_wsj_1040.tml_5_0,In New York Stock Exchange composite trading y...,In New York Stock Exchange composite trading <...,0,0,,1
2,ctb,wsj_1040.tml,3,0,ctb_wsj_1040.tml_3_0,The move rewards shareholders and should impro...,The move <ARG0>rewards</ARG0> shareholders and...,1,1,,1
0,ctb,wsj_1042.tml,4,0,ctb_wsj_1042.tml_4_0,The warrants may be exercised until 90 days af...,The warrants may be <ARG0>exercised</ARG0> unt...,0,0,,1
0,ctb,wsj_1073.tml,2,0,ctb_wsj_1073.tml_2_0,The unit makes intravenous pumps used by hospi...,The unit makes intravenous pumps used by hospi...,0,0,,1


In [6]:
list(data['text'])[0]

'The financial assistance from the World Bank and the International Monetary Fund are not helping.'

In [7]:
from collections import Counter

print('All Examples')
print('Seq Level:', Counter(data['seq_label'])) # if sentence level causality exists
print('Pair Level:',Counter(data['pair_label'])) # if ARG0-ARG1 pair level causality exists
print('Seq Level (Unique):',Counter(data.drop_duplicates(subset=['corpus','doc_id','sent_id'])['seq_label']))

print('\nSingle Sentence Examples')
print('Seq Level:', Counter(data.loc[data['num_sents']==1,'seq_label'])) # if sentence level causality exists
print('Pair Level:', Counter(data.loc[data['num_sents']==1,'pair_label'])) # if ARG0-ARG1 pair level causality exists
print('Seq Level (Unique):',Counter(data.loc[data['num_sents']==1].drop_duplicates(subset=['corpus','doc_id','sent_id'])['seq_label']))

All Examples
Seq Level: Counter({0: 3381, 1: 818})
Pair Level: Counter({0: 3881, 1: 318})
Seq Level (Unique): Counter({0: 2284, 1: 276})

Single Sentence Examples
Seq Level: Counter({0: 2255, 1: 788})
Pair Level: Counter({0: 2745, 1: 298})
Seq Level (Unique): Counter({0: 1285, 1: 256})


In [8]:
data.to_csv('cleaned/ctb.csv', index=False,encoding='utf-8-sig')