In [1]:
import os
import re
import pandas as pd
import numpy as np
from collections import defaultdict

folder = r"D:\59 Github Projects\0015 BECAUSE\BECAUSE-2.0"
list_of_filepaths = []
for root, dirs, files in os.walk(folder):
    for file in files:
        if file.endswith(".ann"):
             list_of_filepaths.append(os.path.join(root, file))
print(len(list_of_filepaths))
print(list_of_filepaths[0])

121
D:\59 Github Projects\0015 BECAUSE\BECAUSE-2.0\CongressionalHearings\CHRG-110hhrg44900-1.ann


In [2]:
def get_txt_fp_from_ann_fp(fn):
    if 'PTB' in fn:
        pdtb_dir = r"D:\08 Thesis\09 Datasets\07. PennTreeBank V3.0\PDTB-3.0\data\raw"
        pdtb_num = os.path.basename(fn).split('_')[-1].split('.')[0]
        if int(pdtb_num) in [1506, 2004]: #EXCEPTIONS. MANUALLY EDITTED LINE SPACING
            return os.path.join(pdtb_dir, pdtb_num[0:2], f'wsj_{pdtb_num}_e')
        else:
            return os.path.join(pdtb_dir, pdtb_num[0:2], f'wsj_{pdtb_num}')
    else:
        return os.path.splitext(fn)[0]+'.txt'
    
print(get_txt_fp_from_ann_fp('D:\\59 Github Projects\\0015 BECAUSE\\BECAUSE-2.0\\MASC\\20000410_nyt-NEW.ann'))    
print(get_txt_fp_from_ann_fp('D:\\59 Github Projects\\0015 BECAUSE\\BECAUSE-2.0\\PTB\\wsj_2125.ann'))
print(get_txt_fp_from_ann_fp('D:\\59 Github Projects\\0015 BECAUSE\\BECAUSE-2.0\\PTB\\wsj_2004.ann'))

D:\59 Github Projects\0015 BECAUSE\BECAUSE-2.0\MASC\20000410_nyt-NEW.txt
D:\08 Thesis\09 Datasets\07. PennTreeBank V3.0\PDTB-3.0\data\raw\21\wsj_2125
D:\08 Thesis\09 Datasets\07. PennTreeBank V3.0\PDTB-3.0\data\raw\20\wsj_2004_e


In [3]:
import nltk

def parse_ann_file(fn):
    
    with open(fn, 'rU') as f:
        annotations = f.readlines()
    
    arguments = {} #T
    relations = {} #E
    attributes = {} #A
    notes = []

    for ann in annotations:
        ann = ann.strip().split('\t')
        index_, info = ann[0], ann[1]

        # notes to ignore for now
        if 'AnnotatorNotes' in info:
            notes.append(ann)
            continue

        # ARGUMENTS
        if index_[0]=='T':
            text_ = ann[2]

            # if annotation is discontinuous
            # e.g. "Consequence 7824 7833;7837 7843"
            info_b = None
            if ';' in info:
                info = info.split(';')
                info_b = info[1:] # back info
                info = info[0] # main/front info

            type_, start, end = info.split(' ')
            loc = [(int(start), int(end))]

            if info_b is not None:
                for i in info_b:
                    start, end = i.split(' ')
                    loc.append((int(start), int(end)))

            arguments[index_] = {
                'type': type_,
                'loc': loc,
                'text': text_
            }

        # ATTRIBUTES
        elif index_[0]=='A':
            info = info.split(' ')
            if len(info)==2:
                attributes[index_] = {
                    'type': info[0],
                    'index': info[1]
                }
            elif len(info)==3:
                attributes[index_] = {
                    'type': info[0],
                    'index': info[1],
                    'value': info[2]
                }
            else:
                raise ValueError()

        # RELATIONS
        elif index_[0]=='E':
            info = info.split(' ')
            relations[index_] = {}
            for arg_info in info[1:]:
                argname, argindex = arg_info.split(':')
                relations[index_][argname] = argindex

    # reverse index to allow search by relation id (#E format)
    r_attributes = defaultdict(dict)
    for k,v in attributes.items():
        r_attributes[v['index']][k]=v

    return arguments, relations, attributes, r_attributes


def split_doc_into_sentences(doc):
    return nltk.sent_tokenize(doc)


def parse_txt_file(fn, buffer = 10):
    with open(get_txt_fp_from_ann_fp(fn), 'rU') as f:
        document_ = f.readlines()

    sent2locid = {-1:0}
    document = ''
    sent_id = 0

    if 'PTB' in fn: # alr split by lines
        sents = []
        for doc in document_:
            doc=re.sub('\n','',doc)
            document+=str(doc)+' '
            if doc!='':
                sents.append(doc)
    else:
        for doc in document_:
            doc=re.sub('\n','',doc)
            document+=str(doc)+' '
        document = document[:-1]
        sents = split_doc_into_sentences(document)
    
    for sent in sents:
        if sent=='':
            sent2locid[sent_id-1]+=1
            continue
        start_from=max(0,sent2locid[sent_id-1]-buffer)
        sent2locid[sent_id]=re.search(re.escape(sent),document[start_from:]).end()+1+start_from
        sent_id+=1
    
    sent2locid[max(sent2locid.keys())+1]=len(document)

    return document[:-1], sent2locid
    
    
def find_sents_needed(sent2locid, search_min, search_max):
    sents_needed = []
    for k,v in sent2locid.items():
        if v<=search_min:
            # keep replacing as first item
            sents_needed = [k+1]
            continue
        if v>search_max:
            # sufficient found, exit
            break
        sents_needed.append(k+1)
    return sents_needed
    
    
def readjust_arguments(document, arguments, buffer=30):
    
    new_arguments = {}

    for a_id, a_v in arguments.items():

        # get new values
        s = 0
        new_loc = []
        for (start, end) in a_v['loc']:

            length = end-start
            start_from = max(0,start-buffer)
            search_result = re.search(re.escape(a_v['text'][s:s+length]),document[start_from:])
            if search_result is None:
                text = a_v['text'][s:s+length]
                text = re.sub("`","'",text)
                text = re.sub("('')",'"',text)
                search_result = re.search(re.escape(text),document[start_from:])
                a_v['text'] = text
            new_start, new_end = search_result.start(), search_result.end()        
            new_loc.append((new_start+start_from, new_end+start_from))

            # prepare for next iteration
            s += length+1

        # update new values
        a_v['loc'] = new_loc
        new_arguments[a_id] = a_v
        
    return new_arguments


def get_s_e_t_list(s_head=[], e_head=[], s_tail=[], e_tail=[]):
    s_locs, e_locs, tags = [], [], []
    if isinstance(s_head,(np.ndarray,list)):
        s_locs.extend(s_head)
        e_locs.extend(e_head)
        tags.extend(['<ARG0>']*len(s_head))
    else:
        s_locs.append(s_head)
        e_locs.append(e_head)
        tags.append('<ARG0>')

    if isinstance(s_tail,(np.ndarray,list)):
        s_locs.extend(s_tail)
        e_locs.extend(e_tail)
        tags.extend(['<ARG1>']*len(s_tail))
    else:
        s_locs.append(s_tail)
        e_locs.append(e_tail)
        tags.append('<ARG1>')
    return sorted(zip(s_locs, e_locs, tags))


def run(fn):
    arguments, relations, attributes, r_attributes = parse_ann_file(fn)
    document, sent2locid = parse_txt_file(os.path.splitext(fn)[0]+'.txt')
    if 'PTB' in fn:
        for buffer in [15,30,50]:
            try:
                arguments = readjust_arguments(document, arguments, buffer)
                break
            except:
                pass
    
    # Format Text
    sentid_to_text = {}
    for k,v in sent2locid.items():
        if k<0:
            continue
        else:
            sentid_to_text[k]=document[sent2locid[k-1]:v]

    # Get eid_to_loc
    eid_to_loc = defaultdict(list)

    for k,v in arguments.items():
        ss, ee = [], []
        prev_e = np.inf
        for s,e in v['loc']:
            if prev_e+1==s: # consecutive
                ee.pop()
                ee.append(int(e))
            else:
                ss.append(int(s))
                ee.append(int(e))
            prev_e = int(e)

        _sents = find_sents_needed(sent2locid, min(ss), max(ss))
        if len(_sents)==0:
            raise ValueError('No sentence found.')
        elif len(_sents)==1:
            _sents = int(_sents[0])
        else:
            raise ValueError('Argument spans across sentences.')
            _sents = f'{min(_sents)};{max(_sents)}'

        eid_to_loc[k] = [ss,ee,_sents]
        
    # Format Annotations
    rel_to_es = {} # rel_to_es[lid]=(headid,tailid)
    causal_rels = []
    counter = 0

    for k,v in relations.items():
        if ('Arg0' in v.keys() and 'Arg1' in v.keys()):
            rel_to_es[k]=(v['Arg0'],v['Arg1'])
        elif ('Cause' in v.keys() and 'Effect' in v.keys()):
            causal_rels.append(k)
            rel_to_es[k]=(v['Cause'],v['Effect'])
        
    # Generate Examples
    # Retain rels that are in same sentence
    corpus = 'because'

    cols = ['corpus','doc_id','sent_id','eg_id','index','text','text_w_pairs','seq_label','pair_label','context','num_sents']
    data = []
    sentid_counter = defaultdict(int)

    for rel_id, (h, t) in rel_to_es.items():
        s_head, e_head, sentid_head = eid_to_loc[h]
        s_tail, e_tail, sentid_tail = eid_to_loc[t]
        if sentid_head == sentid_tail:
            sentid = sentid_head
            num_eg_for_this_sentid = sentid_counter[sentid]
            identifiers = [corpus,os.path.basename(fn),str(sentid),str(num_eg_for_this_sentid)]
            unique_index = '_'.join(identifiers)
            text = sentid_to_text[sentid]

            text_w_pairs = text
            added_t = 0
            accounting = sent2locid[sentid-1]
            for s,e,t in get_s_e_t_list(s_head=s_head, e_head=e_head, s_tail=s_tail, e_tail=e_tail):
                s = s+added_t-accounting
                e = e+added_t-accounting
                text_w_pairs = text_w_pairs[:s]+t+text_w_pairs[s:e]+t[0]+'/'+t[1:]+text_w_pairs[e:]
                added_t+=13

            seq_label = pair_label = 1 if rel_id in causal_rels else 0

            data.append(
                identifiers+[
                    unique_index,
                    text.strip(),
                    text_w_pairs.strip(),
                    seq_label,
                    pair_label,
                    '',1
                ]
            )
            sentid_counter[sentid]+=1
        else:
            raise ValueError('BECAUSE Dataset does not have inter-sentence examples!')
#             if sentid_head<0 or sentid_tail<0:
#                 continue
            
#             _min = min(sentid_head,sentid_tail)
#             _max = max(sentid_head,sentid_tail) 
#             sentid = f'{_min};{_max}'
            
#             num_eg_for_this_sentid = sentid_counter[sentid]
#             identifiers = [corpus,os.path.basename(fn),str(sentid),str(num_eg_for_this_sentid)]
#             unique_index = '_'.join(identifiers)
            
#             text_head = sentid_to_text[sentid_head]
#             text_head_w_pairs = text_head
#             added_t = 0
#             accounting = sent2locid[sentid_head-1]
#             for s,e,t in get_s_e_t_list(s_head=s_head, e_head=e_head, s_tail=[], e_tail=[]):
#                 s = s+added_t-accounting
#                 e = e+added_t-accounting
#                 text_head_w_pairs = text_head_w_pairs[:s]+t+text_head_w_pairs[s:e]+t[0]+'/'+t[1:]+text_head_w_pairs[e:]
#                 added_t+=13
            
#             text_tail = sentid_to_text[sentid_tail]
#             text_tail_w_pairs = text_tail
#             added_t = 0
#             accounting = sent2locid[sentid_tail-1]
#             for s,e,t in get_s_e_t_list(s_head=[], e_head=[], s_tail=s_tail, e_tail=e_tail):
#                 s = s+added_t-accounting
#                 e = e+added_t-accounting
#                 text_tail_w_pairs = text_tail_w_pairs[:s]+t+text_tail_w_pairs[s:e]+t[0]+'/'+t[1:]+text_tail_w_pairs[e:]
#                 added_t+=13
            
#             if abs(sentid_head-sentid_tail)<=1:
#                 context = ''
#             else:
#                 context = ' '.join([sentid_to_text[s].strip() for s in range(_min, _max+1)])
            
#             if sentid_head<sentid_tail:
#                 text = text_head.strip() + ' ' + text_tail.strip()
#                 text_w_pairs = text_head_w_pairs.strip() + ' ' + text_tail_w_pairs.strip()
#             else:
#                 text = text_tail.strip() + ' ' + text_head.strip()
#                 text_w_pairs = text_tail_w_pairs.strip() + ' ' + text_head_w_pairs.strip()
            
#             seq_label = pair_label = 1 if rel_id in causal_rels else 0 

#             data.append(
#                 identifiers+[
#                     unique_index,
#                     text,
#                     text_w_pairs,
#                     seq_label,
#                     pair_label,
#                     context,
#                     _max-_min+1
#                 ]
#             )
#             sentid_counter[sentid]+=1

    data = pd.DataFrame(data, columns=cols)
    data['sent_id'] = data['sent_id'].astype(str)
    data['seq_label'] = data.groupby(['corpus','doc_id','sent_id'])['seq_label'].transform('max')
        
    return data   
    
fn = 'D:\\59 Github Projects\\0015 BECAUSE\\BECAUSE-2.0\\PTB\\wsj_2125.ann'
run(fn)

  """


Unnamed: 0,corpus,doc_id,sent_id,eg_id,index,text,text_w_pairs,seq_label,pair_label,context,num_sents
0,because,wsj_2125.ann,7,0,because_wsj_2125.ann_7_0,"Earlier this year, bankers and other investors...","Earlier this year, <ARG1>bankers and other inv...",1,1,,1
1,because,wsj_2125.ann,9,0,because_wsj_2125.ann_9_0,"""Competition from third parties who have cash ...","""Competition from third parties who have cash ...",1,1,,1
2,because,wsj_2125.ann,6,0,because_wsj_2125.ann_6_0,"""The pricing will become more realistic, which...","""The pricing will become more realistic, <ARG0...",1,1,,1
3,because,wsj_2125.ann,11,0,because_wsj_2125.ann_11_0,"At Saks Fifth Avenue, Paul Leblang, senior vic...","At Saks Fifth Avenue, Paul Leblang, senior vic...",1,1,,1
4,because,wsj_2125.ann,12,0,because_wsj_2125.ann_12_0,"""Having to take on less debt would certainly b...","""Having to take on less debt would certainly b...",1,1,,1
5,because,wsj_2125.ann,13,0,because_wsj_2125.ann_13_0,"To make an LBO work, now we are going to need ...","To <ARG1>make an LBO work</ARG1>, now we are g...",1,1,,1
6,because,wsj_2125.ann,13,1,because_wsj_2125.ann_13_1,"To make an LBO work, now we are going to need ...","To make <ARG1>an LBO work</ARG1>, now <ARG0>we...",1,1,,1
7,because,wsj_2125.ann,14,0,because_wsj_2125.ann_14_0,Not only could the Wall Street gyrations damp ...,Not only <ARG1>could the Wall Street gyrations...,1,1,,1
8,because,wsj_2125.ann,16,0,because_wsj_2125.ann_16_0,"However, the lower prices these retail chains ...","However, <ARG0>the lower prices these retail c...",1,1,,1
9,because,wsj_2125.ann,18,0,because_wsj_2125.ann_18_0,"""What's encouraging about this is that retail ...","""What's encouraging about this is that retail ...",0,0,,1


In [4]:
data = pd.DataFrame()
for counter, fn in enumerate(list_of_filepaths):
    
    if 'NYT' in fn:
        # we do not have access to these
        continue
        
    df = run(fn)
    data = pd.concat([data, df])

data

  """


Unnamed: 0,corpus,doc_id,sent_id,eg_id,index,text,text_w_pairs,seq_label,pair_label,context,num_sents
0,because,CHRG-110hhrg44900-1.ann,5,0,because_CHRG-110hhrg44900-1.ann_5_0,We have gotten the agreement of the Chairman a...,We have gotten the agreement of the Chairman a...,1,1,,1
1,because,CHRG-110hhrg44900-1.ann,13,0,because_CHRG-110hhrg44900-1.ann_13_0,Members want to get a new context because the ...,<ARG1>Members want to get a new context</ARG1>...,1,1,,1
2,because,CHRG-110hhrg44900-1.ann,15,0,because_CHRG-110hhrg44900-1.ann_15_0,"Given the importance of this, and given the in...","Given <ARG0>the importance of this</ARG0>, and...",1,1,,1
3,because,CHRG-110hhrg44900-1.ann,25,0,because_CHRG-110hhrg44900-1.ann_25_0,Where there was a strong argument as recently ...,Where there was a strong argument as recently ...,1,1,,1
4,because,CHRG-110hhrg44900-1.ann,38,0,because_CHRG-110hhrg44900-1.ann_38_0,Something that simple causes problems in subpr...,Something that simple causes problems in subpr...,1,1,,1
...,...,...,...,...,...,...,...,...,...,...,...
7,because,wsj_2125.ann,14,0,because_wsj_2125.ann_14_0,Not only could the Wall Street gyrations damp ...,Not only <ARG1>could the Wall Street gyrations...,1,1,,1
8,because,wsj_2125.ann,16,0,because_wsj_2125.ann_16_0,"However, the lower prices these retail chains ...","However, <ARG0>the lower prices these retail c...",1,1,,1
9,because,wsj_2125.ann,18,0,because_wsj_2125.ann_18_0,"""What's encouraging about this is that retail ...","""What's encouraging about this is that retail ...",0,0,,1
10,because,wsj_2125.ann,19,0,because_wsj_2125.ann_19_0,"Still, most retailing observers expect that al...","Still, most retailing observers expect that al...",1,1,,1


In [5]:
list(data['text'])[0]

'We have gotten the agreement of the Chairman and the Secretary, preliminary to any opening statements, to stay until 1 p.m. We will probably have some votes, so we will maximize our time.'

In [6]:
from collections import Counter

print('All Examples')
print('Seq Level:', Counter(data['seq_label'])) # if sentence level causality exists
print('Pair Level:',Counter(data['pair_label'])) # if ARG0-ARG1 pair level causality exists
print('Seq Level (Unique):',Counter(data.drop_duplicates(subset=['corpus','doc_id','sent_id'])['seq_label']))

print('\nSingle Sentence Examples')
print('Seq Level:', Counter(data.loc[data['num_sents']==1,'seq_label'])) # if sentence level causality exists
print('Pair Level:', Counter(data.loc[data['num_sents']==1,'pair_label'])) # if ARG0-ARG1 pair level causality exists
print('Seq Level (Unique):',Counter(data.loc[data['num_sents']==1].drop_duplicates(subset=['corpus','doc_id','sent_id'])['seq_label']))

All Examples
Seq Level: Counter({1: 1039, 0: 206})
Pair Level: Counter({1: 965, 0: 280})
Seq Level (Unique): Counter({1: 761, 0: 193})

Single Sentence Examples
Seq Level: Counter({1: 1039, 0: 206})
Pair Level: Counter({1: 965, 0: 280})
Seq Level (Unique): Counter({1: 761, 0: 193})


In [7]:
data.to_csv('cleaned/because.csv', index=False,encoding='utf-8-sig')