### Explore CauseNet
All arguments are SINGLE-WORDS

In [62]:
import json


filepath = r"D:\08 Thesis\20 Open Discussions\CausalRelationExtraction Workshop\CausalRelationExtraction_Demo\wikipedia-extraction-subset-5000.tsv"


def is_valid_article(title):
    forbidden_title_parts = ['Wikipedia:', 'Template:', 'File:',
                            'Portal:', 'Category:', 'Draft:',
                            'List of', 'disambiguation']

    contains_forbidden_title_part = False
    for forbidden_title_part in forbidden_title_parts:
        if forbidden_title_part in title:
            contains_forbidden_title_part = True
            break

    return not contains_forbidden_title_part


def get_text_w_pairs(d, retain_original_spacing=True):
    text = str(d['sentence'])
    splitted_text = d['tokens']
    assert(splitted_text[d['cause_id']] == d['cause_word'])
    assert(splitted_text[d['effect_id']] == d['effect_word'])
    splitted_text = splitted_text[:d['cause_id']] + ["<ARG0>"+str(d['cause_word'])+"</ARG0>"] + splitted_text[d['cause_id']+1:]
    splitted_text = splitted_text[:d['effect_id']] + ["<ARG1>"+str(d['effect_word'])+"</ARG1>"] + splitted_text[d['effect_id']+1:]
    return ' '.join(splitted_text)


for id_, line in enumerate(open(filepath, encoding="utf-8")):
    parts = line.strip().split('\t')
    if parts[0] != 'wikipedia_sentence':
        continue
    assert len(parts) == 11

    if not is_valid_article(parts[2]):
        continue
    
    for match in json.loads(parts[10]):
        sentence_data = {
            "cause_word": match['Cause'][0],
            "cause_id": int(match['Cause'][1]),
            "effect_word": match['Effect'][0],
            "effect_id": int(match['Effect'][1]),
            "pattern": match['Pattern'],
            "sentence": json.loads(parts[7]),
            "tokens": json.loads(parts[8]),
            "dependencies": json.loads(parts[9])
        }        
        print(get_text_w_pairs(sentence_data, retain_original_spacing=True))
    for i,p in enumerate(parts):
        print(i,p)
    break

This <ARG1>change</ARG1> resulted from the Art Director 's <ARG0>branch</ARG0> of the Academy of Motion Picture Arts and Sciences ( AMPAS ) being renamed the Designer 's branch .
0 wikipedia_sentence
1 316
2 Academy Award for Best Production Design
3 859367022
4 2018-09-13T15:53:30Z
5 "Academy Award for Best Production Design"
6 1
7 "This change resulted from the Art Director's branch of the Academy of Motion Picture Arts and Sciences (AMPAS) being renamed the Designer's branch."
8 ["This","change","resulted","from","the","Art","Director","'s","branch","of","the","Academy","of","Motion","Picture","Arts","and","Sciences","(","AMPAS",")","being","renamed","the","Designer","'s","branch","."]
9 "digraph  {\n  N_1 [label=\"This/DT-1\"];\n  N_2 [label=\"change/NN-2\"];\n  N_3 [label=\"resulted/VBD-3\"];\n  N_4 [label=\"from/IN-4\"];\n  N_5 [label=\"the/DT-5\"];\n  N_6 [label=\"Art/NNP-6\"];\n  N_7 [label=\"Director/NNP-7\"];\n  N_8 [label=\"'s/POS-8\"];\n  N_9 [label=\"branch/NN-9\"];\n  N_1

In [60]:
def get_original2spaced(splitted_text, splitted_text_w_spaces, verbose=False):
    original2spaced={}
    tmp=[]
    counter=0
    for ii,tt in enumerate(splitted_text):
        lt=''
        for i,t in enumerate(splitted_text_w_spaces[counter:]):
            lt+=t
            if lt==tt:
                if verbose: print(tt, '-->', lt)
                original2spaced[ii]=(counter,(i+1)+counter)
                # reset
                lt=''
                counter+=(i+1)
                break
    return original2spaced


def get_text_w_pairs(d, retain_original_spacing=True):
    splitted_text = d['tokens']
    assert(splitted_text[d['cause_id']] == d['cause_word'])
    assert(splitted_text[d['effect_id']] == d['effect_word'])
    splitted_text = splitted_text[:d['cause_id']] + ["<ARG0>"+str(d['cause_word'])+"</ARG0>"] + splitted_text[d['cause_id']+1:]
    splitted_text = splitted_text[:d['effect_id']] + ["<ARG1>"+str(d['effect_word'])+"</ARG1>"] + splitted_text[d['effect_id']+1:]
    
    if retain_original_spacing:
        original2spaced = get_original2spaced(
            splitted_text=str(d['sentence']).split(' '), 
            splitted_text_w_spaces=d['tokens'], 
            verbose=False
        )
        text_w_pairs = ''
        for k,(s,e) in original2spaced.items():
            text_w_pairs+= ''.join(splitted_text[s:e])+' '
        text_w_pairs = text_w_pairs[:-1]
    else:
        text_w_pairs = ' '.join(splitted_text)
    
    return text_w_pairs

# Observe differences in spacing, we use the TRUE format
print(get_text_w_pairs(sentence_data, retain_original_spacing=False))
print(get_text_w_pairs(sentence_data, retain_original_spacing=True))

In the Tetrabiblos , the 2nd-century astrological <ARG0>text</ARG0> of the Alexandrian astronomer Ptolemy , Algol is referred to as " the Gorgon of Perseus " and associated with <ARG1>death</ARG1> by decapitation : a theme which mirrors the myth of the hero Perseus 's victory over the snake-haired Gorgon Medusa .
In the Tetrabiblos, the 2nd-century astrological <ARG0>text</ARG0> of the Alexandrian astronomer Ptolemy, Algol is referred to as "the Gorgon of Perseus" and associated with <ARG1>death</ARG1> by decapitation: a theme which mirrors the myth of the hero Perseus's victory over the snake-haired Gorgon Medusa.


### 2. Parse CauseNet

CauseNet provides two types of data

1. "XXXXX.txt" files: Manually annotated training and evaluation data for the concept spotting. (Arguments are single nouns, can be inputs to sequence classification and pair classification.)
> Under r"D:\59 Github Projects\0004 CauseNet\data\concept-spotting\sentences"
> <br>See http://localhost:8888/notebooks/08%20Thesis/03%20Projects/EconomicMeasures/causal_data/Get%20CauseNet_Manual.ipynb

2. "XXXXX.tsv" files: Output data of the extraction components (Arguments can be longer, potentially can be used for all tasks.)
> Under "D:\59 Github Projects\0004 CauseNet\data\causality-graphs\extraction\wikipedia"

to do:
1. check number of unique patterns 

In [19]:
import json
import re
import pandas as pd
from tqdm import tqdm


def is_valid_article(title):
    forbidden_title_parts = ['Wikipedia:', 'Template:', 'File:',
                            'Portal:', 'Category:', 'Draft:',
                            'List of', 'disambiguation']

    contains_forbidden_title_part = False
    for forbidden_title_part in forbidden_title_parts:
        if forbidden_title_part in title:
            contains_forbidden_title_part = True
            break

    return not contains_forbidden_title_part


def get_original2spaced(splitted_text, splitted_text_w_spaces, verbose=False):
    original2spaced={}
    tmp=[]
    counter=0
    for ii,tt in enumerate(splitted_text):
        lt=''
        for i,t in enumerate(splitted_text_w_spaces[counter:]):
            lt+=t
            if lt==tt:
                if verbose: print(tt, '-->', lt)
                original2spaced[ii]=(counter,(i+1)+counter)
                # reset
                lt=''
                counter+=(i+1)
                break
    return original2spaced


def get_text_w_pairs(d, retain_original_spacing=True):
    splitted_text = d['tokens']
    cause_text = splitted_text[d['cause_id']]
    effect_text = splitted_text[d['effect_id']]
    try:
        assert(re.sub("[^a-zA-Z0-9 -]","",cause_text) == re.sub("[^a-zA-Z0-9 -]","",d['cause_word']))
        assert(re.sub("[^a-zA-Z0-9 -]","",effect_text) == re.sub("[^a-zA-Z0-9 -]","",d['effect_word']))
    except:
        return None
    splitted_text = splitted_text[:d['cause_id']] + ["<ARG0>"+str(cause_text)+"</ARG0>"] + splitted_text[d['cause_id']+1:]
    splitted_text = splitted_text[:d['effect_id']] + ["<ARG1>"+str(effect_text)+"</ARG1>"] + splitted_text[d['effect_id']+1:]
    
    if retain_original_spacing:
        original2spaced = get_original2spaced(
            splitted_text=str(d['sentence']).split(' '), 
            splitted_text_w_spaces=d['tokens'], 
            verbose=False
        )
        text_w_pairs = ''
        for k,(s,e) in original2spaced.items():
            text_w_pairs+= ''.join(splitted_text[s:e])+' '
        text_w_pairs = text_w_pairs[:-1]
    else:
        text_w_pairs = ' '.join(splitted_text)
    
    return text_w_pairs



def format_by_document(final_data, data):
    # complete document grouping
    data = pd.DataFrame(data, columns=['doc_id','text','text_w_pairs'])
    data['corpus']='causenet'
    data['seq_label'] = data['pair_label'] = 1 # all causal examples
    data['sent_id']= data.text.factorize()[0] # enumerate sentences in document
    data['eg_id'] = data.groupby('sent_id')['corpus'].cumcount() # enumerate examples per sentence
    data['context'] = '' # all single sentences
    data['num_sents'] = 1 # all single sentences
    data['text_w_pairs'] = data['text_w_pairs'].fillna('')
    data['index'] = data['corpus']+'_'+data['doc_id'].astype(str)+'_'+data['sent_id'].astype(str)+'_'+data['eg_id'].astype(int).astype(str)
    data = data[cols]
    final_data = pd.concat([final_data, data], axis=0)
    data = []
    return final_data, data


sample_max = 50000
filepath = r"D:\59 Github Projects\0004 CauseNet\data\causality-graphs\extraction\wikipedia\wikipedia-extraction.tsv"
cols = ['corpus','doc_id','sent_id','eg_id','index','text','text_w_pairs','seq_label','pair_label','context','num_sents']
final_data = pd.DataFrame()
data = []
counter = 0
doc_counter = 0
current_doc_id = ''
for id_, line in enumerate(tqdm(open(filepath, encoding="utf-8"))):
    parts = line.strip().split('\t')
    if parts[0] != 'wikipedia_sentence':
        continue
    assert len(parts) == 11

    if not is_valid_article(parts[2]):
        continue
    
    for match in json.loads(parts[10]):
        doc_id = parts[1]
        if doc_id!=current_doc_id and current_doc_id!='':
            final_data, data = format_by_document(final_data, data)
            doc_counter+=1
        
        text = json.loads(parts[7])
        sentence_data = {
            "cause_word": match['Cause'][0],
            "cause_id": int(match['Cause'][1]),
            "effect_word": match['Effect'][0],
            "effect_id": int(match['Effect'][1]),
            "pattern": match['Pattern'],
            "sentence": text,
            "tokens": json.loads(parts[8]),
#             "dependencies": json.loads(parts[9])
        }        
        text_w_pairs = get_text_w_pairs(sentence_data, retain_original_spacing=True)
        if text_w_pairs is None:
            print(sentence_data,'\n')
        else:
            data.append([f"{doc_counter}_{doc_id}",text,text_w_pairs])
            current_doc_id = doc_id
            counter+=1
        
    if counter>sample_max+50:
        break

final_data, data = format_by_document(final_data, data)
final_data

179808it [02:30, 1198.46it/s]


Unnamed: 0,corpus,doc_id,sent_id,eg_id,index,text,text_w_pairs,seq_label,pair_label,context,num_sents
0,causenet,0_316,0,0,causenet_0_316_0_0,This change resulted from the Art Director's b...,This <ARG1>change</ARG1> resulted from the Art...,1,1,,1
0,causenet,1_290,0,0,causenet_1_290_0_0,The Etruscans brought the Greek alphabet to th...,The <ARG0>Etruscans</ARG0> brought the Greek <...,1,1,,1
0,causenet,2_309,0,0,causenet_2_309_0_0,The Seattle Symphony also recorded a version i...,The Seattle Symphony also recorded a version i...,1,1,,1
1,causenet,2_309,1,0,causenet_2_309_1_0,Ravel's high praise of Gershwin in an introduc...,Ravel's high <ARG0>praise</ARG0> of Gershwin i...,1,1,,1
2,causenet,2_309,2,0,causenet_2_309_2_0,"While the studies were cut short, that 1926 tr...","While the studies were cut short, that 1926 <A...",1,1,,1
...,...,...,...,...,...,...,...,...,...,...,...
1,causenet,10452_34467,1,0,causenet_10452_34467_1_0,The new commands took the place of two existin...,The new commands took the place of two existin...,1,1,,1
2,causenet,10452_34467,2,0,causenet_10452_34467_2_0,Such core changes brought incompatibilities:\n...,Such core <ARG0>changes</ARG0> brought,1,1,,1
3,causenet,10452_34467,3,0,causenet_10452_34467_3_0,These changes resulted in minor incompatibilit...,These <ARG0>changes</ARG0> resulted in minor i...,1,1,,1
4,causenet,10452_34467,4,0,causenet_10452_34467_4_0,"An ""attribute"" consists of a foreground and a ...","An ""attribute"" consists of a foreground and a ...",1,1,,1


In [20]:
len(final_data.dropna())

50051

In [21]:
final_data[:sample_max].to_csv('cleaned/causenet.csv', index=False, encoding='utf-8-sig')

In [22]:
final_data.to_csv('cleaned/causenet_all.csv', index=False, encoding='utf-8-sig')

# random split 

In [23]:
import numpy as np

In [29]:
np.random.seed(42)
unique_docs = list(final_data['doc_id'].unique())
r_list = list(np.random.choice(unique_docs, int(round(len(unique_docs)*0.1,0)), replace=False))
r_list

['6134_20452',
 '5270_17753',
 '3210_11109',
 '3643_12478',
 '2908_10046',
 '6319_20979',
 '7523_24714',
 '1780_6416',
 '2157_7617',
 '2544_8816',
 '7861_25829',
 '33_642',
 '5121_17204',
 '7966_26193',
 '450_1969',
 '914_3797',
 '35_624',
 '8583_28184',
 '795_3231',
 '7960_26176',
 '7048_23336',
 '8493_27808',
 '10234_33687',
 '6620_21891',
 '1713_6206',
 '2903_9995',
 '8221_26994',
 '7944_26118',
 '8224_27024',
 '1297_4882',
 '8918_29074',
 '1145_4476',
 '5678_19070',
 '6946_23053',
 '7685_25237',
 '7903_25982',
 '3724_12730',
 '7542_24762',
 '3284_11369',
 '6502_21461',
 '7375_24269',
 '8144_26830',
 '7583_24872',
 '3535_12221',
 '5763_19316',
 '1373_5195',
 '2974_10273',
 '7957_26173',
 '4417_14843',
 '5683_19075',
 '5753_19293',
 '9456_30972',
 '3819_13035',
 '8829_28829',
 '8959_29199',
 '4782_16103',
 '10017_32623',
 '7918_26032',
 '6289_20892',
 '1523_5666',
 '10273_33864',
 '7109_23545',
 '4047_13774',
 '3902_13396',
 '6550_21683',
 '39_627',
 '4429_14909',
 '8832_28758',
 '67

In [30]:
def save_list(data:list, txt_file_path:str):
    with open(txt_file_path, "w") as f:
        for s in data:
            f.write(str(s) +"\n")
            
save_list(r_list, 'cleaned/causenet-test-doc_id.txt')

In [32]:
# check
def open_list(txt_file_path:str, typ:type=int):
    items = []
    with open(txt_file_path, "r") as f:
        for line in f:
            items.append(typ(line.strip()))
    return items

splits = {
    'causenet': {
        'test': open_list('cleaned/causenet-test-doc_id.txt', str)
    }
}

splits

{'causenet': {'test': ['6134_20452',
   '5270_17753',
   '3210_11109',
   '3643_12478',
   '2908_10046',
   '6319_20979',
   '7523_24714',
   '1780_6416',
   '2157_7617',
   '2544_8816',
   '7861_25829',
   '33_642',
   '5121_17204',
   '7966_26193',
   '450_1969',
   '914_3797',
   '35_624',
   '8583_28184',
   '795_3231',
   '7960_26176',
   '7048_23336',
   '8493_27808',
   '10234_33687',
   '6620_21891',
   '1713_6206',
   '2903_9995',
   '8221_26994',
   '7944_26118',
   '8224_27024',
   '1297_4882',
   '8918_29074',
   '1145_4476',
   '5678_19070',
   '6946_23053',
   '7685_25237',
   '7903_25982',
   '3724_12730',
   '7542_24762',
   '3284_11369',
   '6502_21461',
   '7375_24269',
   '8144_26830',
   '7583_24872',
   '3535_12221',
   '5763_19316',
   '1373_5195',
   '2974_10273',
   '7957_26173',
   '4417_14843',
   '5683_19075',
   '5753_19293',
   '9456_30972',
   '3819_13035',
   '8829_28829',
   '8959_29199',
   '4782_16103',
   '10017_32623',
   '7918_26032',
   '6289_20892