In [1]:
import pandas as pd
import numpy as np
import os
import re
from collections import Counter

data_dir = "../data/V2"

### Subtask 1

In [2]:
train_splits = ['train','dev']
test_splits = ['test']

data_dict = {}
df = pd.DataFrame()
for split in train_splits+test_splits:
    data_dict[split] = pd.read_csv(os.path.join(data_dir, f"{split}_subtask1.csv"))
    df = pd.concat([df, data_dict[split]], axis=0).reset_index(drop=True)
    print(f'{split}: n={len(data_dict[split])}')

split = 'all'
data_dict[split] = df
print(f'{split}: n={len(data_dict[split])}')

train: n=3072
dev: n=340
test: n=313
all: n=3725


In [3]:
data_dict[split].columns

Index(['index', 'text', 'label'], dtype='object')

In [4]:
from nltk import word_tokenize, sent_tokenize

In [5]:
for split in ['all']+train_splits+test_splits:
    df = data_dict[split].copy()
    print(f"\n{split}'s distribution of Causal Label:", dict(Counter(df['label'])))
    
    for causal_label in [1,0,'all']:
        
        if causal_label=='all':
            tmp_df = df.copy()
        else:
            tmp_df = df[df['label']==causal_label]
        
        ##### character length
        char_lens = [len(t) for t in tmp_df['text']]

        ##### word length
        tokenized_texts = [word_tokenize(t) for t in tmp_df['text']]
        word_lens = [len(t) for t in tokenized_texts]

        ##### sentence length
        sentenized_texts = [sent_tokenize(t) for t in tmp_df['text']]
        sent_lens = [len(t) for t in sentenized_texts]
        
        print(f"For label={causal_label}, n={len(tmp_df)}, avg #chars: {round(np.mean(char_lens),2)}, "+\
              f"#words: {round(np.mean(word_lens),2)}, #sents: {round(np.mean(sent_lens),2)}")


all's distribution of Causal Label: {1: 1986, 0: 1739}
For label=1, n=1986, avg #chars: 193.83, #words: 34.22, #sents: 1.03
For label=0, n=1739, avg #chars: 150.02, #words: 27.0, #sents: 1.06
For label=all, n=3725, avg #chars: 173.38, #words: 30.85, #sents: 1.04

train's distribution of Causal Label: {1: 1623, 0: 1449}
For label=1, n=1623, avg #chars: 189.41, #words: 33.47, #sents: 1.02
For label=0, n=1449, avg #chars: 148.52, #words: 26.71, #sents: 1.06
For label=all, n=3072, avg #chars: 170.12, #words: 30.28, #sents: 1.04

dev's distribution of Causal Label: {1: 185, 0: 155}
For label=1, n=185, avg #chars: 194.11, #words: 34.41, #sents: 1.04
For label=0, n=155, avg #chars: 147.83, #words: 26.85, #sents: 1.04
For label=all, n=340, avg #chars: 173.01, #words: 30.96, #sents: 1.04

test's distribution of Causal Label: {1: 178, 0: 135}
For label=1, n=178, avg #chars: 233.83, #words: 40.81, #sents: 1.07
For label=0, n=135, avg #chars: 168.73, #words: 30.25, #sents: 1.13
For label=all, n=3

### Subtask 2

In [6]:
train_splits = ['train','dev']
test_splits = ['test']

train_splits = ['train','dev']
test_splits = ['test']

data_dict = {}
df = pd.DataFrame()
for split in train_splits+test_splits:
    tmp_df = pd.read_csv(os.path.join(data_dir, f"{split}_subtask2_grouped.csv"))
    data_dict[split] = tmp_df[tmp_df['num_rs']>0].reset_index(drop=True)
    df = pd.concat([df, data_dict[split]], axis=0).reset_index(drop=True)
    print(f'{split}: n={len(data_dict[split])}')

split = 'all'
data_dict[split] = df
print(f'{split}: n={len(data_dict[split])}')

train: n=1623
dev: n=185
test: n=90
all: n=1898


In [7]:
data_dict[split].columns

Index(['corpus', 'doc_id', 'sent_id', 'eg_id', 'index', 'text',
       'causal_text_w_pairs', 'num_rs'],
      dtype='object')

In [8]:
from ast import literal_eval

def get_args(text_w_pairs, search_pattern=r'<ARG0>(.*?)</ARG0>', do_join=True):
    found = re.findall(search_pattern, text_w_pairs)
    if do_join:
        return ' '.join(found)
    else:
        return found

get_args('<SIG0>Example, sentence</SIG0> <SIG2>this is</SIG2>.', r'<SIG.>(.*?)</SIG.>', False)

['Example, sentence', 'this is']

In [9]:
for split in ['all']+train_splits+test_splits:
    
    print(f'\n{split}')
    print('# Sentences:', len(data_dict[split]))
    print('# Relations:', data_dict[split]['num_rs'].sum())
    
    ##### word length
    # Sentence
    tokenized_texts = [word_tokenize(t) for t in data_dict[split]['text']]
    word_lens = [len(t) for t in tokenized_texts]
    print('Avg. # words:', round(np.mean(word_lens),2))
    
    cause_lens = []
    effect_lens = []
    signal_counts = []
    signal_lens = []
    
    for causal_text_w_pairs in data_dict[split]['causal_text_w_pairs']:
        
        causal_text_w_pairs = list(literal_eval(causal_text_w_pairs))
        
        for text_w_pairs in causal_text_w_pairs:   
            # Cause
            cause = get_args(text_w_pairs, search_pattern=r'<ARG0>(.*?)</ARG0>')
            cause_lens.append(len(word_tokenize(cause)))

            # Effect
            effect = get_args(text_w_pairs, search_pattern=r'<ARG1>(.*?)</ARG1>')
            effect_lens.append(len(word_tokenize(effect)))

            # Signal
            signals = get_args(text_w_pairs, r'<SIG.>(.*?)</SIG.>', False) 
            signal_counts.append(len(signals))
            signal_lens.extend([len(word_tokenize(t)) for t in signals])
        
    print('Cause - Avg. # words:', round(np.mean(cause_lens),2))
    print('Effect - Avg. # words:', round(np.mean(effect_lens),2))
    print('Signal - Avg. # words:', round(np.mean(signal_lens),2))
    print(f'Distribution of signals: {Counter(signal_counts)}')
    print('Avg # signals per example:', round(np.mean(signal_counts),2))
    print('Avg # rels with signals:', round(np.mean([1 if x>0 else 0 for x in signal_counts]),2))


all
# Sentences: 1898
# Relations: 2631
Avg. # words: 33.33
Cause - Avg. # words: 11.63
Effect - Avg. # words: 10.61
Signal - Avg. # words: 1.45
Distribution of signals: Counter({1: 1760, 0: 832, 2: 38, 3: 1})
Avg # signals per example: 0.7
Avg # rels with signals: 0.68

train
# Sentences: 1623
# Relations: 2257
Avg. # words: 33.47
Cause - Avg. # words: 11.56
Effect - Avg. # words: 10.71
Signal - Avg. # words: 1.45
Distribution of signals: Counter({1: 1514, 0: 712, 2: 30, 3: 1})
Avg # signals per example: 0.7
Avg # rels with signals: 0.68

dev
# Sentences: 185
# Relations: 249
Avg. # words: 34.41
Cause - Avg. # words: 12.2
Effect - Avg. # words: 10.18
Signal - Avg. # words: 1.53
Distribution of signals: Counter({1: 154, 0: 92, 2: 3})
Avg # signals per example: 0.64
Avg # rels with signals: 0.63

test
# Sentences: 90
# Relations: 125
Avg. # words: 28.62
Cause - Avg. # words: 11.75
Effect - Avg. # words: 9.54
Signal - Avg. # words: 1.42
Distribution of signals: Counter({1: 92, 0: 28, 2: