## Rhetorical relations classification used in tree building: Step 1. Data preparation

Make train/dev/test splitting, save in the corresponding .pkl files

Output:
 - ``data_labeling/*``

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from utils.train_test_split import split_train_dev_test

train, dev, test = split_train_dev_test('./data')

news in train: 0.5344827586206896,	in dev: 0.6470588235294118,	in test: 0.6086956521739131
ling in train: 0.0,	in dev: 0.0,	in test: 0.0
comp in train: 0.0,	in dev: 0.0,	in test: 0.0
blog in train: 0.43103448275862066,	in dev: 0.5294117647058824,	in test: 0.4782608695652174


In [3]:
import pandas as pd
from utils.file_reading import read_gold

table = read_gold('data/all_pairs')
random_state = 45

train_samples = []
test_samples = []
dev_samples = []

for file in train:
    train_samples.append(read_gold(file.replace('.edus', ''), features=True))

for file in dev:
    dev_samples.append(read_gold(file.replace('.edus', ''), features=True))
    
for file in test:
    test_samples.append(read_gold(file.replace('.edus', ''), features=True))

train_samples = pd.concat(train_samples).sample(frac=1, random_state=random_state).reset_index(drop=True)
dev_samples = pd.concat(dev_samples).sample(frac=1, random_state=random_state).reset_index(drop=True)
test_samples = pd.concat(test_samples).sample(frac=1, random_state=random_state).reset_index(drop=True)

In [4]:
test_samples.category_id.unique()

array(['solutionhood', 'joint', 'evaluation', 'preparation',
       'cause-effect', 'elaboration', 'cause', 'background', 'sequence',
       'restatement', 'purpose', 'evidence', 'attribution', 'contrast',
       'condition', 'same-unit', 'interpretation-evaluation',
       'concession', 'comparison'], dtype=object)

In [5]:
import os
from utils.prepare_sequence import _prepare_sequence


def correct_samples(row):
    if row.snippet_x[0] in (',', '.', '!', '?'):
        row.snippet_x = row.snippet_x[1:].strip()
    if row.snippet_y[0] in (',', '.'):
        row.snippet_x += row.snippet_y[0]
        row.snippet_y = row.snippet_y[1:].strip()
    return row

def prepare_data(data, max_len=100):
    target_map = {
        'relation': 'joint',
        'antithesis': 'contrast',
        'cause': 'cause-effect',
        'effect': 'cause-effect',
        'conclusion': 'restatement',
        'interpretation': 'interpretation-evaluation',
        'evaluation': 'interpretation-evaluation',
        'motivation': 'condition',
    }

    relation_map = {
        'restatement_SN': 'restatement_NN',
        'restatement_NS': 'restatement_NN',
        'contrast_SN': 'contrast_NN',
        'contrast_NS': 'contrast_NN',
        'solutionhood_NS': 'elaboration_NS',
        'preparation_NS': 'elaboration_NS',
        'concession_SN': 'preparation_SN',
        'evaluation_SN': 'preparation_SN',
        'elaboration_SN': 'preparation_SN',
        'evidence_SN': 'preparation_SN',
        'background_SN': 'preparation_SN'
    }

    data = data[data.tokens_x.map(len) < max_len]
    data = data[data.tokens_y.map(len) < max_len]
    
    data['snippet_x'] = data.tokens_x.map(lambda row: ' '.join(row))
    data['snippet_y'] = data.tokens_y.map(lambda row: ' '.join(row))
    
    data = data.apply(correct_samples, axis=1)
    
    data = data[data.snippet_x.map(len) > 0]
    data = data[data.snippet_y.map(len) > 0]
    
    data = data[data.category_id != 'span']
    data['category_id'] = data['category_id'].map(lambda row: row.split('_')[0])
    data['category_id'] = data['category_id'].replace([0.0], 'same-unit')
    data['order'] = data['order'].replace([0.0], 'NN')
    data['category_id'] = data['category_id'].replace(target_map, regex=False)

    data['relation'] = data['category_id'].map(lambda row: row) + '_' + data['order']
    data['relation'] = data['relation'].replace(relation_map, regex=False)
    
    data['snippet_x'] = data.snippet_x.map(_prepare_sequence)
    data['snippet_y'] = data.snippet_y.map(_prepare_sequence)
    
    return data


train_samples = prepare_data(train_samples)
dev_samples = prepare_data(dev_samples)
test_samples = prepare_data(test_samples)

OUT_PATH = 'data_labeling'
! mkdir $OUT_PATH
train_samples.to_pickle(os.path.join(OUT_PATH, 'train_samples.pkl'))
dev_samples.to_pickle(os.path.join(OUT_PATH, 'dev_samples.pkl'))
test_samples.to_pickle(os.path.join(OUT_PATH, 'test_samples.pkl'))

mkdir: cannot create directory ‘data_labeling’: File exists


In [6]:
train_samples.category_id.value_counts()

joint                        3675
elaboration                  3158
attribution                  1420
cause-effect                 1339
contrast                     1255
interpretation-evaluation     950
condition                     908
sequence                      586
purpose                       579
preparation                   573
same-unit                     556
evidence                      305
comparison                    302
background                    282
concession                    218
solutionhood                  189
restatement                   104
Name: category_id, dtype: int64

In [7]:
counts = train_samples['relation'].value_counts(normalize=False).values
counts

array([3675, 3156, 1255,  984,  832,  748,  738,  591,  586,  556,  521,
        478,  436,  387,  302,  267,  257,  178,  129,  118,  104,  101])

In [8]:
counts = train_samples['relation'].value_counts(normalize=False).values
counts

array([3675, 3156, 1255,  984,  832,  748,  738,  591,  586,  556,  521,
        478,  436,  387,  302,  267,  257,  178,  129,  118,  104,  101])

In [9]:
all_data = pd.concat([train_samples, dev_samples, test_samples])

In [10]:
all_data['relation'].value_counts(normalize=False)

joint_NN                        5122
elaboration_NS                  4485
contrast_NN                     1812
attribution_SN                  1297
interpretation-evaluation_NS    1246
preparation_SN                  1084
cause-effect_SN                 1067
sequence_NN                      948
cause-effect_NS                  857
same-unit_NN                     789
condition_SN                     719
purpose_NS                       662
attribution_NS                   564
condition_NS                     528
comparison_NN                    402
background_NS                    391
evidence_NS                      339
solutionhood_SN                  278
concession_NS                    176
interpretation-evaluation_SN     166
purpose_SN                       140
restatement_NN                   138
Name: relation, dtype: int64

In [11]:
counts = all_data['relation'].value_counts(normalize=False).values
counts

array([5122, 4485, 1812, 1297, 1246, 1084, 1067,  948,  857,  789,  719,
        662,  564,  528,  402,  391,  339,  278,  176,  166,  140,  138])

In [12]:
train_samples['relation'].value_counts()

joint_NN                        3675
elaboration_NS                  3156
contrast_NN                     1255
attribution_SN                   984
interpretation-evaluation_NS     832
cause-effect_SN                  748
preparation_SN                   738
cause-effect_NS                  591
sequence_NN                      586
same-unit_NN                     556
condition_SN                     521
purpose_NS                       478
attribution_NS                   436
condition_NS                     387
comparison_NN                    302
background_NS                    267
evidence_NS                      257
solutionhood_SN                  178
concession_NS                    129
interpretation-evaluation_SN     118
restatement_NN                   104
purpose_SN                       101
Name: relation, dtype: int64

In [13]:
dev_samples['relation'].value_counts()

joint_NN                        837
elaboration_NS                  709
contrast_NN                     318
interpretation-evaluation_NS    266
sequence_NN                     232
preparation_SN                  213
attribution_SN                  187
cause-effect_SN                 185
cause-effect_NS                 172
same-unit_NN                    140
condition_SN                    122
purpose_NS                      100
background_NS                    96
attribution_NS                   84
condition_NS                     71
comparison_NN                    59
solutionhood_SN                  57
evidence_NS                      56
concession_NS                    34
interpretation-evaluation_SN     31
purpose_SN                       18
restatement_NN                   16
Name: relation, dtype: int64