## Binary structure classification used in tree building: Step 1. Negative samples generation

Create train and test sets; Save negative samples of file ``filename.rs3`` as `filename.neg`

Output:
 - ``data/*.neg``
 - ``data_structure/*``

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import pandas as pd
import numpy as np
from utils.print_tree import printBTree

import os
import glob
import pandas as pd
import pickle
from tqdm import tqdm_notebook as tqdm
from isanlp.annotation_rst import DiscourseUnit

from utils.evaluation import extr_pairs, extr_pairs_forest
from utils.file_reading import *

import sys

sys.path.append('../')
sys.path.append('../../')
sys.path.append('../../../')

In [None]:
from isanlp_rst.src.isanlp_rst.rst_tree_predictor import RSTTreePredictor, GoldTreePredictor

In [None]:
class RandomNegativeGenerator(object):
    def __call__(self, edus, corpus, annot_text):
        new_set = self.create_training_set(edus, corpus)
        result = []
        for item in new_set:
            result.append((filename, item[0], item[1], item[2]))

        tmp = pd.DataFrame(result, columns=['filename', 'snippet_x', 'snippet_y', 'relation'])

        def place_locations(row):
            row['loc_x'] = annot_text.find(row.snippet_x)
            row['loc_y'] = annot_text[row['loc_x']+len(row.snippet_x):].find(row.snippet_y)
            return row

        return tmp.apply(place_locations, axis=1)
    
    def __name__(self):
        return 'RandomNegativeGenerator'
    
    def create_training_set(self, edus, gold):
        training_set = []
        
        snippet_cache = []
        for num, e in enumerate(gold.index):
            snippet_x = gold.loc[e, 'snippet_x']
            cache_x = self.extract_snippet_ids(snippet_x, edus)

            snippet_y = gold.loc[e, 'snippet_y']
            cache_y = self.extract_snippet_ids(snippet_y, edus)

            if cache_x and cache_y:
                snippet_cache.append((cache_x, snippet_x))
                snippet_cache.append((cache_y, snippet_y))

        for i in range(len(edus) - 1):
            if not self.check_snippet_pair_in_dataset(gold, edus[i], edus[i+1]):
                training_set.append((edus[i], edus[i+1], False))

        for i in gold.index:
            training_set += self.extract_negative_samples_for_snippet(gold, edus, gold.loc[i, 'snippet_x'])
            training_set += self.extract_negative_samples_for_snippet(gold, edus, gold.loc[i, 'snippet_y'])

        for i in range(len(snippet_cache)):
            for j in range(i, len(snippet_cache)):
                cache_i, snippet_i = snippet_cache[i]
                cache_j, snippet_j = snippet_cache[j]

                if cache_i[-1] + 1 == cache_j[0]:
                    if not self.check_snippet_pair_in_dataset(gold, snippet_i, snippet_j):
                        training_set.append((snippet_i, snippet_j, False))

                if cache_j[-1] + 1 == cache_i[0]:
                    if not self.check_snippet_pair_in_dataset(gold, snippet_j, snippet_i):
                        training_set.append((snippet_j, snippet_i, False))

        return list(set(training_set))
    
    def extract_snippet_ids(self, snippet, edus):
        return [edu_nm for edu_nm, edu in enumerate(edus) if (edu in snippet)]
    
    def check_snippet_pair_in_dataset(self, dataset, snippet_left, snippet_right):
        return ((((dataset.snippet_x == snippet_left) & (dataset.snippet_y == snippet_right)).sum(axis=0) != 0) 
                or ((dataset.snippet_y == snippet_left) & (dataset.snippet_x == snippet_right)).sum(axis=0) != 0)
    
    def extract_negative_samples_for_snippet(self, gold, edus, snippet):
        training_set = []

        snippet_ids = self.extract_snippet_ids(snippet, edus)

        if not snippet_ids:
            return []

        if snippet_ids[0] > 0:
            if not self.check_snippet_pair_in_dataset(gold, snippet, edus[snippet_ids[0] - 1]):
                training_set.append((edus[snippet_ids[0] - 1], snippet, False))

        if snippet_ids[-1] < len(edus) - 1:
            if not self.check_snippet_pair_in_dataset(gold, snippet, edus[snippet_ids[-1] + 1]):
                training_set.append((snippet, edus[snippet_ids[-1] + 1], False))

        return training_set

In [None]:
import sys
import numpy as np


class GreedyNegativeGenerator:
    """ Inversed greedy parser based on gold tree predictor. """
    def __init__(self):
        self.forest_threshold = 0.01
    
    def __call__(self, edus, corpus, annot_text):
        def to_merge(scores):
            return np.argmax(np.array(scores))
        
        negative_nodes = []
        
        self.tree_predictor = GoldTreePredictor(corpus)
        nodes = edus        
        max_id = edus[-1].id

        # initialize scores
        features = self.tree_predictor.initialize_features(nodes)
        scores = list(map(self.tree_predictor.predict_pair_proba, features))
        relations = list(map(self.tree_predictor.predict_label, features))
        nuclearities = list(map(self.tree_predictor.predict_nuclearity, features))

        while len(nodes) > 2 and any([score > self.forest_threshold for score in scores]):
            # select two nodes to merge
            j = to_merge(scores)  # position of the pair in list
            
            # make the new node by merging node[j] + node[j+1]
            temp = DiscourseUnit(
                id=max_id + 1,
                left=nodes[j],
                right=nodes[j + 1],
                relation=self.tree_predictor.predict_label(features[j]),
                nuclearity=self.tree_predictor.predict_nuclearity(features[j]),
                proba=scores[j],
                text=annot_text[nodes[j].start:nodes[j + 1].end].strip()
            )
            
            max_id += 1

            # modify the node list
            nodes = nodes[:j] + [temp] + nodes[j + 2:]

            # modify the scores list
            if j == 0:
                features_right = self.tree_predictor.extract_features(nodes[j], nodes[j + 1])
                predicted = self.tree_predictor.predict_pair_proba(features_right)

                scores = [predicted] + scores[j + 2:]
                features = [features_right] + features[j + 2:]
                
                if predicted == 0:
                    relation = self.tree_predictor.predict_label(features_right)
                    if relation == 'relation':
                        negative_nodes.append(
                            DiscourseUnit(
                                id=None,
                                left=nodes[j],
                                right=nodes[j + 1],
                                relation=relation,
                                nuclearity=self.tree_predictor.predict_nuclearity(features_right),
                                proba=predicted,
                                text=annot_text[nodes[j].start:nodes[j + 1].end].strip()
                        ))

            elif j + 1 < len(nodes):
                features_left = self.tree_predictor.extract_features(nodes[j - 1], nodes[j])
                predicted_left = self.tree_predictor.predict_pair_proba(features_left)
                if predicted_left == 0:
                    relation = self.tree_predictor.predict_label(features_left)
                    if relation == 'relation':
                        negative_nodes.append(
                            DiscourseUnit(
                                id=None,
                                left=nodes[j - 1],
                                right=nodes[j],
                                relation=relation,
                                nuclearity=self.tree_predictor.predict_nuclearity(features_left),
                                proba=predicted_left,
                                text=annot_text[nodes[j - 1].start:nodes[j].end].strip()
                        ))

                features_right = self.tree_predictor.extract_features(nodes[j], nodes[j + 1])
                predicted_right = self.tree_predictor.predict_pair_proba(features_right)
                if predicted_right == 0:
                    negative_nodes.append(
                        DiscourseUnit(
                            id=None,
                            left=nodes[j],
                            right=nodes[j + 1],
                            relation=self.tree_predictor.predict_label(features_right),
                            nuclearity=self.tree_predictor.predict_nuclearity(features_right),
                            proba=predicted_right,
                            text=annot_text[nodes[j].start:nodes[j + 1].end].strip()
                    ))

                scores = scores[:j - 1] + [predicted_left] + [predicted_right] + scores[j + 2:]
                features = features[:j - 1] + [features_left] + [features_right] + features[j + 2:]

            else:
                features_left = self.tree_predictor.extract_features(nodes[j - 1], nodes[j])
                predicted = self.tree_predictor.predict_pair_proba(features_left)
                if predicted == 0:
                    negative_nodes.append(
                        DiscourseUnit(
                            id=None,
                            left=nodes[j - 1],
                            right=nodes[j],
                            relation=self.tree_predictor.predict_label(features_left),
                            nuclearity=self.tree_predictor.predict_nuclearity(features_left),
                            proba=predicted,
                            text=annot_text[nodes[j - 1].start:nodes[j].end].strip()
                    ))
                    
                scores = scores[:j - 1] + [predicted]
                features = features[:j - 1] + [features_left]

        if len(scores) == 1 and scores[0] > self.forest_threshold:
            root = DiscourseUnit(
                id=max_id + 1,
                left=nodes[0],
                right=nodes[1],
                relation='root',
                proba=scores[0]
            )
            nodes = [root]

        return negative_nodes
    
    def __name__(self):
        return 'GreedyNegativeGenerator'

### Make negative samples, save them

In [None]:
gen = RandomNegativeGenerator()
#gen = GreedyNegativeGenerator()

for filename in tqdm(glob.glob('./data/*.json')):
    filename = filename.replace('.json', '')
    df = read_gold(filename, features=True)
    edus = read_edus(filename)
    annot = read_annotation(filename)

    if gen.__name__() == 'RandomNegativeGenerator':
        tmp = gen(edus, df, annot['text'])
    
    elif gen.__name__() == 'GreedyNegativeGenerator':
        _edus = []
        last_end = 0
        for max_id in range(len(edus)):
            start = len(annot['text'][:last_end]) + annot['text'][last_end:].find(edus[max_id])
            end = start + len(edus[max_id])
            temp = DiscourseUnit(
                    id=max_id,
                    left=None,
                    right=None,
                    relation='edu',
                    start=start,
                    end=end,
                    orig_text=annot['text'],
                    proba=1.
                )
            _edus.append(temp)
            last_end = end

        tmp = gen(_edus, df, annot['text'])
        tmp = pd.DataFrame(extr_pairs_forest(tmp), columns=['snippet_x', 'snippet_y', 'category_id'])
        tmp = tmp[tmp.category_id == 'no_relation']
    
    tmp.to_json(filename + '.json.neg')

### Extract features

In [None]:
%%time
from isanlp_rst.src.isanlp_rst.features_processor_default import FeaturesProcessor

features_processor = FeaturesProcessor(model_dir_path='models', verbose=0)

In [None]:
from tqdm import tqdm_notebook as tqdm


MAX_LEN = 100

for filename in tqdm(glob.glob("data/*.json.neg")):    
    filename = filename.replace('.json.neg', '')
    
    df = read_negative(filename).drop(columns=['loc_y'])
    df = df[df.snippet_x.str.len() > 0]
    df = df[df.snippet_y.str.len() > 0]
    
    annotation = read_annotation(filename)
        
    result = features_processor(df, \
                                   annotation['text'],\
                                   annotation['tokens'],\
                                   annotation['sentences'],\
                                   annotation['lemma'],\
                                   annotation['morph'],\
                                   annotation['postag'],\
                                   annotation['syntax_dep_tree'])
    
    result = result[result.is_broken == False]
    
    result = result[result.tokens_x.map(len) < MAX_LEN]
    result = result[result.tokens_y.map(len) < MAX_LEN]
    
    result.to_pickle(filename + '.neg.features')

### Make train/test splits 

In [None]:
from utils.train_test_split import split_train_dev_test

train, dev, test = split_train_dev_test('./data')

In [None]:
import pandas as pd
from utils.file_reading import read_gold


random_state = 45

train_samples = []
test_samples = []
dev_samples = []

for file in tqdm(train):
    gold = read_gold(file.replace('.edus', ''), features=True)
    gold['relation'] = 1
    train_samples.append(gold)
    negative = read_negative(file.replace('.edus', ''), features=True)
    negative['relation'] = 0
    train_samples.append(negative)

for file in tqdm(dev):
    gold = read_gold(file.replace('.edus', ''), features=True)
    gold['relation'] = 1
    dev_samples.append(gold)
    negative = read_negative(file.replace('.edus', ''), features=True)
    negative['relation'] = 0
    dev_samples.append(negative)
    
for file in tqdm(test):
    gold = read_gold(file.replace('.edus', ''), features=True)
    gold['relation'] = 1
    test_samples.append(gold)
    negative = read_negative(file.replace('.edus', ''), features=True)
    negative['relation'] = 0
    test_samples.append(negative)

train_samples = pd.concat(train_samples)
dev_samples = pd.concat(dev_samples)
test_samples = pd.concat(test_samples)

In [None]:
import os
from utils.prepare_sequence import _prepare_sequence


def correct_samples(row):
    if row.snippet_x[0] in (',', '.', '!', '?'):
        row.snippet_x = row.snippet_x[1:].strip()
    if row.snippet_y[0] in (',', '.'):
        row.snippet_x += row.snippet_y[0]
        row.snippet_y = row.snippet_y[1:].strip()
    return row

def prepare_data(data, max_len=100):

    data = data[data.tokens_x.map(len) < max_len]
    data = data[data.tokens_y.map(len) < max_len]
    
    data['snippet_x'] = data.tokens_x.map(lambda row: ' '.join(row))
    data['snippet_y'] = data.tokens_y.map(lambda row: ' '.join(row))
    
    data = data.apply(correct_samples, axis=1)
    
    data = data[data.snippet_x.map(len) > 0]
    data = data[data.snippet_y.map(len) > 0]
    
    data['snippet_x'] = data.snippet_x.map(_prepare_sequence)
    data['snippet_y'] = data.snippet_y.map(_prepare_sequence)
    
    data = data.sort_values(['relation'], ascending=True).drop_duplicates(['snippet_x', 'snippet_y'], keep='last')
    data = data.sample(frac=1, random_state=random_state).reset_index(drop=True)
    
    return data


train_samples = prepare_data(train_samples)
dev_samples = prepare_data(dev_samples)
test_samples = prepare_data(test_samples)

OUT_PATH = 'data_structure'
! mkdir $OUT_PATH
train_samples.to_pickle(os.path.join(OUT_PATH, 'train_samples.pkl'))
dev_samples.to_pickle(os.path.join(OUT_PATH, 'dev_samples.pkl'))
test_samples.to_pickle(os.path.join(OUT_PATH, 'test_samples.pkl'))