## Preprocess dataset for the system introduced in https://www.aclweb.org/anthology/W19-2715/

In [None]:
! pip install -U git+https://github.com/IINemo/isanlp.git@discourse

### 1. Prepare dataset for model training and evaluation 

In [None]:
def annot2tags(annot, edus):
    tags = []
    cursor = 0
    
    for sentence in range(len(annot['sentences'])):
        sentence_tags = []
        for token in range(annot['sentences'][sentence].begin, annot['sentences'][sentence].end):
            
            if cursor == len(edus):
                is_first_token = False
            
            else:
                is_first_token = False
                start_of_sentence = 0

                if token == annot['sentences'][sentence].begin:
                    is_first_token = True
                    cursor += 1
                else:
                    original_text = annot['text'][annot['tokens'][token].begin:annot['tokens'][token].end]
                    if edus[cursor].startswith(original_text):
                        is_first_token = True
                        cursor += 1
                
            tag = 'BeginSeg=Yes' if is_first_token else '_'
            sentence_tags.append(tag)
            
        tags.append(sentence_tags)
    
    return tags

In [None]:
from isanlp.utils.annotation_conll_converter import AnnotationCONLLConverter

converter = AnnotationCONLLConverter()

In [None]:
from utils.train_test_split import split_data
from glob import glob
from tqdm.autonotebook import tqdm
from utils.file_reading import read_annotation, read_edus
import re


train, test = split_data('data/')
TRAIN_FILE = 'rus.rst.rrt_train.conll'
TEST_FILE = 'rus.rst.rrt_test.conll'
MAX_LEN = 220


def preprocess(files, train=True):
    print(f'preprocess {"train" if train else "test"} set')
    
    with open(TRAIN_FILE if train else TEST_FILE, 'w') as fo:
        for filename in tqdm(files):
            filename = filename.replace('.edus', '')
            annot = read_annotation(filename)
            edus = read_edus(filename)
            tags = annot2tags(annot, edus)
            
            sentence = 0
            token = 0

            for string in converter(filename.replace('data/', ''), annot):
                if string == '\n':
                    fo.write(string)
                    sentence += 1
                    token = 0
                    
                elif string.startswith('# newdoc id ='):
                    fo.write(string + '\n')
                    
                else:
                    if ' ' in string:
                        string = re.sub(r' .*\t', '\t', string)
                    if 'www' in string:
                        string = re.sub(r'www[^ \t]*', '_html_', string)
                    if 'http' in string:
                        string = re.sub(r'http[^ \t]*', '_html_', string)
                    fo.write(string + '\t' + tags[sentence][token] + '\n')
                    token += 1
                
                if token > MAX_LEN:
                    print(filename + ' ::: occured very long sentence; truncate to ' + str(MAX_LEN) + ' tokens.')
                    break

preprocess(train)
preprocess(test, train=False)

In [None]:
%%bash -s "$TRAIN_FILE" "$TEST_FILE"

export TONY_PATH="../tony/"

cp ${1} ${TONY_PATH}/data/rus.rst.rrt/${1}
cp ${2} ${TONY_PATH}/data/rus.rst.rrt/${2}

### 2. Scripts for the pipeline integration 

In [None]:
import os
import numpy as np
from isanlp.annotation_rst import DiscourseUnit
from allennlp.predictors import Predictor


class AllenNLPSegmentator:
    TEXT = 0
    TOKENS = 1
    SENTENCES = 2
    LEMMA = 3
    POSTAG = 4
    SYNTAX_DEP_TREE = 5
    
    def __init__(self, model_dir_path):
        self._model_path = os.path.join(model_dir_path, 'tony_segmentator', 'model.tar.gz')
        self.predictor = Predictor.from_path(self._model_path)
        self._separator = 'U-S'
        
    def __call__(self, *args, **kwargs):
        return self._build_discourse_units(args[self.TEXT], args[self.TOKENS], 
                                           self._predict(args[self.TOKENS], args[self.SENTENCES]))
        
    def _predict(self, tokens, sentences):
        """
        :return: numbers of tokens predicted as EDU left boundaries
        """
        result = []
        for sentence in sentences:
            result += self.predictor.predict(' '.join([token.text for token in tokens[sentence.begin:sentence.end]])
                                            )['tags']
            
        result = np.array(result)
        return np.argwhere(result == self._separator)[:, 0]

    def _build_discourse_units(self, text, tokens, numbers):
        """
        :param text: original text
        :param list tokens: isanlp.annotation.Token
        :param numbers: positions of tokens predicted as EDU left boundaries (beginners)
        :return: list of DiscourseUnit
        """
        
        edus = []
    
        if numbers.shape[0]:
            for i in range(0, len(numbers)-1):
                new_edu = DiscourseUnit(i,
                                        start=tokens[numbers[i]].begin,
                                        end=tokens[numbers[i+1]].begin - 1,
                                        text=text[tokens[numbers[i]].begin:tokens[numbers[i+1]].begin],
                                        relation='elementary')
                edus.append(new_edu)

            if numbers.shape[0] == 1:
                i = -1
            
            new_edu = DiscourseUnit(i+1,
                            start=tokens[numbers[-1]].begin,
                            end=len(text),
                            text=text[tokens[numbers[-1]].begin:],
                            relation='elementary')
            edus.append(new_edu)

        return edus

In [None]:
segmentator = AllenNLPSegmentator('models')

In [None]:
from utils.file_reading import read_annotation, read_edus
annot = read_annotation('data/news1_1')

In [None]:
res = segmentator(annot['text'],
                  annot['tokens'], annot['sentences'], 
                  annot['lemma'], annot['postag'], 
                  annot['syntax_dep_tree'])

In [None]:
for edu in res[15:25]:
    print(edu.text)

In [None]:
! cp -r models/tony_segmentator ../isanlp_rst/models/tony_segmentator