## Preprocess dataset for the system introduced in https://www.aclweb.org/anthology/W19-2715/

In [None]:
! pip install -U git+https://github.com/IINemo/isanlp.git@discourse

### 1. Prepare dataset for model training and evaluation 

In [1]:
symbol_map = {
    'x': 'х',
    'y': 'у',
}

def prepare_token(token):
    for key, value in symbol_map.items():
        token = token.replace(key, value)
        
    return token

In [12]:
def annot2tags(annot, edus):
    tags = []
    cursor = 0
    
    for sentence in range(len(annot['sentences'])):
        sentence_tags = []
        previous_first_token = 0
        previous_edu = ''
        
        for token in range(annot['sentences'][sentence].begin, annot['sentences'][sentence].end):
            
            if cursor == len(edus):
                is_first_token = False
            
            else:
                is_first_token = False
                start_of_sentence = 0
                
                tmp_edu = prepare_token(edus[cursor])

                original_text = annot['text'][annot['tokens'][token].begin:annot['tokens'][token].end]
                original_text = prepare_token(original_text)

                if tmp_edu.startswith(original_text):
                    if previous_edu:
                        if annot['text'][annot['tokens'][previous_first_token].begin:annot['tokens'][token].begin].strip() == previous_edu or original_text.lower() == "сначала":
                            is_first_token = True
                            previous_first_token = token
                            previous_edu = tmp_edu
                            cursor += 1
                    else:
                        is_first_token = True
                        previous_first_token = token
                        previous_edu = tmp_edu
                        cursor += 1
                
            tag = 'BeginSeg=Yes' if is_first_token else '_'
            sentence_tags.append(tag)
            
        tags.append(sentence_tags)
    
    return tags

In [13]:
from isanlp.utils.annotation_conll_converter import AnnotationCONLLConverter

converter = AnnotationCONLLConverter()

In [15]:
from utils.train_test_split import split_data
from glob import glob
from tqdm.autonotebook import tqdm
from utils.file_reading import read_annotation, read_edus
import re


train, test = split_data('data/')
TRAIN_FILE = 'rus.rst.rrt_train.conll'
TEST_FILE = 'rus.rst.rrt_test.conll'
MAX_LEN = 230


def preprocess(files, train=True):
    print(f'preprocess {"train" if train else "test"} set')
    
    with open(TRAIN_FILE if train else TEST_FILE, 'w') as fo:
        for filename in tqdm(files):
            filename = filename.replace('.edus', '')
            annot = read_annotation(filename)
            edus = read_edus(filename)
            tags = annot2tags(annot, edus)

            sentence = 0
            token = 0

            for string in converter(filename.replace('data/', ''), annot):
                if string == '\n':
                    fo.write(string)
                    sentence += 1
                    token = 0

                elif string.startswith('# newdoc id ='):
                    fo.write(string + '\n')

                else:
                    if ' ' in string:
                        string = re.sub(r' .*\t', '\t', string)
                    if 'www' in string:
                        string = re.sub(r'www[^\t]*', '_html_', string)
                    if 'http' in string:
                        string = re.sub(r'http[^ \t]*', '_html_', string)
                    if '[' in string:
                        string = re.sub(r'\[(\d+[ ;,с\.]*)+\]', '_ref_', string)
#                         if token > 0:
                    fo.write(string + '\t' + tags[sentence][token] + '\n')

#                         write first token only if it is the EDU boundary
#                         elif tags[sentence][token] == 'BeginSeg=Yes':
#                             fo.write(string + '\t' + tags[sentence][token] + '\n')
                    #else:
                        # skip next {len(sentence)} tokens
                        # ToDO:
                    token += 1

                if token == MAX_LEN:
                    print(filename + ' ::: occured very long sentence; truncate to ' + str(MAX_LEN) + ' tokens.')
                    fo.write('\n')
                    sentence += 1
                    token = 0
                    break

preprocess(train)
preprocess(test, train=False)

news in train: 0.3886792452830189,	in test: 0.3939393939393939
ling in train: 0.1509433962264151,	in test: 0.15151515151515152
comp in train: 0.1471698113207547,	in test: 0.15151515151515152
blog in train: 0.3132075471698113,	in test: 0.3181818181818182
preprocess train set


HBox(children=(IntProgress(value=0, max=265), HTML(value='')))

data/sci.ling_21 ::: occured very long sentence; truncate to 230 tokens.
data/sci.comp_8 ::: occured very long sentence; truncate to 230 tokens.
data/blogs_2 ::: occured very long sentence; truncate to 230 tokens.

preprocess test set


HBox(children=(IntProgress(value=0, max=67), HTML(value='')))

data/sci.ling_18 ::: occured very long sentence; truncate to 230 tokens.
data/sci.ling_28 ::: occured very long sentence; truncate to 230 tokens.
data/sci.comp_40 ::: occured very long sentence; truncate to 230 tokens.
data/sci.comp_54 ::: occured very long sentence; truncate to 230 tokens.



In [16]:
%%bash -s "$TRAIN_FILE" "$TEST_FILE"

export TONY_PATH="../tony/"

cp ${1} ${TONY_PATH}/data/rus.rst.rrt/${1}
cp ${2} ${TONY_PATH}/data/rus.rst.rrt/${2}

In [15]:
%%bash -s "$TRAIN_FILE" "$TEST_FILE"

cp ${1} ${1}.002.backup
cp ${2} ${2}.002.backup

### 2. Scripts for the pipeline integration 

In [None]:
import os
import numpy as np
from isanlp.annotation_rst import DiscourseUnit
from allennlp.predictors import Predictor


class AllenNLPSegmentator:
    TEXT = 0
    TOKENS = 1
    SENTENCES = 2
    LEMMA = 3
    POSTAG = 4
    SYNTAX_DEP_TREE = 5
    
    def __init__(self, model_dir_path):
        self._model_path = os.path.join(model_dir_path, 'tony_segmentator', 'model.tar.gz')
        self.predictor = Predictor.from_path(self._model_path)
        self._separator = 'U-S'
        
    def __call__(self, *args, **kwargs):
        return self._build_discourse_units(args[self.TEXT], args[self.TOKENS], 
                                           self._predict(args[self.TOKENS], args[self.SENTENCES]))
        
    def _predict(self, tokens, sentences):
        """
        :return: numbers of tokens predicted as EDU left boundaries
        """
        result = []
        for sentence in sentences:
            result += self.predictor.predict(' '.join([token.text for token in tokens[sentence.begin:sentence.end]])
                                            )['tags']
            
        result = np.array(result)
        return np.argwhere(result == self._separator)[:, 0]

    def _build_discourse_units(self, text, tokens, numbers):
        """
        :param text: original text
        :param list tokens: isanlp.annotation.Token
        :param numbers: positions of tokens predicted as EDU left boundaries (beginners)
        :return: list of DiscourseUnit
        """
        
        edus = []
    
        if numbers.shape[0]:
            for i in range(0, len(numbers)-1):
                new_edu = DiscourseUnit(i,
                                        start=tokens[numbers[i]].begin,
                                        end=tokens[numbers[i+1]].begin - 1,
                                        text=text[tokens[numbers[i]].begin:tokens[numbers[i+1]].begin],
                                        relation='elementary')
                edus.append(new_edu)

            if numbers.shape[0] == 1:
                i = -1
            
            new_edu = DiscourseUnit(i+1,
                            start=tokens[numbers[-1]].begin,
                            end=len(text),
                            text=text[tokens[numbers[-1]].begin:],
                            relation='elementary')
            edus.append(new_edu)

        return edus

In [None]:
segmentator = AllenNLPSegmentator('models')

In [None]:
from utils.file_reading import read_annotation, read_edus
annot = read_annotation('data/news1_1')

In [None]:
res = segmentator(annot['text'],
                  annot['tokens'], annot['sentences'], 
                  annot['lemma'], annot['postag'], 
                  annot['syntax_dep_tree'])

In [None]:
for edu in res[15:25]:
    print(edu.text)

In [None]:
! cp -r models/tony_segmentator ../isanlp_rst/models/tony_segmentator

### Map UD tags to penn 

In [50]:
%%bash

export TONY_PATH=../tony/
export CONV_PATH=${TONY_PATH}/code/contextual_embeddings/conv2ner.py 

cp ${CONV_PATH} ${CONV_PATH}.backup

In [23]:
%%writefile ../tony/code/contextual_embeddings/conv2ner.py 

"""
Convert to ner Connl format to use allennlp dataset reader

basically, just skip lines between docs, strip to 4 fields with words as 1st and tag as last, and format as BIO

TODO: try BIOUL (L=last, U=unit entity = 1 token)
"""
import sys
import argparse 

parser = argparse.ArgumentParser()
parser.add_argument("filepath", help="path to file to convert")
parser.add_argument("--lemmatize", default=False, action='store_true', help="to use with conll input: replace token with its lemma (useful for turk)")
parser.add_argument("--mark-end", default=False, action='store_true', help="add explicit label for end of segment")
parser.add_argument("--split-too-long", default=[False,180], help="split sentences longer than threshold",nargs=2)
parser.add_argument("--input-format",default="tok",help="input format: tok, split.tok, conll")


args = parser.parse_args()

maptags = {"_":"O",
           "BeginSeg=Yes": "B-S",
           "Seg=B-Conn":"B-Conn",
           "Seg=I-Conn":"I-Conn",
           "SpaceAfter=No":"O",
           "Typo=Yes":"O",
           }

tags_map = {
    'NOUN': 'NN',
    'PROPN': 'NNP',
    'PRON': 'EX',
    #'VERB': 'VB',  # simple way
    #'PUNCT': '``', # punctuation must be processed separately
    'ADV': 'RB',
    'ADP': 'RP',
    'CCONJ': 'CC',
    'ADJ': 'JJ',
    'AUX': 'VB',
    'SCONJ': 'CC',
    'DET': 'DT',
    'PART': 'TO',
    'NUM': 'CD',
    'SYM': 'SYM',
    'X': 'LS',
}

punct_map = {
    "''": "''",
    "'": "''",
    '"': "''",
    ",": ",",
    ".": ".",
    "?": ".",
    "!": ".",
    "--": ":",
    ":": ":",
    ";": ":",
    "...": ":",
    "-": "HYPH",
    "(": "-LRB-",
    "«": "''",
    "[": "[",
    ")": "-RRB-",
    "»": "''",
    ']': "]"
}

verb_variants = [
    ("Tense=Past|VerbForm=Fin", "VBD"),
    ("Tense=Past|VerbForm=Part", "VBN"),
    ("Tense=Pres|VerbForm=Part", "VBG"),
    ("Tense=Pres|VerbForm=Fin", "VBP"),
    ("Tense=Imp|VerbForm=Fin", "VB")
]

def convert_tag(tag, lemma, morph):
    new_tag = tags_map.get(tag)
    
    if new_tag:
        return new_tag
    
    new_tag = punct_map.get(lemma)
    
    if new_tag:
        return new_tag
    
    for verb_var in verb_variants:
        if verb_var[0] in morph:
            return verb_var[1]
        
    if tag == "VERB":
        return "VB"
    
    return 'NN'

# 
MARK_END = args.mark_end
# take lemmas instead of token forms (useful for turkish)
# also tag all proper nouns with same token
LEMMATIZE = args.lemmatize
# split for too long sentences (default 180) for bert
SPLIT_TOO_LONG= args.split_too_long[0]
THRESHOLD = int(args.split_too_long[1])

#filepath = sys.argv[1]
filepath = args.filepath

input_format = args.input_format


if SPLIT_TOO_LONG:
    print("warning: too-long sentence splitting mode = ON ",file=sys.stderr)


with open(filepath) as f:
    start_doc = False
    res = []
    for line in f:
        if "\t" not in line:
            res.append([]) # [line.strip()])
            start_doc = True
        #elif line.strip()=="":
        #    res.append([])
        #    start_doc = True
        else:
            fields = line.strip().split()
            #print(fields,file=sys.stderr)
            token_number = int(fields[0].split("-")[0])
            if SPLIT_TOO_LONG and token_number>THRESHOLD:
                # sentence too long: insert a newline to make a separate sequence
                res.append([])
            w = fields[1] if not(LEMMATIZE) else fields[2]
            label = fields[-1].split("|")[0]
            if input_format=="conll":
                if LEMMATIZE and fields[3]=="PROPN":
                    w = "NAME"
#                 print('field[3] =', fields[3])
#                 print('field[2] =', fields[2])
                pos = convert_tag(fields[3], fields[2], fields[5])
            else:
                pos = convert_tag(fields[3], fields[2], fields[5])
            tag = maptags.get(label,"O")
            #if start_doc:
            #    tag = "B-S"
            if not(start_doc) and MARK_END and tag=="B-S" and res[-1][-1]!="B-S":
                # then, previous token label is set to B-E to signal end of previous segment
                res[-1][-1] = "B-E"
            start_doc = False
            if label not in maptags:
                print("warning, strange label ",label,file=sys.stderr)
            res.append([w,pos,"O",tag])
            
    for line in res:
        print("\t".join(line))


Overwriting ../tony/code/contextual_embeddings/conv2ner.py
