## Tree building evaluation on gold EDUs (mostly) and playground for tree building scripts

1. Modifications of library components for tree building
2. Scripts for test and evaluation of Sklearn-, AllenNLP- and gold-annotation-based RST parsers on manually segmented corpus

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import pandas as pd
import numpy as np
from utils.print_tree import printBTree
#from utils.rst_annotation import DiscourseUnit

import sys
sys.path.append('../')
sys.path.append('../../')
sys.path.append('../../../')

In [None]:
from isanlp.annotation_rst import DiscourseUnit

In [None]:
def printTree(tree):
    def _(n):
        if n.relation:
            value = (n.relation, "%.2f"%(n.proba))
        else:
            value = n.text
        return str(value), n.left, n.right

    return printBTree(_) 

In [None]:
class DiscourseUnitCreator:
    def __init__(self, id):
        self.id = id
        
    def __call__(self, left_node, right_node, proba):
        self.id += 1
        return DiscourseUnit(
            id=id,
            left=left_node,
            right=right_node,
            relation=1,
            proba=proba
        )

In [None]:
! mv ../isanlp_rst/models/structure_predictor_lstm backup_simple_structure_predictor

In [None]:
! mkdir ../isanlp_rst/models/structure_predictor_lstm

In [None]:
! cp -r models/structure_predictor_lstm/result_42/model.tar.gz ../isanlp_rst/models/structure_predictor_lstm/

In [None]:
! cp -r models/customization_package ../isanlp_rst/models/

In [None]:
from models.customization_package.model.custom_bimpm_predictor import CustomBiMPMPredictor

In [None]:
pr = CustomBiMPMPredictor.from_path('models/structure_predictor_lstm/result_42/model.tar.gz', 
                                    predictor_name='custom_bimpm_predictor')

In [None]:
pr.predict_json({"premise":"В мировой парламентской практике есть масса примеров того , как небольшая партия становилась , по сути , самой главной ,",
                "hypothesis":"поскольку именно её немногочисленные голоса обеспечивали решающее большинство при вступлении в коалицию с одной из крупных партий .",
                "metadata":"1"})

In [None]:
tmp = [{"premise":"В мировой парламентской практике есть масса примеров того , как небольшая партия становилась , по сути , самой главной ,",
                       "hypothesis":"поскольку именно её немногочисленные голоса обеспечивали решающее большинство при вступлении в коалицию с одной из крупных партий .",
                       "metadata":"1"},
                      {"premise":"Именно её немногочисленные голоса обеспечивали решающее большинство при вступлении в коалицию с одной из крупных партий .",
                       "hypothesis":"Следующая новость - про носорогов.",
                       "metadata":"0"}]

In [None]:
pr.predict_batch_json(tmp)

In [None]:
pr.predict(premise="В мировой парламентской практике есть масса примеров того , как небольшая партия становилась , по сути , самой главной ,",
                             hypothesis="поскольку именно её немногочисленные голоса обеспечивали решающее большинство при вступлении в коалицию с одной из крупных партий .",
                             metadata="1")

In [None]:
pr = customization_package.model.custom_bimpm_predictor.CustomBiMPMPredictor

In [None]:
from allennlp.predictors import Predictor

pr = Predictor.from_path('models/structure_predictor_lstm/result_42/model.tar.gz', predictor_name='custom_bimpm_predictor')

In [None]:
from isanlp_rst.src.isanlp_rst.sklearn_classifier import SklearnClassifier
from isanlp_rst.src.isanlp_rst.allennlp_classifier import AllenNLPClassifier
from isanlp_rst.src.isanlp_rst.allennlp_classifier_custom_bimpm import AllenNLPClassifier as LargeAllenNLPClassifier
from isanlp_rst.src.isanlp_rst.rst_tree_predictor import *
from isanlp_rst.src.isanlp_rst.greedy_rst_parser import GreedyRSTParser
from isanlp_rst.src.isanlp_rst.features_extractor import FeaturesExtractor
from isanlp_rst.src.isanlp_rst.features_processor_tokenizer import FeaturesProcessor

In [None]:
from utils.train_test_split import split_train_dev_test

train, dev, test = split_train_dev_test('./data')

# Evaluation (Parser)

In [None]:
_SPAN_PREDICTOR = {
    'lstm': (LargeAllenNLPClassifier, 'structure_predictor_lstm', 0.1, 0.5),
    'ensemble': (SklearnClassifier, 'structure_predictor', 0.15, 0.2),
}

_LABEL_PREDICTOR = {
    'lstm': (AllenNLPClassifier, 'label_predictor_lstm'),
    'ensemble': (SklearnClassifier, 'label_predictor'),
}

In [None]:
binary_classifier = LargeAllenNLPClassifier('../isanlp_rst/models/structure_predictor_lstm/')
label_classifier = AllenNLPClassifier('../isanlp_rst/models/label_predictor_lstm/')

features_processor = FeaturesProcessor(model_dir_path='models', verbose=False)
features_extractor = FeaturesExtractor(features_processor)

predictor = LargeNNTreePredictor(features_processor=features_extractor, 
                            relation_predictor_sentence=None,
                            relation_predictor_text=binary_classifier, 
                            label_predictor=label_classifier)

paragraph_parser = GreedyRSTParser(predictor,
                                   confidence_threshold=_SPAN_PREDICTOR['lstm'][2])

document_parser = GreedyRSTParser(predictor,
                                  confidence_threshold=_SPAN_PREDICTOR['lstm'][3])

In [None]:
LargeNNTreePredictor??

In [None]:
additional_document_parser = GreedyRSTParser(predictor,
                                             confidence_threshold=_SPAN_PREDICTOR['lstm'][3]-0.15)

In [None]:
from isanlp.annotation import Sentence

def split_by_paragraphs(annot_text, annot_tokens, annot_sentences, annot_lemma, annot_morph, annot_postag,
                        annot_syntax_dep_tree):

    def split_on_two(sents, boundary):
        list_sum = lambda l: sum([len(sublist) for sublist in l])

        i = 1
        while list_sum(sents[:i]) < boundary and i < len(sents):
            i += 1

        intersentence_boundary = min(len(sents[i - 1]), boundary - list_sum(sents[:i - 1]))
        return (sents[:i - 1] + [sents[i - 1][:intersentence_boundary]],
                [sents[i - 1][intersentence_boundary:]] + sents[i:])

    def recount_sentences(chunk):
        sentences = []
        lemma = []
        morph = []
        postag = []
        syntax_dep_tree = []
        tokens_cursor = 0

        for i, sent in enumerate(chunk['syntax_dep_tree']):
            if len(sent) > 0:
                sentences.append(Sentence(tokens_cursor, tokens_cursor + len(sent)))
                lemma.append(chunk['lemma'][i])
                morph.append(chunk['morph'][i])
                postag.append(chunk['postag'][i])
                syntax_dep_tree.append(chunk['syntax_dep_tree'][i])
                tokens_cursor += len(sent)

        chunk['sentences'] = sentences
        chunk['lemma'] = lemma
        chunk['morph'] = morph
        chunk['postag'] = postag
        chunk['syntax_dep_tree'] = syntax_dep_tree

        return chunk

    chunks = []
    prev_right_boundary = -1

    for i, token in enumerate(annot_tokens[:-1]):

        if '\n' in annot_text[token.end:annot_tokens[i + 1].begin]:
            if prev_right_boundary > -1:
                chunk = {
                    'text': annot_text[annot_tokens[prev_right_boundary].end:token.end + 1].strip(),
                    'tokens': annot_tokens[prev_right_boundary + 1:i + 1]
                }
            else:
                chunk = {
                    'text': annot_text[:token.end + 1].strip(),
                    'tokens': annot_tokens[:i + 1]
                }

            lemma, annot_lemma = split_on_two(annot_lemma, i - prev_right_boundary)
            morph, annot_morph = split_on_two(annot_morph, i - prev_right_boundary)
            postag, annot_postag = split_on_two(annot_postag, i - prev_right_boundary)
            syntax_dep_tree, annot_syntax_dep_tree = split_on_two(annot_syntax_dep_tree, i - prev_right_boundary)

            chunk.update({
                'lemma': lemma,
                'morph': morph,
                'postag': postag,
                'syntax_dep_tree': syntax_dep_tree,
            })
            chunks.append(recount_sentences(chunk))

            prev_right_boundary = i  # number of last token in the last chunk

    chunk = {
        'text': annot_text[annot_tokens[prev_right_boundary].end:].strip(),
        'tokens': annot_tokens[prev_right_boundary + 1:],
        'lemma': annot_lemma,
        'morph': annot_morph,
        'postag': annot_postag,
        'syntax_dep_tree': annot_syntax_dep_tree,
    }

    chunks.append(recount_sentences(chunk))
    return chunks

In [None]:
cache = {}

In [None]:
def split_by_paragraphs_edus(edus, text):
    res = []
    parag = []
    
    for edu in edus:
        parag.append(edu)
        boundary = text.find(edu)+len(edu)
        if boundary < len(text):
            if text[boundary] == '\n':
                res.append(parag)
                parag = []
         
    if parag:
        res.append(parag)
    return res

In [None]:
cache = []

In [None]:
def prepare_gold_pairs(gold_pairs):
    TARGET = 'category_id'

    gold_pairs[TARGET] = gold_pairs[TARGET].replace([0.0], 'same-unit_m')
    gold_pairs['order'] = gold_pairs['order'].replace([0.0], 'NN')
    gold_pairs[TARGET] = gold_pairs[TARGET].replace(['antithesis_r',], 'contrast_m')
    gold_pairs[TARGET] = gold_pairs[TARGET].replace(['cause_r', 'effect_r'], 'cause-effect_r')
    gold_pairs[TARGET] = gold_pairs[TARGET].replace(['conclusion_r',], 'restatement_m')
    gold_pairs[TARGET] = gold_pairs[TARGET].replace(['evaluation_r'], 'interpretation-evaluation_r')
    gold_pairs[TARGET] = gold_pairs[TARGET].replace(['motivation_r',], 'condition_r')
    gold_pairs['relation'] = gold_pairs[TARGET].map(lambda row: row[:-1]) + gold_pairs['order']
    gold_pairs['relation'].value_counts()
    gold_pairs['relation'] = gold_pairs['relation'].replace(['restatement_SN', 'restatement_NS'], 'restatement_NN')
    gold_pairs['relation'] = gold_pairs['relation'].replace(['contrast_SN', 'contrast_NS'], 'contrast_NN')
    gold_pairs['relation'] = gold_pairs['relation'].replace(['solutionhood_NS', 'preparation_NS'], 'elaboration_NS')
    gold_pairs['relation'] = gold_pairs['relation'].replace(['concession_SN', 'evaluation_SN', 
                                                             'elaboration_SN', 'evidence_SN'], 'preparation_SN')

    _class_mapper = {
            'background_NS': 'elaboration_NS',
            'background_SN': 'preparation_SN',
            'comparison_NN': 'contrast_NN',
            'interpretation-evaluation_SN': 'elaboration_NS',
            'evidence_NS': 'elaboration_NS',
            'restatement_NN': 'joint_NN',
            'sequence_NN': 'joint_NN'
        }

    for key, value in _class_mapper.items():
        gold_pairs['relation'] = gold_pairs['relation'].replace(key, value)
        
    gold_pairs['order'] = gold_pairs['relation'].map(lambda row: row.split('_')[1])
    gold_pairs[TARGET] = gold_pairs['relation'].map(lambda row: row.split('_')[0])
        
    return gold_pairs

In [None]:
cache = []

In [None]:
test

### Find edus containing multiple paragraphs and add to exceptions 

In [None]:
from tqdm import tqdm_notebook as tqdm
from utils.file_reading import *
from utils.evaluation import extr_pairs, extr_pairs_forest


broken_files = []
smallest_file = 'data/news2_4.edus'
coolest_file = 'data/blogs_17.edus'
shit = 'data/blogs_99.edus'
#test[:1]
for file in tqdm(test):
    filename = '.'.join(file.split('.')[:-1])
    edus = read_edus(filename)
    #gold = read_gold(filename)
    gold = prepare_gold_pairs(read_gold(filename, features=True))
    
    annot = read_annotation(filename)
    
    for missegmentation in ("\nIMG", 
                            "\nгимнастический коврик;",
                            "\nгантели или бутылки с песком;",
                            "\nнебольшой резиновый мяч;",
                            "\nэластичная лента (эспандер);",
                            "\nхула-хуп (обруч).",
                            "\n200?",
                            "\n300?",
                            "\nНе требуйте странного.",
                            "\nИспользуйте мою модель.",
                            '\n"А чего вы от них требуете?"',
                            '\n"Решить проблемы с тестерами".',
                            "\nКак гончая на дичь.", "\nИ крупная.",
                            "\nВ прошлом году компания удивила рынок",
                            "\nЧужой этики особенно.",
                            "\nНо и своей тоже.",
                            "\nАэропорт имени,",
                            "\nА вот и монголы.",
                            "\nЗолотой Будда.", 
                            "\nДворец Богдо-Хана.",
                            "\nПлощадь Сухэ-Батора.",
                            "\nОдноклассники)",
                            "\nВечерняя площадь.",
                            "\nТугрики.",
                            "\nВнутренние монголы.",
                            "\nВид сверху.",
                            "\nНациональный парк Тэрэлж. IMG IMG",
                            '\nГора "Черепаха".',
                            "\nПуть к медитации.",
                            "\nЖить надо высоко,",
                            "\nЧан с кумысом.",
                            "\nЖилая юрта.",
                            "\nКумыс.",
                            "\nТрадиционное занятие монголов",
                            "\nДвугорбый верблюд мало где",
                            "\nМонгол Шуудан переводится",
                            "\nОвощные буузы.",
                            "\nЗнаменитый чай!"
                            ):
        annot['text'] = annot['text'].replace(missegmentation, ' '+missegmentation[1:])

    for edu in edus:
        if annot['text'].find(edu) == -1:
            print(f'::: {filename} ::: {edu}')

### Evaluate on test

In [None]:
cache = []

In [None]:
from tqdm import tqdm_notebook as tqdm
from utils.file_reading import *
from utils.evaluation import *


broken_files = []
smallest_file = 'data/news2_4.edus'
weirdest_file = 'data/blogs_39.edus'

for file in tqdm([weirdest_file]):
    filename = '.'.join(file.split('.')[:-1])
    edus = read_edus(filename)
    gold = prepare_gold_pairs(read_gold(filename, features=True))
    annot = read_annotation(filename)
    
    for missegmentation in ("\nIMG", 
                            "\nгимнастический коврик;",
                            "\nгантели или бутылки с песком;",
                            "\nнебольшой резиновый мяч;",
                            "\nэластичная лента (эспандер);",
                            "\nхула-хуп (обруч).",
                            "\n200?",
                            "\n300?",
                            "\nНе требуйте странного.",
                            "\nИспользуйте мою модель.",
                            '\n"А чего вы от них требуете?"',
                            '\n"Решить проблемы с тестерами".',
                            "\nКак гончая на дичь.", "\nИ крупная.",
                            "\nВ прошлом году компания удивила рынок",
                            "\nЧужой этики особенно.",
                            "\nНо и своей тоже.",
                            "\nАэропорт имени,",
                            "\nА вот и монголы.",
                            "\nЗолотой Будда.", 
                            "\nДворец Богдо-Хана.",
                            "\nПлощадь Сухэ-Батора.",
                            "\nОдноклассники)",
                            "\nВечерняя площадь.",
                            "\nТугрики.",
                            "\nВнутренние монголы.",
                            "\nВид сверху.",
                            "\nНациональный парк Тэрэлж. IMG IMG",
                            '\nГора "Черепаха".',
                            "\nПуть к медитации.",
                            "\nЖить надо высоко,",
                            "\nЧан с кумысом.",
                            "\nЖилая юрта.",
                            "\nКумыс.",
                            "\nТрадиционное занятие монголов",
                            "\nДвугорбый верблюд мало где",
                            "\nМонгол Шуудан переводится",
                            "\nОвощные буузы.",
                            "\nЗнаменитый чай!"
                            ):
        annot['text'] = annot['text'].replace(missegmentation, ' '+missegmentation[1:])

    
    if '\n' in annot['text']:
        chunks = split_by_paragraphs(
            annot['text'],
            annot['tokens'], 
            annot['sentences'], 
            annot['lemma'], 
            annot['morph'], 
            annot['postag'], 
            annot['syntax_dep_tree'])
        
        chunked_edus = split_by_paragraphs_edus(edus, annot['text'])
    
    dus = []
    for i, chunk in enumerate(chunks):
        _edus = []
        last_end = 0
        
        for max_id in range(len(chunked_edus[i])):
            start = len(annot['text'][:last_end]) + annot['text'][last_end:].find(chunked_edus[i][max_id])
            end = start + len(chunked_edus[i][max_id])
            temp = DiscourseUnit(
                    id=max_id,
                    left=None,
                    right=None,
                    relation='edu',
                    start=start,
                    end=end,
                    orig_text=annot['text'],
                    proba=1.,
                )

            _edus.append(temp)
            last_end = end + 1
            
        if len(_edus) == 1:
            dus += _edus
            start_id = _edus[-1].id + 1

        elif len(_edus) > 1:
            trees = paragraph_parser(_edus,
                annot['text'], chunk['tokens'], chunk['sentences'], chunk['lemma'],
                chunk['morph'], chunk['postag'], chunk['syntax_dep_tree'])
            
            dus += trees
#             print('::: chunk processed :::')
#             print(dus[-1].text)
            start_id = dus[-1].id + 1
        
    parsed = document_parser(
                dus, 
                annot['text'], 
                annot['tokens'], 
                annot['sentences'], 
                annot['lemma'], 
                annot['morph'], 
                annot['postag'], 
                annot['syntax_dep_tree'],
                genre=filename.split('_')[0])
    
    if len(parsed) > len(annot['text']) // 400:
        parsed = additional_document_parser(
            parsed, 
            annot['text'], 
            annot['tokens'], 
            annot['sentences'], 
            annot['lemma'], 
            annot['morph'], 
            annot['postag'], 
            annot['syntax_dep_tree'],
            genre=filename.split('_')[0]
        )
    
    parsed_pairs = pd.DataFrame(extr_pairs_forest(parsed, annot['text']), 
                                columns=['snippet_x', 'snippet_y', 'category_id', 'order'])
    evaluation = eval_pipeline(parsed, edus, gold, annot['text'])
    evaluation['filename'] = file
    cache.append(evaluation)

In [None]:
parsed

In [None]:
annot['text']

In [None]:
print(parsed[13])

In [None]:
print(parsed[8].left)

In [None]:
tmp = pd.DataFrame(cache)
tmp['pr_seg'] = tmp.seg_true_pred / tmp.seg_all_pred
tmp['re_seg'] = tmp.seg_true_pred / tmp.seg_all_true
tmp['f1_seg'] = 2 * tmp.pr_seg * tmp.re_seg / (tmp.pr_seg + tmp.re_seg)
tmp['pr_unlab'] = tmp.unlab_true_pred / tmp.unlab_all_pred
tmp['re_unlab'] = tmp.unlab_true_pred / tmp.unlab_all_true
tmp['f1_unlab'] = 2 * tmp.pr_unlab * tmp.re_unlab / (tmp.pr_unlab + tmp.re_unlab)
tmp['pr_lab'] = tmp.lab_true_pred / tmp.lab_all_pred
tmp['re_lab'] = tmp.lab_true_pred / tmp.lab_all_true
tmp['f1_lab'] = 2 * tmp.pr_lab * tmp.re_lab / (tmp.pr_lab + tmp.re_lab)
tmp['pr_nuc'] = tmp.nuc_true_pred / tmp.nuc_all_pred
tmp['re_nuc'] = tmp.nuc_true_pred / tmp.nuc_all_true
tmp['f1_nuc'] = 2 * tmp.pr_nuc * tmp.re_nuc / (tmp.pr_nuc + tmp.re_nuc)
tmp['pr_full'] = tmp.full_true_pred / tmp.full_all_pred
tmp['re_full'] = tmp.full_true_pred / tmp.full_all_true
tmp['f1_full'] = 2 * tmp.pr_full * tmp.re_full / (tmp.pr_full + tmp.re_full)
tmp.sort_values('f1_unlab', ascending=False)

In [None]:
re = 44./200
pr = .5
f1 = 2. * pr * re / (pr + re)

In [None]:
f1

Unlabeled tree building score

In [None]:
pr_micro = tmp.unlab_true_pred.sum() / tmp.unlab_all_pred.sum() * 100.
re_micro = tmp.unlab_true_pred.sum() / tmp.unlab_all_true.sum() * 100.
f1_micro = 2. * pr_micro * re_micro / (pr_micro + re_micro)

unlab_micro = (pr_micro, re_micro, f1_micro)
unlab_micro

In [None]:
pr_macro = tmp.pr_unlab.sum() / tmp.shape[0] * 100.
re_macro = tmp.re_unlab.sum() / tmp.shape[0] * 100.
f1_macro = 2. * pr_macro * re_macro / (pr_macro + re_macro)

unlab_macro = (pr_macro, re_macro, f1_macro)
unlab_macro

Labeled tree building score

In [None]:
pr_micro = tmp.lab_true_pred.sum() / tmp.lab_all_pred.sum() * 100.
re_micro = tmp.lab_true_pred.sum() / tmp.lab_all_true.sum() * 100.
f1_micro = 2. * pr_micro * re_micro / (pr_micro + re_micro)

lab_micro = (pr_micro, re_micro, f1_micro)
lab_micro

In [None]:
pr_macro = tmp.pr_lab.sum() / tmp.shape[0] * 100.
re_macro = tmp.re_lab.sum() / tmp.shape[0] * 100.
f1_macro = 2. * pr_macro * re_macro / (pr_macro + re_macro)

lab_macro = (pr_macro, re_macro, f1_macro)
lab_macro

Nuclearity score

In [None]:
pr_micro = tmp.nuc_true_pred.sum() / tmp.nuc_all_pred.sum() * 100.
re_micro = tmp.nuc_true_pred.sum() / tmp.nuc_all_true.sum() * 100.
f1_micro = 2. * pr_micro * re_micro / (pr_micro + re_micro)

nuc_micro = (pr_micro, re_micro, f1_micro)
nuc_micro

In [None]:
pr_macro = tmp.pr_nuc.sum() / tmp.shape[0] * 100.
re_macro = tmp.re_nuc.sum() / tmp.shape[0] * 100.
f1_macro = 2. * pr_macro * re_macro / (pr_macro + re_macro)

nuc_macro = (pr_macro, re_macro, f1_macro)
nuc_macro

Full tree building score

In [None]:
pr_micro = tmp.full_true_pred.sum() / tmp.full_all_pred.sum() * 100.
re_micro = tmp.full_true_pred.sum() / tmp.full_all_true.sum() * 100.
f1_micro = 2. * pr_micro * re_micro / (pr_micro + re_micro)

full_micro = pr_micro, re_micro, f1_micro
full_micro

In [None]:
pr_macro = tmp.pr_full.sum() / tmp.shape[0] * 100.
re_macro = tmp.re_full.sum() / tmp.shape[0] * 100.
f1_macro = 2. * pr_macro * re_macro / (pr_macro + re_macro)

full_macro = (pr_macro, re_macro, f1_macro)
full_macro

Draw a table

In [None]:
evaluation_table = pd.DataFrame(columns=['component', 'P', 'R', 'F1', 'P', 'R', 'F1'], data=[
    #['segmentation', overall_score['pr_seg'], overall_score['re_seg'], overall_score['f1_seg']],
    ['span', unlab_micro[0], unlab_micro[1], unlab_micro[2], unlab_macro[0], unlab_macro[1], unlab_macro[2]],
    ['nuclearity', nuc_micro[0], nuc_micro[1], nuc_micro[2], nuc_macro[0], nuc_macro[1], nuc_macro[2]],
    ['relation', lab_micro[0], lab_micro[1], lab_micro[2], lab_macro[0], lab_macro[1], lab_macro[2]],
    ['full', full_micro[0], full_micro[1], full_micro[2], full_macro[0], full_macro[1], full_macro[2]],
])

print(evaluation_table.to_latex(index=False, float_format='%.2f', column_format='|l|l|l|l|'))