In [None]:
! pip install -U git+https://github.com/IINemo/isanlp.git@discourse

In [None]:
from isanlp import PipelineCommon
from isanlp.processor_remote import ProcessorRemote
from isanlp.ru.processor_mystem import ProcessorMystem
from isanlp.ru.converter_mystem_to_ud import ConverterMystemToUd

address_morph = (SERVER0, 4333)
address_syntax = (SERVER0, 5336)
address_rst = (SERVER2, 3344)

ppl = PipelineCommon([
    (ProcessorRemote(address_syntax[0], address_syntax[1], '0'),
     ['text'],
     {'sentences': 'sentences',
      'tokens': 'tokens',
      'lemma': 'lemma',
      'syntax_dep_tree': 'syntax_dep_tree',
      'postag': 'ud_postag'}),
    (ProcessorMystem(delay_init=False),
     ['tokens', 'sentences'],
     {'postag': 'postag'}),
    (ConverterMystemToUd(),
     ['postag'],
     {'morph': 'morph',
      'postag': 'postag'}),
    (ProcessorRemote(address_rst[0], address_rst[1], 'default'),
     ['text', 'tokens', 'sentences', 'postag', 'morph', 'lemma', 'syntax_dep_tree'],
     {'rst': 'rst'})
])

In [None]:
from utils.file_reading import read_annotation, read_edus, read_gold

example = 'data/news1_1'
text = read_annotation(example)['text']
gold_edus = read_edus(example)
gold_pairs = read_gold(example)

In [None]:
%%time

result = ppl(text)

In [None]:
result['rst']

In [None]:
print(result['rst'][1])

In [None]:
import pandas as pd


def metric_parseval(parsed_pairs, gold, labeled=False):
    parsed_strings = []
    for row in parsed_pairs:
        label = ' ' + row[2] if labeled else ''
        parsed_strings.append(row[0].strip() + ' ' + row[1].strip() + label)
    parsed_strings = set(parsed_strings)
    
    gold_strings = []
    for i in gold.index:
        label = ' ' + gold.loc[i, 'category_id'] if labeled else ''
        gold_strings.append(gold.loc[i, 'snippet_x'].strip() + ' ' + gold.loc[i, 'snippet_y'].strip() + label)
    gold_strings = set(gold_strings)
    
    true_pos = len(gold_strings & parsed_strings)
    all_parsed = len(parsed_strings)
    all_gold = len(gold_strings)
    
    pr = true_pos / all_parsed
    re = true_pos / all_gold
    f1 = 2 * pr * re / (pr + re + 1e-5)
    
    return {
        'pr': pr,
        're': re,
        'f1': f1
    }
    
def extr_pairs(tree):
    pp = []
    if tree.left:
        pp.append([tree.left.text, tree.right.text, tree.relation])
        pp += extr_pairs(tree.left)
        pp += extr_pairs(tree.right)
    return pp

def extr_pairs_forest(forest):
    pp = []
    for tree in forest:
        pp += extr_pairs(tree)
    return pp

def _check_snippet_pair_in_dataset(left_snippet, right_snippet):
    left_snippet = left_snippet.strip()
    right_snippet = right_snippet.strip()
    return ((((gold.snippet_x == left_snippet) & (gold.snippet_y == right_snippet)).sum(axis=0) != 0) 
            or ((gold.snippet_y == left_snippet) & (gold.snippet_x == right_snippet)).sum(axis=0) != 0)

def _not_parsed_as_in_gold(parsed_pairs: pd.DataFrame, gold: pd.DataFrame):
    tmp = pd.merge(gold, parsed_pairs, on=['snippet_x', 'snippet_y'], how='left', suffixes=('_gold', '_parsed'))
    return tmp[pd.isnull(tmp.category_id_parsed)]

def extr_edus(tree):
    edus = []
    if tree.left:
        edus += extr_edus(tree.left)
        edus += extr_edus(tree.right)
    else:
        edus.append(tree.text)
    return edus


In [None]:
def eval_segmentation(trees, gold_edus):
    true_predictions = 0
    all_predicted = 0
    
    for tree in trees:
        pred_edus = extr_edus(tree)
        all_predicted += len(pred_edus)
    
        for pred_edu in pred_edus:
            if pred_edu in gold_edus:
                true_predictions += 1
            
    pr = true_predictions / all_predicted
    re = true_predictions / len(gold_edus)
    f1 = 2 * pr * re / (pr + re)
    return {'pr': pr, 
            're': re,
            'f1': f1}

In [None]:
def eval_pipeline(trees, gold_edus, gold_pairs):
    parsed_pairs = extr_pairs_forest(trees)
    return {
        'segmentation': eval_segmentation(trees, gold_edus),
        'unlabeled_tree_building': metric_parseval(parsed_pairs, gold_pairs),
        'labeled_tree_building': metric_parseval(parsed_pairs, gold_pairs, labeled=True)
    }

In [None]:
eval_pipeline(result['rst'], gold_edus, gold_pairs)