In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import pandas as pd
import numpy as np
from utils.print_tree import printBTree
#from utils.rst_annotation import DiscourseUnit

import sys
sys.path.append('../')
sys.path.append('../../')
sys.path.append('../../../')

In [None]:
class DiscourseUnit:
    def __init__(self, id, left=None, right=None, text='', start=None, end=None, 
                 orig_text=None, relation=None, nuclearity=None, proba=1.):
        """
        :param int id:
        :param DiscourseUnit left:
        :param DiscourseUnit right:
        :param str text: (optional)
        :param int start: start position in original text
        :param int end: end position in original text
        :param string relation: {the relation between left and right components | 'elementary' | 'root'}
        :param string nuclearity: {'NS' | 'SN' | 'NN'}
        :param float proba: predicted probability of the relation occurrence
        """
        self.id = id
        self.left = left
        self.right = right
        self.relation = relation
        self.nuclearity = nuclearity
        self.proba = str(proba)
        self.start = start
        self.end = end

        if self.left:
            self.start = left.start
            self.end = right.end+1
        
        if orig_text:            
            self.text = orig_text[self.start:self.end].strip()
        else:
            self.text = text.strip()

    def __str__(self):
        return f"id: {self.id}\ntext: {self.text}\nrelation: {self.relation}\nleft: {self.left.text if self.left else None}\nright: {self.right.text if self.right else None}\nstart: {self.start}\nend: {self.end}"


In [None]:
def printTree(tree):
    def _(n):
        if n.relation:
            value = (n.relation, "%.2f"%(n.proba))
        else:
            value = n.text
        return str(value), n.left, n.right

    return printBTree(_) 

In [None]:
class DiscourseUnitCreator:
    def __init__(self, id):
        self.id = id
        
    def __call__(self, left_node, right_node, proba):
        self.id += 1
        return DiscourseUnit(
            id=id,
            left=left_node,
            right=right_node,
            relation=1,
            proba=proba
        )

In [None]:
import os
import pickle

import pandas as pd


class SklearnClassifier:
    """
    Wrapper for sklearn/catboost classification model along with preprocessors, saved in the same directory:
        [required]
        - model.pkl            : trained model
        [optional]
        - drop_columns.pkl     : list of names for columns to drop before prediction
        - categorical_cols.pkl : list of names for columns with categorical features
        - one_hot_encoder.pkl  : trained one-hot sklearn encoder model
        - scaler.pkl           : trained sklearn scaler model
        - label_encoder.pkl    : trained label encoder to decode predictions
    """

    def __init__(self, model_dir_path):
        self.model_dir_path = model_dir_path

        file_drop_columns = os.path.join(self.model_dir_path, 'drop_columns.pkl')
        self._drop_columns = pickle.load(open(file_drop_columns, 'rb')) if os.path.isfile(
            file_drop_columns) else None
        if self._drop_columns:
            self._drop_columns = [value for value in self._drop_columns if
                                  not value in ('category_id' 'filename' 'order')]

        file_scaler = os.path.join(self.model_dir_path, 'scaler.pkl')
        self._scaler = pickle.load(open(file_scaler, 'rb')) if os.path.isfile(
            file_scaler) else None

        file_categorical_cols = os.path.join(self.model_dir_path, 'categorical_cols.pkl')
        self._categorical_cols = pickle.load(open(file_categorical_cols, 'rb')) if os.path.isfile(
            file_categorical_cols) else None

        file_one_hot_encoder = os.path.join(self.model_dir_path, 'one_hot_encoder.pkl')
        self._one_hot_encoder = pickle.load(open(file_one_hot_encoder, 'rb')) if os.path.isfile(
            file_one_hot_encoder) else None

        file_label_encoder = os.path.join(self.model_dir_path, 'label_encoder.pkl')
        self._label_encoder = pickle.load(open(file_label_encoder, 'rb')) if os.path.isfile(
            file_label_encoder) else None

        self._model = pickle.load(open(os.path.join(self.model_dir_path, 'model.pkl'), 'rb'))
        self.classes_ = self._model.classes_

    def predict_proba(self, features):
        return self._model.predict_proba(self._preprocess_features(features))

    def predict(self, features):
        if self._label_encoder:
            return self._label_encoder.inverse_transform(self._model.predict(self._preprocess_features(features)))

        return self._model.predict(self._preprocess_features(features))

    def _preprocess_features(self, _features):
        features = _features[:]
        
        if self._categorical_cols:
            if self._label_encoder:
                features[self._categorical_cols] = features[self._categorical_cols].apply(
                    lambda col: self._label_encoder.fit_transform(col))

            if self._one_hot_encoder:
                features_ohe = self._one_hot_encoder.transform(features[self._categorical_cols].values)
                features_ohe = pd.DataFrame(features_ohe, features.index,
                                            columns=self._one_hot_encoder.get_feature_names(self._categorical_cols))

                features = features.join(
                    pd.DataFrame(features_ohe, features.index).add_prefix('cat_'), how='right'
                ).drop(columns=self._categorical_cols)

        if self._drop_columns:
            features = features.drop(columns=self._drop_columns)

        if 'category_id' in features.keys():
            features = features.drop(columns=['category_id', 'filename', 'order'])

        if self._scaler:
            return self._scaler.transform(features.values)

        return features.values.astype('float64')


In [None]:
from allennlp.predictors import Predictor

pr = Predictor.from_path('models/structure_predictor_lstm/results_all/model.tar.gz')

In [None]:
pr.predict('В целом арабские страны поддерживают эту инициативу ,',
           'так как многие из них - особенно после 2011 года - были серьезно затронуты действиями радикальных исламистов .')

In [None]:
pr.predict_batch_json([{'premise': 'В целом арабские страны поддерживают эту инициативу ,',
           'hypothesis': 'так как многие из них - особенно после 2011 года - были серьезно затронуты действиями радикальных исламистов .'}], )

In [None]:
import os
import pickle

import pandas as pd


class SklearnClassifier:
    """
    Wrapper for sklearn/catboost classification model along with preprocessors, saved in the same directory:
        [required]
        - model.pkl            : trained model
        [optional]
        - drop_columns.pkl     : list of names for columns to drop before prediction
        - categorical_cols.pkl : list of names for columns with categorical features
        - one_hot_encoder.pkl  : trained one-hot sklearn encoder model
        - scaler.pkl           : trained sklearn scaler model
        - label_encoder.pkl    : trained label encoder to decode predictions
    """

    def __init__(self, model_dir_path):
        self.model_dir_path = model_dir_path

        file_drop_columns = os.path.join(self.model_dir_path, 'drop_columns.pkl')
        self._drop_columns = pickle.load(open(file_drop_columns, 'rb')) if os.path.isfile(
            file_drop_columns) else None
        if self._drop_columns:
            self._drop_columns = [value for value in self._drop_columns if
                                  not value in ('category_id' 'filename' 'order')]

        file_scaler = os.path.join(self.model_dir_path, 'scaler.pkl')
        self._scaler = pickle.load(open(file_scaler, 'rb')) if os.path.isfile(
            file_scaler) else None

        file_categorical_cols = os.path.join(self.model_dir_path, 'categorical_cols.pkl')
        self._categorical_cols = pickle.load(open(file_categorical_cols, 'rb')) if os.path.isfile(
            file_categorical_cols) else None

        file_one_hot_encoder = os.path.join(self.model_dir_path, 'one_hot_encoder.pkl')
        self._one_hot_encoder = pickle.load(open(file_one_hot_encoder, 'rb')) if os.path.isfile(
            file_one_hot_encoder) else None

        file_label_encoder = os.path.join(self.model_dir_path, 'label_encoder.pkl')
        self._label_encoder = pickle.load(open(file_label_encoder, 'rb')) if os.path.isfile(
            file_label_encoder) else None

        self._model = pickle.load(open(os.path.join(self.model_dir_path, 'model.pkl'), 'rb'))
        self.classes_ = self._model.classes_

    def predict_proba(self, features):
        return self._model.predict_proba(self._preprocess_features(features))

    def predict(self, features):
        if self._label_encoder:
            return self._label_encoder.inverse_transform(self._model.predict(self._preprocess_features(features)))

        return self._model.predict(self._preprocess_features(features))

    def _preprocess_features(self, _features):
        features = _features[:]
        
        if self._categorical_cols:
            if self._label_encoder:
                features[self._categorical_cols] = features[self._categorical_cols].apply(
                    lambda col: self._label_encoder.fit_transform(col))

            if self._one_hot_encoder:
                features_ohe = self._one_hot_encoder.transform(features[self._categorical_cols].values)
                features_ohe = pd.DataFrame(features_ohe, features.index,
                                            columns=self._one_hot_encoder.get_feature_names(self._categorical_cols))

                features = features.join(
                    pd.DataFrame(features_ohe, features.index).add_prefix('cat_'), how='right'
                ).drop(columns=self._categorical_cols)

        if self._drop_columns:
            features = features.drop(columns=self._drop_columns)

        if 'category_id' in features.keys():
            features = features.drop(columns=['category_id', 'filename', 'order'])

        if self._scaler:
            return self._scaler.transform(features.values)

        return features.values.astype('float64')


In [None]:
import os
import pickle
from allennlp.predictors import Predictor

import pandas as pd


class AllenNLPClassifier:
    """
    Wrapper for allennlp classification model along with preprocessors, saved in the same directory:
        [required]
        - model.tar.gz            : trained model
    """

    def __init__(self, model_dir_path):
        self.model_dir_path = model_dir_path
        self._max_len = 300
        
        self._model = Predictor.from_path(os.path.join(self.model_dir_path, 'model.tar.gz'))

    def predict_proba(self, snippet_x, snippet_y):
        if len(snippet_x.split()) > self._max_len or len(snippet_y.split()) > self._max_len:
            return [1., 0.]
        
        return self._model.predict(snippet_x, snippet_y)['probs']
    
    def predict_proba_batch(self, snippet_x, snippet_y):
        predictions = pr.predict_batch_json([
            {'premise': snippet_x[i],
             'hypothesis': snippet_y[i]}
            for i in range(len(snippet_x))])
        return [prediction['probs'] for prediction in predictions]

    def predict(self, snippet_x, snippet_y):
        return self._model.predict(snippet_x, snippet_y)['label']


In [None]:
import pandas as pd
from isanlp.annotation_rst import DiscourseUnit


class RSTTreePredictor:
    """
    Contains classifiers and processors needed for tree building.
    """

    def __init__(self, features_processor, relation_predictor, label_predictor, nuclearity_predictor):
        self.features_processor = features_processor
        self.relation_predictor = relation_predictor
        self.label_predictor = label_predictor
        if self.label_predictor:
            self.labels = self.label_predictor.classes_

        self.nuclearity_predictor = nuclearity_predictor
        if self.nuclearity_predictor:
            self.nuclearities = self.nuclearity_predictor.classes_

        self.genre = None


class GoldTreePredictor(RSTTreePredictor):
    """
    Contains classifiers and processors needed for gold tree building from corpus.
    """

    def __init__(self, corpus):
        """
        :param pandas.DataFrame corpus:
            columns=['snippet_x', 'snippet_y', 'category_id']
            rows=[all the relations pairs from corpus]
        """
        RSTTreePredictor.__init__(self, None, None, None, None)
        self.corpus = corpus

    def extract_features(self, *args):
        return pd.DataFrame({
            'snippet_x': [args[0].text, ],
            'snippet_y': [args[1].text, ]
        })

    def initialize_features(self, *args):
        return pd.DataFrame({
            'snippet_x': [args[0][i].text for i in range(len(args[0]) - 1)],
            'snippet_y': [args[0][i].text for i in range(1, len(args[0]))]
        })

    def predict_pair_proba(self, features):
        def _check_snippet_pair_in_dataset(left_snippet, right_snippet):
            return float((((self.corpus.snippet_x == left_snippet) & (self.corpus.snippet_y == right_snippet)).sum(
                axis=0) != 0)
                         or ((self.corpus.snippet_y == left_snippet) & (self.corpus.snippet_x == right_snippet)).sum(
                axis=0) != 0)

        result = features.apply(lambda row: _check_snippet_pair_in_dataset(row.snippet_x, row.snippet_y), axis=1)
        return result.values.tolist()

    def predict_label(self, features):
        def _get_label(left_snippet, right_snippet):
            label = self.corpus[
                ((self.corpus.snippet_x == left_snippet) & (self.corpus.snippet_y == right_snippet))].category_id.values
            if label.size == 0:
                return 'relation'

            return label[0]

        if type(features) == pd.Series:
            result = _get_label(features.loc['snippet_x'], features.loc['snippet_y'])
            return result
        else:
            result = features.apply(lambda row: _get_label(row.snippet_x, row.snippet_y), axis=1)
            return result.values.tolist()

    def predict_nuclearity(self, features):
        def _get_nuclearity(left_snippet, right_snippet):
            nuclearity = self.corpus[
                ((self.corpus.snippet_x == left_snippet) & (self.corpus.snippet_y == right_snippet))].order.values
            if nuclearity.size == 0:
                return '_'

        if type(features) == pd.Series:
            result = _get_nuclearity(features.loc['snippet_x'], features.loc['snippet_y'])
            return result
        else:
            result = features.apply(lambda row: _get_nuclearity(row.snippet_x, row.snippet_y), axis=1)
            return result.values.tolist()


class CustomTreePredictor(RSTTreePredictor):
    """
    Contains trained classifiers and feature processors needed for tree prediction.
    """

    def __init__(self, features_processor, relation_predictor, label_predictor=None, nuclearity_predictor=None):
        RSTTreePredictor.__init__(self, features_processor, relation_predictor, label_predictor, nuclearity_predictor)

    def extract_features(self, left_node: DiscourseUnit, right_node: DiscourseUnit,
                         annot_text, annot_tokens, annot_sentences, annot_lemma, annot_morph, annot_postag,
                         annot_syntax_dep_tree):
        pair = pd.DataFrame({
            'snippet_x': [left_node.text.strip()],
            'snippet_y': [right_node.text.strip()],
        })

        try:
            features = self.features_processor(pair, annot_text=annot_text,
                                               annot_tokens=annot_tokens, annot_sentences=annot_sentences,
                                               annot_postag=annot_postag, annot_morph=annot_morph,
                                               annot_lemma=annot_lemma, annot_syntax_dep_tree=annot_syntax_dep_tree)
            return features
        except IndexError:
            with open('errors.log', 'w+') as f:
                f.write(str(pair.values))
                f.write(annot_text)
            return -1

    def initialize_features(self, nodes,
                            annot_text, annot_tokens, annot_sentences, annot_lemma, annot_morph, annot_postag,
                            annot_syntax_dep_tree):
        pairs = pd.DataFrame({
            'snippet_x': [node.text.strip() for node in nodes[:-1]],
            'snippet_y': [node.text.strip() for node in nodes[1:]]
        })

        try:
            features = self.features_processor(pairs, annot_text=annot_text,
                                               annot_tokens=annot_tokens, annot_sentences=annot_sentences,
                                               annot_postag=annot_postag, annot_morph=annot_morph,
                                               annot_lemma=annot_lemma, annot_syntax_dep_tree=annot_syntax_dep_tree)
            return features
        except IndexError:
            with open('errors.log', 'w+') as f:
                f.write(str(pair.values))
                f.write(annot_text)
            return -1

    def predict_pair_proba(self, features):
        _same_sentence_bonus = 0.5

        if type(features) == pd.DataFrame:
            probas = self.relation_predictor.predict_proba(features)
            # results = list(map(lambda proba: proba[1], probas))
            # return results
            same_sentence_bonus = list(map(lambda value: float(value) * _same_sentence_bonus,
                                           list(features['same_sentence'] == 1)))
            return [probas[i][1] + same_sentence_bonus[i] for i in range(len(probas))]

        if type(features) == pd.Series:
            return self.relation_predictor.predict_proba(features)[0][1] + (
                    features.loc['same_sentence'] == 1) * _same_sentence_bonus

        if type(features) == list:
            return self.relation_predictor.predict_proba([features])[0][1]

    def predict_label(self, features):
        if not self.label_predictor:
            return 'relation'

        if type(features) == pd.DataFrame:
            return self.label_predictor.predict(features)

        if type(features) == pd.Series:
            return self.label_predictor.predict(features.to_frame().T)[0]

    def predict_nuclearity(self, features):
        if not self.nuclearity_predictor:
            return 'unavail'

        if type(features) == pd.DataFrame:
            return self.nuclearity_predictor.predict(features)

        if type(features) == pd.Series:
            return self.nuclearity_predictor.predict(features.to_frame().T)[0]


class NNTreePredictor(CustomTreePredictor):
    """
    Contains trained classifiers and feature processors needed for tree prediction.
    """
    
    def initialize_features(self, nodes,
                            annot_text, annot_tokens, annot_sentences, annot_lemma, annot_morph, annot_postag,
                            annot_syntax_dep_tree):
        features = super().initialize_features(nodes,
                            annot_text, annot_tokens, annot_sentences, annot_lemma, annot_morph, annot_postag,
                            annot_syntax_dep_tree)
        features['snippet_x'] = features['tokens_x'].map(lambda row: ' '.join(row)).values
        features['snippet_y'] = features['tokens_y'].map(lambda row: ' '.join(row)).values
        
        return features

    def predict_pair_proba(self, features):
        _same_sentence_bonus = 0.

        if type(features) == pd.DataFrame:
            probas = features.apply(lambda row: self.relation_predictor.predict_proba(row.snippet_x, row.snippet_y), axis=1).values
            same_sentence_bonus = list(map(lambda value: float(value) * _same_sentence_bonus,
                                           list(features['same_sentence'] == 1)))

            return [probas[i][1] + same_sentence_bonus[i] for i in range(len(probas))]

        if type(features) == pd.Series:
            return self.relation_predictor.predict_proba(' '.join(features.loc['tokens_x']),
                                                         ' '.join(features.loc['tokens_y']))[0][1] + (
                           features.loc['same_sentence'] == 1) * _same_sentence_bonus

        if type(features) == list:
            probas = []
            
            for i in range(len(features)):
                snippet_x = features[i]['tokens_x'].map(lambda row: ' '.join(row)).values
                snippet_y = features[i]['tokens_y'].map(lambda row: ' '.join(row)).values
                probas.append(self.relation_predictor.predict_proba(snippet_x, snippet_y)[1])
                
            #probas = [self.relation_predictor.predict_proba(features[i]['', snippet_y[i]) for i in range(len(snippet_x))]
            return probas#self.relation_predictor.predict_proba([features])[0][1]


In [None]:
import numpy as np
import pandas as pd

from isanlp.annotation_rst import DiscourseUnit


class GreedyRSTParser:
    def __init__(self, tree_predictor, forest_threshold=0.05):
        """
        :param RSTTreePredictor tree_predictor:
        :param float forest_threshold: minimum relation probability to append the pair into the tree
        """
        self.tree_predictor = tree_predictor
        self.forest_threshold = forest_threshold

    def __call__(self, edus, annot_text, annot_tokens, annot_sentences, annot_lemma, annot_morph, annot_postag,
                 annot_syntax_dep_tree, genre=None):
        """
        :param list edus: DiscourseUnit
        :param str annot_text: original text
        :param list annot_tokens: isanlp.annotation.Token
        :param list annot_sentences: isanlp.annotation.Sentence
        :param list annot_postag: lists of str for each sentence
        :param annot_lemma: lists of str for each sentence
        :param annot_syntax_dep_tree: list of isanlp.annotation.WordSynt for each sentence
        :return: list of DiscourseUnit containing each extracted tree
        """

        def to_merge(_scores):
            return np.argmax(np.array(_scores))

        self.tree_predictor.genre = genre

        nodes = edus

        max_id = edus[-1].id

        # initialize scores
        features = self.tree_predictor.initialize_features(nodes, 
                                                           annot_text, annot_tokens,
                                                           annot_sentences,
                                                           annot_lemma, annot_morph, annot_postag,
                                                           annot_syntax_dep_tree)

        scores = self.tree_predictor.predict_pair_proba(features)

        while len(nodes) > 2 and any([score > self.forest_threshold for score in scores]):
            # select two nodes to merge
            j = to_merge(scores)  # position of the pair in list

            # make the new node by merging node[j] + node[j+1]
            temp = DiscourseUnit(
                id=max_id + 1,
                left=nodes[j],
                right=nodes[j + 1],
                relation=self.tree_predictor.predict_label(features.iloc[j]),
                nuclearity=self.tree_predictor.predict_nuclearity(features.iloc[j]),
                proba=scores[j],
                text=annot_text[nodes[j].start:nodes[j + 1].end].strip()
            )

            max_id += 1

            # modify the node list
            nodes = nodes[:j] + [temp] + nodes[j + 2:]

            # modify the scores list
            if j == 0:
                _features = self.tree_predictor.extract_features(nodes[j], nodes[j + 1],
                                                                 annot_text, annot_tokens,
                                                                 annot_sentences,
                                                                 annot_lemma, annot_morph, annot_postag,
                                                                 annot_syntax_dep_tree)
                _scores = self.tree_predictor.predict_pair_proba(_features)
                scores = _scores + scores[j + 2:]
                features = pd.concat([_features, features.iloc[j + 2:]])

            elif j + 1 < len(nodes):
                _features = self.tree_predictor.initialize_features([nodes[j - 1], nodes[j], nodes[j + 1]],
                                                                    annot_text, annot_tokens,
                                                                    annot_sentences,
                                                                    annot_lemma, annot_morph, annot_postag,
                                                                    annot_syntax_dep_tree)
                _scores = self.tree_predictor.predict_pair_proba(_features)
                features = pd.concat([features.iloc[:j - 1], _features, features.iloc[j + 2:]])
                scores = scores[:j - 1] + _scores + scores[j + 2:]

            else:
                _features = self.tree_predictor.extract_features(nodes[j - 1], nodes[j],
                                                                 annot_text, annot_tokens,
                                                                 annot_sentences,
                                                                 annot_lemma, annot_morph, annot_postag,
                                                                 annot_syntax_dep_tree)
                _scores = self.tree_predictor.predict_pair_proba(_features)
                scores = scores[:j - 1] + _scores
                features = pd.concat([features.iloc[:j - 1], _features])

        if len(scores) == 1 and scores[0] > self.forest_threshold:
            root = DiscourseUnit(
                id=max_id + 1,
                left=nodes[0],
                right=nodes[1],
                relation='root',
                proba=scores[0]
            )
            nodes = [root]

        return nodes


In [None]:
import numpy as np
import pandas as pd


def get_embeddings(embedder, x, maxlen=100):
    x_ = [text[:text.rfind('_')] for text in x.split()]
    result = np.zeros((embedder.vector_size, maxlen))

    for i in range(min(len(x_), maxlen)):
        try:
            result[i] = embedder[x_[i]]
        except KeyError:
            continue

    return result


class FeaturesExtractor:
    DROP_COLUMNS = ['snippet_x', 'snippet_y', 'snippet_x_tmp', 'snippet_y_tmp', 'postags_x', 'postags_y']

    def __init__(self, processor, scaler=None, categorical_cols=None, one_hot_encoder=None, label_encoder=None):
        self.processor = processor
        self.scaler = scaler
        self._categorical_cols = categorical_cols
        self.one_hot_encoder = one_hot_encoder
        self.label_encoder = label_encoder

    def __call__(self, df, 
                 annot_text, annot_tokens, annot_sentences, 
                 annot_lemma, annot_morph, annot_postag, annot_syntax_dep_tree):
        x = self.processor(df, 
                           annot_text, annot_tokens, annot_sentences, 
                           annot_lemma, annot_morph, annot_postag, annot_syntax_dep_tree)

        if self._categorical_cols:
            if self.label_encoder:
                x[self._categorical_cols] = x[self._categorical_cols].apply(
                    lambda col: self.label_encoder.fit_transform(col))

            if self.one_hot_encoder:
                x_ohe = self.one_hot_encoder.transform(x[self._categorical_cols].values)
                x_ohe = pd.DataFrame(x_ohe, x.index,
                                     columns=self.one_hot_encoder.get_feature_names(self._categorical_cols))

                x = x.join(
                    pd.DataFrame(x_ohe, x.index).add_prefix('cat_'), how='right'
                ).drop(columns=self._categorical_cols).drop(columns=self.DROP_COLUMNS)

        if self.scaler:
            return pd.DataFrame(self.scaler.transform(x.values), index=x.index, columns=x.columns)

        return x


In [None]:
from utils.train_test_split import split_data

train, test = split_data('data/', 0.2)

In [None]:
from utils.file_reading import read_edus, read_gold, read_annotation
from utils.evaluation import extr_pairs, extr_pairs_forest

# Evaluation (Parser)

In [None]:
from utils.features_processor_default import FeaturesProcessor

In [None]:
%%time

features_processor = FeaturesProcessor(model_dir_path='models', verbose=False)

In [None]:
binary_classifier = AllenNLPClassifier('models/structure_predictor_lstm/results_all/')
label_classifier = SklearnClassifier('models/label_predictor/')
features_extractor = FeaturesExtractor(features_processor)
predictor = NNTreePredictor(features_extractor, binary_classifier, label_predictor=label_classifier)
parser = GreedyRSTParser(predictor, forest_threshold=0.2)

In [None]:
from tqdm import tqdm_notebook as tqdm
from utils.file_reading import *
from utils.evaluation import extr_pairs, extr_pairs_forest

cache = {}
broken_files = []

for file in tqdm(test):
    filename = '.'.join(file.split('.')[:-1])
    edus = read_edus(filename)
    gold = read_gold(filename)
    annot = read_annotation(filename)
    
    _edus = []
    last_end = 0
    for max_id in range(len(edus) - 1):
        #start = len(annot['text'][:last_end]) + annot['text'][last_end:].find(edus[max_id])
        start = len(annot['text'][:last_end]) + annot['text'][last_end:].find(edus[max_id])
        end = start + len(edus[max_id])
        temp = DiscourseUnit(
                id=max_id,
                left=None,
                right=None,
                relation='edu',
                start=start,
                end=end,
                orig_text=annot['text'],
                proba=1.,
            )
        #print(temp)
        _edus.append(temp)
        last_end = end + 1
        
    parsed = parser(_edus, 
                annot['text'], 
                annot['tokens'], 
                annot['sentences'], 
                annot['lemma'], 
                annot['morph'], 
                annot['postag'], 
                annot['syntax_dep_tree'],
                genre=filename.split('_')[0])
    
    parsed_pairs = pd.DataFrame(extr_pairs_forest(parsed), columns=['snippet_x', 'snippet_y', 'category_id'])
    cache[filename] = (parsed_pairs, gold)

In [None]:
from utils.evaluation import metric_parseval_pd as metric_parseval

filenames = []
true_pos = []
all_parsed = []
all_gold = []

for key, value in cache.items():
    c_true_pos, c_all_parsed, c_all_gold = metric_parseval(value[0], value[1])
    filenames.append(key)
    true_pos.append(c_true_pos)
    all_parsed.append(c_all_parsed)
    all_gold.append(c_all_gold)
    
results = pd.DataFrame({'filename': filenames, 
                    'true_pos': true_pos,
                    'all_parsed': all_parsed,
                    'all_gold': all_gold})

In [None]:
results['recall'] = results['true_pos'] / results['all_gold']
results['precision'] = results['true_pos'] / results['all_parsed']
results['F1'] = 2 * results['precision'] * results['recall'] / (results['precision'] + results['recall'])

In [None]:
results['F1'].mean(), results['recall'].mean(), results['precision'].mean()

In [None]:
tmp = results[results.filename.str.contains('blog')]

tmp['F1'].mean(), tmp['recall'].mean(), tmp['precision'].mean()

In [None]:
tmp = results[results.filename.str.contains('news')]

tmp['F1'].mean(), tmp['recall'].mean(), tmp['precision'].mean()

In [None]:
tmp = results[results.filename.str.contains('ling')]

tmp['F1'].mean(), tmp['recall'].mean(), tmp['precision'].mean()

In [None]:
tmp = results[results.filename.str.contains('comp')]

tmp['F1'].mean(), tmp['recall'].mean(), tmp['precision'].mean()

# Evaluation (Gold)

In [None]:
cache = {}

In [None]:
from utils.evaluation import metric_parseval

In [None]:
filenames = []
true_pos = []
all_parsed = []
all_gold = []

for key, value in cache.items():
    c_true_pos, c_all_parsed, c_all_gold = metric_parseval(value[0], value[1])
    filenames.append(key)
    true_pos.append(c_true_pos)
    all_parsed.append(c_all_parsed)
    all_gold.append(c_all_gold)
    
results = pd.DataFrame({'filename': filenames, 
                    'true_pos': true_pos,
                    'all_parsed': all_parsed,
                    'all_gold': all_gold})

In [None]:
def parse_golds(filename):
    filename = '.'.join(filename.split('.')[:-1])
    edus = read_edus(filename)
    gold = read_gold(filename)
    annot = read_annotation(filename)
    
    _edus = []
    last_end = 0
    for max_id in range(len(edus)):
        start = len(annot['text'][:last_end]) + annot['text'][last_end:].find(edus[max_id])
        end = start + len(edus[max_id])
        temp = DiscourseUnit(
                id=max_id,
                left=None,
                right=None,
                relation='edu',
                start=start,
                end=end,
                orig_text=annot['text'],
                proba=1.,
            )
        _edus.append(temp)
        last_end = end

    parser = GreedyRSTParser(GoldTreePredictor(gold), forest_threshold=0.)
    parsed = parser(_edus, annot['text'], annot['tokens'], annot['sentences'],
                    annot['postag'], annot['morph'], annot['lemma'], annot['syntax_dep_tree'])
    
    parsed_pairs = pd.DataFrame(extr_pairs_forest(parsed), columns=['snippet_x', 'snippet_y', 'category_id'])
    
    return (filename,) + metric_parseval(parsed_pairs, gold)

In [None]:
%%time

import multiprocessing as mp

pool = mp.Pool(5)
result = pool.map(parse_golds, test)
pool.close()

In [None]:
results = pd.DataFrame(columns=['filename', 'true_pos', 'all_parsed', 'all_gold'], data=result)

results['recall'] = results['true_pos'] / results['all_gold']
results['precision'] = results['true_pos'] / results['all_parsed']
results['F1'] = 2 * results['precision'] * results['recall'] / (results['precision'] + results['recall'])

In [None]:
results.sort_values('F1')

In [None]:
results['difference'] = results['all_gold'] - results['all_parsed']

In [None]:
results.sort_values('difference', ascending=False)

### Bad file analysis 

In [None]:
filename = 'data/sci.comp_44'

edus = read_edus(filename)
gold = read_gold(filename)
gold = gold.sort_values('snippet_y').drop_duplicates(subset=['snippet_y'])
annot = read_annotation(filename)
_edus = []
last_end = 0
for max_id in range(len(edus)):
    start = annot['text'].find(edus[max_id], last_end)
    end = start + len(edus[max_id])
    temp = DiscourseUnit(
            id=max_id,
            left=None,
            right=None,
            relation='edu',
            start=start,
            end=end,
            orig_text=annot['text'],
            #text=edus[max_id],
            proba=1.,
            #text=edus[max_id]  #annot_text[nodes[j].start:nodes[j+1].end]
        )
    _edus.append(temp)
    last_end = end

parser = GreedyRSTParser(GoldTreePredictor(gold), forest_threshold=0.)
#parsed = parser(_edus)

parsed = parser(_edus, 
                annot['text'], 
                annot['tokens'], 
                annot['sentences'], 
                annot['postag'], 
                annot['morph'], 
                annot['lemma'], 
                annot['syntax_dep_tree'], 
                genre=filename.split('_')[0])

In [None]:
counter = 0

for tree in parsed:
    if tree.relation != 'edu':
        print(vars(tree))
        counter += 1
        break

In [None]:
tree = parsed[1]

In [None]:
vars(tree)

In [None]:
vars(tree)

In [None]:
vars(tree.right)

In [None]:
from utils.evaluation import metric_parseval, extr_pairs, extr_pairs_forest, _check_snippet_pair_in_dataset, _not_parsed_as_in_gold

parsed_pairs = pd.DataFrame(extr_pairs_forest(parsed), columns=['snippet_x', 'snippet_y', 'category_id'])
print(parsed_pairs.shape, gold.shape)
errors = _not_parsed_as_in_gold(parsed_pairs, gold)

def find_edu_number(edus, error):
    for i, edu in enumerate(edus):
        if error[2].find(edu) > -1:
            yield i

In [None]:
errors.values

In [None]:
list(find_edu_number(edus, errors.iloc[3]))

In [None]:
parsed_pairs[parsed_pairs.snippet_x.str.contains("В ту пору экзамены можно было сдавать экстерном.")]

In [None]:
gold[gold.snippet_x.str.contains("В ту пору экзамены можно было сдавать экстерном.")]