## Rhetorical relations classification used in tree building: ESIM

Prepare data and model-related scripts.

Evaluate models.

Make and evaluate ansembles for ESIM and BiMPM model / ESIM and feature-based model.

Output:
 - ``models/relation_predictor_esim/*``

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import os
import glob
import pandas as pd
import numpy as np
import pickle
from utils.file_reading import read_edus, read_gold, read_negative, read_annotation

### Make a directory

In [None]:
MODEL_PATH = 'models/label_predictor_esim'
! mkdir $MODEL_PATH

TRAIN_FILE_PATH = os.path.join(MODEL_PATH, 'nlabel_cf_train.tsv')
DEV_FILE_PATH = os.path.join(MODEL_PATH, 'nlabel_cf_dev.tsv')
TEST_FILE_PATH = os.path.join(MODEL_PATH, 'nlabel_cf_test.tsv')

### Prepare train/test sets 

In [None]:
IN_PATH = 'data_labeling'

train_samples = pd.read_pickle(os.path.join(IN_PATH, 'train_samples.pkl'))
dev_samples = pd.read_pickle(os.path.join(IN_PATH, 'dev_samples.pkl'))
test_samples = pd.read_pickle(os.path.join(IN_PATH, 'test_samples.pkl'))

In [None]:
counts = train_samples['relation'].value_counts(normalize=False).values
NUMBER_CLASSES = len(counts)
print("number of classes:", NUMBER_CLASSES)
print("class weights:")
np.round(counts.min() / counts, decimals=6)

In [None]:
counts = train_samples['relation'].value_counts()

In [None]:
counts

In [None]:
train_samples = train_samples.reset_index()
train_samples[['relation', 'snippet_x', 'snippet_y', 'index']].to_csv(TRAIN_FILE_PATH, sep='\t', header=False, index=False)

dev_samples = dev_samples.reset_index()
dev_samples[['relation', 'snippet_x', 'snippet_y', 'index']].to_csv(DEV_FILE_PATH, sep='\t', header=False, index=False)

test_samples = test_samples.reset_index()
test_samples[['relation', 'snippet_x', 'snippet_y', 'index']].to_csv(TEST_FILE_PATH, sep='\t', header=False, index=False)

### Modify model

(Add F1, concatenated encoding)

In [None]:
%%writefile models/bimpm_custom_package/model/esim.py

from typing import Dict, List, Any, Optional

import numpy
import torch

from allennlp.common.checks import check_dimensions_match
from allennlp.data import Vocabulary
from allennlp.models.model import Model
from allennlp.models.esim import ESIM
from allennlp.modules import FeedForward, InputVariationalDropout
from allennlp.modules.matrix_attention.legacy_matrix_attention import LegacyMatrixAttention
from allennlp.modules.similarity_functions.similarity_function import SimilarityFunction
from allennlp.modules import Seq2SeqEncoder, TextFieldEmbedder
from allennlp.nn.initializers import InitializerApplicator
from allennlp.nn.regularizers import RegularizerApplicator
from allennlp.nn.util import (
    get_text_field_mask,
    masked_softmax,
    weighted_sum,
    masked_max,
    replace_masked_values,
)
from allennlp.training.metrics import CategoricalAccuracy, F1Measure


@Model.register("custom_esim")
class CustomESIM(Model):
    """
    This `Model` implements the ESIM sequence model described in [Enhanced LSTM for Natural Language Inference]
    (https://api.semanticscholar.org/CorpusID:34032948) by Chen et al., 2017.
    Registered as a `Model` with name "esim".
    # Parameters
    vocab : `Vocabulary`
    text_field_embedder : `TextFieldEmbedder`
        Used to embed the `premise` and `hypothesis` `TextFields` we get as input to the
        model.
    encoder : `Seq2SeqEncoder`
        Used to encode the premise and hypothesis.
    matrix_attention : `MatrixAttention`
        This is the attention function used when computing the similarity matrix between encoded
        words in the premise and words in the hypothesis.
    projection_feedforward : `FeedForward`
        The feedforward network used to project down the encoded and enhanced premise and hypothesis.
    inference_encoder : `Seq2SeqEncoder`
        Used to encode the projected premise and hypothesis for prediction.
    output_feedforward : `FeedForward`
        Used to prepare the concatenated premise and hypothesis for prediction.
    output_logit : `FeedForward`
        This feedforward network computes the output logits.
    dropout : `float`, optional (default=`0.5`)
        Dropout percentage to use.
    initializer : `InitializerApplicator`, optional (default=`InitializerApplicator()`)
        Used to initialize the model parameters.
    """

    def __init__(
        self,
        vocab: Vocabulary,
        text_field_embedder: TextFieldEmbedder,
        encoder: Seq2SeqEncoder,
        similarity_function: SimilarityFunction,
        projection_feedforward: FeedForward,
        inference_encoder: Seq2SeqEncoder,
        output_feedforward: FeedForward,
        output_logit: FeedForward,
        dropout: float = 0.5,
        class_weights: list = [],
        initializer: InitializerApplicator = InitializerApplicator(),
        regularizer: Optional[RegularizerApplicator] = None,
        encode_together: bool = False,
    ) -> None:
        super().__init__(vocab, regularizer)

        self._text_field_embedder = text_field_embedder
        self._encoder = encoder
        self.encode_together = encode_together

        self._matrix_attention = LegacyMatrixAttention(similarity_function)
        self._projection_feedforward = projection_feedforward

        self._inference_encoder = inference_encoder

        if dropout:
            self.dropout = torch.nn.Dropout(dropout)
            self.rnn_input_dropout = InputVariationalDropout(dropout)
        else:
            self.dropout = None
            self.rnn_input_dropout = None
            
        if class_weights:
            self.class_weights = class_weights
        else:
            self.class_weights = [1.] * self.output_feedforward.get_output_dim()

        self._output_feedforward = output_feedforward
        self._output_logit = output_logit

        self._num_labels = vocab.get_vocab_size(namespace="labels")

        check_dimensions_match(
            text_field_embedder.get_output_dim(),
            encoder.get_input_dim(),
            "text field embedding dim",
            "encoder input dim",
        )
        check_dimensions_match(
            encoder.get_output_dim() * 4,
            projection_feedforward.get_input_dim(),
            "encoder output dim",
            "projection feedforward input",
        )
        check_dimensions_match(
            projection_feedforward.get_output_dim(),
            inference_encoder.get_input_dim(),
            "proj feedforward output dim",
            "inference lstm input dim",
        )

        self.metrics = {"accuracy": CategoricalAccuracy()}
        
        for _class in range(len(self.class_weights)):
            self.metrics.update({
                f"f1_rel{_class}": F1Measure(_class),
            })
        
        self._loss = torch.nn.CrossEntropyLoss(weight=torch.FloatTensor(self.class_weights))

        initializer(self)

    def forward(  # type: ignore
        self,
        premise: Dict[str, torch.LongTensor],
        hypothesis: Dict[str, torch.LongTensor],
        label: torch.IntTensor = None,
        metadata: List[Dict[str, Any]] = None,
    ) -> Dict[str, torch.Tensor]:

        """
        # Parameters
        premise : Dict[str, torch.LongTensor]
            From a `TextField`
        hypothesis : Dict[str, torch.LongTensor]
            From a `TextField`
        label : `torch.IntTensor`, optional (default = `None`)
            From a `LabelField`
        metadata : `List[Dict[str, Any]]`, optional (default = `None`)
            Metadata containing the original tokenization of the premise and
            hypothesis with 'premise_tokens' and 'hypothesis_tokens' keys respectively.
        # Returns
        An output dictionary consisting of:
        label_logits : `torch.FloatTensor`
            A tensor of shape `(batch_size, num_labels)` representing unnormalised log
            probabilities of the entailment label.
        label_probs : `torch.FloatTensor`
            A tensor of shape `(batch_size, num_labels)` representing probabilities of the
            entailment label.
        loss : `torch.FloatTensor`, optional
            A scalar loss to be optimised.
        """
        
        def encode_pair(x1, x2, mask1=None, mask2=None):
            _joined_pair: Dict[str, torch.LongTensor] = {}
            
            for key in premise.keys():
                bsz = premise[key].size(0)
                x1_len, x2_len = premise[key].size(1), hypothesis[key].size(1)
                sep = torch.empty([bsz, 1], dtype=torch.long, device=premise[key].device)
                sep.data.fill_(0) # 2 is the id for </s>
                
                x = torch.cat([premise[key], hypothesis[key]], dim=1)
                _joined_pair[key] = x
                
            x_output = self.dropout(self._text_field_embedder(_joined_pair))
            return x_output[:, :x1_len], x_output[:, -x2_len:], mask1, mask2
        
        premise_mask = get_text_field_mask(premise)
        hypothesis_mask = get_text_field_mask(hypothesis)
        
        if self.encode_together:
            embedded_premise, embedded_hypothesis, _, _ = encode_pair(premise, hypothesis)
        else:
            embedded_premise = self.dropout(self._text_field_embedder(premise))
            embedded_hypothesis = self.dropout(self._text_field_embedder(hypothesis))

        # apply dropout for LSTM
        if self.rnn_input_dropout:
            embedded_premise = self.rnn_input_dropout(embedded_premise)
            embedded_hypothesis = self.rnn_input_dropout(embedded_hypothesis)

        # encode premise and hypothesis
        encoded_premise = self._encoder(embedded_premise, premise_mask)
        encoded_hypothesis = self._encoder(embedded_hypothesis, hypothesis_mask)

        # Shape: (batch_size, premise_length, hypothesis_length)
        similarity_matrix = self._matrix_attention(encoded_premise, encoded_hypothesis)

        # Shape: (batch_size, premise_length, hypothesis_length)
        p2h_attention = masked_softmax(similarity_matrix, hypothesis_mask)
        # Shape: (batch_size, premise_length, embedding_dim)
        attended_hypothesis = weighted_sum(encoded_hypothesis, p2h_attention)

        # Shape: (batch_size, hypothesis_length, premise_length)
        h2p_attention = masked_softmax(similarity_matrix.transpose(1, 2).contiguous(), premise_mask)
        # Shape: (batch_size, hypothesis_length, embedding_dim)
        attended_premise = weighted_sum(encoded_premise, h2p_attention)

        # the "enhancement" layer
        premise_enhanced = torch.cat(
            [encoded_premise, attended_hypothesis,
             encoded_premise - attended_hypothesis,
             encoded_premise * attended_hypothesis,
            ],
            dim=-1,
        )
        hypothesis_enhanced = torch.cat(
            [encoded_hypothesis, attended_premise,
             encoded_hypothesis - attended_premise,
             encoded_hypothesis * attended_premise,
            ],
            dim=-1,
        )

        # The projection layer down to the model dimension.  Dropout is not applied before
        # projection.
        projected_enhanced_premise = self._projection_feedforward(premise_enhanced)
        projected_enhanced_hypothesis = self._projection_feedforward(hypothesis_enhanced)

        # Run the inference layer
        if self.rnn_input_dropout:
            projected_enhanced_premise = self.rnn_input_dropout(projected_enhanced_premise)
            projected_enhanced_hypothesis = self.rnn_input_dropout(projected_enhanced_hypothesis)
        v_ai = self._inference_encoder(projected_enhanced_premise, premise_mask)
        v_bi = self._inference_encoder(projected_enhanced_hypothesis, hypothesis_mask)

        # The pooling layer -- max and avg pooling.
        # (batch_size, model_dim)
        v_a_max, _ = replace_masked_values(v_ai, premise_mask.unsqueeze(-1), -1e7).max(dim=1)
        v_b_max, _ = replace_masked_values(v_bi, hypothesis_mask.unsqueeze(-1), -1e7).max(dim=1)

        v_a_avg = torch.sum(v_ai * premise_mask.unsqueeze(-1), dim=1) / torch.sum(
            premise_mask, 1, keepdim=True
        )
        v_b_avg = torch.sum(v_bi * hypothesis_mask.unsqueeze(-1), dim=1) / torch.sum(
            hypothesis_mask, 1, keepdim=True
        )

        # Now concat
        # (batch_size, model_dim * 2 * 4)
        v_all = torch.cat([v_a_avg, v_a_max, v_b_avg, v_b_max], dim=1)

        # the final MLP -- apply dropout to input, and MLP applies to output & hidden
        if self.dropout:
            v_all = self.dropout(v_all)

        output_hidden = self._output_feedforward(v_all)
        label_logits = self._output_logit(output_hidden)
        label_probs = torch.nn.functional.softmax(label_logits, dim=-1)

        output_dict = {"label_logits": label_logits, "label_probs": label_probs}

        if label is not None:
            loss = self._loss(label_logits, label.long().view(-1))
            output_dict["loss"] = loss
            
            for metric in self.metrics.values():
                metric(label_logits, label.long().view(-1))

        return output_dict

    def get_metrics(self, reset: bool = False) -> Dict[str, float]:
        metrics = {"accuracy": self.metrics["accuracy"].get_metric(reset=reset)}
        
        for _class in range(len(self.class_weights)):
            metrics.update({
                f"f1_rel{_class}": self.metrics[f"f1_rel{_class}"].get_metric(reset=reset)[2],
            })
        
        metrics["f1_macro"] = numpy.mean([metrics[f"f1_rel{_class}"] for _class in range(len(self.class_weights))])
        return metrics

    default_predictor = "textual_entailment"

In [None]:
! cp models/bimpm_custom_package/model/esim.py ../../../maintenance_rst/models/customization_package/model/esim.py

### 2. Generate config files

#### ELMo 

In [None]:
%%writefile $MODEL_PATH/config_elmo.json

// Configuration for a sentence matching model based on:
//   Wang, Zhiguo, Wael Hamza, and Radu Florian. "Bilateral multi-perspective matching for natural language sentences."
//   Proceedings of the 26th International Joint Conference on Artificial Intelligence. 2017.

{
  "dataset_reader": {
    "type": "quora_paraphrase",
    "lazy": false,
    "tokenizer": {
      "type": "word",
      "word_splitter": {
        "type": "just_spaces"
      }
    },
    "token_indexers": {
      "token_characters": {
        "type": "characters",
        "min_padding_length": 30,
      },
      "elmo": {
        "type": "elmo_characters"
     }
    }
  },
  "train_data_path": "label_predictor_esim/nlabel_cf_train.tsv",
  "validation_data_path": "label_predictor_esim/nlabel_cf_dev.tsv",
  "test_data_path": "label_predictor_esim/nlabel_cf_test.tsv",
  "model": {
    "type": "bidaf",
    "dropout": 0.5,
    "class_weights": [
        0.022915, 0.027624, 0.069509, 0.104457, 0.109012, 0.111773,
        0.123355, 0.139147, 0.153374, 0.157233, 0.159915, 0.16129 ,
        0.191327, 0.202703, 0.288462, 0.337838, 0.347222, 0.535714,
        0.630252, 0.757576, 0.806452, 1.0      ],
    "encode_together": true,
    "text_field_embedder": {
        "token_embedders": {
            "elmo": {
                    "type": "elmo_token_embedder",
                    "options_file": "rsv_elmo/options.json",
                    "weight_file": "rsv_elmo/model.hdf5",
                    "do_layer_norm": false,
                    "dropout": 0.1
            },
            "token_characters": {
                "type": "character_encoding",
                "dropout": 0.1,
                "embedding": {
                    "embedding_dim": 20,
                    "padding_index": 0,
                },
                "encoder": {
                    "type": "gru",
                    "input_size": 20,
                    "hidden_size": 50,
                    "num_layers": 1,
                    "bidirectional": true,
                },
            },
      }
    },
    "encoder": {
      "type": "lstm",
      "input_size": 1024+100,
      "hidden_size": 300,
      "num_layers": 1,
      "bidirectional": true
    },
    "similarity_function": {"type": "dot_product"},
    "projection_feedforward": {
      "input_dim": 2400,
      "hidden_dims": 300,
      "num_layers": 1,
      "activations": "relu"
    },
    "inference_encoder": {
      "type": "lstm",
      "input_size": 300,
      "hidden_size": 300,
      "num_layers": 1,
      "bidirectional": true
    },
    "output_feedforward": {
      "input_dim": 2400,
      "num_layers": 1,
      "hidden_dims": 300,
      "activations": "relu",
      "dropout": 0.5
    },
    "output_logit": {
      "input_dim": 300,
      "num_layers": 1,
      "hidden_dims": 22,
      "activations": "linear"
    },
     "initializer": [
      [".*linear_layers.*weight", {"type": "xavier_uniform"}],
      [".*linear_layers.*bias", {"type": "zero"}],
      [".*weight_ih.*", {"type": "xavier_uniform"}],
      [".*weight_hh.*", {"type": "orthogonal"}],
      [".*bias_ih.*", {"type": "zero"}],
      [".*bias_hh.*", {"type": "lstm_hidden_bias"}]
     ]
   },
  "iterator": {
    "type": "bucket",
    "padding_noise": 0,
    "sorting_keys": [["premise", "num_tokens"], ["hypothesis", "num_tokens"]],
    "batch_size": 4
  },
  "trainer": {
    "num_epochs": 200,
    "cuda_device": 0
    "shuffle": true,
    "optimizer": {
      "type": "adam",
      "lr": 0.001
    },
    "type":"callback",
    "callbacks":[
        {
            "type": "validate"
        },
        {
            "type": "checkpoint",
            "checkpointer":{
                "num_serialized_models_to_keep":1
            }
        },
        {
            "type": "gradient_norm_and_clip", 
            "grad_norm": 5.0
        },
        {
            "type": "track_metrics",
            "patience": 20,
            "validation_metric": "+f1_macro"
        },
        {
            "type": "log_metrics_to_wandb"
        }
    ],
  }
}

In [None]:
! cp -r $MODEL_PATH ../../../maintenance_rst/models/label_predictor_esim

In [None]:
! cp -r $MODEL_PATH/config_elmo.json ../../../maintenance_rst/models/label_predictor_esim/

### 3. Scripts for training/prediction 

#### Option 1. Directly from the config

Train a model

In [None]:
%%writefile models/train_label_predictor_esim.sh
# usage:
# $ cd models 
# $ sh train_label_predictor.sh {bert|elmo} result_30

export METHOD=${1}
export RESULT_DIR=${2}
export DEV_FILE_PATH="nlabel_cf_dev.tsv"
export TEST_FILE_PATH="nlabel_cf_test.tsv"

rm -r label_predictor_bimpm/${RESULT_DIR}/
allennlp train -s label_predictor_esim/${RESULT_DIR}/ label_predictor_esim/config_${METHOD}.json \
    --include-package customization_package
allennlp predict --use-dataset-reader --silent \
    --output-file label_predictor_esim/${RESULT_DIR}/predictions_dev.json label_predictor_esim/${RESULT_DIR}/model.tar.gz label_predictor_esim/${DEV_FILE_PATH} \
    --include-package customization_package \
    --predictor textual-entailment
allennlp predict --use-dataset-reader --silent \
    --output-file label_predictor_esim/${RESULT_DIR}/predictions_test.json label_predictor_esim/${RESULT_DIR}/model.tar.gz label_predictor_esim/${TEST_FILE_PATH} \
    --include-package customization_package \
    --predictor textual-entailment

In [None]:
! cp models/train_label_predictor_esim.sh ../../../maintenance_rst/models/

Predict on dev&test

In [None]:
%%writefile models/eval_label_predictor_esim.sh
# usage:
# $ cd models 
# $ sh train_label_predictor.sh {bert|elmo} result_30

export METHOD=${1}
export RESULT_DIR=${2}
export DEV_FILE_PATH="nlabel_cf_dev.tsv"
export TEST_FILE_PATH="nlabel_cf_test.tsv"

allennlp predict --use-dataset-reader --silent \
    --output-file label_predictor_esim/${RESULT_DIR}/predictions_dev.json label_predictor_esim/${RESULT_DIR}/model.tar.gz label_predictor_esim/${DEV_FILE_PATH} \
    --include-package customization_package \
    --predictor textual-entailment
allennlp predict --use-dataset-reader --silent \
    --output-file label_predictor_esim/${RESULT_DIR}/predictions_test.json label_predictor_esim/${RESULT_DIR}/model.tar.gz label_predictor_esim/${TEST_FILE_PATH} \
    --include-package customization_package \
    --predictor textual-entailment

In [None]:
! cp models/eval_label_predictor_esim.sh ../../../maintenance_rst/models/

(optional) predict on train

In [None]:
%%writefile models/eval_label_predictor_train.sh
# usage:
# $ cd models 
# $ sh eval_label_predictor_train.sh {bert|elmo} result_30

export METHOD=${1}
export RESULT_DIR=${2}
export TEST_FILE_PATH="nlabel_cf_train.tsv"

allennlp predict --use-dataset-reader --silent \
    --output-file label_predictor_bimpm/${RESULT_DIR}/predictions_train.json label_predictor_bimpm/${RESULT_DIR}/model.tar.gz label_predictor_bimpm/${TEST_FILE_PATH} \
    --include-package customization_package \
    --predictor textual-entailment

#### Option 2. Using wandb for parameters adjustment

In [None]:
%%writefile ../../../maintenance_rst/models/wandb_label_predictor_esim.yaml

name: label_predictor_esim
program: wandb_allennlp # this is a wrapper console script around allennlp commands. It is part of wandb-allennlp
method: bayes
## Do not for get to use the command keyword to specify the following command structure
command:
  - ${program} #omit the interpreter as we use allennlp train command directly
  - "--subcommand=train"
  - "--include-package=customization_package" # add all packages containing your registered classes here
  - "--config_file=label_predictor_esim/config_elmo.json"
  - ${args}
metric:
    name: best_f1_macro
    goal: maximize
parameters:
    model.encode_together:
        values: ["true", ]    
    iterator.batch_size:
        values: [8,]
    trainer.optimizer.lr:
        values: [0.001,]
    model.dropout:
        values: [0.5]


3. Run training

``wandb sweep wandb_label_predictor_esim.yaml``

(returns %sweepname1)

``wandb sweep wandb_label_predictor2.yaml``

(returns %sweepname2)

``wandb agent --count 1 %sweepname1 && wandb agent --count 1 %sweepname2``

Move the best model in label_predictor_bimpm

In [None]:
! ls -laht models/wandb

In [None]:
! cp -r models/wandb/run-20201218_123424-kcphaqhi/training_dumps models/label_predictor_esim/esim_elmo

**Or** load from wandb by %sweepname

In [None]:
import wandb
api = wandb.Api()
run = api.run("tchewik/tmp/7hum4oom")
for file in run.files():
    file.download(replace=True)

In [None]:
! cp -r training_dumps models/label_predictor_bimpm/toasty-sweep-1

And run evaluation from shell

``sh eval_label_predictor_esim.sh {elmo|elmo_fasttext} toasty-sweep-1``

### 4. Evaluate classifier

In [None]:
def load_predictions(path):
    result = []
    vocab = []
    
    with open(path, 'r') as file:
        for line in file.readlines():
            line = json.loads(line)
            if line.get("label"):
                result.append(line.get("label"))
            elif line.get("label_probs"):
                if not vocab:
                    vocab = open(path[:path.rfind('/')] + '/vocabulary/labels.txt', 'r').readlines()
                    vocab = [label.strip() for label in vocab]
                
                result.append(vocab[np.argmax(line.get("label_probs"))])
            
    print('length of result:', len(result))
    return result

In [None]:
RESULT_DIR = 'esim_elmo'

In [None]:
! mkdir models/label_predictor_esim/$RESULT_DIR

In [None]:
! cp -r ../../../maintenance_rst/models/label_predictor_esim/$RESULT_DIR/*.json models/label_predictor_esim/$RESULT_DIR/

On dev set

In [None]:
import pandas as pd
import json

true = pd.read_csv(DEV_FILE_PATH, sep='\t', header=None)[0].values.tolist()
pred = load_predictions(f'{MODEL_PATH}/{RESULT_DIR}/predictions_dev.json')

In [None]:
from sklearn.metrics import classification_report

print(classification_report(true[:len(pred)], pred, digits=4))

In [None]:
test_metrics = classification_report(true[:len(pred)], pred, digits=4, output_dict=True)
test_f1 = np.array(
    [test_metrics[label].get('f1-score') for label in test_metrics if type(test_metrics[label]) == dict]) * 100

test_f1

In [None]:
len(true)

In [None]:
from sklearn.metrics import f1_score, precision_score, recall_score

print('f1: %.2f'%(f1_score(true[:len(pred)], pred, average='macro')*100))
print('pr: %.2f'%(precision_score(true[:len(pred)], pred, average='macro')*100))
print('re: %.2f'%(recall_score(true[:len(pred)], pred, average='macro')*100))

In [None]:
from utils.plot_confusion_matrix import plot_confusion_matrix
from sklearn.metrics import confusion_matrix

labels = list(set(true))
labels.sort()
plot_confusion_matrix(confusion_matrix(true[:len(pred)], pred, labels), target_names=labels, normalize=True)

In [None]:
top_classes = [
    'attribution_NS',
    'attribution_SN',
    'purpose_NS',
    'purpose_SN',
    'condition_SN',
    'contrast_NN',
    'condition_NS',
    'joint_NN',
    'concession_NS',
    'same-unit_NN',
    'elaboration_NS',
    'cause-effect_NS',
]

class_mapper = {weird_class: 'other' + weird_class[-3:] for weird_class in labels if not weird_class in top_classes}

In [None]:
import numpy as np

true = [class_mapper.get(value) if class_mapper.get(value) else value for value in true]
pred = [class_mapper.get(value) if class_mapper.get(value) else value for value in pred]

pred_mapper = {
    'other_NN': 'joint_NN',
    'other_NS': 'joint_NN',
    'other_SN': 'joint_NN'
}
pred = [pred_mapper.get(value) if pred_mapper.get(value) else value for value in pred]

_to_stay = (np.array(true) != 'other_NN') & (np.array(true) != 'other_SN') & (np.array(true) != 'other_NS')

_true = np.array(true)[_to_stay]
_pred = np.array(pred)[_to_stay[:len(pred)]]
labels = list(set(_true))

In [None]:
from sklearn.metrics import f1_score, precision_score, recall_score

print('f1: %.2f'%(f1_score(true[:len(pred)], pred, average='macro')*100))
print('pr: %.2f'%(precision_score(true[:len(pred)], pred, average='macro')*100))
print('re: %.2f'%(recall_score(true[:len(pred)], pred, average='macro')*100))

In [None]:
labels.sort()

In [None]:
plot_confusion_matrix(confusion_matrix(_true[:len(_pred)], _pred), target_names=labels, normalize=True)

In [None]:
import numpy as np

for rel in np.unique(_true):
    print(rel)

On train set (optional)

In [None]:
import pandas as pd
import json

true = pd.read_csv('models/label_predictor_bimpm/nlabel_cf_train.tsv', sep='\t', header=None)[0].values.tolist()
pred = load_predictions(f'{MODEL_PATH}/{RESULT_DIR}/predictions_train.json')

print(classification_report(true[:len(pred)], pred, digits=4))

In [None]:
file = 'models/label_predictor_lstm/nlabel_cf_train.tsv'
true_train = pd.read_csv(file, sep='\t', header=None)
true_train['predicted_relation'] = pred

print(true_train[true_train.relation != true_train.predicted_relation].shape)

true_train[true_train.relation != true_train.predicted_relation].to_csv('mispredicted_relations.csv', sep='\t')

On test set

In [None]:
import pandas as pd
import json

true = pd.read_csv(TEST_FILE_PATH, sep='\t', header=None)[0].values.tolist()
pred = load_predictions(f'{MODEL_PATH}/{RESULT_DIR}/predictions_test.json')

print(classification_report(true[:len(pred)], pred, digits=4))

In [None]:
test_metrics = classification_report(true[:len(pred)], pred, digits=4, output_dict=True)
test_f1 = np.array(
    [test_metrics[label].get('f1-score') for label in test_metrics if type(test_metrics[label]) == dict]) * 100

test_f1

In [None]:
from sklearn.metrics import f1_score, precision_score, recall_score

print('f1: %.2f'%(f1_score(true[:len(pred)], pred, average='macro')*100))
print('pr: %.2f'%(precision_score(true[:len(pred)], pred, average='macro')*100))
print('re: %.2f'%(recall_score(true[:len(pred)], pred, average='macro')*100))

In [None]:
len(true)

In [None]:
true = [class_mapper.get(value) if class_mapper.get(value) else value for value in true]
pred = [class_mapper.get(value) if class_mapper.get(value) else value for value in pred]
pred = [pred_mapper.get(value) if pred_mapper.get(value) else value for value in pred]

_to_stay = (np.array(true) != 'other_NN') & (np.array(true) != 'other_SN') & (np.array(true) != 'other_NS')

_true = np.array(true)[_to_stay]
_pred = np.array(pred)[_to_stay]

In [None]:
print(classification_report(_true[:len(_pred)], _pred, digits=4))

In [None]:
from sklearn.metrics import f1_score, precision_score, recall_score

print('f1: %.2f'%(f1_score(_true[:len(_pred)], _pred, average='macro')*100))
print('pr: %.2f'%(precision_score(_true[:len(_pred)], _pred, average='macro')*100))
print('re: %.2f'%(recall_score(_true[:len(_pred)], _pred, average='macro')*100))

### Ensemble: (Logreg+Catboost) + ESIM

In [None]:
! ls models/label_predictor_esim

In [None]:
import json

model_vocab = open(MODEL_PATH + '/' + RESULT_DIR + '/vocabulary/labels.txt', 'r').readlines()
model_vocab = [label.strip() for label in model_vocab]

catboost_vocab = [
   'attribution_NS', 'attribution_SN', 'background_NS',
   'cause-effect_NS', 'cause-effect_SN', 'comparison_NN',
   'concession_NS', 'condition_NS', 'condition_SN', 'contrast_NN',
   'elaboration_NS', 'evidence_NS', 'interpretation-evaluation_NS',
   'interpretation-evaluation_SN', 'joint_NN', 'preparation_SN',
   'purpose_NS', 'purpose_SN', 'restatement_NN', 'same-unit_NN',
   'sequence_NN', 'solutionhood_SN']

def load_neural_predictions(path):
    result = []
    
    with open(path, 'r') as file:
        for line in file.readlines():
            line = json.loads(line)
            if line.get('probs'):
                probs = line.get('probs')
            elif line.get('label_probs'):
                probs = line.get('label_probs')
            probs = {model_vocab[i]: probs[i] for i in range(len(model_vocab))}
            result.append(probs)
            
    return result

def load_scikit_predictions(model, X):
    result = []
    predictions = model.predict_proba(X)
    
    for prediction in predictions:
        probs = {catboost_vocab[j]: prediction[j] for j in range(len(catboost_vocab))}
        result.append(probs)
    
    return result

def vote_predictions(predictions, soft=True, weights=[1., 1.]):
    for i in range(1, len(predictions)):
        assert len(predictions[i-1]) == len(predictions[i])
        
    if weights == [1., 1.]:
        weights = [1.,] * len(predictions)
    
    result = []
    
    for i in range(len(predictions[0])):
        sample_result = {}
        for key in predictions[0][i].keys():
            if soft:
                sample_result[key] = 0
                for j, prediction in enumerate(predictions):
                    sample_result[key] += prediction[i][key] * weights[j]
            else:
                sample_result[key] = max([pred[i][key] * weights[j] for j, pred in enumerate(predictions)])

        
        result.append(sample_result)
    
    return result

def probs_to_classes(pred):
    result = []
    
    for sample in pred:
        best_class = ''
        best_prob = 0.
        for key in sample.keys():
            if sample[key] > best_prob:
                best_prob = sample[key]
                best_class = key
        
        result.append(best_class)
    
    return result

In [None]:
import pickle

fs_catboost_plus_logreg = pickle.load(open('models/relation_predictor_baseline/model.pkl', 'rb'))
lab_encoder = pickle.load(open('models/relation_predictor_baseline/label_encoder.pkl', 'rb'))
scaler = pickle.load(open('models/relation_predictor_baseline/scaler.pkl', 'rb'))
drop_columns = pickle.load(open('models/relation_predictor_baseline/drop_columns.pkl', 'rb'))

On dev set

In [None]:
from sklearn import metrics


TARGET = 'relation'

y_dev, X_dev = dev_samples['relation'].to_frame(), dev_samples.drop('relation', axis=1).drop(
    columns=drop_columns + ['category_id', 'index'])

X_scaled_np = scaler.transform(X_dev)
X_dev = pd.DataFrame(X_scaled_np, index=X_dev.index)

catboost_predictions = load_scikit_predictions(fs_catboost_plus_logreg, X_dev)
neural_predictions = load_neural_predictions(f'{MODEL_PATH}/{RESULT_DIR}/predictions_dev.json')

tmp = vote_predictions(neural_predictions, catboost_predictions, soft=True, weights=[1., 1.])
ensemble_pred = probs_to_classes(tmp)

print('weighted f1: ', metrics.f1_score(y_dev.values, ensemble_pred, average='weighted'))
print('macro f1: ', metrics.f1_score(y_dev.values, ensemble_pred, average='macro'))
print('accuracy: ', metrics.accuracy_score(y_dev.values, ensemble_pred))
print()
print(metrics.classification_report(y_dev, ensemble_pred, digits=4))

On test set

In [None]:
TARGET = 'relation'

y_test, X_test = test_samples[TARGET].to_frame(), test_samples.drop(TARGET, axis=1).drop(
    columns=drop_columns + ['category_id', 'index'])

X_scaled_np = scaler.transform(X_test)
X_test = pd.DataFrame(X_scaled_np, index=X_test.index)

catboost_predictions = load_scikit_predictions(fs_catboost_plus_logreg, X_test)
neural_predictions = load_neural_predictions(f'{MODEL_PATH}/{RESULT_DIR}/predictions_test.json')

tmp = vote_predictions(neural_predictions, catboost_predictions, soft=True, weights=[1., 2.])

ensemble_pred = probs_to_classes(tmp)

print('weighted f1: ', metrics.f1_score(y_test.values, ensemble_pred, average='weighted'))
print('macro f1: ', metrics.f1_score(y_test.values, ensemble_pred, average='macro'))
print('accuracy: ', metrics.accuracy_score(y_test.values, ensemble_pred))
print()
print(metrics.classification_report(y_test, ensemble_pred, digits=4))

In [None]:
test_metrics = metrics.classification_report(y_test, ensemble_pred, digits=4, output_dict=True)
test_f1 = np.array(
    [test_metrics[label].get('f1-score') for label in test_metrics if type(test_metrics[label]) == dict]) * 100

test_f1

### Ensemble: BiMPM + ESIM

On dev set

In [None]:
!ls models/label_predictor_bimpm/

In [None]:
from sklearn import metrics


TARGET = 'relation'

y_dev, X_dev = dev_samples['relation'].to_frame(), dev_samples.drop('relation', axis=1).drop(
    columns=drop_columns + ['category_id', 'index'])

X_scaled_np = scaler.transform(X_dev)
X_dev = pd.DataFrame(X_scaled_np, index=X_dev.index)

bimpm = load_neural_predictions(f'models/label_predictor_bimpm/winter-sweep-1/predictions_dev.json')
esim = load_neural_predictions(f'{MODEL_PATH}/{RESULT_DIR}/predictions_dev.json')
catboost_predictions = load_scikit_predictions(fs_catboost_plus_logreg, X_dev)

tmp = vote_predictions(bimpm, esim, soft=False, weights=[1., 1.])
tmp = vote_predictions(tmp, catboost_predictions, soft=True, weights=[1., 1.])
ensemble_pred = probs_to_classes(tmp)

print('weighted f1: ', metrics.f1_score(y_dev.values, ensemble_pred, average='weighted'))
print('macro f1: ', metrics.f1_score(y_dev.values, ensemble_pred, average='macro'))
print('accuracy: ', metrics.accuracy_score(y_dev.values, ensemble_pred))
print()
print(metrics.classification_report(y_dev, ensemble_pred, digits=4))

On test set

In [None]:
TARGET = 'relation'

y_test, X_test = test_samples[TARGET].to_frame(), test_samples.drop(TARGET, axis=1).drop(
    columns=drop_columns + ['category_id', 'index'])

X_scaled_np = scaler.transform(X_test)
X_test = pd.DataFrame(X_scaled_np, index=X_test.index)

bimpm = load_neural_predictions(f'models/label_predictor_bimpm/winter-sweep-1/predictions_test.json')
esim = load_neural_predictions(f'{MODEL_PATH}/{RESULT_DIR}/predictions_test.json')
catboost_predictions = load_scikit_predictions(fs_catboost_plus_logreg, X_test)

tmp = vote_predictions([bimpm, catboost_predictions, esim], soft=True, weights=[2., 1, 15.])

ensemble_pred = probs_to_classes(tmp)

print('weighted f1: ', metrics.f1_score(y_test.values, ensemble_pred, average='weighted'))
print('macro f1: ', metrics.f1_score(y_test.values, ensemble_pred, average='macro'))
print('accuracy: ', metrics.accuracy_score(y_test.values, ensemble_pred))
print()
print(metrics.classification_report(y_test, ensemble_pred, digits=4))