## Rhetorical relations classification used in tree building: Step 3. BiMPM

1. Prepare data and model-related scripts.
2. Evaluate models.
3. Adjust and evaluate an ansemble for BiMPM and feature rich model.

Output:
 - ``models/relation_predictor_bimpm/*``

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import os
import glob
import pandas as pd
import numpy as np

### Make a directory

In [None]:
MODEL_PATH = 'models/label_predictor_bimpm'
! mkdir $MODEL_PATH

TRAIN_FILE_PATH = os.path.join(MODEL_PATH, 'nlabel_cf_train.tsv')
DEV_FILE_PATH = os.path.join(MODEL_PATH, 'nlabel_cf_dev.tsv')
TEST_FILE_PATH = os.path.join(MODEL_PATH, 'nlabel_cf_test.tsv')

### Prepare train/test sets 

In [None]:
IN_PATH = 'data_labeling'

train_samples = pd.read_pickle(os.path.join(IN_PATH, 'train_samples.pkl'))
dev_samples = pd.read_pickle(os.path.join(IN_PATH, 'dev_samples.pkl'))
test_samples = pd.read_pickle(os.path.join(IN_PATH, 'test_samples.pkl'))

In [None]:
counts = train_samples['relation'].value_counts(normalize=False).values
NUMBER_CLASSES = len(counts)
print("number of classes:", NUMBER_CLASSES)
print("class weights:")
np.round(counts.min() / counts, decimals=6)

In [None]:
train_samples.relation.value_counts()

In [None]:
train_samples = train_samples.reset_index()
train_samples[['relation', 'snippet_x', 'snippet_y', 'index']].to_csv(TRAIN_FILE_PATH, sep='\t', header=False, index=False)

dev_samples = dev_samples.reset_index()
dev_samples[['relation', 'snippet_x', 'snippet_y', 'index']].to_csv(DEV_FILE_PATH, sep='\t', header=False, index=False)

test_samples = test_samples.reset_index()
test_samples[['relation', 'snippet_x', 'snippet_y', 'index']].to_csv(TEST_FILE_PATH, sep='\t', header=False, index=False)

### 2. Generate config files

#### ELMo 

In [None]:
%%writefile $MODEL_PATH/config_elmo.jsonnet

// Configuration for a sentence matching model based on:
//   Wang, Zhiguo, Wael Hamza, and Radu Florian. "Bilateral multi-perspective matching for natural language sentences."
//   Proceedings of the 26th International Joint Conference on Artificial Intelligence. 2017.


local NUM_EPOCHS = 100;
local LR = std.parseJson(std.extVar('LR'));
local LSTM_ENCODER_HIDDEN = 50;
local LSTM_AGG_HIDDEN = std.parseJson(std.extVar('LSTM_AGG_HIDDEN'));

local dataset_reader_type = "quora_paraphrase";
local model_type = "bimpm_custom_package.model.multiclass_bimpm.BiMpm";


// best: LR=0.0005794893218638051
//       LSTM_AGG_HIDDEN=73

{
  "dataset_reader": {
    "type": dataset_reader_type,
    "tokenizer": {
      "type": "just_spaces"
    },
    "token_indexers": {
      "token_characters": {
        "type": "characters",
        "min_padding_length": 30,
      },
      "elmo": {
        "type": "elmo_characters"
     }
    }
  },
  "train_data_path": "label_predictor_bimpm/nlabel_cf_train.tsv",
  "validation_data_path": "label_predictor_bimpm/nlabel_cf_dev.tsv",
  "test_data_path": "label_predictor_bimpm/nlabel_cf_test.tsv",
  "model": {
    "type": model_type,
    "dropout": 0.5,
    "class_weights": [
        0.02518 , 0.029339, 0.073593, 0.100253, 0.107207, 0.114203,
        0.133858, 0.159732, 0.161028, 0.179758, 0.195724, 0.221601,
        0.247917, 0.274194, 0.30829 , 0.34593 , 0.363914, 0.420495,
        0.777778, 0.788079, 0.991667, 1.      ],
    "text_field_embedder": {
        "token_embedders": {
            "elmo": {
                    "type": "elmo_token_embedder",
                    "options_file": "rsv_elmo/options.json",
                    "weight_file": "rsv_elmo/model.hdf5",
                    "do_layer_norm": false,
                    "projection_dim": 100,
                    "dropout": 0.0
            },
            "token_characters": {
                "type": "character_encoding",
                "dropout": 0.1,
                "embedding": {
                    "embedding_dim": 20,
                    "sparse": false,
                    "vocab_namespace": "token_characters"
                },
                "encoder": {
                    "type": "gru",
                    "input_size": $.model.text_field_embedder.token_embedders.token_characters.embedding.embedding_dim,
                    "hidden_size": LSTM_ENCODER_HIDDEN,
                    "num_layers": 1,
                    "bidirectional": true,
                    "dropout": 0.4
                },
            },
      }
    },
    "matcher_word": {
      "is_forward": true,
      "hidden_dim": $.model.text_field_embedder.token_embedders.elmo.projection_dim+LSTM_ENCODER_HIDDEN+LSTM_ENCODER_HIDDEN,
      "num_perspectives": 10,
      "with_full_match": false
    },
    "encoder1": {
      "type": "lstm",
      "bidirectional": true,
      "input_size": $.model.text_field_embedder.token_embedders.elmo.projection_dim+LSTM_ENCODER_HIDDEN+LSTM_ENCODER_HIDDEN,
      "hidden_size": 200,
      "num_layers": 1
    },
    "matcher_forward1": {
      "is_forward": true,
      "hidden_dim": 200,
      "num_perspectives": 10
    },
    "matcher_backward1": {
      "is_forward": false,
      "hidden_dim": 200,
      "num_perspectives": 10
    },
    "encoder2": {
      "type": "lstm",
      "bidirectional": true,
      "input_size": $.model.matcher_forward1.hidden_dim+$.model.matcher_backward1.hidden_dim,
      "hidden_size": 200,
      "num_layers": 1
    },
    "matcher_forward2": {
      "is_forward": true,
      "hidden_dim": 200,
      "num_perspectives": 10
    },
    "matcher_backward2": {
      "is_forward": false,
      "hidden_dim": 200,
      "num_perspectives": 10
    },
    "aggregator":{
      "type": "lstm",
      "bidirectional": true,
      "input_size": 264,
      "hidden_size": LSTM_AGG_HIDDEN,
      "num_layers": 1,
    },
    "classifier_feedforward": {
      "input_dim": LSTM_AGG_HIDDEN*4,
      "num_layers": 1,
      "hidden_dims": [22],
      "activations": ["mish"],
      "dropout": [0.0]
    },
    "initializer": {
      "regexes": [
        [".*linear_layers.*weight", {"type": "xavier_normal"}],
        [".*linear_layers.*bias", {"type": "constant", "val": 0}],
        [".*weight_ih.*", {"type": "xavier_normal"}],
        [".*weight_hh.*", {"type": "orthogonal"}],
        [".*bias.*", {"type": "constant", "val": 0}],
        [".*matcher.*match_weights.*", {"type": "kaiming_normal"}]
      ]
    }
  },
  "data_loader": {
#     "type": 'multiprocess',
#     "max_instances_in_memory": $.data_loader.batch_sampler.batch_size * 10,
    "batch_sampler": {
        "type": "bucket",
        "batch_size": 8,
        "padding_noise": 0.0,
        "sorting_keys": ["premise"],
    },
  },
  "trainer": {
    "num_epochs": NUM_EPOCHS,
    "patience": 5,
    "grad_clipping": 5.0,
    "validation_metric": "+f1_macro",
    "cuda_device": 1,
    "optimizer": {
      "type": "huggingface_adamw",
      "lr": LR
    },
  }
}

In [None]:
!nvidia-smi

In [None]:
%%writefile models/label_bimpm_params.json

[
  {
    "type": "int",
    "attributes": {
      "name": "LSTM_AGG_HIDDEN",
      "low": 50,
      "high": 100
    }
  },
  {
    "type": "float",
    "attributes": {
      "name": "LR",
      "low": 1e-4,
      "high": 1e-3,
      "log": true
    }
  }
]

In [None]:
%%writefile models/tune_label_predictor.sh

export METHOD=label_predictor_bimpm
export STUDY_NAME=label_tuning_0
mkdir optuna
rm -r optuna/$METHOD
mkdir optuna/$METHOD

# optuna delete-study --study-name $STUDY_NAME
allennlp tune ${METHOD}/config_elmo.jsonnet label_bimpm_params.json --serialization-dir optuna/$METHOD \
    --study-name $STUDY_NAME \
    --skip-if-exists \
    --metrics best_validation_f1_macro \
    --direction maximize

In [None]:
import json

def collect_optuna_results(path):
    for trial in glob.glob(os.path.join(path, 'trial_*/')):
        try:
            metrics = json.load(open(os.path.join(trial, 'metrics.json')))
            print(trial, metrics['best_validation_f1_macro'])
        except:
            pass

In [None]:
collect_optuna_results('models/optuna/label_predictor_bimpm/')

#### ELMo + fasttext 

In [None]:
%%writefile $MODEL_PATH/config_elmo_fasttext.jsonnet

// Configuration for a sentence matching model based on:
//   Wang, Zhiguo, Wael Hamza, and Radu Florian. "Bilateral multi-perspective matching for natural language sentences."
//   Proceedings of the 26th International Joint Conference on Artificial Intelligence. 2017.


local NUM_EPOCHS = 100;
local LR = std.parseJson(std.extVar('LR'));
local LSTM_ENCODER_HIDDEN = 50;
local ELMO_DIM = 1024;
local LSTM_AGG_HIDDEN = std.parseJson(std.extVar('LSTM_AGG_HIDDEN'));

local dataset_reader_type = "quora_paraphrase";
local model_type = "bimpm_custom_package.model.multiclass_bimpm.BiMpm";

// best: LR=0.0005303721829092237
//       LSTM_AGG_HIDDEN=83

{
  "dataset_reader": {
    "type": dataset_reader_type,
    "tokenizer": {
      "type": "just_spaces"
    },
    "token_indexers": {
      "token_characters": {
        "type": "characters",
        "min_padding_length": 30,
      },
      "elmo": {
        "type": "elmo_characters"
      },
      "tokens": {
        "type": "single_id",
        "lowercase_tokens": true
      },
    }
  },
  "train_data_path": "label_predictor_bimpm/nlabel_cf_train.tsv",
  "validation_data_path": "label_predictor_bimpm/nlabel_cf_dev.tsv",
  "test_data_path": "label_predictor_bimpm/nlabel_cf_test.tsv",
  "model": {
    "type": model_type,
    "dropout": 0.5,
    "class_weights": [
        0.02518 , 0.029339, 0.073593, 0.100253, 0.107207, 0.114203,
        0.133858, 0.159732, 0.161028, 0.179758, 0.195724, 0.221601,
        0.247917, 0.274194, 0.30829 , 0.34593 , 0.363914, 0.420495,
        0.777778, 0.788079, 0.991667, 1.      ],
    "encode_together": false,
    "text_field_embedder": {
        "token_embedders": {
            "tokens": {
                "type": "embedding",
                "embedding_dim": 300,
                "pretrained_file": "ft_native_300_ru_wiki_lenta_nltk_wordpunct_tokenize.vec",
                "trainable": false
            },
            "elmo": {
                    "type": "elmo_token_embedder",
                    "options_file": "rsv_elmo/options.json",
                    "weight_file": "rsv_elmo/model.hdf5",
                    "do_layer_norm": false,
                    "projection_dim": 100,
                    "dropout": 0.0
            },
            "token_characters": {
                "type": "character_encoding",
                "dropout": 0.1,
                "embedding": {
                    "embedding_dim": 20,
                    "padding_index": 0,
                },
                "encoder": {
                    "type": "gru",
                    "input_size": 20,
                    "hidden_size": LSTM_ENCODER_HIDDEN,
                    "num_layers": 1,
                    "bidirectional": true,
                },
            },
      }
    },
    "matcher_word": {
      "is_forward": true,
      "hidden_dim": $.model.text_field_embedder.token_embedders.elmo.projection_dim+LSTM_ENCODER_HIDDEN*2+$.model.text_field_embedder.token_embedders.tokens.embedding_dim,
      "num_perspectives": 10,
      "with_full_match": false
    },
    "encoder1": {
      "type": "lstm",
      "bidirectional": true,
      "input_size": $.model.text_field_embedder.token_embedders.elmo.projection_dim+LSTM_ENCODER_HIDDEN*2+$.model.text_field_embedder.token_embedders.tokens.embedding_dim,
      "hidden_size": 200,
      "num_layers": 1
    },
    "matcher_forward1": {
      "is_forward": true,
      "hidden_dim": 200,
      "num_perspectives": 10
    },
    "matcher_backward1": {
      "is_forward": false,
      "hidden_dim": 200,
      "num_perspectives": 10
    },
    "encoder2": {
      "type": "lstm",
      "bidirectional": true,
      "input_size": $.model.matcher_forward1.hidden_dim+$.model.matcher_backward1.hidden_dim,
      "hidden_size": 200,
      "num_layers": 1
    },
    "matcher_forward2": {
      "is_forward": true,
      "hidden_dim": 200,
      "num_perspectives": 10
    },
    "matcher_backward2": {
      "is_forward": false,
      "hidden_dim": 200,
      "num_perspectives": 10
    },
    "aggregator":{
      "type": "lstm",
      "bidirectional": true,
      "input_size": 264,
      "hidden_size": LSTM_AGG_HIDDEN,
      "num_layers": 1,
    },
    "classifier_feedforward": {
      "input_dim": LSTM_AGG_HIDDEN*4,
      "num_layers": 1,
      "hidden_dims": [22],
      "activations": ["mish"],
      "dropout": [0.0]
    },
    "initializer": {
      "regexes": [
        [".*linear_layers.*weight", {"type": "xavier_normal"}],
        [".*linear_layers.*bias", {"type": "constant", "val": 0}],
        [".*weight_ih.*", {"type": "xavier_normal"}],
        [".*weight_hh.*", {"type": "orthogonal"}],
        [".*bias.*", {"type": "constant", "val": 0}],
        [".*matcher.*match_weights.*", {"type": "kaiming_normal"}]
      ]
    }
  },
  "data_loader": {
    "type": 'multiprocess',
    "max_instances_in_memory": $.data_loader.batch_sampler.batch_size * 20,
    "batch_sampler": {
        "type": "bucket",
        "batch_size": 4,
        "padding_noise": 0.0,
        "sorting_keys": ["premise"],
    },
  },
  "trainer": {
    "num_epochs": NUM_EPOCHS,
    "patience": 5,
    "grad_clipping": 5.0,
    "validation_metric": "+f1_macro",
    "cuda_device": 1,
    "optimizer": {
      "type": "huggingface_adamw",
      "lr": LR
    },
  }
}

In [None]:
%%writefile models/tune_label_predictor.sh

export METHOD=label_predictor_bimpm
export STUDY_NAME=label_tuning_1
mkdir optuna
rm -r optuna/$METHOD
mkdir optuna/$METHOD

allennlp tune ${METHOD}/config_elmo_fasttext.jsonnet label_bimpm_params.json --serialization-dir optuna/$METHOD \
    --study-name $STUDY_NAME \
    --skip-if-exists \
    --metrics best_validation_f1_macro \
    --direction maximize

In [None]:
collect_optuna_results('models/optuna/label_predictor_bimpm/')

In [None]:
pd.Series(sorted([0.40781721303408797, 0.3998988602649082, 0.4202644174749201, 0.41427419673312793, 0.37974115122448315,
           0.4106660695238547, 0.41931628300385043, 0.40962370891462674, 0.4246042018586939, 0.4083355727859519,
           0.424028525298292])).plot(kind='density', bw_method=0.3)

In [None]:
pd.Series(sorted([0.40781721303408797, 0.3998988602649082, 0.4202644174749201, 0.41427419673312793, 
                  0.37974115122448315, 0.4106660695238547, 0.41931628300385043, 0.40962370891462674, 
                  0.4246042018586939, 0.4083355727859519, 0.424028525298292])).plot

In [None]:
! cd models && allennlp best-params --study-name label_tuning_1

In [None]:
! rm -r models/label_predictor_bimpm/elmo_ft/
! mv models/optuna/label_predictor_bimpm/trial_19 models/label_predictor_bimpm/elmo_ft

### 3. Scripts for training/prediction 

#### Option 1. Directly from the config

Train a model

In [None]:
%%writefile models/train_label_predictor.sh
# usage:
# $ cd models 
# $ sh train_label_predictor.sh {bert|elmo} result_30

export METHOD=${1}
export RESULT_DIR=${2}
export DEV_FILE_PATH="nlabel_cf_dev.tsv"
export TEST_FILE_PATH="nlabel_cf_test.tsv"

rm -r label_predictor_bimpm/${RESULT_DIR}/
allennlp train -s label_predictor_bimpm/${RESULT_DIR}/ label_predictor_bimpm/config_${METHOD}.json \
    --include-package customization_package
allennlp predict --use-dataset-reader --silent \
    --output-file label_predictor_bimpm/${RESULT_DIR}/predictions_dev.json label_predictor_bimpm/${RESULT_DIR}/model.tar.gz label_predictor_bimpm/${DEV_FILE_PATH} \
    --include-package customization_package \
    --predictor textual-entailment
allennlp predict --use-dataset-reader --silent \
    --output-file label_predictor_bimpm/${RESULT_DIR}/predictions_test.json label_predictor_bimpm/${RESULT_DIR}/model.tar.gz label_predictor_bimpm/${TEST_FILE_PATH} \
    --include-package customization_package \
    --predictor textual-entailment

Predict on dev&test

In [None]:
%%writefile models/eval_label_predictor.sh
# usage:
# $ cd models 
# $ sh train_label_predictor.sh {bert|elmo}

export RESULT_DIR=${1}
export DEV_FILE_PATH="nlabel_cf_dev.tsv"
export TEST_FILE_PATH="nlabel_cf_test.tsv"

allennlp predict --use-dataset-reader --cuda-device 0 --silent \
    --output-file label_predictor_bimpm/${RESULT_DIR}/predictions_dev.json label_predictor_bimpm/${RESULT_DIR}/model.tar.gz label_predictor_bimpm/${DEV_FILE_PATH} \
    --include-package bimpm_custom_package 
allennlp predict --use-dataset-reader --cuda-device 0 --silent \
    --output-file label_predictor_bimpm/${RESULT_DIR}/predictions_test.json label_predictor_bimpm/${RESULT_DIR}/model.tar.gz label_predictor_bimpm/${TEST_FILE_PATH} \
    --include-package bimpm_custom_package

(optional) predict on train

In [None]:
%%writefile models/eval_label_predictor_train.sh
# usage:
# $ cd models 
# $ sh eval_label_predictor_train.sh {bert|elmo} result_30

export METHOD=${1}
export RESULT_DIR=${2}
export TEST_FILE_PATH="nlabel_cf_train.tsv"

allennlp predict --use-dataset-reader --silent \
    --output-file label_predictor_bimpm/${RESULT_DIR}/predictions_train.json label_predictor_bimpm/${RESULT_DIR}/model.tar.gz label_predictor_bimpm/${TEST_FILE_PATH} \
    --include-package customization_package \
    --predictor textual-entailment

#### Option 2 (OLD!). Using wandb for parameters adjustment

In [None]:
%%writefile models/wandb_label_predictor1.yaml

name: label_predictor_stacked
program: wandb_allennlp # this is a wrapper console script around allennlp commands. It is part of wandb-allennlp
method: bayes
## Do not for get to use the command keyword to specify the following command structure
command:
  - ${program} #omit the interpreter as we use allennlp train command directly
  - "--subcommand=train"
  - "--include-package=bimpm_custom_package" # add all packages containing your registered classes here
  - "--config_file=label_predictor_bimpm/config_elmo.json"
  - ${args}
metric:
    name: best_f1_macro
    goal: maximize
parameters:
    iterator.batch_size:
        values: [4,]
    model.encode_together:
        values: ["true", ]
    trainer.optimizer.lr:
        values: [0.001,]
    model.dropout:
        values: [0.5]


In [None]:
%%writefile models/wandb_label_predictor2.yaml

name: label_predictor_stacked
program: wandb_allennlp # this is a wrapper console script around allennlp commands. It is part of wandb-allennlp
method: bayes
## Do not for get to use the command keyword to specify the following command structure
command:
  - ${program} #omit the interpreter as we use allennlp train command directly
  - "--subcommand=train"
  - "--include-package=bimpm_custom_package" # add all packages containing your registered classes here
  - "--config_file=label_predictor_bimpm/config_elmo_fasttext.json"
  - ${args}
metric:
    name: best_f1_macro
    goal: maximize
parameters:
    iterator.batch_size:
        values: [4,]
    model.encode_together:
        values: ["true", ]
    trainer.optimizer.lr:
        values: [0.001,]
    model.dropout:
        values: [0.5]


In [None]:
! rm -r ../../../maintenance_rst/models/label_predictor_bimpm

In [None]:
! cp -r models/label_predictor_bimpm ../../../maintenance_rst/models/label_predictor_bimpm

3. Run training

``wandb sweep wandb_label_predictor1.yaml``

(returns %sweepname1)

``wandb sweep wandb_label_predictor2.yaml``

(returns %sweepname2)

``wandb agent --count 1 %sweepname1 && wandb agent --count 1 %sweepname2``

Move the best model in label_predictor_bimpm

In [None]:
! ls -laht models/wandb

In [None]:
! cp -r models/wandb/run-20200721_172146-kggsduvw/training_dumps models/label_predictor_bimpm/noble-sweep-3

**Or** load from wandb by %sweepname

In [None]:
import wandb
api = wandb.Api()
run = api.run("tchewik/tmp/7hum4oom")
for file in run.files():
    file.download(replace=True)

In [None]:
! cp -r training_dumps models/label_predictor_bimpm/toasty-sweep-1

And run evaluation from shell

``sh eval_label_predictor.sh {elmo|elmo_fasttext} toasty-sweep-1``

### 4. Evaluate classifier

In [None]:
import json

def load_predictions(path):
    result = []
    
    with open(path, 'r') as file:
        for line in file.readlines():
            result.append(json.loads(line)["label"])
            
    return result

In [None]:
RESULT_DIR = 'elmo_ft'

In [None]:
MODEL_PATH = '../../models/label_predictor_bimpm/'

On dev set

In [None]:
import pandas as pd

true = pd.read_csv(DEV_FILE_PATH, sep='\t', header=None)[0].values.tolist()
pred = load_predictions(f'{MODEL_PATH}/{RESULT_DIR}/predictions_dev.json')

print('length of true:', len(true))
print('length of pred:', len(pred))

In [None]:
from sklearn.metrics import classification_report

print(classification_report(true[:len(pred)], pred, digits=4))

In [None]:
test_metrics = classification_report(true[:len(pred)], pred, digits=4, output_dict=True)
test_f1 = np.array(
    [test_metrics[label].get('f1-score') for label in test_metrics if type(test_metrics[label]) == dict]) * 100

test_f1

In [None]:
from sklearn.metrics import f1_score, precision_score, recall_score

print('f1: %.2f'%(f1_score(true[:len(pred)], pred, average='macro')*100))
print('pr: %.2f'%(precision_score(true[:len(pred)], pred, average='macro')*100))
print('re: %.2f'%(recall_score(true[:len(pred)], pred, average='macro')*100))

In [None]:
from utils.plot_confusion_matrix import plot_confusion_matrix
from sklearn.metrics import confusion_matrix

labels = list(set(true))
labels.sort()
plot_confusion_matrix(confusion_matrix(true[:len(pred)], pred), target_names=labels, normalize=True)

In [None]:
high_level_relations = {
    'coherence': ['background', 'elaboration', 'restatement', 'interpretation-evaluation', 'preparation',
                  'solutionhood'],
    'causal-argumentative:contrastive': ['concession', 'contrast', 'comparison'],
    'causal-argumentative:causal': ['purpose', 'evidence', 'cause-effect'],
    'causal-argumentative:condition': ['condition'],
    'structural': ['sequence', 'joint', 'same-unit'],
    'attribution': ['attribution']
}

class_mapper = dict()
for key in high_level_relations:
    for value in high_level_relations[key]:
        for order in ['NN', 'NS', 'SN']:
            class_mapper[value + '_' + order] = key

In [None]:
import numpy as np

true = [class_mapper.get(value) if class_mapper.get(value) else value for value in true]
pred = [class_mapper.get(value) if class_mapper.get(value) else value for value in pred]

pred_mapper = {
    'other_NN': 'joint_NN',
    'other_NS': 'joint_NN',
    'other_SN': 'joint_NN'
}

pred = [pred_mapper.get(value) if pred_mapper.get(value) else value for value in pred]
_to_stay = (np.array(true) != 'other_NN') & (np.array(true) != 'other_SN') & (np.array(true) != 'other_NS')

_true = np.array(true)[_to_stay]
_pred = np.array(pred)[_to_stay[:len(pred)]]
labels = list(set(_true))

In [None]:
from sklearn.metrics import f1_score, precision_score, recall_score

print('f1: %.2f'%(f1_score(true[:len(pred)], pred, average='macro')*100))
print('pr: %.2f'%(precision_score(true[:len(pred)], pred, average='macro')*100))
print('re: %.2f'%(recall_score(true[:len(pred)], pred, average='macro')*100))

In [None]:
labels.sort()

In [None]:
plot_confusion_matrix(confusion_matrix(_true[:len(_pred)], _pred), target_names=labels, normalize=True)

In [None]:
import numpy as np

for rel in np.unique(_true):
    print(rel)

On train set (optional)

In [None]:
import pandas as pd

true = pd.read_csv('models/label_predictor_bimpm/nlabel_cf_train.tsv', sep='\t', header=None)[0].values.tolist()
pred = load_predictions(f'{MODEL_PATH}/{RESULT_DIR}/predictions_train.json')

print(classification_report(true[:len(pred)], pred, digits=4))

In [None]:
file = 'models/label_predictor_lstm/nlabel_cf_train.tsv'
true_train = pd.read_csv(file, sep='\t', header=None)
true_train['predicted_relation'] = pred

print(true_train[true_train.relation != true_train.predicted_relation].shape)

true_train[true_train.relation != true_train.predicted_relation].to_csv('mispredicted_relations.csv', sep='\t')

On test set

In [None]:
import pandas as pd
import json

true = pd.read_csv(TEST_FILE_PATH, sep='\t', header=None)[0].values.tolist()
pred = load_predictions(f'{MODEL_PATH}/{RESULT_DIR}/predictions_test.json')

print('length of true:', len(true))
print('length of pred:', len(pred))

print(classification_report(true[:len(pred)], pred, digits=4))

In [None]:
test_metrics = classification_report(true[:len(pred)], pred, digits=4, output_dict=True)
test_f1 = np.array(
    [test_metrics[label].get('f1-score') for label in test_metrics if type(test_metrics[label]) == dict]) * 100

test_f1

In [None]:
from sklearn.metrics import f1_score, precision_score, recall_score

print('f1: %.2f'%(f1_score(true[:len(pred)], pred, average='macro')*100))
print('pr: %.2f'%(precision_score(true[:len(pred)], pred, average='macro')*100))
print('re: %.2f'%(recall_score(true[:len(pred)], pred, average='macro')*100))

In [None]:
true = [class_mapper.get(value) if class_mapper.get(value) else value for value in true]
pred = [class_mapper.get(value) if class_mapper.get(value) else value for value in pred]
pred = [pred_mapper.get(value) if pred_mapper.get(value) else value for value in pred]

_to_stay = (np.array(true) != 'other_NN') & (np.array(true) != 'other_SN') & (np.array(true) != 'other_NS')

_true = np.array(true)[_to_stay]
_pred = np.array(pred)[_to_stay]


print('f1: %.2f'%(f1_score(_true[:len(_pred)], _pred, average='macro')*100))
print('pr: %.2f'%(precision_score(_true[:len(_pred)], _pred, average='macro')*100))
print('re: %.2f'%(recall_score(_true[:len(_pred)], _pred, average='macro')*100))

print(classification_report(_true[:len(_pred)], _pred, digits=4))

In [None]:
from sklearn.metrics import f1_score, precision_score, recall_score

print('f1: %.2f'%(f1_score(_true[:len(_pred)], _pred, average='macro')*100))
print('pr: %.2f'%(precision_score(_true[:len(_pred)], _pred, average='macro')*100))
print('re: %.2f'%(recall_score(_true[:len(_pred)], _pred, average='macro')*100))

### Ensemble: (Logreg+Catboost) + BiMPM

In [None]:
model_vocab = [
    "joint_NN",
    "elaboration_NS",
    "contrast_NN",
    "attribution_SN",
    "interpretation-evaluation_NS",
    "cause-effect_SN",
    "preparation_SN",
    "sequence_NN",
    "cause-effect_NS",
    "same-unit_NN",
    "condition_SN",
    "purpose_NS",
    "attribution_NS",
    "condition_NS",
    "comparison_NN",
    "background_NS",
    "evidence_NS",
    "solutionhood_SN",
    "concession_NS",
    "interpretation-evaluation_SN",
    "restatement_NN",
    "purpose_SN",
]

catboost_vocab = [
    'attribution_NS', 'attribution_SN', 'background_NS',
    'cause-effect_NS', 'cause-effect_SN', 'comparison_NN',
    'concession_NS', 'condition_NS', 'condition_SN', 'contrast_NN',
    'elaboration_NS', 'evidence_NS', 'interpretation-evaluation_NS',
    'interpretation-evaluation_SN', 'joint_NN', 'preparation_SN',
    'purpose_NS', 'purpose_SN', 'restatement_NN', 'same-unit_NN',
    'sequence_NN', 'solutionhood_SN']

def load_neural_predictions(path):
    result = []
    
    with open(path, 'r') as file:
        for line in file.readlines():
            probs = json.loads(line)['probs']
            probs = {model_vocab[i]: probs[i] for i in range(len(model_vocab))}
            result.append(probs)
            
    return result

def load_scikit_predictions(model, X):
    result = []
    predictions = model.predict_proba(X)
    
    for prediction in predictions:
        probs = {catboost_vocab[j]: prediction[j] for j in range(len(catboost_vocab))}
        result.append(probs)
    
    return result

def vote_predictions(pred1, pred2, soft=True, weights=[1., 1.]):
    assert len(pred1) == len(pred2)
    result = []
    
    for i in range(len(pred1)):
        sample_result = {}
        for key in pred1[i].keys():
            if soft:
                sample_result[key] = pred1[i][key]*weights[0] + pred2[i][key]*weights[1]
            else:
                sample_result[key] = max(pred1[i][key], pred2[i][key])
        
        result.append(sample_result)
    
    return result

def probs_to_classes(pred):
    result = []
    result_proba = []
    
    for sample in pred:
        best_class = ''
        best_prob = 0.
        for key in sample.keys():
            if sample[key] > best_prob:
                best_prob = sample[key]
                best_class = key
        
        result.append(best_class)
        result_proba.append(best_prob)
    
    return result, result_proba

In [None]:
import pickle

baseline_model_path = '../../models/label_predictor_baseline/'
fs_catboost_plus_logreg = pickle.load(open(os.path.join(baseline_model_path, 'model.pkl'), 'rb'))
lab_encoder = pickle.load(open(os.path.join(baseline_model_path, 'label_encoder.pkl'), 'rb'))
scaler = pickle.load(open(os.path.join(baseline_model_path, 'scaler.pkl'), 'rb'))
drop_columns = pickle.load(open(os.path.join(baseline_model_path, 'drop_columns.pkl'), 'rb'))

On dev set

In [None]:
from sklearn import metrics


TARGET = 'relation'

y_dev, X_dev = dev_samples['relation'].to_frame(), dev_samples.drop('relation', axis=1).drop(
    columns=drop_columns + ['category_id', 'index'])

for additional_col in ('pred_category', 'true_category', 'pred_relation', 'pred_proba'):
    if additional_col in X_dev.keys():
        X_dev = X_dev.drop(columns=[additional_col])

X_scaled_np = scaler.transform(X_dev)
X_dev = pd.DataFrame(X_scaled_np, index=X_dev.index)

catboost_predictions = load_scikit_predictions(fs_catboost_plus_logreg, X_dev)
neural_predictions = load_neural_predictions(f'{MODEL_PATH}/{RESULT_DIR}/predictions_dev.json')

In [None]:
tmp = vote_predictions(neural_predictions, catboost_predictions, soft=True, weights=[.3, .8])
ensemble_pred, _ = probs_to_classes(tmp)

print('weighted f1: ', metrics.f1_score(y_dev.values, ensemble_pred, average='weighted'))
print('macro f1: ', metrics.f1_score(y_dev.values, ensemble_pred, average='macro'))
print('accuracy: ', metrics.accuracy_score(y_dev.values, ensemble_pred))
print()
print(metrics.classification_report(y_dev, ensemble_pred, digits=4))

In [None]:
dev_samples['pred_relation'] = ensemble_pred
dev_samples['pred_proba'] = _

In [None]:
y_dev.values.T

In [None]:
# Blending coeffs

macro_f1 = 0.0
for w0 in np.arange(0.1, 1., 0.1):
    for w1 in np.arange(0.1, 1., 0.1):
        tmp = vote_predictions(neural_predictions, catboost_predictions, soft=True, weights=[w0, w1])
        ensemble_pred, _ = probs_to_classes(tmp)
        new_macro_f1 = metrics.f1_score(y_dev.values, ensemble_pred, average='macro')
        if new_macro_f1 > macro_f1:
            print(w0, w1, new_macro_f1)
            macro_f1 = new_macro_f1
#         print(new_macro_f1)

In [None]:
true = [value[0] for value in y_dev.values]
true = [class_mapper.get(value) if class_mapper.get(value) else value for value in true]
ensemble_pred = [class_mapper.get(value) if class_mapper.get(value) else value for value in ensemble_pred]
ensemble_pred = [pred_mapper.get(value) if pred_mapper.get(value) else value for value in ensemble_pred]

_to_stay = (np.array(true) != 'other_NN') & (np.array(true) != 'other_SN') & (np.array(true) != 'other_NS')

_true = np.array(true)[_to_stay]
_pred = np.array(ensemble_pred)[_to_stay]


print('f1: %.2f'%(f1_score(_true[:len(_pred)], _pred, average='macro')*100))
print('pr: %.2f'%(precision_score(_true[:len(_pred)], _pred, average='macro')*100))
print('re: %.2f'%(recall_score(_true[:len(_pred)], _pred, average='macro')*100))

print(classification_report(_true[:len(_pred)], _pred, digits=4))

In [None]:
dev_samples['pred_category'] = _pred
dev_samples['true_category'] = _true

On test set

In [None]:
TARGET = 'relation'

y_test, X_test = test_samples[TARGET].to_frame(), test_samples.drop(TARGET, axis=1).drop(
    columns=drop_columns + ['category_id', 'index'])

for additional_col in ('pred_category', 'true_category', 'pred_relation'):
    if additional_col in X_test.keys():
        X_test = X_test.drop(columns=[additional_col])

X_scaled_np = scaler.transform(X_test)
X_test = pd.DataFrame(X_scaled_np, index=X_test.index)

catboost_predictions = load_scikit_predictions(fs_catboost_plus_logreg, X_test)
neural_predictions = load_neural_predictions(f'{MODEL_PATH}/{RESULT_DIR}/predictions_test.json')

tmp = vote_predictions(neural_predictions, catboost_predictions, soft=True, weights=[.3, .8])

ensemble_pred, proba = probs_to_classes(tmp)

print('weighted f1: ', metrics.f1_score(y_test.values, ensemble_pred, average='weighted'))
print('macro f1: ', metrics.f1_score(y_test.values, ensemble_pred, average='macro'))
print('accuracy: ', metrics.accuracy_score(y_test.values, ensemble_pred))
print()
print(metrics.classification_report(y_test, ensemble_pred, digits=4))

In [None]:
test_samples['pred_relation'] = ensemble_pred
test_samples['pred_proba'] = proba

In [None]:
test_metrics = classification_report(y_test, ensemble_pred, digits=4, output_dict=True)
test_f1 = np.array(
    [test_metrics[label].get('f1-score') for label in test_metrics if type(test_metrics[label]) == dict]) * 100

test_f1

In [None]:
true = [value[0] for value in y_test.values]
true = [class_mapper.get(value) if class_mapper.get(value) else value for value in true]
ensemble_pred = [class_mapper.get(value) if class_mapper.get(value) else value for value in ensemble_pred]
ensemble_pred = [pred_mapper.get(value) if pred_mapper.get(value) else value for value in ensemble_pred]

_to_stay = (np.array(true) != 'other_NN') & (np.array(true) != 'other_SN') & (np.array(true) != 'other_NS')

_true = np.array(true)[_to_stay]
_pred = np.array(ensemble_pred)[_to_stay]


print('f1: %.2f'%(f1_score(_true[:len(_pred)], _pred, average='macro')*100))
print('pr: %.2f'%(precision_score(_true[:len(_pred)], _pred, average='macro')*100))
print('re: %.2f'%(recall_score(_true[:len(_pred)], _pred, average='macro')*100))

print(classification_report(_true[:len(_pred)], _pred, digits=4))

In [None]:
labels = list(set(_true))
labels.sort()
plot_confusion_matrix(confusion_matrix(_true[:len(_pred)], _pred), target_names=labels, normalize=True)

In [None]:
test_samples['pred_category'] = _pred
test_samples['true_category'] = _true

In [None]:
test_samples[(test_samples.pred_category != test_samples.true_category) & (
              test_samples.true_category == 'causal-argumentative:causal') & (
              test_samples.filename.str.startswith('depression.'))][['snippet_x', 'snippet_y', 
                                                                     'relation', 'true_category', 
                                                                     'pred_relation', 'pred_category']].head(1)

#### Compare results for RuRSTreebank and Essays

In [None]:
_data = test_samples[test_samples.filename.str.contains('blogs') | test_samples.filename.str.contains('news')]
_true = _data.true_category.values.tolist()
_pred = _data.pred_category.values.tolist()

_data = dev_samples[test_samples.filename.str.contains('blogs') | dev_samples.filename.str.contains('news')]
_true += _data.true_category.values.tolist()
_pred += _data.pred_category.values.tolist()

print('f1: %.2f'%(f1_score(_true[:len(_pred)], _pred, average='macro')*100))
print('pr: %.2f'%(precision_score(_true[:len(_pred)], _pred, average='macro')*100))
print('re: %.2f'%(recall_score(_true[:len(_pred)], _pred, average='macro')*100))

print(classification_report(_true[:len(_pred)], _pred, digits=4))

In [None]:
_data = test_samples[test_samples.filename.str.contains('depression.') | test_samples.filename.str.contains('healthy.')]
_true = _data.true_category.values.tolist()
_pred = _data.pred_category.values.tolist()

_data = dev_samples[test_samples.filename.str.contains('depression.') | dev_samples.filename.str.contains('healthy.')]
_true += _data.true_category.values.tolist()
_pred += _data.pred_category.values.tolist()

print('f1: %.2f'%(f1_score(_true[:len(_pred)], _pred, average='macro')*100))
print('pr: %.2f'%(precision_score(_true[:len(_pred)], _pred, average='macro')*100))
print('re: %.2f'%(recall_score(_true[:len(_pred)], _pred, average='macro')*100))

print(classification_report(_true[:len(_pred)], _pred, digits=4))

In [None]:
labels = list(set(_true))
labels.sort()
plot_confusion_matrix(confusion_matrix(_true[:len(_pred)], _pred), target_names=labels, normalize=True)

### Just for testing 

In [None]:
# from allennlp.predictors.predictor import Predictor

# predictor_name='textual_entailment'
# clf.predict(premise='Сейчас я думаю,', hypothesis='что')