## Rhetorical relations classification used in tree building: Transformer

Prepare data and model-related scripts.

Evaluate models.

Make and evaluate ansembles for Transformer-based and feature-based model.

Output:
 - ``models/relation_predictor_transf/*``

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
import os
import glob
import pandas as pd
import numpy as np
import pickle
from utils.file_reading import read_edus, read_gold, read_negative, read_annotation

### Make a directory

In [8]:
MODEL_PATH = 'models/relation_predictor_transf'
if not os.path.isdir(MODEL_PATH):
    os.mkdir(MODEL_PATH)

TRAIN_FILE_PATH = os.path.join(MODEL_PATH, 'nlabel_cf_train.json')
DEV_FILE_PATH = os.path.join(MODEL_PATH, 'nlabel_cf_dev.json')
TEST_FILE_PATH = os.path.join(MODEL_PATH, 'nlabel_cf_test.json')

### Prepare train/test sets 

In [5]:
IN_PATH = 'data_labeling'

train_samples = pd.read_pickle(os.path.join(IN_PATH, 'train_samples.pkl'))
dev_samples = pd.read_pickle(os.path.join(IN_PATH, 'dev_samples.pkl'))
test_samples = pd.read_pickle(os.path.join(IN_PATH, 'test_samples.pkl'))

In [6]:
import razdel

def tokenize(text):
    result = ' '.join([tok.text for tok in razdel.tokenize(text)])
    return result
    
train_samples['snippet_x'] = train_samples.snippet_x.map(tokenize)
train_samples['snippet_y'] = train_samples.snippet_y.map(tokenize)

dev_samples['snippet_x'] = dev_samples.snippet_x.map(tokenize)
dev_samples['snippet_y'] = dev_samples.snippet_y.map(tokenize)

test_samples['snippet_x'] = test_samples.snippet_x.map(tokenize)
test_samples['snippet_y'] = test_samples.snippet_y.map(tokenize)

In [14]:
train = pd.DataFrame({
    'premise': train_samples.snippet_x,
    'hypothesis': train_samples.snippet_y,
    'label': train_samples.relation,
    'idx': train_samples.index
})

dev = pd.DataFrame({
    'premise': dev_samples.snippet_x,
    'hypothesis': dev_samples.snippet_y,
    'label': dev_samples.relation,
    'idx': dev_samples.index
})

test = pd.DataFrame({
    'premise': test_samples.snippet_x,
    'hypothesis': test_samples.snippet_y,
    'label': test_samples.relation,
    'idx': test_samples.index
})

In [16]:
import json

# For transformer_superglue_rte data reader

with open(TRAIN_FILE_PATH, 'w') as fp:
    fp.write('\n'.join(json.dumps(i) for i in train.to_dict('records')) + '\n')

with open(DEV_FILE_PATH, 'w') as fp:
    fp.write('\n'.join(json.dumps(i) for i in dev.to_dict('records')) + '\n')
    
with open(TEST_FILE_PATH, 'w') as fp:
    fp.write('\n'.join(json.dumps(i) for i in test.to_dict('records')) + '\n')

In [36]:
%%writefile $MODEL_PATH/config_rubert.json

local transformer_model = "DeepPavlov/rubert-base-cased";
local transformer_dim = 768;

local epochs = 20;
local gpu_batch_size = 2;
local gradient_accumulation_steps = 16;

{
  "dataset_reader":{
    "type": "transformer_superglue_rte"
  },
  "train_data_path": "relation_predictor_transf/nlabel_cf_train.json",
  "validation_data_path": "relation_predictor_transf/nlabel_cf_dev.json",
  "test_data_path": "relation_predictor_transf/nlabel_cf_test.json",
  "model": {
    "type": "basic_classifier",
    "text_field_embedder": {
      "token_embedders": {
        "tokens": {
          "type": "pretrained_transformer",
          "model_name": transformer_model,
          "max_length": 512
        }
      }
    },
    "seq2vec_encoder": {
       "type": "cls_pooler",
       "embedding_dim": transformer_dim,
    },
    "feedforward": {
      "input_dim": transformer_dim,
      "num_layers": 1,
      "hidden_dims": transformer_dim,
      "activations": "tanh"
    },
    "dropout": 0.1,
    "namespace": "tags"
  },
  "data_loader": {
    "shuffle": true,
    "batch_size": gpu_batch_size
  },
  "trainer": {
    "optimizer": {
      "type": "huggingface_adamw",
      "weight_decay": 0.01,
      "parameter_groups": [[["bias", "LayerNorm\\.weight", "layer_norm\\.weight"], {"weight_decay": 0}]],
      "lr": 1e-6,
      "eps": 1e-8,
      "correct_bias": true
    },
    "learning_rate_scheduler": {
      "type": "linear_with_warmup",
      "warmup_steps": 100
    },
    // "grad_norm": 1.0,
    "num_epochs": epochs,
    "num_gradient_accumulation_steps": gradient_accumulation_steps,
    "patience": 5,
    "cuda_device": 0,
    "validation_metric": "+accuracy",
  },
  "random_seed": 42,
  "numpy_seed": 42,
  "pytorch_seed": 42,
}

Overwriting models/relation_predictor_transf/config_rubert.json


In [48]:
%%writefile $MODEL_PATH/config_rubert.json

local transformer_model = "DeepPavlov/rubert-base-cased";
local transformer_dim = 768;

local epochs = 100;
local gpu_batch_size = 2;
local gradient_accumulation_steps = 16;

{
  "dataset_reader":{
    "type": "transformer_superglue_rte"
  },
  "train_data_path": "relation_predictor_transf/nlabel_cf_train.json",
  "validation_data_path": "relation_predictor_transf/nlabel_cf_dev.json",
  "test_data_path": "relation_predictor_transf/nlabel_cf_test.json",
  "model": {
    "type": "bimpm_custom_package.model.custom_basic_classifier.CustomBasicClassifier",
    class_weights: [
        0.03, 0.03, 0.08, 0.1, 0.12, 0.14,
        0.14, 0.17, 0.17, 0.18, 0.19, 0.21,
        0.23, 0.26, 0.33, 0.38, 0.39, 0.57,
        0.78, 0.86, 0.97, 1.0],
    "text_field_embedder": {
      "token_embedders": {
        "tokens": {
          "type": "pretrained_transformer",
          "model_name": transformer_model,
          "max_length": 512
        }
      }
    },
    "seq2vec_encoder": {
       type: "bert_pooler",
       pretrained_model: $.model.text_field_embedder.token_embedders.tokens.model_name,
       requires_grad: true,
       dropout: 0.2,
    },
#     "feedforward": {
#       "input_dim": transformer_dim,
#       "num_layers": 1,
#       "hidden_dims": transformer_dim,
#       "activations": "tanh"
#     },
    "dropout": 0.5,
    "namespace": "tags"
  },
  data_loader: {
    batch_sampler: {
      type: 'bucket',
      batch_size: 2,
    },
  },
  "trainer": {
    "optimizer": {
      "type": "huggingface_adamw",
      "weight_decay": 0.01,
#       "parameter_groups": [[["bias", "LayerNorm\\.weight", "layer_norm\\.weight"], {"weight_decay": 0}]],
      "lr": 2e-5,
      "eps": 1e-8,
      "correct_bias": true
    },
    learning_rate_scheduler: {
      type: "slanted_triangular",
      cut_frac: 0.06
    },
    // "grad_norm": 1.0,
    "num_epochs": epochs,
    num_serialized_models_to_keep: 1,
    "num_gradient_accumulation_steps": gradient_accumulation_steps,
    "patience": 5,
    "cuda_device": 0,
    "validation_metric": "+f1_macro",
  },
  "random_seed": 42,
  "numpy_seed": 42,
  "pytorch_seed": 42,
}

Overwriting models/relation_predictor_transf/config_rubert.json


### 3. Scripts for training/prediction 

Train a model

In [25]:
%%writefile models/train_relation_predictor_bert.sh
# usage:
# $ cd models 
# $ sh train_relation_predictor.sh rubert rubert_01

export METHOD=${1}
export RESULT_DIR=${2}
export DEV_FILE_PATH="nlabel_cf_dev.json"
export TEST_FILE_PATH="nlabel_cf_test.json"

rm -r relation_predictor_transf/${RESULT_DIR}/
allennlp train -s relation_predictor_transf/${RESULT_DIR}/ relation_predictor_transf/config_${METHOD}.json \
    --include-package bimpm_custom_package
allennlp predict --use-dataset-reader --silent \
    --output-file relation_predictor_transf/${RESULT_DIR}/predictions_dev.json relation_predictor_transf/${RESULT_DIR}/model.tar.gz relation_predictor_transf/${DEV_FILE_PATH} \
    --include-package bimpm_custom_package \
    --predictor textual-entailment
allennlp predict --use-dataset-reader --silent \
    --output-file relation_predictor_transf/${RESULT_DIR}/predictions_test.json relation_predictor_transf/${RESULT_DIR}/model.tar.gz relation_predictor_transf/${TEST_FILE_PATH} \
    --include-package bimpm_custom_package \
    --predictor textual-entailment

Overwriting models/train_relation_predictor_bert.sh


### 4. Evaluate classifier

In [26]:
def load_predictions(path):
    result = []
    vocab = []
    
    with open(path, 'r') as file:
        for line in file.readlines():
            line = json.loads(line)
            if line.get("label"):
                result.append(line.get("label"))
            elif line.get("label_probs"):
                if not vocab:
                    vocab = open(path[:path.rfind('/')] + '/vocabulary/labels.txt', 'r').readlines()
                    vocab = [label.strip() for label in vocab]
                
                result.append(vocab[np.argmax(line.get("label_probs"))])
            
    print('length of result:', len(result))
    return result

In [27]:
RESULT_DIR = 'rubert'

In [28]:
DEV_FILE_PATH

'models/relation_predictor_transf/nlabel_cf_dev.json'

In [29]:
import pandas as pd
import json

dev_samples = pd.read_pickle(os.path.join(IN_PATH, 'dev_samples.pkl'))
true = dev_samples.relation.values.tolist()
# pred = load_predictions(f'{MODEL_PATH}/{RESULT_DIR}/predictions_dev.json')

In [31]:
from sklearn.metrics import classification_report

print(classification_report(true[:len(pred)], pred, digits=4))

NameError: name 'pred' is not defined