In [2]:
%load_ext autoreload
%autoreload 2

### Loading data 

In [3]:
import pandas as pd
from tqdm import tqdm
import razdel
import fasttext
import os
import json
tqdm.pandas()


data = pd.read_pickle('data/train.annotated.pkl')
data['tokens'] = data.annot.map(lambda row: ' '.join([tok.text for tok in row['tokens']]))

In [4]:
data.head(1)

Unnamed: 0,text_id,text,masks_stance,masks_argument,quarantine_stance,quarantine_argument,vaccines_stance,vaccines_argument,annot,tokens
0,17024,"[USER], согласно предписаниям Роспотребнадзора...",-1,-1,1,1,-1,-1,{'text': 'согласно предписаниям Роспотребнадзо...,"согласно предписаниям Роспотребнадзора , все т..."


In [5]:
symbol_mapper = {
    'ё': 'е',
    '“': '«',
    '”': '»'
}

data.tokens = data.tokens.replace(symbol_mapper, regex=True)

In [7]:
import youtokentome as yttm
import re

MODELNAME = 'bpe.model'


def train_bpe(texts, vocab_size=10000):
    texts_filename = 'texts.txt'
    with open(texts_filename, 'w') as f:
        for text in texts:
            f.write(text + '\n')
    
    yttm.BPE.train(data=texts_filename, vocab_size=vocab_size, model=MODELNAME)
            
train_bpe(data.tokens.map(lambda row: row.lower()))
bpe = yttm.BPE(model=MODELNAME)

def bpe_tokenize(text):
    return ' '.join(bpe.encode(text, output_type=yttm.OutputType.SUBWORD))

data['bpe'] = data.tokens.map(lambda row: bpe_tokenize(row.lower()))

In [8]:
data.head(1)

Unnamed: 0,text_id,text,masks_stance,masks_argument,quarantine_stance,quarantine_argument,vaccines_stance,vaccines_argument,annot,tokens,bpe
0,17024,"[USER], согласно предписаниям Роспотребнадзора...",-1,-1,1,1,-1,-1,{'text': 'согласно предписаниям Роспотребнадзо...,"согласно предписаниям Роспотребнадзора , все т...","▁согласно ▁предписа ниям ▁роспотребнадзора ▁, ..."


### CV split 

In [9]:
import json
import os
from sklearn.model_selection import KFold

kf = KFold(n_splits=5, shuffle=True, random_state=42)
i = 0
for train, test in kf.split(data):
    pathname = f'data/fold_{i}'
    if os.path.isdir(pathname):
        import shutil
        shutil.rmtree(pathname)
    
    os.mkdir(pathname)
    
    data.iloc[train].reset_index(drop=True).to_pickle(os.path.join(pathname, 'train.pkl'))
    data.iloc[test].reset_index(drop=True).to_pickle(os.path.join(pathname, 'test.pkl'))

    bpe_data = pd.DataFrame({
        'text': data.bpe,
        'masks_stance': data.masks_stance,
        'masks_argument': data.masks_argument,
        'quarantine_stance': data.quarantine_stance,
        'quarantine_argument': data.quarantine_argument,
        'vaccines_stance': data.vaccines_stance,
        'vaccines_argument': data.vaccines_argument
    })
    
    with open(os.path.join(pathname, 'train_bpe.json'), 'w') as fp:
        fp.write('\n'.join(json.dumps(i) for i in bpe_data.iloc[train].to_dict('records')) + '\n')
        
    with open(os.path.join(pathname, 'test_bpe.json'), 'w') as fp:
        fp.write('\n'.join(json.dumps(i) for i in bpe_data.iloc[test].to_dict('records')) + '\n')
        
    with open(os.path.join(pathname, 'train_tokens.json'), 'w') as fp:
        fp.write('\n'.join(json.dumps(i) for i in data[['tokens',
                                                        'masks_stance', 'masks_argument',
                                                        'quarantine_stance', 'quarantine_argument',
                                                        'vaccines_stance', 'vaccines_argument'
                                                       ]].iloc[train].to_dict('records')) + '\n')
        
    with open(os.path.join(pathname, 'test_tokens.json'), 'w') as fp:
        fp.write('\n'.join(json.dumps(i) for i in data[['tokens',
                                                        'masks_stance', 'masks_argument',
                                                        'quarantine_stance', 'quarantine_argument',
                                                        'vaccines_stance', 'vaccines_argument'
                                                       ]].iloc[test].to_dict('records')) + '\n')
        
    i += 1
    print(data.iloc[train].shape, data.iloc[test].shape)

In [10]:
! ls -laht data/fold_0

total 54M
-rw-r--r-- 1 root root 809K Jan 30 12:07 test_quarantine_tokens.json
drwxr-xr-x 2 root root 4.0K Jan 30 12:07 .
-rw-r--r-- 1 root root 3.2M Jan 30 12:07 train_quarantine_tokens.json
-rw-r--r-- 1 root root 809K Jan 28 15:35 test_vaccines_tokens.json
-rw-r--r-- 1 root root 3.2M Jan 28 15:35 train_vaccines_tokens.json
-rw-r--r-- 1 root root 808K Jan 28 11:54 test_mask_tokens.json
-rw-r--r-- 1 root root 3.2M Jan 28 11:54 train_mask_tokens.json
drwxr-xr-x 7 root root 4.0K Jan 28 11:52 ..
-rw-r--r-- 1 root root 954K Jan 28 11:52 test_tokens.json
-rw-r--r-- 1 root root 3.8M Jan 28 11:52 train_tokens.json
-rw-r--r-- 1 root root 1.1M Jan 28 11:52 test_bpe.json
-rw-r--r-- 1 root root 4.4M Jan 28 11:52 train_bpe.json
-rw-r--r-- 1 root root 6.3M Jan 28 11:52 test.pkl
-rw-r--r-- 1 root root  26M Jan 28 11:52 train.pkl


### Evaluation functions

The main performance metric in each of the two tasks is the macro F1-score (macro F1rel-score), which is averaged first over three relevance classes (the class “irrelevant” is excluded), and then over topics. More precisely, the following procedure is used:

- for each of the three claims, F1-score is calculated for each class (label) separately;
- F1-scores are averaged over three out of four classes (the “irrelevant” class is excluded) – macro F1rel-score is obtained for a given claim; # aka fine_grained_f1
- macro F1rel-scores for all three claims are averaged – we get macro F1rel-score relative to the task (stance detection or premise classification). # average_f1

As a result, two main macro F1rel-scores will be calculated – one for each task. Participants’ systems will be ranked by these metrics (two separate lists). The F1rel-score for claims and F1-score for individual classes (labels) will be considered auxiliary.

In [11]:
from sklearn.metrics import f1_score
import numpy as np

def fine_grained_f1(true, pred):
    labels = [0, 1, 2]
    return f1_score(true, pred, average='macro', labels=labels)

def average_f1(f1s):
    return np.average(f1s)

### 1. RuBERT 

In [None]:
CONFIG_PATH = 'configs'

if not os.path.isdir(CONFIG_PATH):
    os.mkdir(CONFIG_PATH)

#### 1a Masks 

In [12]:
for fold in tqdm(range(5), desc="Writing train/test files for mask-related classification"):
    pathname = f'data/fold_{fold}'
    
    with open(f'data/fold_{fold}/train_tokens.json', 'r') as file:
        train = pd.read_json(file.read(), lines=True)
    
    train['text'] = train.tokens.map(lambda row: row.lower())
    train['label1'] = train.masks_stance.map(str)
    train['label2'] = train.masks_argument.map(str)
        
    with open(os.path.join(pathname, 'train_mask_tokens.json'), 'w') as fp:
        fp.write('\n'.join(
            json.dumps(i) for i in train[['text', 'label1', 'label2']].to_dict('records')) + '\n')
       
    with open(f'data/fold_{fold}/test_tokens.json', 'r') as file:
        test = pd.read_json(file.read(), lines=True)
        
    test['text'] = test.tokens.map(lambda row: row.lower())
    test['label1'] = test.masks_stance.map(str)
    test['label2'] = test.masks_argument.map(str)
        
    with open(os.path.join(pathname, 'test_mask_tokens.json'), 'w') as fp:
        fp.write('\n'.join(
            json.dumps(i) for i in test[['text', 'label1', 'label2']].to_dict('records')) + '\n')

Writing train/test files for mask-related classification: 100%|██████████| 5/5 [00:00<00:00, 10.37it/s]


In [21]:
%%writefile configs/rubert_masks_0.jsonnet

local embedding_dim = 768;
local foldnum = 0;
local max_length = 512;
local lr = std.parseJson(std.extVar('lr'));
local dropout = std.parseJson(std.extVar('dropout'));
local batch_size = std.parseJson(std.extVar('batch_size'));
local dataset_reader_type = "models_scripts.dataset_readers.two_outputs_rdr.TwoOutputsTextClassificationJsonReader";
local model_type = "models_scripts.models.two_outputs_clf.TwoOutputsTextClassifier";
local model_name = "DeepPavlov/rubert-base-cased";

{
  vocabulary: {
    non_padded_namespaces: ["tokens", "labels1", "labels2"]
  },
  dataset_reader: {
      type: dataset_reader_type,
      num_labels1: 4,
      num_labels2: 4,
      tokenizer: {
        type: "pretrained_transformer",
        model_name: model_name,
        max_length: max_length,
      },
      token_indexers: {
          tokens: {
            type: "pretrained_transformer",
            model_name: model_name,
            max_length: max_length,
            namespace: 'tokens',
      },
    },
  },
  train_data_path: 'data/fold_' + foldnum + '/train_mask_tokens.json',
  validation_data_path: 'data/fold_' + foldnum + '/test_mask_tokens.json',
  model: {
    type: model_type,
    dropout: dropout,
    num_labels1: 4,
    num_labels2: 4,
    label1_weights: [0.2, 0.3, 1.0, 1.0],
    label2_weights: [0.1, 0.1, 1.0, 1.0],
    text_field_embedder: {
        token_embedders: {
            tokens: {
              type: "pretrained_transformer",
              model_name: model_name,
              max_length: max_length,
            },
        },
    },
    seq2vec_encoder: {
        type: "bert_pooler",
           pretrained_model: model_name,
           requires_grad: true,
           dropout: 0.2,
    }
  },
  data_loader: {
    batch_sampler: {
      type: 'bucket',
      batch_size: batch_size,
    },
  },
  trainer: {
    optimizer: {
      type: "huggingface_adamw",
      lr: lr,
      weight_decay: 0.1,
    },
    learning_rate_scheduler: {
      type: "slanted_triangular",
      cut_frac: 0.06
    },
    validation_metric: '+all_mean',
    num_serialized_models_to_keep: 1,
    num_epochs: 30,
    patience: 3,
    cuda_device: 1,
  },
}

Writing configs/rubert_masks_0.jsonnet


In [1]:
%%writefile configs/convbert_masks_0.jsonnet

local embedding_dim = 768;
local foldnum = 0;
local max_length = 512;
local lr = std.parseJson(std.extVar('lr'));
local dropout = std.parseJson(std.extVar('dropout'));
local batch_size = std.parseJson(std.extVar('batch_size'));
local dataset_reader_type = "models_scripts.dataset_readers.two_outputs_rdr.TwoOutputsTextClassificationJsonReader";
local model_type = "models_scripts.models.two_outputs_clf.TwoOutputsTextClassifier";
local model_name = "DeepPavlov/rubert-base-cased-conversational";

{
  vocabulary: {
    non_padded_namespaces: ["tokens", "labels1", "labels2"]
  },
  dataset_reader: {
      type: dataset_reader_type,
      num_labels1: 4,
      num_labels2: 4,
      tokenizer: {
        type: "pretrained_transformer",
        model_name: model_name,
        max_length: max_length,
      },
      token_indexers: {
          tokens: {
            type: "pretrained_transformer",
            model_name: model_name,
            max_length: max_length,
            namespace: 'tokens',
      },
    },
  },
  train_data_path: 'data/fold_' + foldnum + '/train_mask_tokens.json',
  validation_data_path: 'data/fold_' + foldnum + '/test_mask_tokens.json',
  model: {
    type: model_type,
    dropout: dropout,
    num_labels1: 4,
    num_labels2: 4,
    label1_weights: [0.2, 0.3, 1.0, 1.0],
    label2_weights: [0.1, 0.1, 1.0, 1.0],
    text_field_embedder: {
        token_embedders: {
            tokens: {
              type: "pretrained_transformer",
              model_name: model_name,
              max_length: max_length,
            },
        },
    },
    seq2vec_encoder: {
        type: "bert_pooler",
           pretrained_model: model_name,
           requires_grad: true,
           dropout: 0.2,
    }
  },
  data_loader: {
    batch_sampler: {
      type: 'bucket',
      batch_size: batch_size,
    },
  },
  trainer: {
    optimizer: {
      type: "huggingface_adamw",
      lr: lr,
      weight_decay: 0.1,
    },
    learning_rate_scheduler: {
      type: "slanted_triangular",
      cut_frac: 0.06
    },
    validation_metric: '+all_mean',
    num_serialized_models_to_keep: 1,
    num_epochs: 30,
    patience: 3,
    cuda_device: 1,
  },
}

Overwriting configs/convbert_masks_0.jsonnet


In [2]:
%%writefile configs/base_model_params.json

[
  {
    "type": "int",
    "attributes": {
      "name": "batch_size",
      "low": 2,
      "high": 8
    }
  },
  {
    "type": "float",
    "attributes": {
      "name": "dropout",
      "low": 0.0,
      "high": 0.5
    }
  },
  {
    "type": "float",
    "attributes": {
      "name": "lr",
      "low": 2e-6,
      "high": 2e-4,
      "log": true
    }
  }
]

Overwriting configs/base_model_params.json


In [4]:
%%writefile tune_convbert.sh

export METHOD=convbert_masks
# rm -r $METHOD
# mkdir $METHOD
export FOLD=0
export STUDY_NAME=convbert_test30
# optuna delete-study --study-name $STUDY_NAME
allennlp tune configs/${METHOD}_${FOLD}.jsonnet configs/base_model_params.json --serialization-dir $METHOD/fold_${FOLD} \
    --study-name $STUDY_NAME \
    --skip-if-exists \
    --metrics best_validation_all_mean \
    --direction maximize

Overwriting tune_convbert.sh


In [46]:
! allennlp best-params --study-name convbert_test2

2022-01-28 14:46:10,137 - INFO - allennlp.common.plugins - Plugin allennlp_models available
2022-01-28 14:46:10,773 - INFO - allennlp.common.plugins - Plugin allennlp_optuna available
batch_size=4 dropout=0.4424265997222608 lr=2.533292198731432e-05


In [25]:
import glob
import json
import os


def find_top_trials(path):
    mean_all = dict()
    for directory in glob.glob(path):
        try:
            metrics = json.load(open(os.path.join(directory, 'metrics.json'), 'r'))
            mean_all[directory] = (metrics.get('best_validation_all_mean'), 
                                   metrics.get('best_validation_f1_1_mean'),
                                   metrics.get('best_validation_f1_2_mean'))
        except:
            pass
    return {key: value for key, value in sorted(mean_all.items(), key=lambda x: x[1][0], reverse=True)}

In [41]:
find_top_trials('convbert_masks/fold_0/trial_*')

{}

In [48]:
%%writefile convbert_masks.sh

export METHOD=convbert_masks
rm -r $METHOD
mkdir $METHOD
export batch_size=4
export dropout=0.44
export lr=2.5e-05

python utils/make_k_copies.py --filename configs/${METHOD}_0.jsonnet --k 5

export FOLD=0
allennlp train -s $METHOD/fold_${FOLD} configs/${METHOD}_${FOLD}.jsonnet
allennlp predict --use-dataset-reader --silent \
                 --output-file $METHOD/fold_${FOLD}/predictions_test.json $METHOD/fold_${FOLD}/model.tar.gz \
                 data/fold_${FOLD}/test_mask_tokens.json \
                 --include-package models_scripts
                 
allennlp predict --use-dataset-reader --silent \
                 --output-file $METHOD/fold_${FOLD}/predictions_train.json $METHOD/fold_${FOLD}/model.tar.gz \
                 data/fold_${FOLD}/train_mask_tokens.json \
                 --include-package models_scripts

export FOLD=1
allennlp train -s $METHOD/fold_${FOLD} configs/${METHOD}_${FOLD}.jsonnet
allennlp predict --use-dataset-reader --silent \
                 --output-file $METHOD/fold_${FOLD}/predictions_test.json $METHOD/fold_${FOLD}/model.tar.gz \
                 data/fold_${FOLD}/test_mask_tokens.json \
                 --include-package models_scripts
                 
allennlp predict --use-dataset-reader --silent \
                 --output-file $METHOD/fold_${FOLD}/predictions_train.json $METHOD/fold_${FOLD}/model.tar.gz \
                 data/fold_${FOLD}/train_mask_tokens.json \
                 --include-package models_scripts

export FOLD=2
allennlp train -s $METHOD/fold_${FOLD} configs/${METHOD}_${FOLD}.jsonnet
allennlp predict --use-dataset-reader --silent \
                 --output-file $METHOD/fold_${FOLD}/predictions_test.json $METHOD/fold_${FOLD}/model.tar.gz \
                 data/fold_${FOLD}/test_mask_tokens.json \
                 --include-package models_scripts
                 
allennlp predict --use-dataset-reader --silent \
                 --output-file $METHOD/fold_${FOLD}/predictions_train.json $METHOD/fold_${FOLD}/model.tar.gz \
                 data/fold_${FOLD}/train_mask_tokens.json \
                 --include-package models_scripts

export FOLD=3
allennlp train -s $METHOD/fold_${FOLD} configs/${METHOD}_${FOLD}.jsonnet
allennlp predict --use-dataset-reader --silent \
                 --output-file $METHOD/fold_${FOLD}/predictions_test.json $METHOD/fold_${FOLD}/model.tar.gz \
                 data/fold_${FOLD}/test_mask_tokens.json \
                 --include-package models_scripts
                 
allennlp predict --use-dataset-reader --silent \
                 --output-file $METHOD/fold_${FOLD}/predictions_train.json $METHOD/fold_${FOLD}/model.tar.gz \
                 data/fold_${FOLD}/train_mask_tokens.json \
                 --include-package models_scripts

export FOLD=4
allennlp train -s $METHOD/fold_${FOLD} configs/${METHOD}_${FOLD}.jsonnet
allennlp predict --use-dataset-reader --silent \
                 --output-file $METHOD/fold_${FOLD}/predictions_test.json $METHOD/fold_${FOLD}/model.tar.gz \
                 data/fold_${FOLD}/test_mask_tokens.json \
                 --include-package models_scripts
                 
allennlp predict --use-dataset-reader --silent \
                 --output-file $METHOD/fold_${FOLD}/predictions_train.json $METHOD/fold_${FOLD}/model.tar.gz \
                 data/fold_${FOLD}/train_mask_tokens.json \
                 --include-package models_scripts

Overwriting convbert_masks.sh


#### 1b Vaccines

In [49]:
for fold in tqdm(range(5), desc="Writing train/test files for vaccines-related classification"):
    pathname = f'data/fold_{fold}'
    
    with open(f'data/fold_{fold}/train_tokens.json', 'r') as file:
        train = pd.read_json(file.read(), lines=True)
    
    train['text'] = train.tokens.map(lambda row: row.lower())
    train['label1'] = train.vaccines_stance.map(str)
    train['label2'] = train.vaccines_argument.map(str)
        
    with open(os.path.join(pathname, 'train_vaccines_tokens.json'), 'w') as fp:
        fp.write('\n'.join(
            json.dumps(i) for i in train[['text', 'label1', 'label2']].to_dict('records')) + '\n')
       
    with open(f'data/fold_{fold}/test_tokens.json', 'r') as file:
        test = pd.read_json(file.read(), lines=True)
        
    test['text'] = test.tokens.map(lambda row: row.lower())
    test['label1'] = test.vaccines_stance.map(str)
    test['label2'] = test.vaccines_argument.map(str)
        
    with open(os.path.join(pathname, 'test_vaccines_tokens.json'), 'w') as fp:
        fp.write('\n'.join(
            json.dumps(i) for i in test[['text', 'label1', 'label2']].to_dict('records')) + '\n')

Writing train/test files for vaccines-related classification: 100%|██████████| 5/5 [00:00<00:00,  5.38it/s]


In [39]:
%%writefile configs/convbert_vaccines_0.jsonnet

local embedding_dim = 768;
local foldnum = 0;
local max_length = 512;
local lr = std.parseJson(std.extVar('lr'));
local dropout = std.parseJson(std.extVar('dropout'));
local batch_size = std.parseJson(std.extVar('batch_size'));
local dataset_reader_type = "models_scripts.dataset_readers.two_outputs_rdr.TwoOutputsTextClassificationJsonReader";
local model_type = "models_scripts.models.two_outputs_clf.TwoOutputsTextClassifier";
local model_name = "DeepPavlov/rubert-base-cased-conversational";

{
  vocabulary: {
    non_padded_namespaces: ["tokens", "labels1", "labels2"]
  },
  dataset_reader: {
      type: dataset_reader_type,
      num_labels1: 4,
      num_labels2: 4,
      tokenizer: {
        type: "pretrained_transformer",
        model_name: model_name,
        max_length: max_length,
      },
      token_indexers: {
          tokens: {
            type: "pretrained_transformer",
            model_name: model_name,
            max_length: max_length,
            namespace: 'tokens',
      },
    },
  },
  train_data_path: 'data/fold_' + foldnum + '/train_vaccines_tokens.json',
  validation_data_path: 'data/fold_' + foldnum + '/test_vaccines_tokens.json',
  model: {
    type: model_type,
    dropout: dropout,
    num_labels1: 4,
    num_labels2: 4,
    label1_weights: [0.1, 0.4, 1.0, 1.0],
    label2_weights: [0.1, 0.1, 0.6, 1.0],
    text_field_embedder: {
        token_embedders: {
            tokens: {
              type: "pretrained_transformer",
              model_name: model_name,
              max_length: max_length,
            },
        },
    },
    seq2vec_encoder: {
        type: "bert_pooler",
           pretrained_model: model_name,
           requires_grad: true,
           dropout: 0.2,
    }
  },
  data_loader: {
    batch_sampler: {
      type: 'bucket',
      batch_size: batch_size,
    },
  },
  trainer: {
    optimizer: {
      type: "huggingface_adamw",
      lr: lr,
      weight_decay: 0.1,
    },
    learning_rate_scheduler: {
      type: "slanted_triangular",
      cut_frac: 0.06
    },
    validation_metric: '+all_mean',
    num_serialized_models_to_keep: 1,
    num_epochs: 30,
    patience: 3,
    cuda_device: 1,
  },
}

Overwriting configs/convbert_vaccines_0.jsonnet


In [62]:
%%writefile tune_convbert.sh

export METHOD=convbert_vaccines
# rm -r $METHOD
# mkdir $METHOD
export FOLD=0
export STUDY_NAME=convbert_test3
# optuna delete-study --study-name $STUDY_NAME
allennlp tune configs/${METHOD}_${FOLD}.jsonnet configs/base_model_params.json --serialization-dir $METHOD/fold_${FOLD} \
    --study-name $STUDY_NAME \
    --skip-if-exists \
    --metrics best_validation_all_mean \
    --direction maximize

Overwriting tune_convbert.sh


In [53]:
! allennlp best-params --study-name convbert_test3

2022-01-30 11:08:21,283 - INFO - allennlp.common.plugins - Plugin allennlp_models available
2022-01-30 11:08:22,059 - INFO - allennlp.common.plugins - Plugin allennlp_optuna available
batch_size=8 dropout=0.16807920937155635 lr=5.389378433438082e-06


In [None]:
find_top_trials('convbert_vaccines/fold_0/trial_*')

In [63]:
%%writefile convbert_vaccines.sh

export METHOD=convbert_vaccines
rm -r $METHOD
mkdir $METHOD
export batch_size=8
export dropout=0.2
export lr=5e-06

python utils/make_k_copies.py --filename configs/${METHOD}_0.jsonnet --k 5

export FOLD=0
allennlp train -s $METHOD/fold_${FOLD} configs/${METHOD}_${FOLD}.jsonnet
allennlp predict --use-dataset-reader --silent \
                 --output-file $METHOD/fold_${FOLD}/predictions_test.json $METHOD/fold_${FOLD}/model.tar.gz \
                 data/fold_${FOLD}/test_vaccines_tokens.json \
                 --include-package models_scripts
                 
allennlp predict --use-dataset-reader --silent \
                 --output-file $METHOD/fold_${FOLD}/predictions_train.json $METHOD/fold_${FOLD}/model.tar.gz \
                 data/fold_${FOLD}/train_vaccines_tokens.json \
                 --include-package models_scripts

export FOLD=1
rm -r $METHOD/fold_${FOLD}
allennlp train -s $METHOD/fold_${FOLD} configs/${METHOD}_${FOLD}.jsonnet
allennlp predict --use-dataset-reader --silent \
                 --output-file $METHOD/fold_${FOLD}/predictions_test.json $METHOD/fold_${FOLD}/model.tar.gz \
                 data/fold_${FOLD}/test_vaccines_tokens.json \
                 --include-package models_scripts
                 
allennlp predict --use-dataset-reader --silent \
                 --output-file $METHOD/fold_${FOLD}/predictions_train.json $METHOD/fold_${FOLD}/model.tar.gz \
                 data/fold_${FOLD}/train_vaccines_tokens.json \
                 --include-package models_scripts

export FOLD=2
allennlp train -s $METHOD/fold_${FOLD} configs/${METHOD}_${FOLD}.jsonnet
allennlp predict --use-dataset-reader --silent \
                 --output-file $METHOD/fold_${FOLD}/predictions_test.json $METHOD/fold_${FOLD}/model.tar.gz \
                 data/fold_${FOLD}/test_vaccines_tokens.json \
                 --include-package models_scripts
                 
allennlp predict --use-dataset-reader --silent \
                 --output-file $METHOD/fold_${FOLD}/predictions_train.json $METHOD/fold_${FOLD}/model.tar.gz \
                 data/fold_${FOLD}/train_vaccines_tokens.json \
                 --include-package models_scripts

export FOLD=3
allennlp train -s $METHOD/fold_${FOLD} configs/${METHOD}_${FOLD}.jsonnet
allennlp predict --use-dataset-reader --silent \
                 --output-file $METHOD/fold_${FOLD}/predictions_test.json $METHOD/fold_${FOLD}/model.tar.gz \
                 data/fold_${FOLD}/test_vaccines_tokens.json \
                 --include-package models_scripts
                 
allennlp predict --use-dataset-reader --silent \
                 --output-file $METHOD/fold_${FOLD}/predictions_train.json $METHOD/fold_${FOLD}/model.tar.gz \
                 data/fold_${FOLD}/train_vaccines_tokens.json \
                 --include-package models_scripts

export FOLD=4
allennlp train -s $METHOD/fold_${FOLD} configs/${METHOD}_${FOLD}.jsonnet
allennlp predict --use-dataset-reader --silent \
                 --output-file $METHOD/fold_${FOLD}/predictions_test.json $METHOD/fold_${FOLD}/model.tar.gz \
                 data/fold_${FOLD}/test_vaccines_tokens.json \
                 --include-package models_scripts
                 
allennlp predict --use-dataset-reader --silent \
                 --output-file $METHOD/fold_${FOLD}/predictions_train.json $METHOD/fold_${FOLD}/model.tar.gz \
                 data/fold_${FOLD}/train_vaccines_tokens.json \
                 --include-package models_scripts

Overwriting convbert_vaccines.sh


#### 1c Quarantine

In [58]:
for fold in tqdm(range(5), desc="Writing train/test files for quarantine-related classification"):
    pathname = f'data/fold_{fold}'
    
    with open(f'data/fold_{fold}/train_tokens.json', 'r') as file:
        train = pd.read_json(file.read(), lines=True)
    
    train['text'] = train.tokens.map(lambda row: row.lower())
    train['label1'] = train.quarantine_stance.map(str)
    train['label2'] = train.quarantine_argument.map(str)
        
    with open(os.path.join(pathname, 'train_quarantine_tokens.json'), 'w') as fp:
        fp.write('\n'.join(
            json.dumps(i) for i in train[['text', 'label1', 'label2']].to_dict('records')) + '\n')
       
    with open(f'data/fold_{fold}/test_tokens.json', 'r') as file:
        test = pd.read_json(file.read(), lines=True)
        
    test['text'] = test.tokens.map(lambda row: row.lower())
    test['label1'] = test.quarantine_stance.map(str)
    test['label2'] = test.quarantine_argument.map(str)
        
    with open(os.path.join(pathname, 'test_quarantine_tokens.json'), 'w') as fp:
        fp.write('\n'.join(
            json.dumps(i) for i in test[['text', 'label1', 'label2']].to_dict('records')) + '\n')

Writing train/test files for quarantine-related classification: 100%|██████████| 5/5 [00:01<00:00,  3.33it/s]


In [7]:
%%writefile configs/convbert_quarantine_0.jsonnet

local embedding_dim = 768;
local foldnum = 0;
local max_length = 512;
local lr = std.parseJson(std.extVar('lr'));
local dropout = std.parseJson(std.extVar('dropout'));
local batch_size = std.parseJson(std.extVar('batch_size'));
local dataset_reader_type = "models_scripts.dataset_readers.two_outputs_rdr.TwoOutputsTextClassificationJsonReader";
local model_type = "models_scripts.models.two_outputs_clf.TwoOutputsTextClassifier";
local model_name = "DeepPavlov/rubert-base-cased-conversational";

{
  vocabulary: {
    non_padded_namespaces: ["tokens", "labels1", "labels2"]
  },
  dataset_reader: {
      type: dataset_reader_type,
      num_labels1: 4,
      num_labels2: 4,
      tokenizer: {
        type: "pretrained_transformer",
        model_name: model_name,
        max_length: max_length,
      },
      token_indexers: {
          tokens: {
            type: "pretrained_transformer",
            model_name: model_name,
            max_length: max_length,
            namespace: 'tokens',
      },
    },
  },
  train_data_path: 'data/fold_' + foldnum + '/train_quarantine_tokens.json',
  validation_data_path: 'data/fold_' + foldnum + '/test_quarantine_tokens.json',
  model: {
    type: model_type,
    dropout: dropout,
    num_labels1: 4,
    num_labels2: 4,
    label1_weights: [0.1, 0.1, 0.3, 1.0],
    label2_weights: [0.1, 0.1, 0.6, 1.0],
    text_field_embedder: {
        token_embedders: {
            tokens: {
              type: "pretrained_transformer",
              model_name: model_name,
              max_length: max_length,
            },
        },
    },
    seq2vec_encoder: {
        type: "bert_pooler",
           pretrained_model: model_name,
           requires_grad: true,
           dropout: 0.2,
    }
  },
  data_loader: {
    batch_sampler: {
      type: 'bucket',
      batch_size: batch_size,
    },
  },
  trainer: {
    optimizer: {
      type: "huggingface_adamw",
      lr: lr,
      weight_decay: 0.1,
    },
    learning_rate_scheduler: {
      type: "slanted_triangular",
      cut_frac: 0.06
    },
    validation_metric: '+all_mean',
    num_serialized_models_to_keep: 1,
    num_epochs: 30,
    patience: 3,
    cuda_device: 1,
  },
}

Overwriting configs/convbert_quarantine_0.jsonnet


In [8]:
%%writefile tune_convbert_q.sh

export METHOD=convbert_quarantine
rm -r $METHOD
mkdir $METHOD
export FOLD=0
export STUDY_NAME=convbert_test10
# optuna delete-study --study-name $STUDY_NAME
allennlp tune configs/${METHOD}_${FOLD}.jsonnet configs/base_model_params.json --serialization-dir $METHOD/fold_${FOLD} \
    --study-name $STUDY_NAME \
    --skip-if-exists \
    --metrics best_validation_all_mean \
    --direction maximize

Overwriting tune_convbert_q.sh


In [8]:
! allennlp best-params --study-name convbert_test10

2022-02-04 21:00:09,211 - INFO - allennlp.common.plugins - Plugin allennlp_models available
2022-02-04 21:00:09,834 - INFO - allennlp.common.plugins - Plugin allennlp_optuna available
batch_size=6 dropout=0.2992458712067729 lr=1.1411634348494446e-05


In [None]:
find_top_trials('convbert_quarantine/fold_0/trial_*')

In [9]:
%%writefile convbert_quarantine.sh

export METHOD=convbert_quarantine
rm -r $METHOD
mkdir $METHOD
export batch_size=6
export dropout=0.3
export lr=1e-05


python utils/make_k_copies.py --filename configs/${METHOD}_0.jsonnet --k 5

export FOLD=0
allennlp train -s $METHOD/fold_${FOLD} configs/${METHOD}_${FOLD}.jsonnet
allennlp predict --use-dataset-reader --silent \
                 --output-file $METHOD/fold_${FOLD}/predictions_test.json $METHOD/fold_${FOLD}/model.tar.gz \
                 data/fold_${FOLD}/test_quarantine_tokens.json \
                 --include-package models_scripts
                 
allennlp predict --use-dataset-reader --silent \
                 --output-file $METHOD/fold_${FOLD}/predictions_train.json $METHOD/fold_${FOLD}/model.tar.gz \
                 data/fold_${FOLD}/train_quarantine_tokens.json \
                 --include-package models_scripts

export FOLD=1
allennlp train -s $METHOD/fold_${FOLD} configs/${METHOD}_${FOLD}.jsonnet
allennlp predict --use-dataset-reader --silent \
                 --output-file $METHOD/fold_${FOLD}/predictions_test.json $METHOD/fold_${FOLD}/model.tar.gz \
                 data/fold_${FOLD}/test_quarantine_tokens.json \
                 --include-package models_scripts
                 
allennlp predict --use-dataset-reader --silent \
                 --output-file $METHOD/fold_${FOLD}/predictions_train.json $METHOD/fold_${FOLD}/model.tar.gz \
                 data/fold_${FOLD}/train_quarantine_tokens.json \
                 --include-package models_scripts

export FOLD=2
allennlp train -s $METHOD/fold_${FOLD} configs/${METHOD}_${FOLD}.jsonnet
allennlp predict --use-dataset-reader --silent \
                 --output-file $METHOD/fold_${FOLD}/predictions_test.json $METHOD/fold_${FOLD}/model.tar.gz \
                 data/fold_${FOLD}/test_quarantine_tokens.json \
                 --include-package models_scripts
                 
allennlp predict --use-dataset-reader --silent \
                 --output-file $METHOD/fold_${FOLD}/predictions_train.json $METHOD/fold_${FOLD}/model.tar.gz \
                 data/fold_${FOLD}/train_quarantine_tokens.json \
                 --include-package models_scripts

export FOLD=3
allennlp train -s $METHOD/fold_${FOLD} configs/${METHOD}_${FOLD}.jsonnet
allennlp predict --use-dataset-reader --silent \
                 --output-file $METHOD/fold_${FOLD}/predictions_test.json $METHOD/fold_${FOLD}/model.tar.gz \
                 data/fold_${FOLD}/test_quarantine_tokens.json \
                 --include-package models_scripts
                 
allennlp predict --use-dataset-reader --silent \
                 --output-file $METHOD/fold_${FOLD}/predictions_train.json $METHOD/fold_${FOLD}/model.tar.gz \
                 data/fold_${FOLD}/train_quarantine_tokens.json \
                 --include-package models_scripts

export FOLD=4
allennlp train -s $METHOD/fold_${FOLD} configs/${METHOD}_${FOLD}.jsonnet
allennlp predict --use-dataset-reader --silent \
                 --output-file $METHOD/fold_${FOLD}/predictions_test.json $METHOD/fold_${FOLD}/model.tar.gz \
                 data/fold_${FOLD}/test_quarantine_tokens.json \
                 --include-package models_scripts
                 
allennlp predict --use-dataset-reader --silent \
                 --output-file $METHOD/fold_${FOLD}/predictions_train.json $METHOD/fold_${FOLD}/model.tar.gz \
                 data/fold_${FOLD}/train_quarantine_tokens.json \
                 --include-package models_scripts

Overwriting convbert_quarantine.sh


### Evaluation 

In [334]:
f1_stances = []
f1_arguments = []

f1_masks_stances = []
f1_masks_arguments = []
f1_vaccines_stances = []
f1_vaccines_arguments = []
f1_quarantine_stances = []
f1_quarantine_arguments = []


for fold in range(5):
    pathname = f'data/fold_{fold}'
    
    with open(f'data/fold_{fold}/test_tokens.json', 'r') as file:
        test = pd.read_json(file.read(), lines=True)
        
    with open(f'convbert_masks/fold_{fold}/predictions_test.json', 'r') as file:
        pred = pd.read_json(file.read(), lines=True)
        pred_masks_stance = pred.label1
        pred_masks_argument = pred.label2

    f1_masks_stance = fine_grained_f1(test.masks_stance, pred_masks_stance)
    f1_masks_argument = fine_grained_f1(test.masks_argument, pred_masks_argument)
    print(f'Masks stance: {(f1_masks_stance*100).round(2)}, argument: {(f1_masks_argument*100).round(2)}')
    f1_masks_stances.append(f1_masks_stance)
    f1_masks_arguments.append(f1_masks_argument)
    
    with open(f'convbert_vaccines/fold_{fold}/predictions_test.json', 'r') as file:
        pred = pd.read_json(file.read(), lines=True)
        pred_vac_stance = pred.label1
        pred_vac_argument = pred.label2
        
    f1_vac_stance = fine_grained_f1(test.vaccines_stance, pred_vac_stance)
    f1_vac_argument = fine_grained_f1(test.vaccines_argument, pred_vac_argument)
    print(f'Vaccines stance: {(f1_vac_stance*100).round(2)}, argument: {(f1_vac_argument*100).round(2)}')
    f1_vaccines_stances.append(f1_vac_stance)
    f1_vaccines_arguments.append(f1_vac_argument)
    
    with open(f'convbert_quarantine/fold_{fold}/predictions_test.json', 'r') as file:
        pred = pd.read_json(file.read(), lines=True)
        pred_quarantine_stance = pred.label1
        pred_quarantine_argument = pred.label2
        
    f1_quarantine_stance = fine_grained_f1(test.quarantine_stance, pred_quarantine_stance)
    f1_quarantine_argument = fine_grained_f1(test.quarantine_argument, pred_quarantine_argument)
    print(f'Quarantine stance: {(f1_quarantine_stance*100).round(2)}, argument: {(f1_quarantine_argument*100).round(2)}')
    f1_quarantine_stances.append(f1_quarantine_stance)
    f1_quarantine_arguments.append(f1_quarantine_argument)

    f1_stance = average_f1([f1_masks_stance, f1_vac_stance])
    f1_arg = average_f1([f1_masks_argument, f1_vac_argument])
    
    print(f'Fold {fold} ------------------')
    print('F1 stance\t:::', (f1_stance * 100).round(2))
    print('F1 argument\t:::', (f1_arg * 100).round(2))
    
    f1_stances.append(f1_stance)
    f1_arguments.append(f1_arg)

Masks stance: 60.03, argument: 67.96
Vaccines stance: 58.88, argument: 39.68
Quarantine stance: 42.39, argument: 42.72
Fold 0 ------------------
F1 stance	::: 59.45
F1 argument	::: 53.82
Masks stance: 61.89, argument: 69.94
Vaccines stance: 56.96, argument: 59.94
Quarantine stance: 46.35, argument: 43.7
Fold 1 ------------------
F1 stance	::: 59.42
F1 argument	::: 64.94
Masks stance: 59.47, argument: 56.58
Vaccines stance: 52.76, argument: 48.53
Quarantine stance: 54.79, argument: 51.01
Fold 2 ------------------
F1 stance	::: 56.12
F1 argument	::: 52.56
Masks stance: 61.68, argument: 69.33
Vaccines stance: 47.73, argument: 37.15
Quarantine stance: 38.12, argument: 29.76
Fold 3 ------------------
F1 stance	::: 54.7
F1 argument	::: 53.24
Masks stance: 62.32, argument: 64.1
Vaccines stance: 45.8, argument: 46.69
Quarantine stance: 58.93, argument: 52.75
Fold 4 ------------------
F1 stance	::: 54.06
F1 argument	::: 55.4


In [335]:
print(f'stance \t\t{(np.mean(f1_stances) * 100).round(1)} ± {(np.std(f1_stances) * 100).round(1)}')
print(f'argument\t{(np.mean(f1_arguments) * 100).round(12)} ± {(np.std(f1_arguments) * 100).round(1)}')

stance 		56.8 ± 2.3
argument	55.99121114449 ± 4.6


In [336]:
print(f'mask stance \t{(np.mean(f1_masks_stances) * 100).round(1)} ± {(np.std(f1_masks_stances) * 100).round(1)}')
print(f'mask argument\t{(np.mean(f1_masks_arguments) * 100).round(1)} ± {(np.std(f1_masks_arguments) * 100).round(1)}')

mask stance 	61.1 ± 1.1
mask argument	65.6 ± 4.9


In [337]:
print(f'vaccine stance \t\t{(np.mean(f1_vaccines_stances) * 100).round(1)} ± {(np.std(f1_vaccines_stances) * 100).round(1)}')
print(f'vaccine argument\t{(np.mean(f1_vaccines_arguments) * 100).round(1)} ± {(np.std(f1_vaccines_arguments) * 100).round(1)}')

vaccine stance 		52.4 ± 5.1
vaccine argument	46.4 ± 8.0


In [338]:
print(f'quarantine stance\t{(np.mean(f1_quarantine_stances) * 100).round(1)} ± {(np.std(f1_quarantine_stances) * 100).round(1)}')
print(f'quarantine argument\t{(np.mean(f1_quarantine_arguments) * 100).round(1)} ± {(np.std(f1_quarantine_arguments) * 100).round(1)}')

quarantine stance	48.1 ± 7.7
quarantine argument	44.0 ± 8.1


### Predict on unlabeled data 

In [None]:
dev = pd.read_csv('data/val_empty.tsv', sep='\t')
dev['tokens'] = dev.text.map(tokenize)

In [17]:
model_predictions = [model.predict(text, k=DEPTH) for idx, text in dev.tokens.iteritems()]
pred_masks_stance = [load_prediction('masks_stance', pred) for pred in model_predictions]
pred_masks_argument = [load_prediction('masks_argument', pred) for pred in model_predictions]
pred_quarantine_stance = [load_prediction('quarantine_stance', pred) for pred in model_predictions]
pred_quarantine_argument = [load_prediction('quarantine_argument', pred) for pred in model_predictions]
pred_vac_stance = [load_prediction('vaccines_stance', pred) for pred in model_predictions]
pred_vac_argument = [load_prediction('vaccines_argument', pred) for pred in model_predictions]

In [18]:
dev.masks_stance = pred_masks_stance
dev.masks_argument = pred_masks_argument
dev.quarantine_stance = pred_quarantine_stance
dev.quarantine_argument = pred_quarantine_argument
dev.vaccines_stance = pred_vac_stance
dev.vaccines_argument = pred_vac_argument

In [27]:
dev[['text_id', 'text', 'masks_stance', 'masks_argument', 'quarantine_stance', 'quarantine_argument',
     'vaccines_stance', 'vaccines_argument']].to_csv('data/baseline_dev.tsv', sep='\t', index=None)