# RATIO 2019 - Benchmarking Workshop

https://github.com/zihangdai/xlnet/blob/master/notebooks/colab_imdb_gpu.ipynb

### Setup

In [None]:
! conda env create -f environment.yml
! conda activate argmining19-ssc && conda install -y -c conda-forge git-lfs && git lfs install && git lfs pull

In [40]:
! pip install sentencepiece
! pip install absl-py
# ! pip install tensorflow-auto-detect
! pip install tensorflow-gpu



In [8]:
import os

if not os.path.exists('data/xlnet_cased_L-24_H-1024_A-16'):
    ! wget https://storage.googleapis.com/xlnet/released_models/cased_L-24_H-1024_A-16.zip
    ! mv cased_L-24_H-1024_A-16.zip data/
    ! cd data/ && unzip cased_L-24_H-1024_A-16.zip
else:
    print('Have XLNet model already!')

Have XLNet model already!


In [4]:
if not os.path.exists('xlnet'):
    ! git clone https://github.com/zihangdai/xlnet.git
else:
    print('Should have repo already!')
    ! cd xlnet && git pull

Cloning into 'xlnet'...
remote: Enumerating objects: 13, done.[K
remote: Counting objects: 100% (13/13), done.[K
remote: Compressing objects: 100% (11/11), done.[K
remote: Total 107 (delta 3), reused 5 (delta 1), pack-reused 94[K
Receiving objects: 100% (107/107), 143.88 KiB | 767.00 KiB/s, done.
Resolving deltas: 100% (46/46), done.


### Convert data format

https://gist.github.com/W4ngatang/60c2bdb54d156a41194446737ce03e2e

In [11]:
import csv

import pandas as pd

In [12]:
data_cross_path = 'data/same-side-classification/cross-topic/{}.csv'
data_within_path = 'data/same-side-classification/within-topic/{}.csv'

cross_traindev_df = pd.read_csv(data_cross_path.format('training'),
                                quotechar='"',
                                quoting=csv.QUOTE_ALL,
                                encoding='utf-8',
                                escapechar='\\',
                                doublequote=False,
                                index_col='id')
cross_test_df = pd.read_csv(data_cross_path.format('test'),
                            quotechar='"',
                            quoting=csv.QUOTE_ALL,
                            encoding='utf-8',
                            escapechar='\\',
                            doublequote=False,
                            index_col='id')

within_traindev_df = pd.read_csv(data_within_path.format('training'),
                                 quotechar='"',
                                 quoting=csv.QUOTE_ALL,
                                 encoding='utf-8',
                                 escapechar='\\',
                                 doublequote=False,
                                 index_col='id')
within_test_df = pd.read_csv(data_within_path.format('test'),
                             quotechar='"',
                             quoting=csv.QUOTE_ALL,
                             encoding='utf-8',
                             escapechar='\\',
                             doublequote=False,
                             index_col='id')

In [13]:
# Adding a tag for the topics in focus: "gay marriage" and "abortion"
def add_tag(row):
    title = row['topic'].lower().strip()
    if "abortion" in title:
        row['tag'] = 'abortion'
    elif "gay marriage" in title:
        row['tag'] = 'gay marriage'
    else:
        row['tag'] = 'NA'
    return row


cross_traindev_df = cross_traindev_df.apply(add_tag, axis=1)
cross_test_df = cross_test_df.apply(add_tag, axis=1)

within_traindev_df = within_traindev_df.apply(add_tag, axis=1)
within_test_df = within_test_df.apply(add_tag, axis=1)

In [14]:
from sklearn.model_selection import train_test_split


def get_train_test_sets(df, ratio=0.30, random_state=1):
    X = df[['argument1', 'argument2', 'argument1_id', 'argument2_id', 'topic']]
    y = df[['is_same_side']]

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=ratio, random_state=random_state, shuffle=True)
    return X_train, X_test, y_train, y_test

In [45]:
X_train, X_dev, y_train, y_dev = get_train_test_sets(within_traindev_df)
X_test = within_test_df
# X_train, X_dev, y_train, y_dev = get_train_test_sets(cross_traindev_df)
# X_test = cross_test_df

In [46]:
import os
from tqdm import tqdm

DATA_DIR = 'data/xlnet-in'
if not os.path.exists(DATA_DIR):
    os.mkdir(DATA_DIR)
    
DATA_DIR = os.path.join(DATA_DIR, 'ssc-within')
# DATA_DIR = os.path.join(DATA_DIR, 'ssc-cross')
if not os.path.exists(DATA_DIR):
    os.mkdir(DATA_DIR)

train_df = X_train.join(y_train)
dev_df = X_dev.join(y_dev)
test_df = X_test

with open(os.path.join(DATA_DIR, 'train.tsv'), 'w', encoding='utf-8') as fh:
    fh.write("label\t#1 ID\t#2 ID\t#1 String\t#2 String\n")
    for _, row in tqdm(train_df.iterrows()):
        fh.write("{}\t{}\t{}\t{}\t{}\n".format(
            (1 if row['is_same_side'] else 0), row['argument1_id'],
            row['argument2_id'], row['argument1'], row['argument2']))

with open(os.path.join(DATA_DIR, 'dev.tsv'), 'w', encoding='utf-8') as fh:
    fh.write("label\t#1 ID\t#2 ID\t#1 String\t#2 String\n")
    for _, row in tqdm(dev_df.iterrows()):
        fh.write("{}\t{}\t{}\t{}\t{}\n".format(
            (1 if row['is_same_side'] else 0), row['argument1_id'],
            row['argument2_id'], row['argument1'], row['argument2']))

with open(os.path.join(DATA_DIR, 'test.tsv'), 'w', encoding='utf-8') as fh:
    fh.write("index\t#1 ID\t#2 ID\t#1 String\t#2 String\n")
    for id_, row in tqdm(test_df.iterrows()):
        fh.write("{}\t{}\t{}\t{}\t{}\n".format(id_, row['argument1_id'],
                                               row['argument2_id'],
                                               row['argument1'],
                                               row['argument2']))

44732it [00:05, 8215.50it/s]
19171it [00:01, 14331.95it/s]
31475it [00:02, 15577.95it/s]


In [47]:
! head -n 2 data/xlnet-in/ssc-within/train.tsv

label	#1 ID	#2 ID	#1 String	#2 String
1	b67fc3fb-2019-04-17T11:47:41Z-00112-000	b67fc3fb-2019-04-17T11:47:41Z-00112-000	wanted fetuses are beloved "babies"; unwanted ones are "tissue" (inconsistent)	abortions are emotionally and psychologically unsafe.


### Variables

In [59]:
TASK_NAME = 'ssc-within' #@param{type:"string"}
SCRIPTS_DIR = 'xlnet' #@param {type:"string"}
DATA_DIR = 'data/xlnet-in/ssc-within' #@param {type:"string"}
OUTPUT_DIR = 'data/xlnet-out/ssc-within' #@param {type:"string"}
PRETRAINED_MODEL_DIR = 'data/xlnet_cased_L-24_H-1024_A-16' #@param {type:"string"}
CHECKPOINT_DIR = 'data/xlnet-chkp/ssc-within' #@param {type:"string"}

MAX_SEQ_LEN = '128'
BATCH_SIZE = '4'

### Run model

In [60]:
train_command = "python3 " + SCRIPTS_DIR + "/run_classifier.py \
  --do_train=True \
  --do_eval=True \
  --eval_all_ckpt=True \
  --task_name=" + TASK_NAME + " \
  --data_dir=" + DATA_DIR + " \
  --output_dir=" + OUTPUT_DIR + " \
  --model_dir=" + CHECKPOINT_DIR + " \
  --uncased=False \
  --spiece_model_file=" + PRETRAINED_MODEL_DIR + "/spiece.model \
  --model_config_path=" + PRETRAINED_MODEL_DIR + "/xlnet_config.json \
  --init_checkpoint=" + PRETRAINED_MODEL_DIR + "/xlnet_model.ckpt \
  --max_seq_length=" + MAX_SEQ_LEN + " \
  --train_batch_size=" + BATCH_SIZE + " \
  --eval_batch_size=" + BATCH_SIZE + " \
  --num_hosts=1 \
  --num_core_per_host=1 \
  --learning_rate=2e-5 \
  --train_steps=4000 \
  --warmup_steps=500 \
  --save_steps=500 \
  --iterations=500"

! {train_command}

W0701 17:50:31.592350 140037696431936 deprecation_wrapper.py:119] From /disk1/users/ekoerner/same-side-classification/argmining19-same-side-classification/xlnet/model_utils.py:295: The name tf.train.Optimizer is deprecated. Please use tf.compat.v1.train.Optimizer instead.

W0701 17:50:31.593913 140037696431936 deprecation_wrapper.py:119] From xlnet/run_classifier.py:865: The name tf.app.run is deprecated. Please use tf.compat.v1.app.run instead.

W0701 17:50:31.594314 140037696431936 deprecation_wrapper.py:119] From xlnet/run_classifier.py:645: The name tf.logging.set_verbosity is deprecated. Please use tf.compat.v1.logging.set_verbosity instead.

W0701 17:50:31.594387 140037696431936 deprecation_wrapper.py:119] From xlnet/run_classifier.py:645: The name tf.logging.INFO is deprecated. Please use tf.compat.v1.logging.INFO instead.

W0701 17:50:31.594459 140037696431936 deprecation_wrapper.py:119] From xlnet/run_classifier.py:671: The name tf.gfile.Exists is deprecated. Please use tf.io.

W0701 17:50:39.788754 140037696431936 deprecation.py:323] From /disk1/users/ekoerner/same-side-classification/argmining19-same-side-classification/xlnet/model_utils.py:123: add_dispatch_support.<locals>.wrapper (from tensorflow.python.ops.array_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
W0701 17:50:39.789453 140037696431936 deprecation_wrapper.py:119] From /disk1/users/ekoerner/same-side-classification/argmining19-same-side-classification/xlnet/model_utils.py:131: The name tf.train.AdamOptimizer is deprecated. Please use tf.compat.v1.train.AdamOptimizer instead.

I0701 17:50:51.597460 140037696431936 estimator.py:1147] Done calling model_fn.
I0701 17:50:51.598851 140037696431936 basic_session_run_hooks.py:541] Create CheckpointSaverHook.
I0701 17:50:56.939395 140037696431936 monitored_session.py:240] Graph was finalized.
2019-07-01 17:50:56.939700: I tensorflow/core/platform/c

I0701 17:55:53.087936 140037696431936 basic_session_run_hooks.py:692] global_step/sec: 3.2974
I0701 17:55:53.089003 140037696431936 basic_session_run_hooks.py:260] loss = 0.67947936, step = 700 (30.327 sec)
I0701 17:56:23.485878 140037696431936 basic_session_run_hooks.py:692] global_step/sec: 3.28969
I0701 17:56:23.486687 140037696431936 basic_session_run_hooks.py:260] loss = 0.6362066, step = 800 (30.398 sec)
I0701 17:56:53.784959 140037696431936 basic_session_run_hooks.py:692] global_step/sec: 3.30044
I0701 17:56:53.785936 140037696431936 basic_session_run_hooks.py:260] loss = 0.8376169, step = 900 (30.299 sec)
I0701 17:57:23.928329 140037696431936 basic_session_run_hooks.py:606] Saving checkpoints for 1000 into data/xlnet-chkp/ssc-within/model.ckpt.
I0701 17:57:27.348334 140037696431936 checkpoint_management.py:95] data/xlnet-chkp/ssc-within/model.ckpt-1000 is not in all_model_checkpoint_paths. Manually adding it.
I0701 17:57:29.178934 140037696431936 basic_session_run_hooks.py:692]

I0701 18:12:07.851596 140037696431936 basic_session_run_hooks.py:692] global_step/sec: 3.2892
I0701 18:12:07.852277 140037696431936 basic_session_run_hooks.py:260] loss = 0.68414915, step = 3800 (30.402 sec)
I0701 18:12:38.287777 140037696431936 basic_session_run_hooks.py:692] global_step/sec: 3.28556
I0701 18:12:38.288453 140037696431936 basic_session_run_hooks.py:260] loss = 0.91425836, step = 3900 (30.436 sec)
I0701 18:13:08.370802 140037696431936 basic_session_run_hooks.py:606] Saving checkpoints for 4000 into data/xlnet-chkp/ssc-within/model.ckpt.
I0701 18:13:12.092422 140037696431936 checkpoint_management.py:95] data/xlnet-chkp/ssc-within/model.ckpt-4000 is not in all_model_checkpoint_paths. Manually adding it.
I0701 18:13:14.057407 140037696431936 estimator.py:368] Loss for final step: 1.038384.
I0701 18:13:14.442138 140037696431936 run_classifier.py:737] Num of eval samples: 19171
I0701 18:13:14.442384 140037696431936 run_classifier.py:413] Create new tfrecord data/xlnet-out/ss

I0701 18:14:08.098958 140037696431936 estimator.py:1145] Calling model_fn.
I0701 18:14:08.104631 140037696431936 modeling.py:453] memory input None
I0701 18:14:08.104758 140037696431936 modeling.py:455] Use float type <dtype: 'float32'>
I0701 18:14:13.995715 140037696431936 run_classifier.py:535] #params: 361319426
I0701 18:14:13.995876 140037696431936 model_utils.py:71] Initialize from the ckpt data/xlnet_cased_L-24_H-1024_A-16/xlnet_model.ckpt
I0701 18:14:14.697473 140037696431936 model_utils.py:85] **** Global Variables ****
I0701 18:14:14.697630 140037696431936 model_utils.py:91]   name = model/transformer/r_w_bias:0, shape = (24, 16, 64), *INIT_FROM_CKPT*
I0701 18:14:14.697690 140037696431936 model_utils.py:91]   name = model/transformer/r_r_bias:0, shape = (24, 16, 64), *INIT_FROM_CKPT*
I0701 18:14:14.697729 140037696431936 model_utils.py:91]   name = model/transformer/word_embedding/lookup_table:0, shape = (32000, 1024), *INIT_FROM_CKPT*
I0701 18:14:14.697766 140037696431936 mod

I0701 18:14:14.728640 140037696431936 estimator.py:1147] Done calling model_fn.
I0701 18:14:14.740313 140037696431936 evaluation.py:255] Starting evaluation at 2019-07-01T18:14:14Z
I0701 18:14:15.189416 140037696431936 monitored_session.py:240] Graph was finalized.
2019-07-01 18:14:15.190423: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1640] Found device 0 with properties: 
name: GeForce GTX 1080 Ti major: 6 minor: 1 memoryClockRate(GHz): 1.62
pciBusID: 0000:17:00.0
2019-07-01 18:14:15.190958: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1640] Found device 1 with properties: 
name: GeForce GTX 1080 Ti major: 6 minor: 1 memoryClockRate(GHz): 1.62
pciBusID: 0000:65:00.0
2019-07-01 18:14:15.191003: I tensorflow/stream_executor/platform/default/dso_loader.cc:42] Successfully opened dynamic library libcudart.so.10.0
2019-07-01 18:14:15.191012: I tensorflow/stream_executor/platform/default/dso_loader.cc:42] Successfully opened dynamic library libcublas.so.10.0
2019-07-01 18:14:1

I0701 18:21:40.754153 140037696431936 model_utils.py:91]   name = model/transformer/layer_23/ff/layer_2/kernel:0, shape = (4096, 1024), *INIT_FROM_CKPT*
I0701 18:21:40.754185 140037696431936 model_utils.py:91]   name = model/transformer/layer_23/ff/layer_2/bias:0, shape = (1024,), *INIT_FROM_CKPT*
I0701 18:21:40.754215 140037696431936 model_utils.py:91]   name = model/transformer/layer_23/ff/LayerNorm/beta:0, shape = (1024,), *INIT_FROM_CKPT*
I0701 18:21:40.754245 140037696431936 model_utils.py:91]   name = model/transformer/layer_23/ff/LayerNorm/gamma:0, shape = (1024,), *INIT_FROM_CKPT*
I0701 18:21:40.754276 140037696431936 model_utils.py:91]   name = model/sequnece_summary/summary/kernel:0, shape = (1024, 1024)
I0701 18:21:40.754307 140037696431936 model_utils.py:91]   name = model/sequnece_summary/summary/bias:0, shape = (1024,)
I0701 18:21:40.754338 140037696431936 model_utils.py:91]   name = model/classification_ssc-within/logit/kernel:0, shape = (1024, 2)
I0701 18:21:40.754370 1

In [61]:
help_command = "python3 " + SCRIPTS_DIR + "/run_classifier.py --help"

! {help_command}

W0701 18:31:50.251414 140541152442176 deprecation_wrapper.py:119] From /disk1/users/ekoerner/same-side-classification/argmining19-same-side-classification/xlnet/model_utils.py:295: The name tf.train.Optimizer is deprecated. Please use tf.compat.v1.train.Optimizer instead.

W0701 18:31:50.253334 140541152442176 deprecation_wrapper.py:119] From xlnet/run_classifier.py:865: The name tf.app.run is deprecated. Please use tf.compat.v1.app.run instead.


       USAGE: xlnet/run_classifier.py [flags]
flags:

xlnet/run_classifier.py:
  --adam_epsilon: Adam epsilon
    (default: '1e-08')
    (a number)
  --clamp_len: Clamp length
    (default: '-1')
    (an integer)
  --clip: Gradient clipping
    (default: '1.0')
    (a number)
  --cls_scope: Classifier layer scope.
  --data_dir: Directory for input data.
    (default: '')
  --decay_method: poly or cos
    (default: 'poly')
  --[no]do_eval: whether to do eval
    (default: 'false')
  --[no]do_predict: whether to do prediction
    (default: 'fal

In [None]:
stop it

In [1]:
import datetime
import logging
import os
import random
import time
import warnings

import csv
import gluonnlp as nlp
import matplotlib.pyplot as plt
import mxnet as mx
import numpy as np
import pandas as pd
import seaborn as sns

from bert import *
from mxnet import gluon
from mxnet.gluon.data import SimpleDataset
from sklearn.metrics import (accuracy_score, classification_report,
                             confusion_matrix, f1_score)
from sklearn.model_selection import train_test_split
from sklearn import utils
from tqdm import tqdm

In [2]:
warnings.filterwarnings('ignore')

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
                    level=logging.INFO)

In [3]:
# set repeatable random state
np.random.seed(100)
random.seed(100)
mx.random.seed(10000)

In [4]:
# apply progress bars for pandas .apply() -> .progress_apply()
tqdm.pandas()

In [5]:
class Timer:
    def __init__(self, name=None):
        self.name = name

    def __enter__(self):
        self.time_start = time.time()

    def __exit__(self, *exc):
        time_end = time.time()
        time_delta = datetime.timedelta(seconds=(time_end - self.time_start))
        if self.name:
            print(("Time for [{}]: {}".format(self.name, time_delta)))
        else:
            print(("Time: {}".format(time_delta)))

# Task 1 - Same Side Classification

## Train model - Baseline

### BERT

- https://gluon-nlp.mxnet.io/examples/sentence_embedding/bert.html

In [39]:
class MyBERTDataset(SimpleDataset):
    def __init__(self, X, y=None):
        self._X = X
        self._y = y
        super(MyBERTDataset, self).__init__(self._convert())
    
    def _convert(self):
        allsamples = list()

        if self._y is not None:
            df = self._X.merge(self._y, left_index=True, right_index=True)
            for _, row in df.iterrows():
                allsamples.append([row['argument1'], row['argument2'], "1" if str(row['is_same_side']) == "True" else "0"])
        else:
            for _, row in self._X.iterrows():
                allsamples.append([row['argument1'], row['argument2'], None])
        
        return allsamples

    # for lazy retrieval?
    #
    # def __getitem__(self, idx):
    #     row_X = self._X.iloc[idx]
    #     row_y = self._y.iloc[idx]
    #     return [row_X['argument1'], row_X['argument2'], "1" if row_y['is_same_side'] else "0"]
    #
    # def __len__(self):
    #     return len(self._X)

In [None]:
# df = X_dev.merge(y_dev, left_index=True, right_index=True)
# allsamples = list()
# for _, row in df.iterrows():
#     allsamples.append("1" if row['is_same_side'] == "True" else "0")
# np.unique(allsamples)

In [12]:
def setup_bert():
    # change `ctx` to `mx.cpu()` if no GPU is available.
    ctx = mx.gpu(0)
    # ctx =  mx.gpu() if mx.context.num_gpus() else mx.cpu()
    # ctx = mx.cpu()
    
    bert_base, vocabulary = nlp.model.get_model('bert_12_768_12',
                                                 dataset_name='book_corpus_wiki_en_uncased',
                                                 pretrained=True, ctx=ctx, use_pooler=True,
                                                 use_decoder=False, use_classifier=False)
    print(bert_base)
    
    model = bert.BERTClassifier(bert_base, num_classes=2, dropout=0.1)
    # only need to initialize the classifier layer.
    model.classifier.initialize(init=mx.init.Normal(0.02), ctx=ctx)
    model.hybridize(static_alloc=True)

    # softmax cross entropy loss for classification
    loss_function = gluon.loss.SoftmaxCELoss()
    loss_function.hybridize(static_alloc=True)

    metric = mx.metric.Accuracy()
    
    # use the vocabulary from pre-trained model for tokenization
    bert_tokenizer = nlp.data.BERTTokenizer(vocabulary, lower=True)
    # maximum sequence length
    max_len = 128
    # the labels for the two classes
    all_labels = ["0", "1"]
    # whether to transform the data as sentence pairs.
    # for single sentence classification, set pair=False
    pair = True
    transform = dataset.BERTDatasetTransform(bert_tokenizer, max_len,
                                             labels=all_labels,
                                             label_dtype='int32',
                                             pad=True,
                                             pair=pair)

    return model, vocabulary, ctx, transform, loss_function, metric, all_labels

In [43]:
def transform_dataset(X, y, transform):
    data_train_raw = MyBERTDataset(X, y)
    data_train = data_train_raw.transform(transform)
    return data_train_raw, data_train


def predict_out_to_ys(all_predictions, all_labels):
    y_true, y_pred = list(), list()
    
    for _, y_true_many, y_pred_many in all_predictions:
        y_true_many = y_true_many.T[0].asnumpy()
        # https://mxnet.incubator.apache.org/api/python/gluon/loss.html#mxnet.gluon.loss.SoftmaxCrossEntropyLoss
        # pred: the prediction tensor, where the batch_axis dimension ranges over batch size and axis dimension ranges over the number of classes.
        y_pred_many = np.argmax(y_pred_many, axis=1).asnumpy()

        y_true.extend(list(y_true_many))
        y_pred.extend(list(y_pred_many))
        # TODO: convert label_id to label?
        # y_pred.extend(all_labels[c] for c in list(y_pred_many))
        
    y_true = np.array(y_true)
    y_pred = np.array(y_pred)
    
    return y_true, y_pred

In [14]:
def train(model, data_train, ctx, metric, loss_function, batch_size=32, lr=5e-6, num_epochs=3, checkpoint_dir="data", use_checkpoints=True):
    with Timer("setup training"):
        train_sampler = nlp.data.FixedBucketSampler(
            lengths=[int(item[1]) for item in tqdm(data_train)],
            batch_size=batch_size,
            shuffle=True)
        bert_dataloader = mx.gluon.data.DataLoader(data_train,
                                                   batch_sampler=train_sampler)

        trainer = gluon.Trainer(model.collect_params(), 'adam', {
            'learning_rate': lr,
            'epsilon': 1e-9
        })

        # collect all differentiable parameters
        # grad_req == 'null' indicates no gradients are calculated (e.g. constant parameters)
        # the gradients for these params are clipped later
        params = [p for p in model.collect_params().values() if p.grad_req != 'null']

    log_interval = 10
    with Timer("training"):
        stats = list()
        for epoch_id in range(num_epochs):
            if use_checkpoints:
                epoch_checkpoint_savefile = "bert.model.checkpoint{}.params".format(epoch_id)
                if checkpoint_dir is not None:
                    epoch_checkpoint_savefile = os.path.join(checkpoint_dir, epoch_checkpoint_savefile)
                if os.path.exists(epoch_checkpoint_savefile):
                    model.load_parameters(epoch_checkpoint_savefile, ctx=ctx)
                    print("loaded checkpoint for epoch {}".format(epoch_id))
                    continue

            with Timer("epoch {}".format(epoch_id)):
                metric.reset()
                step_loss = 0
                t_p = time.time()  # time keeping
                for batch_id, (token_ids, valid_length, segment_ids,
                               label) in enumerate(bert_dataloader):
                    with mx.autograd.record():

                        # load data to GPU
                        token_ids = token_ids.as_in_context(ctx)
                        valid_length = valid_length.as_in_context(ctx)
                        segment_ids = segment_ids.as_in_context(ctx)
                        label = label.as_in_context(ctx)

                        # forward computation
                        out = model(token_ids, segment_ids,
                                    valid_length.astype('float32'))
                        ls = loss_function(out, label).mean()

                    # backward computation
                    ls.backward()

                    # gradient clipping
                    trainer.allreduce_grads()
                    nlp.utils.clip_grad_global_norm(params, 1)
                    trainer.update(1)

                    step_loss += ls.asscalar()
                    metric.update([label], [out])
                    stats.append((metric.get()[1], ls.asscalar()))
                    if (batch_id + 1) % (log_interval) == 0:
                        print(
                            '[Epoch {} Batch {}/{}] loss={:.4f}, lr={:.7f}, acc={:.3f} - time {}'
                            .format(epoch_id, batch_id + 1, len(bert_dataloader),
                                    step_loss / log_interval, trainer.learning_rate,
                                    metric.get()[1],
                                    datetime.timedelta(seconds=(time.time() - t_p))))
                        t_p = time.time()
                        step_loss = 0

            if use_checkpoints:
                model.save_parameters(epoch_checkpoint_savefile)
            
    return stats

In [47]:
def predict(model, data_predict, ctx, metric, loss_function, batch_size=32):
    bert_dataloader = mx.gluon.data.DataLoader(data_predict, batch_size=batch_size)
    
    all_predictions = list()

    with Timer("prediction"):
        metric.reset()
        cum_loss = 0
        for batch_id, (token_ids, valid_length, segment_ids,
                       label) in enumerate(tqdm(bert_dataloader)):
            # load data to GPU
            token_ids = token_ids.as_in_context(ctx)
            valid_length = valid_length.as_in_context(ctx)
            segment_ids = segment_ids.as_in_context(ctx)
            label = label.as_in_context(ctx)

            # forward computation
            out = model(token_ids, segment_ids,
                        valid_length.astype('float32'))
            ls = loss_function(out, label).mean()

            metric.update([label], [out])
            cum_loss += ls.asscalar()  # .sum() ?
            all_predictions.append((batch_id, label, out))
            
    return all_predictions, cum_loss

In [16]:
def print_infos(vocabulary, data_train_raw, data_train):
    sample_id = 0

    # sentence a
    print(data_train_raw[sample_id][0])
    # sentence b
    print(data_train_raw[sample_id][1])
    # 1 means equivalent, 0 means not equivalent
    print(data_train_raw[sample_id][2])

    print('vocabulary used for tokenization = \n%s'%vocabulary)
    print('[PAD] token id = %s'%(vocabulary['[PAD]']))
    print('[CLS] token id = %s'%(vocabulary['[CLS]']))
    print('[SEP] token id = %s'%(vocabulary['[SEP]']))

    print('token ids = \n%s'%data_train[sample_id][0])
    print('valid length = \n%s'%data_train[sample_id][1])
    print('segment ids = \n%s'%data_train[sample_id][2])
    print('label = \n%s'%data_train[sample_id][3])
    

def plot_train_stats(stats):
    if not stats:
        print("no stats to plot")
        return

    x = np.arange(len(stats))  # arange/linspace

    acc_dots, loss_dots = zip(*stats)

    plt.subplot(2, 1, 1)
    plt.plot(x, acc_dots)  # Linie: '-', 'o-', '.-'
    plt.title('Training BERTClassifier')
    plt.ylabel('Accuracy')

    plt.subplot(2, 1, 2)
    plt.plot(x, loss_dots)
    plt.xlabel('Batches')
    plt.ylabel('Loss')

    plt.show()

### Evaluate

In [17]:
def heatconmat(y_test, y_pred):
    sns.set_context('talk')
    plt.figure(figsize=(9, 6))
    sns.heatmap(confusion_matrix(y_test, y_pred),
                annot=True,
                fmt='d',
                cbar=False,
                cmap='gist_earth_r',
                yticklabels=sorted(np.unique(y_test)))
    plt.show()


def report_training_results(y_test, y_pred, name=None, heatmap=True):
    print('Confusion Matrix:')
    print(confusion_matrix(y_test, y_pred))
    if heatmap:
        heatconmat(y_test, y_pred)
    print()
    print('Accuracy: ', round(accuracy_score(y_test, y_pred), 2), '\n')  #

    print('Report{}:'.format("" if not name else " for [{}]".format(name)))
    print(classification_report(y_test, y_pred))

    f1_dic = {}
    f1_dic['macro'] = round(
        f1_score(y_pred=y_pred, y_true=y_test, average='macro'), 2)
    f1_dic['micro'] = round(
        f1_score(y_pred=y_pred, y_true=y_test, average='micro'), 2)
    return f1_dic

### Within topic - Training and evaluating model 

In [None]:
# 1. Getting train and dev data
with Timer("1 - test/train split"):
    X_train, X_dev, y_train, y_dev = get_train_test_sets(within_traindev_df)

In [None]:
# 2. setup
with Timer("2 - setup BERT model"):
    model, vocabulary, ctx, transform, loss_function, metric, all_labels = setup_bert()

In [None]:
with Timer("3 - prepare training data"):
    data_train_raw, data_train = transform_dataset(X_train, y_train, transform)
    print_infos(vocabulary, data_train_raw, data_train)

In [None]:
with Timer("4 - train model"):
    # train(model, data_train, ctx, metric, loss_function, batch_size=32, lr=5e-6, num_epochs=3)
    stats = train(model, data_train, ctx, metric, loss_function, batch_size=32, lr=5e-6, num_epochs=2)
    # model.save_parameters("data/same-side-classification/within-topic/bert.model.params")
    model.save_parameters("data/bert.model.params")

    plot_train_stats(stats)

In [None]:
with Timer("5 - prepare eval data"):
    data_dev_raw, data_dev = transform_dataset(X_dev, y_dev, transform)
    print_infos(vocabulary, data_dev_raw, data_dev)

In [None]:
with Timer("6 - evaluate"):
    # model.load_parameters("data/same-side-classification/within-topic/bert.model.params", ctx=ctx)
    model.load_parameters("data/bert.model.params", ctx=ctx)
    all_predictions, cum_loss = predict(model, data_dev, ctx, metric, loss_function)
    print("Accuracy:", metric.get()[1])

    y_true, y_pred = predict_out_to_ys(all_predictions, all_labels)
    report_training_results(y_true, y_pred, name="BERTClassifier", heatmap=False)

### Train and evaluate each epoch

In [None]:
for epoch_id in range(5):
    with Timer("4 - train model - {}".format(epoch_id)):
        stats = train(model, data_train, ctx, metric, loss_function, batch_size=32, lr=5e-6, num_epochs=epoch_id + 1)
        plot_train_stats(stats)

    with Timer("6 - evaluate - {}".format(epoch_id)):
        all_predictions, cum_loss = predict(model, data_dev, ctx, metric, loss_function)
        print("Accuracy in epoch {}:".format(epoch_id), metric.get()[1])
        y_true, y_pred = predict_out_to_ys(all_predictions, all_labels)
        report_training_results(y_true, y_pred, name="BERTClassifier", heatmap=False)

    model.save_parameters("data/bert.model.params")

### Cross topic - Training and evaluating model 

In [None]:
# 1. Getting train and dev data
with Timer("1 - test/train split"):
    X_train, X_dev, y_train, y_dev = get_train_test_sets(cross_traindev_df)

In [None]:
# 2. setup
with Timer("2 - setup BERT model"):
    model, vocabulary, ctx, transform, loss_function, metric, all_labels = setup_bert()

In [None]:
with Timer("3 - prepare training data"):
    data_train_raw, data_train = transform_dataset(X_train, y_train, transform)
    print_infos(vocabulary, data_train_raw, data_train)

In [None]:
with Timer("4 - train model"):
    # train(model, data_train, ctx, metric, loss_function, batch_size=32, lr=5e-6, num_epochs=3)
    stats = train(model, data_train, ctx, metric, loss_function, batch_size=32, lr=5e-6, num_epochs=2)
    # model.save_parameters("data/same-side-classification/cross-topic/bert.model.params")
    model.save_parameters("data/bert.model.params")

    plot_train_stats(stats)

In [None]:
with Timer("5 - prepare eval data"):
    data_dev_raw, data_dev = transform_dataset(X_dev, y_dev, transform)
    print_infos(vocabulary, data_dev_raw, data_dev)

In [None]:
with Timer("6 - evaluate"):
    # model.load_parameters("data/same-side-classification/cross-topic/bert.model.params", ctx=ctx)
    model.load_parameters("data/bert.model.params", ctx=ctx)
    all_predictions, cum_loss = predict(model, data_dev, ctx, metric, loss_function)
    print("Accuracy:", metric.get()[1])

    y_true, y_pred = predict_out_to_ys(all_predictions, all_labels)
    report_training_results(y_true, y_pred, name="BERTClassifier", heatmap=False)

In [None]:
for epoch_id in range(5):
    with Timer("4 - train model - {}".format(epoch_id)):
        stats = train(model, data_train, ctx, metric, loss_function, batch_size=32, lr=5e-6, num_epochs=epoch_id + 1)
        plot_train_stats(stats)

    with Timer("6 - evaluate - {}".format(epoch_id)):
        all_predictions, cum_loss = predict(model, data_dev, ctx, metric, loss_function)
        print("Accuracy in epoch {}:".format(epoch_id), metric.get()[1])
        y_true, y_pred = predict_out_to_ys(all_predictions, all_labels)
        report_training_results(y_true, y_pred, name="BERTClassifier", heatmap=False)

    model.save_parameters("data/bert.model.params")