# RATIO 2019 - Benchmarking Workshop

https://gluon-nlp.mxnet.io/install.html

```
pip install --upgrade 'mxnet>=1.3.0'
pip install gluonnlp
wget https://gluon-nlp.mxnet.io/_downloads/sentence_embedding.zip
unzip sentence_embedding.zip
ln -s sentence_embedding/bert bert
```

In [30]:
import datetime
import logging
import os
import random
import time
import warnings

import csv
import gluonnlp as nlp
import matplotlib.pyplot as plt
import mxnet as mx
import numpy as np
import pandas as pd
import seaborn as sns

from bert import *
from mxnet import gluon
from mxnet.gluon.data import Dataset, SimpleDataset
from sklearn.metrics import (accuracy_score, classification_report,
                             confusion_matrix, f1_score)
from sklearn.model_selection import train_test_split
from sklearn import utils
from tqdm import tqdm

In [31]:
warnings.filterwarnings('ignore')

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
                    level=logging.INFO)

In [32]:
# set repeatable random state
np.random.seed(100)
random.seed(100)
mx.random.seed(10000)

In [33]:
# apply progress bars for pandas .apply() -> .progress_apply()
tqdm.pandas()

In [34]:
# make tqdm jupyter friendly
from tqdm import tqdm_notebook as tqdm
# for .progress_apply() we have to hack it like this?
tqdm().pandas()

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




In [35]:
class Timer:
    def __init__(self, name=None):
        self.name = name

    def __enter__(self):
        self.time_start = time.time()

    def __exit__(self, *exc):
        time_end = time.time()
        time_delta = datetime.timedelta(seconds=(time_end - self.time_start))
        if self.name:
            print(("Time for [{}]: {}".format(self.name, time_delta)))
        else:
            print(("Time: {}".format(time_delta)))

# Task 1 - Same Side Classification

In [36]:
data_cross_path = 'data/same-side-classification/cross-topic/{}.csv'
data_within_path = 'data/same-side-classification/within-topic/{}.csv'

### Load within-topics and cross-topics data

In [37]:
with Timer("read cross"):
    cross_traindev_df = pd.read_csv(data_cross_path.format('training'),
                                    quotechar='"',
                                    quoting=csv.QUOTE_ALL,
                                    encoding='utf-8',
                                    escapechar='\\',
                                    doublequote=False,
                                    index_col='id')
    cross_test_df = pd.read_csv(data_cross_path.format('test'), index_col='id')

with Timer("read within"):
    within_traindev_df = pd.read_csv(data_within_path.format('training'),
                                     quotechar='"',
                                     quoting=csv.QUOTE_ALL,
                                     encoding='utf-8',
                                     escapechar='\\',
                                     doublequote=False,
                                     index_col='id')
    within_test_df = pd.read_csv(data_within_path.format('test'), index_col='id')

Time for [read cross]: 0:00:00.860872
Time for [read within]: 0:00:00.830165


In [38]:
# Adding a tag for the topics in focus: "gay marriage" and "abortion"
def add_tag(row):
    title = row['topic'].lower().strip()
    if "abortion" in title:
        row['tag'] = 'abortion'
    elif "gay marriage"  in title:
        row['tag'] = 'gay marriage'
    else:
        row['tag'] = 'NA'
    return row


with Timer("tag cross traindev"):
    cross_traindev_df = cross_traindev_df.progress_apply(add_tag, axis=1)
with Timer("tag cross test"):
    cross_test_df = cross_test_df.progress_apply(add_tag, axis=1)

with Timer("tag within traindev"):
    within_traindev_df = within_traindev_df.progress_apply(add_tag, axis=1)
with Timer("tag within test"):
    within_test_df = within_test_df.progress_apply(add_tag, axis=1)

HBox(children=(IntProgress(value=0, max=61048), HTML(value='')))


Time for [tag cross traindev]: 0:00:32.794460


HBox(children=(IntProgress(value=0, max=6163), HTML(value='')))


Time for [tag cross test]: 0:00:03.279922


HBox(children=(IntProgress(value=0, max=63903), HTML(value='')))


Time for [tag within traindev]: 0:00:34.059834


HBox(children=(IntProgress(value=0, max=3552), HTML(value='')))


Time for [tag within test]: 0:00:01.876847


### Get an overview about each dataset

In [39]:
# requires nltk  wordtokenize
# from nltk.tokenize import sent_tokenize, word_tokenize
# model uses BERT Tokenizer ...

def get_overview(df, task='same-side', class_name='is_same_side'):
    # Total instance numbers
    total = len(df)
    print("Task: ", task)
    print('=' * 40, '\n')

    print('Total instances: ', total)
    print('\n')

    print('For each topic:')
    for tag, tag_df in df.groupby(['tag']):
        print(tag, ': ', len(tag_df), ' instances')
        if class_name in df.columns:
            for is_same_side, side_df in tag_df.groupby([class_name]):
                print('\t\t', is_same_side, ': ', len(side_df), ' instances')
    print('\n')

    if class_name in df.columns:
        print('For each class value:')
        for class_value, class_df in df.groupby([class_name]):
            print(class_value, ': ', len(class_df), ' instances')
        print('\n')

    print('Unique argument1:', len(df['argument1'].unique()))
    print('Unique argument2:', len(df['argument2'].unique()))
    arguments = df['argument1'].values
    arguments = np.concatenate([arguments, df['argument2'].values])

    print('Unique total arguments:', len(set(list(arguments))), '\n')
    
    return

    print('-' * 40, '\n')

    arguments_length_lst = [
        len(word_tokenize(x)) for x in df['argument1'].values
    ]
    arguments_length_lst.extend(
        [len(word_tokenize(x)) for x in df['argument2'].values])
    print('Words:')
    print('\tshortest argument:', min(arguments_length_lst), ' words')
    print('\tlongest argument:', max(arguments_length_lst), ' words')
    print('\targument average length:', np.mean(arguments_length_lst),
          ' words')

    arguments_sent_length_lst = [
        len(sent_tokenize(x)) for x in df['argument1'].values
    ]
    arguments_sent_length_lst.extend(
        [len(sent_tokenize(x)) for x in df['argument2'].values])
    print('Sentences:')
    print('\tshortest argument:', min(arguments_sent_length_lst), ' sentences')
    print('\tlongest argument:', max(arguments_sent_length_lst), ' sentences')
    print('\targument average length:', np.mean(arguments_sent_length_lst),
          ' sentences')

In [None]:
with Timer("overview cross"):
    get_overview(cross_traindev_df)

In [None]:
with Timer("overview within"):
    get_overview(within_traindev_df)

##### Count raw length

In [None]:
def compute_arg_len(row):
    row['argument1_len'] = len(row['argument1'])
    row['argument2_len'] = len(row['argument2'])
    row['argument12_len_diff'] = row['argument1_len'] - row['argument2_len']
    return row


cross_traindev_df = cross_traindev_df.progress_apply(compute_arg_len, axis=1)
within_traindev_df = within_traindev_df.progress_apply(compute_arg_len, axis=1)
cross_test_df = cross_test_df.progress_apply(compute_arg_len, axis=1)
within_test_df = within_test_df.progress_apply(compute_arg_len, axis=1)

In [None]:
cross_traindev_df.describe()

In [None]:
within_traindev_df.describe()

##### Tokenize and count tokens

In [None]:
ctx = mx.cpu()
_, vocabulary = nlp.model.get_model('bert_12_768_12',
                                    dataset_name='book_corpus_wiki_en_uncased',
                                    pretrained=True, ctx=ctx, use_pooler=True,
                                    use_decoder=False, use_classifier=False)
bert_tokenizer = nlp.data.BERTTokenizer(vocabulary, lower=True)
tokenizer = bert_tokenizer

In [None]:
from nltk.tokenize import sent_tokenize, word_tokenize
# nltk.download('punct')


# tokenizer from BERT
def tokenize_arguments(row):
    # tokenize
    row['argument1_tokens'] = tokenizer(row['argument1'])
    row['argument2_tokens'] = tokenizer(row['argument2'])

    # count tokens
    row['argument1_len'] = len(row['argument1_tokens'])
    row['argument2_len'] = len(row['argument2_tokens'])
    # token number diff
    row['argument12_len_diff'] = row['argument1_len'] - row['argument2_len']
    return row


cross_traindev_df = cross_traindev_df.progress_apply(tokenize_arguments, axis=1)
within_traindev_df = within_traindev_df.progress_apply(tokenize_arguments, axis=1)
cross_test_df = cross_test_df.progress_apply(tokenize_arguments, axis=1)
within_test_df = within_test_df.progress_apply(tokenize_arguments, axis=1)

In [None]:
cross_traindev_df.describe()

In [None]:
within_traindev_df.describe()

## Train model - Baseline

### train dev set - 70% 30%

In [40]:
def get_train_test_sets(df, ratio=0.30, random_state=1):
    X = df[['argument1', 'argument2', 'argument1_id', 'argument2_id', 'topic']]
    y = df[['is_same_side']]

    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=ratio,
                                                        random_state=random_state,
                                                        shuffle=True)
    return X_train, X_test, y_train, y_test

### BERT

- https://gluon-nlp.mxnet.io/examples/sentence_embedding/bert.html

In [41]:
class MyBERTDataset(SimpleDataset):
    def __init__(self, X, y=None):
        self._X = X
        self._y = y
        super(MyBERTDataset, self).__init__(self._convert())

    def _convert(self):
        allsamples = list()

        if self._y is not None:
            df = self._X.merge(self._y, left_index=True, right_index=True)
            for _, row in df.iterrows():
                allsamples.append([
                    row['argument1'], row['argument2'],
                    "1" if str(row['is_same_side']) == "True" else "0"
                ])
        else:
            for _, row in self._X.iterrows():
                allsamples.append([row['argument1'], row['argument2'], None])

        return allsamples

    # for lazy retrieval?
    #
    # def __getitem__(self, idx):
    #     row_X = self._X.iloc[idx]
    #     row_y = self._y.iloc[idx]
    #     return [row_X['argument1'], row_X['argument2'], "1" if row_y['is_same_side'] else "0"]
    #
    # def __len__(self):
    #     return len(self._X)

In [None]:
# df = X_dev.merge(y_dev, left_index=True, right_index=True)
# allsamples = list()
# for _, row in df.iterrows():
#     allsamples.append("1" if row['is_same_side'] == "True" else "0")
# np.unique(allsamples)

**TODO**: my own `BERTDatasetTransform` for extracting chunks from arguments or last part etc.

```python
transform = dataset.BERTDatasetTransform(bert_tokenizer, 512,
                                         labels=['0', '1'],
                                         label_dtype='int32',
                                         pad=True,
                                         pair=True)
```

http://localhost:9001/edit/bert/dataset.py @454
```python
# substitute with my own (e. g. last part, many parts etc.)
def __init__(...):
    self._bert_xform = BERTSentenceTransform(tokenizer, max_seq_length, pad=pad, pair=pair)
```
https://gluon-nlp.mxnet.io/master/_modules/gluonnlp/data/transforms.html#BERTSentenceTransform
```python
# substitute with my own (e. g. only last part (trim from start))
self._truncate_seq_pair(tokens_a, tokens_b, self._max_seq_length - 3)
```

https://mxnet.incubator.apache.org/_modules/mxnet/gluon/data/dataset.html#Dataset.transform

In [42]:
from gluonnlp.data import BERTSentenceTransform


class LastPartBERTSentenceTransform(BERTSentenceTransform):
    def __init__(self, tokenizer, max_seq_length, pad=True, pair=True):
        super(LastPartBERTSentenceTransform, self).__init__(tokenizer, max_seq_length, pad=pad, pair=pair)


    def _truncate_seq_pair(self, tokens_a, tokens_b, max_length):
        """Truncates a sequence pair in place to the maximum length.
        Removes from end of token list."""
        # This is a simple heuristic which will always truncate the longer sequence
        # one token at a time. This makes more sense than truncating an equal percent
        # of tokens from each, since if one sequence is very short then each token
        # that's truncated likely contains more information than a longer sequence.
        while True:
            total_length = len(tokens_a) + len(tokens_b)
            if total_length <= max_length:
                break
            if len(tokens_a) > len(tokens_b):
                tokens_a.pop(0)
            else:
                tokens_b.pop(0)


# TODO: random trim ? --> bad probably
# TODO: segment-wise, e. g. 0 for normal, 1 for tokens after normal tokens, ...

In [43]:
class LastPartBERTDatasetTransform(dataset.BERTDatasetTransform):
    def __init__(self, tokenizer, max_seq_length, labels=None, pad=True, pair=True, label_dtype='float32'):
        super(LastPartBERTDatasetTransform, self).__init__(tokenizer, max_seq_length, labels=labels, pad=pad, pair=pair, label_dtype=label_dtype)
        self._bert_xform = LastPartBERTSentenceTransform(tokenizer, max_seq_length, pad=pad, pair=pair)

In [44]:
def setup_bert():
    # change `ctx` to `mx.cpu()` if no GPU is available.
    ctx = mx.gpu(0)
    # ctx =  mx.gpu() if mx.context.num_gpus() else mx.cpu()
    # ctx = mx.cpu()
    
    bert_base, vocabulary = nlp.model.get_model('bert_12_768_12',
                                                 dataset_name='book_corpus_wiki_en_uncased',
                                                 pretrained=True, ctx=ctx, use_pooler=True,
                                                 use_decoder=False, use_classifier=False)
    print(bert_base)
    
    model = bert.BERTClassifier(bert_base, num_classes=2, dropout=0.1)
    # only need to initialize the classifier layer.
    model.classifier.initialize(init=mx.init.Normal(0.02), ctx=ctx)
    model.hybridize(static_alloc=True)

    # softmax cross entropy loss for classification
    loss_function = gluon.loss.SoftmaxCELoss()
    loss_function.hybridize(static_alloc=True)

    metric = mx.metric.Accuracy()
    
    # use the vocabulary from pre-trained model for tokenization
    bert_tokenizer = nlp.data.BERTTokenizer(vocabulary, lower=True)
    # maximum sequence length
    # max_len = 128  # + batch_size: 32
    max_len = 512  # + batch_size: 6 ?
    # the labels for the two classes
    all_labels = ["0", "1"]
    # whether to transform the data as sentence pairs.
    # for single sentence classification, set pair=False
    pair = True
    transform = LastPartBERTDatasetTransform(bert_tokenizer, max_len,
                                             labels=all_labels,
                                             label_dtype='int32',
                                             pad=True,
                                             pair=pair)

    return model, vocabulary, ctx, bert_tokenizer, transform, loss_function, metric, all_labels

In [45]:
def transform_dataset(X, y, transform):
    data_train_raw = MyBERTDataset(X, y)
    data_train = data_train_raw.transform(transform)
    return data_train_raw, data_train


def predict_out_to_ys(all_predictions, all_labels):
    y_true, y_pred = list(), list()
    
    for _, y_true_many, y_pred_many in all_predictions:
        y_true_many = y_true_many.T[0].asnumpy()
        # https://mxnet.incubator.apache.org/api/python/gluon/loss.html#mxnet.gluon.loss.SoftmaxCrossEntropyLoss
        # pred: the prediction tensor, where the batch_axis dimension ranges over batch size and axis dimension ranges over the number of classes.
        y_pred_many = np.argmax(y_pred_many, axis=1).asnumpy()

        y_true.extend(list(y_true_many))
        y_pred.extend(list(y_pred_many))
        # TODO: convert label_id to label?
        # y_pred.extend(all_labels[c] for c in list(y_pred_many))
        
    y_true = np.array(y_true)
    y_pred = np.array(y_pred)
    
    return y_true, y_pred

In [46]:
def train(model, data_train, ctx, metric, loss_function, batch_size=32, lr=5e-6, num_epochs=3, checkpoint_dir="data", use_checkpoints=True):
    with Timer("setup training"):
        train_sampler = nlp.data.FixedBucketSampler(
            lengths=[int(item[1]) for item in tqdm(data_train)],
            batch_size=batch_size,
            shuffle=True)
        bert_dataloader = mx.gluon.data.DataLoader(data_train,
                                                   batch_sampler=train_sampler)

        trainer = gluon.Trainer(model.collect_params(), 'adam', {
            'learning_rate': lr,
            'epsilon': 1e-9
        })

        # collect all differentiable parameters
        # grad_req == 'null' indicates no gradients are calculated (e.g. constant parameters)
        # the gradients for these params are clipped later
        params = [p for p in model.collect_params().values() if p.grad_req != 'null']

    log_interval = 100
    with Timer("training"):
        stats = list()
        for epoch_id in range(num_epochs):
            if use_checkpoints:
                epoch_checkpoint_savefile = "bert.model.checkpoint{}.params".format(epoch_id)
                if checkpoint_dir is not None:
                    epoch_checkpoint_savefile = os.path.join(checkpoint_dir, epoch_checkpoint_savefile)
                if os.path.exists(epoch_checkpoint_savefile):
                    model.load_parameters(epoch_checkpoint_savefile, ctx=ctx)
                    print("loaded checkpoint for epoch {}".format(epoch_id))
                    continue

            with Timer("epoch {}".format(epoch_id)):
                metric.reset()
                step_loss = 0
                t_p = time.time()  # time keeping
                for batch_id, (token_ids, valid_length, segment_ids,
                               label) in enumerate(tqdm(bert_dataloader)):
                    with mx.autograd.record():

                        # load data to GPU
                        token_ids = token_ids.as_in_context(ctx)
                        valid_length = valid_length.as_in_context(ctx)
                        segment_ids = segment_ids.as_in_context(ctx)
                        label = label.as_in_context(ctx)

                        # forward computation
                        out = model(token_ids, segment_ids,
                                    valid_length.astype('float32'))
                        ls = loss_function(out, label).mean()

                    # backward computation
                    ls.backward()

                    # gradient clipping
                    trainer.allreduce_grads()
                    nlp.utils.clip_grad_global_norm(params, 1)
                    trainer.update(1)

                    step_loss += ls.asscalar()
                    metric.update([label], [out])
                    stats.append((metric.get()[1], ls.asscalar()))
                    if (batch_id + 1) % (log_interval) == 0:
                        print(
                            '[Epoch {} Batch {}/{}] loss={:.4f}, lr={:.7f}, acc={:.3f} - time {}'
                            .format(epoch_id, batch_id + 1, len(bert_dataloader),
                                    step_loss / log_interval, trainer.learning_rate,
                                    metric.get()[1],
                                    datetime.timedelta(seconds=(time.time() - t_p))))
                        t_p = time.time()
                        step_loss = 0

            if use_checkpoints:
                model.save_parameters(epoch_checkpoint_savefile)
            
    return stats

In [47]:
def predict(model, data_predict, ctx, metric, loss_function, batch_size=32):
    bert_dataloader = mx.gluon.data.DataLoader(data_predict, batch_size=batch_size)
    
    all_predictions = list()

    with Timer("prediction"):
        metric.reset()
        cum_loss = 0
        for batch_id, (token_ids, valid_length, segment_ids,
                       label) in enumerate(tqdm(bert_dataloader)):
            # load data to GPU
            token_ids = token_ids.as_in_context(ctx)
            valid_length = valid_length.as_in_context(ctx)
            segment_ids = segment_ids.as_in_context(ctx)
            label = label.as_in_context(ctx)

            # forward computation
            out = model(token_ids, segment_ids,
                        valid_length.astype('float32'))
            ls = loss_function(out, label).mean()

            metric.update([label], [out])
            cum_loss += ls.asscalar()  # .sum() ?
            all_predictions.append((batch_id, label, out))
            
    return all_predictions, cum_loss

In [48]:
def print_infos(vocabulary, data_train_raw, data_train):
    sample_id = 0

    # sentence a
    print(data_train_raw[sample_id][0])
    # sentence b
    print(data_train_raw[sample_id][1])
    # 1 means equivalent, 0 means not equivalent
    print(data_train_raw[sample_id][2])

    print('vocabulary used for tokenization = \n%s'%vocabulary)
    print('[PAD] token id = %s'%(vocabulary['[PAD]']))
    print('[CLS] token id = %s'%(vocabulary['[CLS]']))
    print('[SEP] token id = %s'%(vocabulary['[SEP]']))

    print('token ids = \n%s'%data_train[sample_id][0])
    print('valid length = \n%s'%data_train[sample_id][1])
    print('segment ids = \n%s'%data_train[sample_id][2])
    print('label = \n%s'%data_train[sample_id][3])
    

def plot_train_stats(stats):
    if not stats:
        print("no stats to plot")
        return

    x = np.arange(len(stats))  # arange/linspace

    acc_dots, loss_dots = zip(*stats)

    plt.subplot(2, 1, 1)
    plt.plot(x, acc_dots)  # Linie: '-', 'o-', '.-'
    plt.title('Training BERTClassifier')
    plt.ylabel('Accuracy')

    plt.subplot(2, 1, 2)
    plt.plot(x, loss_dots)
    plt.xlabel('Batches')
    plt.ylabel('Loss')

    plt.show()

### Evaluate

In [49]:
def heatconmat(y_test, y_pred):
    sns.set_context('talk')
    plt.figure(figsize=(9, 6))
    sns.heatmap(confusion_matrix(y_test, y_pred),
                annot=True,
                fmt='d',
                cbar=False,
                cmap='gist_earth_r',
                yticklabels=sorted(np.unique(y_test)))
    plt.show()


def report_training_results(y_test, y_pred, name=None, heatmap=True):
    print('Confusion Matrix:')
    print(confusion_matrix(y_test, y_pred))
    if heatmap:
        heatconmat(y_test, y_pred)
    print()
    print('Accuracy: ', round(accuracy_score(y_test, y_pred), 2), '\n')  #

    print('Report{}:'.format("" if not name else " for [{}]".format(name)))
    print(classification_report(y_test, y_pred))

    f1_dic = {}
    f1_dic['macro'] = round(
        f1_score(y_pred=y_pred, y_true=y_test, average='macro'), 2)
    f1_dic['micro'] = round(
        f1_score(y_pred=y_pred, y_true=y_test, average='micro'), 2)
    return f1_dic

### Within topic - Training and evaluating model 

In [21]:
# 1. Getting train and dev data
with Timer("1 - test/train split"):
    X_train, X_dev, y_train, y_dev = get_train_test_sets(within_traindev_df)

Time for [1 - test/train split]: 0:00:00.025023


In [22]:
# 2. setup
with Timer("2 - setup BERT model"):
    model, vocabulary, ctx, tokenizer, transform, loss_function, metric, all_labels = setup_bert()

BERTModel(
  (encoder): BERTEncoder(
    (dropout_layer): Dropout(p = 0.1, axes=())
    (layer_norm): BERTLayerNorm(eps=1e-12, axis=-1, center=True, scale=True, in_channels=768)
    (transformer_cells): HybridSequential(
      (0): BERTEncoderCell(
        (dropout_layer): Dropout(p = 0.1, axes=())
        (attention_cell): MultiHeadAttentionCell(
          (_base_cell): DotProductAttentionCell(
            (_dropout_layer): Dropout(p = 0.1, axes=())
          )
          (proj_query): Dense(768 -> 768, linear)
          (proj_key): Dense(768 -> 768, linear)
          (proj_value): Dense(768 -> 768, linear)
        )
        (proj): Dense(768 -> 768, linear)
        (ffn): BERTPositionwiseFFN(
          (ffn_1): Dense(768 -> 3072, linear)
          (activation): GELU()
          (ffn_2): Dense(3072 -> 768, linear)
          (dropout_layer): Dropout(p = 0.1, axes=())
          (layer_norm): BERTLayerNorm(eps=1e-12, axis=-1, center=True, scale=True, in_channels=768)
        )
        (la

In [None]:
print(model)

In [23]:
with Timer("3 - prepare training data"):
    data_train_raw, data_train = transform_dataset(X_train, y_train, transform)
    print_infos(vocabulary, data_train_raw, data_train)

wanted fetuses are beloved "babies"; unwanted ones are "tissue" (inconsistent)
abortions are emotionally and psychologically unsafe.
1
vocabulary used for tokenization = 
Vocab(size=30522, unk="[UNK]", reserved="['[PAD]', '[CLS]', '[SEP]', '[MASK]']")
[PAD] token id = 1
[CLS] token id = 2
[SEP] token id = 3
token ids = 
[    2  2359 10768  5809  2229  2024 11419  1000 10834  1000  1025 18162
  3924  2024  1000  8153  1000  1006 20316  1007     3 11324  2015  2024
 14868  1998  8317  2135 25135  1012     3     1     1     1     1     1
     1     1     1     1     1     1     1     1     1     1     1     1
     1     1     1     1     1     1     1     1     1     1     1     1
     1     1     1     1     1     1     1     1     1     1     1     1
     1     1     1     1     1     1     1     1     1     1     1     1
     1     1     1     1     1     1     1     1     1     1     1     1
     1     1     1     1     1     1     1     1     1     1     1     1
     1     1     1   

In [24]:
with Timer("4 - train model"):
    stats = train(model, data_train, ctx, metric, loss_function, batch_size=6, lr=5e-6, num_epochs=5, checkpoint_dir="data/within_traindev_epi512")
    model.save_parameters("data/within_traindev_epi512/bert.model.params")

    plot_train_stats(stats)

HBox(children=(IntProgress(value=0, max=44732), HTML(value='')))


Time for [setup training]: 0:03:10.299870
loaded checkpoint for epoch 0
loaded checkpoint for epoch 1
loaded checkpoint for epoch 2


HBox(children=(IntProgress(value=0, max=7459), HTML(value='')))

[Epoch 3 Batch 10/7459] loss=0.1763, lr=0.0000050, acc=0.967 - time 0:00:08.971009
[Epoch 3 Batch 20/7459] loss=0.0609, lr=0.0000050, acc=0.975 - time 0:00:08.602159
[Epoch 3 Batch 30/7459] loss=0.1823, lr=0.0000050, acc=0.972 - time 0:00:08.724515
[Epoch 3 Batch 40/7459] loss=0.3088, lr=0.0000050, acc=0.950 - time 0:00:08.868646
[Epoch 3 Batch 50/7459] loss=0.0103, lr=0.0000050, acc=0.960 - time 0:00:08.510683
[Epoch 3 Batch 60/7459] loss=0.1299, lr=0.0000050, acc=0.958 - time 0:00:08.820832
[Epoch 3 Batch 70/7459] loss=0.3124, lr=0.0000050, acc=0.947 - time 0:00:08.888049
[Epoch 3 Batch 80/7459] loss=0.1976, lr=0.0000050, acc=0.946 - time 0:00:08.689797
[Epoch 3 Batch 90/7459] loss=0.0339, lr=0.0000050, acc=0.952 - time 0:00:08.728526
[Epoch 3 Batch 100/7459] loss=0.1567, lr=0.0000050, acc=0.950 - time 0:00:08.842822
[Epoch 3 Batch 110/7459] loss=0.1737, lr=0.0000050, acc=0.950 - time 0:00:08.754740
[Epoch 3 Batch 120/7459] loss=0.1825, lr=0.0000050, acc=0.946 - time 0:00:08.784256
[

[Epoch 3 Batch 990/7459] loss=0.1781, lr=0.0000050, acc=0.931 - time 0:00:08.873673
[Epoch 3 Batch 1000/7459] loss=0.2120, lr=0.0000050, acc=0.930 - time 0:00:08.620039
[Epoch 3 Batch 1010/7459] loss=0.0488, lr=0.0000050, acc=0.931 - time 0:00:08.871709
[Epoch 3 Batch 1020/7459] loss=0.2275, lr=0.0000050, acc=0.931 - time 0:00:08.880643
[Epoch 3 Batch 1030/7459] loss=0.1650, lr=0.0000050, acc=0.931 - time 0:00:08.994270
[Epoch 3 Batch 1040/7459] loss=0.1589, lr=0.0000050, acc=0.931 - time 0:00:08.765680
[Epoch 3 Batch 1050/7459] loss=0.3426, lr=0.0000050, acc=0.930 - time 0:00:08.803807
[Epoch 3 Batch 1060/7459] loss=0.1855, lr=0.0000050, acc=0.930 - time 0:00:08.637885
[Epoch 3 Batch 1070/7459] loss=0.0905, lr=0.0000050, acc=0.931 - time 0:00:08.790016
[Epoch 3 Batch 1080/7459] loss=0.4114, lr=0.0000050, acc=0.930 - time 0:00:08.794393
[Epoch 3 Batch 1090/7459] loss=0.0769, lr=0.0000050, acc=0.931 - time 0:00:08.666419
[Epoch 3 Batch 1100/7459] loss=0.0316, lr=0.0000050, acc=0.931 - t

[Epoch 3 Batch 1960/7459] loss=0.3625, lr=0.0000050, acc=0.931 - time 0:00:09.062955
[Epoch 3 Batch 1970/7459] loss=0.3254, lr=0.0000050, acc=0.931 - time 0:00:08.736995
[Epoch 3 Batch 1980/7459] loss=0.3541, lr=0.0000050, acc=0.931 - time 0:00:08.799196
[Epoch 3 Batch 1990/7459] loss=0.0214, lr=0.0000050, acc=0.931 - time 0:00:08.733095
[Epoch 3 Batch 2000/7459] loss=0.2826, lr=0.0000050, acc=0.931 - time 0:00:08.679637
[Epoch 3 Batch 2010/7459] loss=0.1013, lr=0.0000050, acc=0.931 - time 0:00:08.810562
[Epoch 3 Batch 2020/7459] loss=0.3368, lr=0.0000050, acc=0.931 - time 0:00:08.845279
[Epoch 3 Batch 2030/7459] loss=0.1739, lr=0.0000050, acc=0.931 - time 0:00:08.820941
[Epoch 3 Batch 2040/7459] loss=0.0295, lr=0.0000050, acc=0.931 - time 0:00:08.873195
[Epoch 3 Batch 2050/7459] loss=0.3325, lr=0.0000050, acc=0.931 - time 0:00:08.814368
[Epoch 3 Batch 2060/7459] loss=0.3584, lr=0.0000050, acc=0.931 - time 0:00:08.844797
[Epoch 3 Batch 2070/7459] loss=0.2054, lr=0.0000050, acc=0.931 - 

[Epoch 3 Batch 2930/7459] loss=0.2460, lr=0.0000050, acc=0.930 - time 0:00:08.942460
[Epoch 3 Batch 2940/7459] loss=0.0583, lr=0.0000050, acc=0.930 - time 0:00:08.781915
[Epoch 3 Batch 2950/7459] loss=0.3282, lr=0.0000050, acc=0.930 - time 0:00:08.793145
[Epoch 3 Batch 2960/7459] loss=0.1178, lr=0.0000050, acc=0.930 - time 0:00:08.776520
[Epoch 3 Batch 2970/7459] loss=0.4883, lr=0.0000050, acc=0.930 - time 0:00:08.779602
[Epoch 3 Batch 2980/7459] loss=0.1700, lr=0.0000050, acc=0.930 - time 0:00:08.673522
[Epoch 3 Batch 2990/7459] loss=0.1021, lr=0.0000050, acc=0.930 - time 0:00:08.819955
[Epoch 3 Batch 3000/7459] loss=0.2265, lr=0.0000050, acc=0.930 - time 0:00:08.766195
[Epoch 3 Batch 3010/7459] loss=0.2253, lr=0.0000050, acc=0.930 - time 0:00:08.817812
[Epoch 3 Batch 3020/7459] loss=0.2445, lr=0.0000050, acc=0.930 - time 0:00:08.709071
[Epoch 3 Batch 3030/7459] loss=0.0927, lr=0.0000050, acc=0.930 - time 0:00:08.706741
[Epoch 3 Batch 3040/7459] loss=0.3159, lr=0.0000050, acc=0.930 - 

[Epoch 3 Batch 3900/7459] loss=0.1785, lr=0.0000050, acc=0.930 - time 0:00:08.967913
[Epoch 3 Batch 3910/7459] loss=0.1612, lr=0.0000050, acc=0.930 - time 0:00:08.773903
[Epoch 3 Batch 3920/7459] loss=0.1248, lr=0.0000050, acc=0.930 - time 0:00:08.772876
[Epoch 3 Batch 3930/7459] loss=0.1074, lr=0.0000050, acc=0.930 - time 0:00:08.877441
[Epoch 3 Batch 3940/7459] loss=0.2097, lr=0.0000050, acc=0.930 - time 0:00:08.883607
[Epoch 3 Batch 3950/7459] loss=0.1396, lr=0.0000050, acc=0.930 - time 0:00:08.701050
[Epoch 3 Batch 3960/7459] loss=0.2513, lr=0.0000050, acc=0.930 - time 0:00:08.956942
[Epoch 3 Batch 3970/7459] loss=0.1572, lr=0.0000050, acc=0.930 - time 0:00:08.810063
[Epoch 3 Batch 3980/7459] loss=0.2076, lr=0.0000050, acc=0.930 - time 0:00:08.782896
[Epoch 3 Batch 3990/7459] loss=0.0592, lr=0.0000050, acc=0.930 - time 0:00:08.692508
[Epoch 3 Batch 4000/7459] loss=0.1849, lr=0.0000050, acc=0.930 - time 0:00:08.853589
[Epoch 3 Batch 4010/7459] loss=0.2954, lr=0.0000050, acc=0.930 - 

[Epoch 3 Batch 5010/7459] loss=0.1708, lr=0.0000050, acc=0.931 - time 0:00:08.871618
[Epoch 3 Batch 5020/7459] loss=0.1119, lr=0.0000050, acc=0.931 - time 0:00:08.807742
[Epoch 3 Batch 5030/7459] loss=0.0780, lr=0.0000050, acc=0.931 - time 0:00:08.642095
[Epoch 3 Batch 5040/7459] loss=0.1854, lr=0.0000050, acc=0.931 - time 0:00:08.879799
[Epoch 3 Batch 5050/7459] loss=0.1727, lr=0.0000050, acc=0.931 - time 0:00:08.857484
[Epoch 3 Batch 5060/7459] loss=0.1066, lr=0.0000050, acc=0.931 - time 0:00:08.868065
[Epoch 3 Batch 5070/7459] loss=0.1816, lr=0.0000050, acc=0.931 - time 0:00:08.914725
[Epoch 3 Batch 5080/7459] loss=0.2494, lr=0.0000050, acc=0.931 - time 0:00:08.962128
[Epoch 3 Batch 5090/7459] loss=0.0014, lr=0.0000050, acc=0.931 - time 0:00:08.624103
[Epoch 3 Batch 5100/7459] loss=0.0627, lr=0.0000050, acc=0.931 - time 0:00:08.796347
[Epoch 3 Batch 5110/7459] loss=0.0550, lr=0.0000050, acc=0.931 - time 0:00:08.796208
[Epoch 3 Batch 5120/7459] loss=0.2582, lr=0.0000050, acc=0.931 - 

[Epoch 3 Batch 5980/7459] loss=0.2254, lr=0.0000050, acc=0.933 - time 0:00:08.811558
[Epoch 3 Batch 5990/7459] loss=0.0092, lr=0.0000050, acc=0.933 - time 0:00:08.654283
[Epoch 3 Batch 6000/7459] loss=0.0814, lr=0.0000050, acc=0.933 - time 0:00:08.746972
[Epoch 3 Batch 6010/7459] loss=0.3597, lr=0.0000050, acc=0.933 - time 0:00:08.888587
[Epoch 3 Batch 6020/7459] loss=0.0321, lr=0.0000050, acc=0.934 - time 0:00:08.706588
[Epoch 3 Batch 6030/7459] loss=0.0937, lr=0.0000050, acc=0.934 - time 0:00:08.663068
[Epoch 3 Batch 6040/7459] loss=0.0875, lr=0.0000050, acc=0.934 - time 0:00:09.074009
[Epoch 3 Batch 6050/7459] loss=0.2793, lr=0.0000050, acc=0.934 - time 0:00:08.699523
[Epoch 3 Batch 6060/7459] loss=0.1367, lr=0.0000050, acc=0.934 - time 0:00:08.803793
[Epoch 3 Batch 6070/7459] loss=0.0986, lr=0.0000050, acc=0.934 - time 0:00:08.699743


IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



[Epoch 4 Batch 5500/7459] loss=0.2507, lr=0.0000050, acc=0.950 - time 0:00:09.071339
[Epoch 4 Batch 5510/7459] loss=0.2557, lr=0.0000050, acc=0.950 - time 0:00:08.879209
[Epoch 4 Batch 5520/7459] loss=0.2944, lr=0.0000050, acc=0.950 - time 0:00:09.009009
[Epoch 4 Batch 5530/7459] loss=0.1607, lr=0.0000050, acc=0.950 - time 0:00:08.977364
[Epoch 4 Batch 5540/7459] loss=0.1216, lr=0.0000050, acc=0.950 - time 0:00:08.865316
[Epoch 4 Batch 5550/7459] loss=0.1895, lr=0.0000050, acc=0.950 - time 0:00:08.799721
[Epoch 4 Batch 5560/7459] loss=0.2944, lr=0.0000050, acc=0.950 - time 0:00:08.934952
[Epoch 4 Batch 5570/7459] loss=0.0779, lr=0.0000050, acc=0.950 - time 0:00:08.808002
[Epoch 4 Batch 5580/7459] loss=0.2503, lr=0.0000050, acc=0.950 - time 0:00:08.917144
[Epoch 4 Batch 5590/7459] loss=0.3469, lr=0.0000050, acc=0.950 - time 0:00:08.976044
[Epoch 4 Batch 5600/7459] loss=0.1034, lr=0.0000050, acc=0.950 - time 0:00:08.652513
[Epoch 4 Batch 5610/7459] loss=0.0810, lr=0.0000050, acc=0.950 - 

[Epoch 4 Batch 6470/7459] loss=0.1140, lr=0.0000050, acc=0.949 - time 0:00:09.015861
[Epoch 4 Batch 6480/7459] loss=0.1332, lr=0.0000050, acc=0.949 - time 0:00:08.771066
[Epoch 4 Batch 6490/7459] loss=0.1444, lr=0.0000050, acc=0.949 - time 0:00:08.712400
[Epoch 4 Batch 6500/7459] loss=0.0999, lr=0.0000050, acc=0.949 - time 0:00:08.970287
[Epoch 4 Batch 6510/7459] loss=0.1499, lr=0.0000050, acc=0.949 - time 0:00:08.770912
[Epoch 4 Batch 6520/7459] loss=0.1131, lr=0.0000050, acc=0.949 - time 0:00:08.937320
[Epoch 4 Batch 6530/7459] loss=0.0860, lr=0.0000050, acc=0.949 - time 0:00:08.668525
[Epoch 4 Batch 6540/7459] loss=0.4270, lr=0.0000050, acc=0.949 - time 0:00:08.972187
[Epoch 4 Batch 6550/7459] loss=0.3625, lr=0.0000050, acc=0.949 - time 0:00:08.849530
[Epoch 4 Batch 6560/7459] loss=0.0102, lr=0.0000050, acc=0.949 - time 0:00:08.675791
[Epoch 4 Batch 6570/7459] loss=0.3704, lr=0.0000050, acc=0.949 - time 0:00:09.000027
[Epoch 4 Batch 6580/7459] loss=0.2484, lr=0.0000050, acc=0.948 - 

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [25]:
with Timer("5 - prepare eval data"):
    data_dev_raw, data_dev = transform_dataset(X_dev, y_dev, transform)
    print_infos(vocabulary, data_dev_raw, data_dev)

abortion opens the door to the sexual exploitation of women the existence of abortion gives men a little more of a safeguard against unintentionally impregnating a woman. as a result, men will be more aggressive in their sexual exploitation of women.
the fact that a child is likely to have a short life does not justify further shortening it:
0
vocabulary used for tokenization = 
Vocab(size=30522, unk="[UNK]", reserved="['[PAD]', '[CLS]', '[SEP]', '[MASK]']")
[PAD] token id = 1
[CLS] token id = 2
[SEP] token id = 3
token ids = 
[    2 11324  7480  1996  2341  2000  1996  4424 14427  1997  2308  1996
  4598  1997 11324  3957  2273  1037  2210  2062  1997  1037 28805  2114
  4895 18447  4765 19301  2135 17727  2890 16989  3436  1037  2450  1012
  2004  1037  2765  1010  2273  2097  2022  2062  9376  1999  2037  4424
 14427  1997  2308  1012     3  1996  2755  2008  1037  2775  2003  3497
  2000  2031  1037  2460  2166  2515  2025 16114  2582  2460  7406  2009
  1024     3     1     1     

In [26]:
with Timer("6 - evaluate"):
    #model.load_parameters("data/within_traindev_epi512/bert.model.params", ctx=ctx)
    all_predictions, cum_loss = predict(model, data_dev, ctx, metric, loss_function)
    print("Accuracy:", metric.get()[1])

    y_true, y_pred = predict_out_to_ys(all_predictions, all_labels)
    report_training_results(y_true, y_pred, name="BERTClassifier - epi512", heatmap=False)

HBox(children=(IntProgress(value=0, max=600), HTML(value='')))


Time for [prediction]: 0:09:42.420674
Accuracy: 0.8983881904960618
Confusion Matrix:
[[8067  766]
 [1182 9156]]

Accuracy:  0.9 

Report for [BERTClassifier - epi512]:
              precision    recall  f1-score   support

           0       0.87      0.91      0.89      8833
           1       0.92      0.89      0.90     10338

    accuracy                           0.90     19171
   macro avg       0.90      0.90      0.90     19171
weighted avg       0.90      0.90      0.90     19171

Time for [6 - evaluate]: 0:09:42.655505


### Train and evaluate each epoch

In [None]:
for epoch_id in range(5):
    with Timer("4 - train model - {}".format(epoch_id)):
        # stats = train(model, data_train, ctx, metric, loss_function, batch_size=32, lr=5e-6, num_epochs=epoch_id + 1)
        stats = train(model, data_train, ctx, metric, loss_function, batch_size=6, lr=5e-6, num_epochs=epoch_id + 1)  # seq_len: 512
        plot_train_stats(stats)

    with Timer("6 - evaluate - {}".format(epoch_id)):
        # all_predictions, cum_loss = predict(model, data_dev, ctx, metric, loss_function)
        all_predictions, cum_loss = predict(model, data_dev, ctx, metric, loss_function, batch_size=6)  # seq_len: 512
        print("Accuracy in epoch {}:".format(epoch_id), metric.get()[1])
        y_true, y_pred = predict_out_to_ys(all_predictions, all_labels)
        report_training_results(y_true, y_pred, name="BERTClassifier - last part", heatmap=False)

    model.save_parameters("data/bert.model.params")

In [None]:
for epoch_id in range(5):
    with Timer("4 - train model - {}".format(epoch_id)):
        # stats = train(model, data_train, ctx, metric, loss_function, batch_size=32, lr=5e-6, num_epochs=epoch_id + 1)
        stats = train(model, data_train, ctx, metric, loss_function, batch_size=6, lr=5e-6, num_epochs=epoch_id + 1)  # seq_len: 512
        plot_train_stats(stats)

    with Timer("6 - evaluate - {}".format(epoch_id)):
        # all_predictions, cum_loss = predict(model, data_dev, ctx, metric, loss_function)
        all_predictions, cum_loss = predict(model, data_dev, ctx, metric, loss_function, batch_size=6)  # seq_len: 512
        print("Accuracy in epoch {}:".format(epoch_id), metric.get()[1])
        y_true, y_pred = predict_out_to_ys(all_predictions, all_labels)
        report_training_results(y_true, y_pred, name="BERTClassifier - last part", heatmap=False)

    model.save_parameters("data/bert.model.params")

*may need to use **binary_cross_entrophy**?* (can I use a single label or do I have to use "0" and "1"?)

### Cross topic - Training and evaluating model 

In [27]:
# 1. Getting train and dev data
with Timer("1 - test/train split"):
    X_train, X_dev, y_train, y_dev = get_train_test_sets(cross_traindev_df)

Time for [1 - test/train split]: 0:00:00.010687


In [None]:
# 2. setup
with Timer("2 - setup BERT model"):
    model, vocabulary, ctx, tokenizer, transform, loss_function, metric, all_labels = setup_bert()

In [None]:
with Timer("3 - prepare training data"):
    data_train_raw, data_train = transform_dataset(X_train, y_train, transform)
    print_infos(vocabulary, data_train_raw, data_train)

In [None]:
with Timer("4 - train model"):
    # train(model, data_train, ctx, metric, loss_function, batch_size=32, lr=5e-6, num_epochs=3)
    stats = train(model, data_train, ctx, metric, loss_function, batch_size=32, lr=5e-6, num_epochs=2)
    # model.save_parameters("data/same-side-classification/cross-topic/bert.model.params")
    model.save_parameters("data/bert.model.params")

    plot_train_stats(stats)

In [28]:
with Timer("5 - prepare eval data"):
    data_dev_raw, data_dev = transform_dataset(X_dev, y_dev, transform)
    print_infos(vocabulary, data_dev_raw, data_dev)

alcoholism and drug-use are common after abortions.
uncertainty over whether fetuses are "life" should halt abortions.
1
vocabulary used for tokenization = 
Vocab(size=30522, unk="[UNK]", reserved="['[PAD]', '[CLS]', '[SEP]', '[MASK]']")
[PAD] token id = 1
[CLS] token id = 2
[SEP] token id = 3
token ids = 
[    2 25519  1998  4319  1011  2224  2024  2691  2044 11324  2015  1012
     3 12503  2058  3251 10768  5809  2229  2024  1000  2166  1000  2323
  9190 11324  2015  1012     3     1     1     1     1     1     1     1
     1     1     1     1     1     1     1     1     1     1     1     1
     1     1     1     1     1     1     1     1     1     1     1     1
     1     1     1     1     1     1     1     1     1     1     1     1
     1     1     1     1     1     1     1     1     1     1     1     1
     1     1     1     1     1     1     1     1     1     1     1     1
     1     1     1     1     1     1     1     1     1     1     1     1
     1     1     1     1     1     

In [50]:
with Timer("6 - evaluate"):
    # model.load_parameters("data/same-side-classification/cross-topic/bert.model.params", ctx=ctx)
    # model.load_parameters("data/bert.model.params", ctx=ctx)
    all_predictions, cum_loss = predict(model, data_dev, ctx, metric, loss_function, batch_size=6)
    print("Accuracy:", metric.get()[1])

    y_true, y_pred = predict_out_to_ys(all_predictions, all_labels)
    report_training_results(y_true, y_pred, name="BERTClassifier within-model, cross-devset", heatmap=False)

HBox(children=(IntProgress(value=0, max=3053), HTML(value='')))


Time for [prediction]: 0:10:26.480018
Accuracy: 0.9274911274911275
Confusion Matrix:
[[8451  485]
 [ 843 8536]]

Accuracy:  0.93 

Report for [BERTClassifier within-model, cross-devset]:
              precision    recall  f1-score   support

           0       0.91      0.95      0.93      8936
           1       0.95      0.91      0.93      9379

    accuracy                           0.93     18315
   macro avg       0.93      0.93      0.93     18315
weighted avg       0.93      0.93      0.93     18315

Time for [6 - evaluate]: 0:10:27.453049


In [None]:
for epoch_id in range(5):
    with Timer("4 - train model - {}".format(epoch_id)):
        stats = train(model, data_train, ctx, metric, loss_function, batch_size=32, lr=5e-6, num_epochs=epoch_id + 1)
        plot_train_stats(stats)

    with Timer("6 - evaluate - {}".format(epoch_id)):
        all_predictions, cum_loss = predict(model, data_dev, ctx, metric, loss_function)
        print("Accuracy in epoch {}:".format(epoch_id), metric.get()[1])
        y_true, y_pred = predict_out_to_ys(all_predictions, all_labels)
        report_training_results(y_true, y_pred, name="BERTClassifier", heatmap=False)

    model.save_parameters("data/bert.model.params")