In [1]:
import warnings
warnings.filterwarnings('ignore')

import random
import os
import numpy as np
import mxnet as mx
from mxnet import gluon, nd, autograd
import gluonnlp as nlp
from bert import *
from gluonnlp.data import TSVDataset

from sklearn.metrics import f1_score, precision_score, recall_score, confusion_matrix, accuracy_score
import matplotlib.pyplot as plt
import seaborn as sns; sns.set()
from glob import glob

np.random.seed(100)
random.seed(100)
mx.random.seed(10000)
ctx = mx.gpu(0) if mx.test_utils.list_gpus() else mx.cpu()

In [2]:
ROOT_DIR = '/home/yaserkl/data/imdb/v1/'
working_dir = "/home/yaserkl/working_dir/classification/gluon/bert/imdb/"
if not os.path.exists(working_dir):
    os.makedirs(working_dir)
filename = '{}/net.params'.format(working_dir)

In [3]:
# network params
# maximum sequence length
max_len = 150
n_classes = 2
all_labels = [str(_) for _ in range(n_classes)]
batch_size = 32
lr = 5e-6
grad_clip = 1
log_interval = 4
num_epochs = 10
max_patience = 5

In [4]:
def print_results(y_true, y_pred):
    print(f1_score(y_true, y_pred, average="macro"))
    print(precision_score(y_true, y_pred, average="macro"))
    print(recall_score(y_true, y_pred, average="macro"))
    fig, ax = plt.subplots(figsize=(19, 10))
    x = sns.heatmap(confusion_matrix(y_true, y_pred), annot=True, ax=ax)
    x.invert_yaxis()
    plt.xlabel("Predicted")
    plt.ylabel("True")
    return fig

class Dataset(TSVDataset):
    """Train dataset.

    Parameters
    ----------
    segment : str or list of str, default 'train'
        Dataset segment. Options are 'train', 'val', 'test' or their combinations.
    root : str, default 'dir containing train/dev/test datasets'
    """
    def __init__(self, segment='train', root=ROOT_DIR, n_classes=2):
        self._supported_segments = ['train', 'dev', 'test']
        assert segment in self._supported_segments, 'Unsupported segment: %s'%segment
        path = os.path.join(root, '%s.tsv'%segment)
        A_IDX, LABEL_IDX = 0, 1
        fields = [A_IDX, LABEL_IDX]
        self.n_classes=n_classes
        super(Dataset, self).__init__(path, field_indices=fields)

    @staticmethod
    def get_labels():
        """Get classification label ids of the dataset."""
        return [str(_) for _ in range(self.n_classes)]

In [5]:
bert_base, vocabulary = nlp.model.get_model('bert_12_768_12',
                                             dataset_name='book_corpus_wiki_en_uncased',
                                             pretrained=True, ctx=ctx, use_pooler=True,
                                             use_decoder=False, use_classifier=False)
print(bert_base)

BERTModel(
  (encoder): BERTEncoder(
    (dropout_layer): Dropout(p = 0.1, axes=())
    (layer_norm): BERTLayerNorm(in_channels=768, epsilon=1e-12)
    (transformer_cells): HybridSequential(
      (0): BERTEncoderCell(
        (dropout_layer): Dropout(p = 0.1, axes=())
        (attention_cell): MultiHeadAttentionCell(
          (_base_cell): DotProductAttentionCell(
            (_dropout_layer): Dropout(p = 0.1, axes=())
          )
          (proj_query): Dense(768 -> 768, linear)
          (proj_key): Dense(768 -> 768, linear)
          (proj_value): Dense(768 -> 768, linear)
        )
        (proj): Dense(768 -> 768, linear)
        (ffn): BERTPositionwiseFFN(
          (ffn_1): Dense(768 -> 3072, linear)
          (activation): GELU()
          (ffn_2): Dense(3072 -> 768, linear)
          (dropout_layer): Dropout(p = 0.1, axes=())
          (layer_norm): BERTLayerNorm(in_channels=768, epsilon=1e-12)
        )
        (layer_norm): BERTLayerNorm(in_channels=768, epsilon=1e-12)
   

In [6]:
data_train = Dataset('train', n_classes=n_classes)
data_dev = Dataset('dev', n_classes=n_classes)
data_test = Dataset('test', n_classes=n_classes)

In [7]:
sample_id = 0
# sentence a
print(data_train[sample_id][0])
# label
print(data_train[sample_id][1])

utterly pretentious nonsense the material is dull , dull , dull , and most of the cast would n't even have made understudies in allen 's earlier films and to have to listen to the unfunny will ferrell do his woody allen imitation makes me loathe the second rate \\( though mysteriously popular \\) ferrell even more it appears that the morose 70 year old allen should have knocked off work when the clock rang in a new century br br i truly tried to get involved in the film , but it was just impossible my snyapses could n't fire that slowly so , rather than doze off and kill the afternoon sleeping in an upright position i got up , left my wife and daughter in the theater , and went out to the car where i had a really good book to re read \\( george bailey 's great tome of 30 years ago , germans \\) the day turned out pretty well after all , no thanks to woody
0


In [8]:
# use the vocabulary from pre-trained model for tokenization
tokenizer = tokenization.FullTokenizer(vocabulary, do_lower_case=True)
transform = dataset.ClassificationTransform(tokenizer, all_labels, max_len, pair=False)

In [9]:
data_train = data_train.transform(transform)
data_dev = data_dev.transform(transform)
data_test = data_test.transform(transform)
print('token ids = \n%s'%data_train[sample_id][0])

token ids = 
[    2 12580  3653  6528 20771 14652  1996  3430  2003 10634  1010 10634
  1010 10634  1010  1998  2087  1997  1996  3459  2052  1050  1005  1056
  2130  2031  2081  2104  3367 21041  2229  1999  5297  1005  1055  3041
  3152  1998  2000  2031  2000  4952  2000  1996  4895 11263 10695  2100
  2097 10768 14069  2079  2010 13703  5297 20017  3084  2033  8840  8988
  2063  1996  2117  3446  1032  1032  1006  2295 29239  2759  1032  1032
  1007 10768 14069  2130  2062  2009  3544  2008  1996 22822  9232  3963
  2095  2214  5297  2323  2031  6573  2125  2147  2043  1996  5119  8369
  1999  1037  2047  2301  7987  7987  1045  5621  2699  2000  2131  2920
  1999  1996  2143  1010  2021  2009  2001  2074  5263  2026  1055 17238
 29251  2015  2071  1050  1005  1056  2543  2008  3254  2061  1010  2738
  2084  2079  4371  2125  1998  3102  1996  5027  5777  1999  2019 10051
  2597  1045  2288  2039  1010     3]


In [10]:
train_dataloader = mx.gluon.data.DataLoader(data_train, batch_size=batch_size, shuffle=True, last_batch='rollover')
dev_dataloader = mx.gluon.data.DataLoader(data_dev, batch_size=batch_size, shuffle=False, last_batch='rollover')
test_dataloader = mx.gluon.data.DataLoader(data_dev, batch_size=batch_size, shuffle=False, last_batch='rollover')

In [11]:
model = bert.BERTClassifier(bert_base, num_classes=n_classes, dropout=0.1)
# only need to initialize the classifier layer.
model.classifier.initialize(init=mx.init.Normal(0.02), ctx=ctx)
model.hybridize(static_alloc=True)

# softmax cross entropy loss for classification
loss_function = gluon.loss.SoftmaxCELoss()
loss_function.hybridize(static_alloc=True)

metric = mx.metric.Accuracy()

In [12]:
trainer = gluon.Trainer(model.collect_params(), 'adam', {'learning_rate': lr, 'epsilon': 1e-9})

# collect all differentiable parameters
# grad_req == 'null' indicates no gradients are calculated (e.g. constant parameters)
# the gradients for these params are clipped later
params = [p for p in model.collect_params().values() if p.grad_req != 'null']

train_step = 0
epoch_id = 0
best_loss = None
patience = 0
#for epoch_id in range(num_epochs):
while True:
    metric.reset()
    step_loss = 0
    for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(train_dataloader):
        # load data to GPU
        token_ids = token_ids.as_in_context(ctx)
        valid_length = valid_length.as_in_context(ctx)
        segment_ids = segment_ids.as_in_context(ctx)
        label = label.as_in_context(ctx)

        with autograd.record():
            # forward computation
            out = model(token_ids, segment_ids, valid_length.astype('float32'))
            ls = loss_function(out, label).mean()

        # backward computation
        ls.backward()

        # gradient clipping
        grads = [p.grad(c) for p in params for c in [ctx]]
        gluon.utils.clip_global_norm(grads, grad_clip)

        # parameter update
        trainer.step(1)
        step_loss += ls.asscalar()
        metric.update([label], [out])
        if (batch_id + 1) % (log_interval) == 0:
            print('[Epoch {} Batch {}/{}] loss={:.4f}, lr={:.7f}, acc={:.3f}'
                         .format(epoch_id, batch_id + 1, len(train_dataloader),
                                 step_loss / log_interval,
                                 trainer.learning_rate, metric.get()[1]))
            step_loss = 0
        train_step +=1
    epoch_id+=1
    ########################
    #### RUN EVALUATION ####
    ########################
    dev_loss = []
    y_true = []
    y_pred = []
    for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(dev_dataloader):
        # load data to GPU
        token_ids = token_ids.as_in_context(ctx)
        valid_length = valid_length.as_in_context(ctx)
        segment_ids = segment_ids.as_in_context(ctx)
        label = label.as_in_context(ctx)
        out = model(token_ids, segment_ids, valid_length.astype('float32'))
        ls = loss_function(out, label).mean()
        dev_loss.append(ls.asscalar())
        probs = out.softmax()
        pred = nd.argmax(probs, axis=1).asnumpy()
        y_true.extend(list(np.reshape(label.asnumpy(), (-1))))
        y_pred.extend(pred)
    dev_loss = np.mean(dev_loss)
    f1 = f1_score(y_true, y_pred, average="macro")
    acc = accuracy_score(y_true, y_pred)
    print('EVALUATION ON DEV DATASET:')
    print('dev mean loss: {:.4f}, f1-score: {:.4f}, accuracy: {:0.4f}'.format(dev_loss, f1, acc))
    if best_loss is None or dev_loss < best_loss:
        model.save_parameters('{}_{}_best'.format(filename, train_step))
        best_loss = dev_loss
        patience=0
    else:
        new_lr = trainer.learning_rate/2
        trainer.set_learning_rate(new_lr)
        print('patience #{}: reducing the lr to {}'.format(patience, new_lr))
        patience+=1
    if patience == max_patience:
        model.save_parameters('{}_{}'.format(filename, train_step))
        break

NameError: name 'epoch_id' is not defined

In [None]:
# load the best pre-trained model for evaluation
best_ckpt = glob('{}*best'.format(filename))[0]
model = bert.BERTClassifier(bert_base, num_classes=n_classes, dropout=0.1)
model.load_parameters(best_ckpt, ctx=ctx)

In [None]:
y_true = []
y_pred = []
for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(test_dataloader):
    token_ids = token_ids.as_in_context(ctx)
    valid_length = valid_length.as_in_context(ctx)
    segment_ids = segment_ids.as_in_context(ctx)
    label = label.as_in_context(ctx)
    out = model(token_ids, segment_ids, valid_length.astype('float32')).softmax()
    pred = nd.argmax(out, axis=1).asnumpy()
    y_true.extend(list(np.reshape(label.asnumpy(), (-1))))
    y_pred.extend(pred)
assert len(y_true)==len(y_pred)

In [None]:
fig = print_results(np.reshape(y_true, (-1)), y_pred)
fig.savefig('{}/cm.png'.format(working_dir))