In [1]:
import warnings
warnings.filterwarnings('ignore')

import random
import os
import numpy as np
import mxnet as mx
from mxnet import gluon, nd, autograd
import gluonnlp as nlp
from bert import *
from gluonnlp.data import TSVDataset
from glob import glob
from os.path import expanduser

from sklearn.metrics import f1_score, precision_score, recall_score, confusion_matrix, accuracy_score
import matplotlib.pyplot as plt
import seaborn as sns; sns.set()

# seeding all randomizers
os.environ['PYTHONHASHSEED'] = '0'
np.random.seed(100)
random.seed(100)
mx.random.seed(100)

# use GPU when available otherwise use CPU
ctx = mx.gpu(0) if mx.test_utils.list_gpus() else mx.cpu()

In [2]:
dataset_name = 'imdb'

In [3]:
nclasses = {"20ng":20, "imdb":2, "r8":8, "r52":52, "ohsumed_all": 23, "ohsumed_first": 23}

In [4]:
HOME = expanduser("~")
DATADIR = '{}/working_dir/classification/files/{}/'.format(HOME, dataset_name)
working_dir = "{}/working_dir/classification/models/bert/{}/".format(HOME, dataset_name)
if not os.path.exists(working_dir):
    os.makedirs(working_dir)
filename = '{}/net.params'.format(working_dir)

In [5]:
# network params
# maximum sequence length
max_len = 150
# number of classes
n_classes = nclasses[dataset_name]
all_labels = [str(_) for _ in range(n_classes)]
# batch size
batch_size = 32
# initial learning rate
lr = 5e-6
# gradient clipping value
grad_clip = 1
# log to screen every 50 batch
log_interval = 50
# train until we fail to beat the current best validation loss for 5 consecutive epochs
max_patience = 5

In [6]:
def print_results(y_true, y_pred):
    print("Accuracy: {:2.2f}".format(100*accuracy_score(y_true, y_pred)))
    print("F1-Score: {:2.2f}".format(100*f1_score(y_true, y_pred, average="macro")))
    print("Precision: {:2.2f}".format(100*precision_score(y_true, y_pred, average="macro")))
    print("Recall: {:2.2f}".format(100*recall_score(y_true, y_pred, average="macro")))
    fig, ax = plt.subplots(figsize=(18, 10))
    x = sns.heatmap(confusion_matrix(y_true, y_pred), annot=True, ax=ax)
    x.invert_yaxis()
    plt.xlabel("Predicted")
    plt.ylabel("True")
    return fig

class Dataset(TSVDataset):
    """Train dataset.

    Parameters
    ----------
    segment : str or list of str, default 'train'
        Dataset segment. Options are 'train', 'val', 'test' or their combinations.
    root : str, default 'dir containing train/dev/test datasets'
    """
    def __init__(self, segment='train', root='.', n_classes=2):
        self._supported_segments = ['train', 'dev', 'test']
        assert segment in self._supported_segments, 'Unsupported segment: %s'%segment
        path = os.path.join(root, '%s.tsv'%segment)
        A_IDX, LABEL_IDX = 0, 1
        fields = [A_IDX, LABEL_IDX]
        self.n_classes=n_classes
        super(Dataset, self).__init__(path, field_indices=fields)

    @staticmethod
    def get_labels():
        """Get classification label ids of the dataset."""
        return [str(_) for _ in range(self.n_classes)]

In [7]:
data_train = Dataset(root=DATADIR, segment='train', n_classes=n_classes)
data_dev = Dataset(root=DATADIR, segment='dev', n_classes=n_classes)
data_test = Dataset(root=DATADIR, segment='test', n_classes=n_classes)

In [8]:
sample_id = np.random.randint(0, len(data_train))
print('<<<<TEXT>>>>')
print(data_train[sample_id][0])
print("<<<<LABEL>>>>")
print(data_train[sample_id][1])

<<<<TEXT>>>>
this superb film draws on a variety of talented actors and musicians at the top of their form levant , crosby , martin , rathbone , manone are completely at home in the story that apparently was supplied by billy wilder one would love to know more about how much he had to do with it , because it 's an exceptionally clever variation on the sterile master fertile servant tale nearly an allegory of the entertainment industry , run by dried up numskulls , but made into a vibrant world of art and play by an exploited underclass of nobodies and non wasps looking at the last six decades of music , tv , and film in the us , it 's hard not to see the underlying insights of this film as prophetic
<<<<LABEL>>>>
1


In [9]:
bert_base, vocabulary = nlp.model.get_model('bert_12_768_12',
                                             dataset_name='book_corpus_wiki_en_uncased',
                                             pretrained=True, ctx=ctx, use_pooler=True,
                                             use_decoder=False, use_classifier=False)
#print(bert_base)

In [10]:
# use the vocabulary from pre-trained model for tokenization
tokenizer = tokenization.FullTokenizer(vocabulary, do_lower_case=True)
transform = dataset.ClassificationTransform(tokenizer, all_labels, max_len, pair=False)

In [11]:
data_train = data_train.transform(transform)
data_dev = data_dev.transform(transform)
data_test = data_test.transform(transform)
print('token ids = \n%s'%data_train[sample_id][0])

token ids = 
[    2  2023 21688  2143  9891  2006  1037  3528  1997 10904  5889  1998
  5389  2012  1996  2327  1997  2037  2433 24485  1010 14282  1010  3235
  1010  9350  2232 14417  1010  2158  5643  2024  3294  2012  2188  1999
  1996  2466  2008  4593  2001  8127  2011  5006 18463  2028  2052  2293
  2000  2113  2062  2055  2129  2172  2002  2018  2000  2079  2007  2009
  1010  2138  2009  1005  1055  2019 17077 12266  8386  2006  1996 25403
  3040 14946  7947  6925  3053  2019  2035 20265  2854  1997  1996  4024
  3068  1010  2448  2011  9550  2039 16371  5244  5283 12718  1010  2021
  2081  2046  1037 17026  2088  1997  2396  1998  2377  2011  2019 18516
  2104 26266  1997  2053  5092 18389  1998  2512 23146  2559  2012  1996
  2197  2416  5109  1997  2189  1010  2694  1010  1998  2143  1999  1996
  2149  1010  2009  1005  1055  2524  2025  2000  2156  1996 10318 20062
  1997  2023  2143  2004 12168     3]


In [12]:
train_dataloader = mx.gluon.data.DataLoader(data_train, batch_size=batch_size, shuffle=True, last_batch='rollover')
dev_dataloader = mx.gluon.data.DataLoader(data_dev, batch_size=batch_size, shuffle=False, last_batch='rollover')
test_dataloader = mx.gluon.data.DataLoader(data_dev, batch_size=batch_size, shuffle=False, last_batch='rollover')

In [None]:
model = bert.BERTClassifier(bert_base, num_classes=n_classes, dropout=0.1)
# only need to initialize the classifier layer.
model.classifier.initialize(init=mx.init.Normal(0.02), ctx=ctx)
model.hybridize(static_alloc=True)

# softmax cross entropy loss for classification
loss_function = gluon.loss.SoftmaxCELoss()
loss_function.hybridize(static_alloc=True)

metric = mx.metric.Accuracy()

In [None]:
trainer = gluon.Trainer(model.collect_params(), 'adam', {'learning_rate': lr, 'epsilon': 1e-9})

# collect all differentiable parameters
# grad_req == 'null' indicates no gradients are calculated (e.g. constant parameters)
# the gradients for these params are clipped later
params = [p for p in model.collect_params().values() if p.grad_req != 'null']

train_step = 0
epoch_id = 0
best_loss = None
patience = 0
while True:
    metric.reset()
    step_loss = 0
    for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(train_dataloader):
        # load data to GPU
        token_ids = token_ids.as_in_context(ctx)
        valid_length = valid_length.as_in_context(ctx)
        segment_ids = segment_ids.as_in_context(ctx)
        label = label.as_in_context(ctx)

        with autograd.record():
            # forward computation
            out = model(token_ids, segment_ids, valid_length.astype('float32'))
            ls = loss_function(out, label).mean()

        # backward computation
        ls.backward()

        # gradient clipping
        grads = [p.grad(c) for p in params for c in [ctx]]
        gluon.utils.clip_global_norm(grads, grad_clip)

        # parameter update
        trainer.step(1)
        step_loss += ls.asscalar()
        metric.update([label], [out])
        if (batch_id + 1) % (log_interval) == 0:
            print('[Epoch {} Batch {}/{}] loss={:.4f}, lr={:.7f}, acc={:.3f}'
                         .format(epoch_id, batch_id + 1, len(train_dataloader),
                                 step_loss / log_interval,
                                 trainer.learning_rate, metric.get()[1]))
            step_loss = 0
        train_step +=1
    epoch_id+=1
    ########################
    #### RUN EVALUATION ####
    ########################
    dev_loss = []
    y_true = []
    y_pred = []
    for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(dev_dataloader):
        # load data to GPU
        token_ids = token_ids.as_in_context(ctx)
        valid_length = valid_length.as_in_context(ctx)
        segment_ids = segment_ids.as_in_context(ctx)
        label = label.as_in_context(ctx)
        # get logits and loss value
        out = model(token_ids, segment_ids, valid_length.astype('float32'))
        ls = loss_function(out, label).mean()
        dev_loss.append(ls.asscalar())
        probs = out.softmax()
        pred = nd.argmax(probs, axis=1).asnumpy()
        y_true.extend(list(np.reshape(label.asnumpy(), (-1))))
        y_pred.extend(pred)
    dev_loss = np.mean(dev_loss)
    f1 = f1_score(y_true, y_pred, average="macro")
    acc = accuracy_score(y_true, y_pred)
    print('EVALUATION ON DEV DATASET:')
    print('dev mean loss: {:.4f}, f1-score: {:.4f}, accuracy: {:0.4f}'.format(dev_loss, f1, acc))
    if best_loss is None or dev_loss < best_loss:
        model.save_parameters('{}_best'.format(filename, train_step))
        best_loss = dev_loss
        print('dev best loss updated: {:.4f}'.format(best_loss))
        patience=0
    else:
        if patience == max_patience:
            model.save_parameters('{}_{}'.format(filename, train_step))
            break
        new_lr = trainer.learning_rate/2
        trainer.set_learning_rate(new_lr)
        print('patience #{}: reducing the lr to {}'.format(patience, new_lr))
        patience+=1

[Epoch 0 Batch 50/664] loss=0.6720, lr=0.0000050, acc=0.591
[Epoch 0 Batch 100/664] loss=0.5595, lr=0.0000050, acc=0.672
[Epoch 0 Batch 150/664] loss=0.4451, lr=0.0000050, acc=0.719
[Epoch 0 Batch 200/664] loss=0.3723, lr=0.0000050, acc=0.753
[Epoch 0 Batch 250/664] loss=0.3915, lr=0.0000050, acc=0.767
[Epoch 0 Batch 300/664] loss=0.3544, lr=0.0000050, acc=0.781
[Epoch 0 Batch 350/664] loss=0.3347, lr=0.0000050, acc=0.792
[Epoch 0 Batch 400/664] loss=0.3473, lr=0.0000050, acc=0.799
[Epoch 0 Batch 450/664] loss=0.3227, lr=0.0000050, acc=0.806
[Epoch 0 Batch 500/664] loss=0.3221, lr=0.0000050, acc=0.813
[Epoch 0 Batch 550/664] loss=0.3175, lr=0.0000050, acc=0.818
[Epoch 0 Batch 600/664] loss=0.3079, lr=0.0000050, acc=0.822
[Epoch 0 Batch 650/664] loss=0.3357, lr=0.0000050, acc=0.825
EVALUATION ON DEV DATASET:
dev mean loss: 0.3111, f1-score: 0.8740, accuracy: 0.8747
[Epoch 1 Batch 50/664] loss=0.2537, lr=0.0000050, acc=0.896
[Epoch 1 Batch 100/664] loss=0.2973, lr=0.0000050, acc=0.885
[E

In [None]:
# load the best pre-trained model for evaluation
best_ckpt = glob('{}*best'.format(filename))[0]
model = bert.BERTClassifier(bert_base, num_classes=n_classes, dropout=0.1)
model.load_parameters(best_ckpt, ctx=ctx)

In [None]:
y_true = []
y_pred = []
for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(test_dataloader):
    token_ids = token_ids.as_in_context(ctx)
    valid_length = valid_length.as_in_context(ctx)
    segment_ids = segment_ids.as_in_context(ctx)
    label = label.as_in_context(ctx)
    out = model(token_ids, segment_ids, valid_length.astype('float32')).softmax()
    pred = nd.argmax(out, axis=1).asnumpy()
    y_true.extend(list(np.reshape(label.asnumpy(), (-1))))
    y_pred.extend(pred)
assert len(y_true)==len(y_pred)

In [None]:
fig = print_results(np.reshape(y_true, (-1)), y_pred)
fig.savefig('{}/cm.png'.format(working_dir))