In [1]:
import pandas as pd
import numpy as np
import mxnet as mx
from mxnet import nd, autograd, gluon, init
from mxnet.gluon import nn, rnn
import gluonnlp as nlp
import pkuseg
import multiprocessing as mp
import time
from d2l import try_gpu
import itertools
from sklearn.metrics import accuracy_score, f1_score
# fixed random number seed
np.random.seed(9102)
mx.random.seed(9102)

In [2]:
DATA_FOLDER = 'data/'
TRAIN_DATA = 'train.csv'
WORD_EMBED = 'sgns.weibo.bigram-char'
LABEL_FILE = 'train.label'
N_ROWS=10000
ctx = try_gpu()
seg = pkuseg.pkuseg(model_name='web')

In [3]:
train_df = pd.read_csv(DATA_FOLDER+TRAIN_DATA, nrows=N_ROWS, sep='|')

In [4]:
dataset =[ [row[0], row[1]] for _, row in train_df.iterrows()]
train_dataset, valid_dataset = nlp.data.train_valid_split(dataset, .2)
len(train_dataset), len(valid_dataset)

(8000, 2000)

In [5]:
def tokenizer(x):
    tweet, label = x
    if type(tweet) != str:
        tweet = str(tweet)
    word_list = seg.cut(tweet)
    return word_list, label

def get_length(x):
    return float(len(x[0]))

def to_word_list(dataset):
    start = time.time()
    with mp.Pool() as pool:
        # Each sample is processed in an asynchronous manner.
        dataset = gluon.data.ArrayDataset(pool.map(tokenizer, dataset))
        lengths = gluon.data.ArrayDataset(pool.map(get_length, dataset))
    end = time.time()

    print('Done! Tokenizing Time={:.2f}s, #Sentences={}'.format(end - start, len(dataset)))
    return dataset, lengths

train_word_list, train_word_lengths = to_word_list(train_dataset)
valid_word_list, valid_word_lengths = to_word_list(valid_dataset)

Done! Tokenizing Time=1.01s, #Sentences=8000
Done! Tokenizing Time=0.47s, #Sentences=2000


In [6]:
train_seqs = [sample[0] for sample in train_word_list]
counter = nlp.data.count_tokens(list(itertools.chain.from_iterable(train_seqs)))

vocab = nlp.Vocab(counter, max_size=40000)

# load customed pre-trained embedding
embedding_weights = nlp.embedding.TokenEmbedding.from_file(file_path=DATA_FOLDER+WORD_EMBED)
vocab.set_embedding(embedding_weights)
print(vocab)

def token_to_idx(x):
    return vocab[x[0]], x[1]

# A token index or a list of token indices is returned according to the vocabulary.
with mp.Pool() as pool:
    train_dataset = pool.map(token_to_idx, train_word_list)
    valid_dataset = pool.map(token_to_idx, valid_word_list)

  .format(line_num, pretrained_file_path))


Vocab(size=17473, unk="<unk>", reserved="['<pad>', '<bos>', '<eos>']")


In [7]:
batch_size = 64
bucket_num = 10
bucket_ratio = 0.5


def get_dataloader():
    # Construct the DataLoader Pad data, stack label and lengths
    batchify_fn = nlp.data.batchify.Tuple(nlp.data.batchify.Pad(axis=0), \
                                          nlp.data.batchify.Stack())

    # in this example, we use a FixedBucketSampler,
    # which assigns each data sample to a fixed bucket based on its length.
    batch_sampler = nlp.data.sampler.FixedBucketSampler(
        train_word_lengths,
        batch_size=batch_size,
        num_buckets=bucket_num,
        ratio=bucket_ratio,
        shuffle=True)
    print(batch_sampler.stats())

    # train_dataloader
    train_dataloader = gluon.data.DataLoader(
        dataset=train_dataset,
        batch_sampler=batch_sampler,
        batchify_fn=batchify_fn)
    # valid_dataloader
    valid_dataloader = gluon.data.DataLoader(
        dataset=valid_dataset,
        batch_size=batch_size,
        shuffle=False,
        batchify_fn=batchify_fn)
    return train_dataloader, valid_dataloader

train_dataloader, valid_dataloader = get_dataloader()

FixedBucketSampler:
  sample_num=8000, batch_num=55
  key=[14, 23, 32, 41, 50, 59, 68, 77, 86, 95]
  cnt=[5902, 942, 449, 255, 140, 110, 97, 57, 29, 19]
  batch_size=[217, 132, 95, 74, 64, 64, 64, 64, 64, 64]


In [8]:
for tweet, label in train_dataloader:
    print(tweet, label)
    break


[[1.100e+02 5.000e+01 1.020e+02 ... 0.000e+00 0.000e+00 0.000e+00]
 [3.140e+03 5.000e+00 3.594e+03 ... 0.000e+00 0.000e+00 0.000e+00]
 [3.760e+02 3.122e+03 4.500e+01 ... 0.000e+00 0.000e+00 0.000e+00]
 ...
 [2.389e+03 1.070e+02 0.000e+00 ... 0.000e+00 0.000e+00 0.000e+00]
 [2.600e+01 5.475e+03 4.000e+00 ... 0.000e+00 0.000e+00 0.000e+00]
 [1.634e+03 2.100e+01 2.800e+01 ... 0.000e+00 0.000e+00 0.000e+00]]
<NDArray 217x14 @cpu_shared(0)> 
[ 3 30 39 10 47 23 26  0  3  0 68 29 42 38 35 34  3 31 12 16 29 20 24  3
 18 29 35 23  6 17 23 29  9 39  3 34 49  3 61 12 16 12 30 28 43 40  8 47
  6 29 12 30  3 29  0 64 67  9 16 58 23  0 23  3  1 12  8 35 43 16  1 16
  0  3  3 12 38 22  3 52 32 41 29 42 12 23  6 43 17  6 23  6  6  3 16  3
 14 31 11 24 23 36 12 28 41 35 17 55 12  3 36 29 14 35  3 30 23 41 31 14
 12 12 16 30 19 33 23 35 27 14 29 23 24  7 34 14 34  1 34  3 11 12 14 23
  6 14 23 23  0 52 23 12 30 10  0 12 10 14 16 16 34 12 31  6 29 12 62 13
 35 62 35 23 24 17  3 14  4 14  9  6  3 18 47 2

## Model contruction
Self attention layer, weighted cross entropy, and whole model

In [9]:
# custom attention layer
# in this class, we want to implement the operation:
# softmax(W_2 * tanh(W_1 * H))
# where H is the word embedding of the whole sentence, of shape (num_of_word, embed_size)
class SelfAttention(nn.HybridBlock):
    def __init__(self, att_unit, att_hops, **kwargs):
        super(SelfAttention, self).__init__(**kwargs)
        with self.name_scope():
            # this layer is tanh(w_1 * H), the att_unit corresponds to d_a in the essay
            self.ut_dense = nn.Dense(att_unit, activation='tanh', flatten=False)
            # this layer implements the multiple hops
            self.et_dense = nn.Dense(att_hops, activation=None, flatten=False)

    def hybrid_forward(self, F, x): # F is the backend which implements the tensor operation
        # x shape: [batch_size, seq_len, embedding_width]
        # ut shape: [batch_size, seq_len, att_unit]
        ut = self.ut_dense(x) # batch_size * seq_len [* embed_size * embed_size *] att_unit
        # et shape: [batch_size, seq_len, att_hops]
        et = self.et_dense(ut)# batch_size * seq_len [* att_unit * att_unit *] att_hops

        # att shape: [batch_size,  att_hops, seq_len]
        # softmax is performed along the seq_len dimension
        att = F.softmax(F.transpose(et, axes=(0, 2, 1)), axis=-1)
        # output shape [batch_size, att_hops, embedding_width]
        output = F.batch_dot(att, x)
        # output is the weighted matrix representation of the matrix
        # att is the weighted vector we use as attention
        return output, att
    
# d_a = 20, hops = 5
print(SelfAttention(20, 5))

SelfAttention(
  (ut_dense): Dense(None -> 20, Activation(tanh))
  (et_dense): Dense(None -> 5, linear)
)


In [10]:
class WeightedSoftmaxCE(nn.HybridBlock):
    def __init__(self, sparse_label=True, from_logits=False,  **kwargs):
        super(WeightedSoftmaxCE, self).__init__(**kwargs)
        with self.name_scope():
            self.sparse_label = sparse_label
            self.from_logits = from_logits

    def hybrid_forward(self, F, pred, label, class_weight, depth=None):
        if self.sparse_label:
            label = F.reshape(label, shape=(-1, ))
            label = F.one_hot(label, depth)
        if not self.from_logits:
            pred = F.log_softmax(pred, -1)

        weight_label = F.broadcast_mul(label, class_weight)
        loss = -F.sum(pred * weight_label, axis=-1)

        # return F.mean(loss, axis=0, exclude=True)
        return loss

In [11]:
class SelfAttentiveBiLSTM(nn.HybridBlock):
    def __init__(self, vocab_len, embsize, nhidden, nlayers, natt_unit, natt_hops, \
                 nfc, nclass, # these two params are not used currrently
                 drop_prob, pool_way, prune_p=None, prune_q=None, **kwargs):
        super(SelfAttentiveBiLSTM, self).__init__(**kwargs)
        with self.name_scope():
            # now we switch back to shared layers
            self.embedding_layer = nn.Embedding(vocab_len, embsize)
            
            self.bilstm = rnn.LSTM(nhidden, num_layers=nlayers, dropout=drop_prob, \
                                        bidirectional=True)
            
            self.att_encoder = SelfAttention(natt_unit, natt_hops)
            self.dense = nn.Dense(nfc, activation='tanh')
            # this layer is used to output the final class
            self.output_layer = nn.Dense(nclass)
            
            self.dense_p, self.dense_q = None, None
            if all([prune_p, prune_q]):
                self.dense_p = nn.Dense(prune_p, activation='tanh', flatten=False)
                self.dense_q = nn.Dense(prune_q, activation='tanh', flatten=False)

            self.drop_prob = drop_prob
            self.pool_way = pool_way

    def hybrid_forward(self, F, inp):
        # inp_embed size: [batch, seq_len, embed_size]
        inp_embed = self.embedding_layer(inp)
        # rnn requires the first dimension to be the time steps
        h_output = self.bilstm(F.transpose(inp_embed, axes=(1, 0, 2)))
        # att_output: [batch, att_hops, emsize]
        output, att = self.att_encoder(F.transpose(h_output, axes=(1, 0, 2)))
        '''
        FIXME: now this code will only work with flatten
        '''
        dense_input = None
        if self.pool_way == 'flatten':
            dense_input = F.Dropout(F.flatten(output), self.drop_prob)
        else:
            raise NotImplemented
        '''
        elif self.pool_way == 'mean':
            dense_input = F.Dropout(F.mean(att_output, axis=1), self.drop_prob)
        elif self.pool_way == 'prune' and all([self.dense_p, self.dense_q]):
            # p_section: [batch, att_hops, prune_p]
            p_section = self.dense_p(att_output)
            # q_section: [batch, emsize, prune_q]
            q_section = self.dense_q(F.transpose(att_output, axes=(0, 2, 1)))
            dense_input = F.Dropout(F.concat(F.flatten(p_section), F.flatten(q_section), dim=-1), self.drop_prob)
        '''
        dense_out = self.dense(dense_input)
        output = self.output_layer(F.Dropout(dense_out, self.drop_prob))

        return output, att

In [12]:
vocab_len = len(vocab)
emsize = 300   # word embedding size
nhidden = 400    # lstm hidden_dim
nlayers = 4     # lstm layers
natt_unit = 400     # the hidden_units of attention layer
natt_hops = 20    # the channels of attention
nfc = 512  # last dense layer size
nclass = 72 # we have 72 emoji in total

drop_prob = 0.6
pool_way = 'flatten'    # # The way to handle M
prune_p = None
prune_q = None

ctx = try_gpu()

try:
    assert(False)
    model = gluon.nn.SymbolBlock.imports("model/model-symbol.json", ['data'], \
                                         "model/model-0001.params", ctx=ctx)
    print('use saved model params to start')
except:
    model = SelfAttentiveBiLSTM(vocab_len, emsize, nhidden, nlayers,
                            natt_unit, natt_hops, nfc, nclass,
                            drop_prob, pool_way, prune_p, prune_q)

    print('initialize a new model')
    model.initialize(init=init.Xavier(), ctx=ctx)
    model.hybridize()

    # Attach a pre-trained glove word vector to the embedding layer
    model.embedding_layer.weight.set_data(vocab.embedding.idx_to_vec)
    # fixed the embedding layer
    model.embedding_layer.collect_params().setattr('grad_req', 'null')

print(model)

initialize a new model
SelfAttentiveBiLSTM(
  (embedding_layer): Embedding(17473 -> 300, float32)
  (bilstm): LSTM(None -> 400, TNC, num_layers=4, dropout=0.6, bidirectional)
  (att_encoder): SelfAttention(
    (ut_dense): Dense(None -> 400, Activation(tanh))
    (et_dense): Dense(None -> 20, linear)
  )
  (dense): Dense(None -> 512, Activation(tanh))
  (output_layer): Dense(None -> 72, linear)
)


In [13]:
vocab.embedding.idx_to_vec[vocab.embedding.token_to_idx['i']]


[-0.216152  0.111755  0.131167  0.108303 -0.56618   0.154915 -0.682917
  0.184372 -1.259019 -0.270754 -0.295431 -0.477648  0.313637 -0.249563
  0.464083 -0.177201  0.250866  0.63421  -0.023141 -0.083413 -0.086886
  0.373867 -0.120577  0.154108  0.075599  0.749676  0.064633  0.346573
 -0.375857  0.170967 -0.387877  0.621435  0.252638 -0.327384  0.03449
  0.59719   0.396766  0.550666 -0.658407 -0.525238  0.167532 -0.511287
 -0.360124 -0.815612 -0.511149 -0.866398  0.068793 -0.629899  0.036555
 -0.245903 -0.501821  0.222177 -0.887551 -0.059061  0.357666  0.444045
 -0.632446  0.706885  0.488229  0.459782  0.109316 -0.090775 -0.408769
  0.25539   0.630114 -0.136657 -0.541437  0.510262 -0.273591  0.137092
 -0.586211 -0.199848  0.066356  0.603941 -0.288794 -0.023497 -0.258354
  0.341849  0.427584 -0.001543  0.755926  0.718712 -1.017008  0.452808
  0.173271  0.29188   0.644698  0.49299   0.216398  0.517657  0.00933
  0.858918 -0.384057 -0.178975 -0.281533  1.395328  0.856061  0.560499
 -0.335

## Training helpers
Calculate loss, one epoch computation and top function for train and valid

In [14]:
def calculate_loss(x, y, model, loss, class_weight, penal_coeff):
    pred, att = model(x)
    y = nd.array(y.asnumpy().astype('int32')).as_in_context(ctx)
    if loss_name in ['sce', 'l1', 'l2']:
        l = loss(pred, y)
    elif loss_name == 'wsce':
        l = loss(pred, y, class_weight, class_weight.shape[0])
    else:
        raise NotImplemented
    # penalty, now we have two att's
    diversity_penalty = nd.batch_dot(att, nd.transpose(att, axes=(0, 2, 1))) - \
                        nd.eye(att.shape[1], ctx=att.context)
    l = l + penal_coeff * diversity_penalty.norm(axis=(1, 2))

    return pred, l

In [15]:
def one_epoch(data_iter, model, loss, trainer, ctx, is_train, epoch,
              penal_coeff=0.0, clip=None, class_weight=None, loss_name='sce'):

    loss_val = 0.
    total_pred = []
    total_true = []
    n_batch = 0

    for batch_x, batch_y in data_iter:
        batch_x = batch_x.as_in_context(ctx)
        batch_y = batch_y.as_in_context(ctx)

        if is_train:
            with autograd.record():
                batch_pred, l = calculate_loss(batch_x, batch_y, model, \
                                               loss, class_weight, penal_coeff)

            # backward calculate
            l.backward()

            # clip gradient
            clip_params = [p.data() for p in model.collect_params().values()]
            if clip is not None:
                norm = nd.array([0.0], ctx)
                for param in clip_params:
                    if param.grad is not None:
                        norm += (param.grad ** 2).sum()
                norm = norm.sqrt().asscalar()
                if norm > clip:
                    for param in clip_params:
                        if param.grad is not None:
                            param.grad[:] *= clip / norm

            # update parmas
            trainer.step(batch_x.shape[0])

        else:
            batch_pred, l = calculate_loss(batch_x, batch_y, model, \
                                           loss, class_weight, penal_coeff)

        # keep result for metric
        batch_pred = nd.argmax(nd.softmax(batch_pred, axis=1), axis=1).asnumpy()
        batch_true = np.reshape(batch_y.asnumpy(), (-1, ))
        total_pred.extend(batch_pred.tolist())
        total_true.extend(batch_true.tolist())
        
        batch_loss = l.mean().asscalar()

        n_batch += 1
        loss_val += batch_loss

        # check the result of traing phase
        if is_train and n_batch % 400 == 0:
            print('epoch %d, batch %d, batch_train_loss %.4f, batch_train_acc %.3f' %
                  (epoch, n_batch, batch_loss, accuracy_score(batch_true, batch_pred)))

    # metric
    F1 = f1_score(np.array(total_true), np.array(total_pred), average='weighted')
    acc = accuracy_score(np.array(total_true), np.array(total_pred))
    loss_val /= n_batch

    if is_train:
        print('epoch %d, learning_rate %.5f \n\t train_loss %.4f, acc_train %.3f, F1_train %.3f, ' %
              (epoch, trainer.learning_rate, loss_val, acc, F1))
        # declay lr
        if epoch % 3 == 0:
            trainer.set_learning_rate(trainer.learning_rate * 0.9)
    else:
        print('\t valid_loss %.4f, acc_valid %.3f, F1_valid %.3f, ' % (loss_val, acc, F1))

In [16]:
def train_valid(data_iter_train, data_iter_valid, model, loss, trainer, ctx, nepochs,
                penal_coeff=0.0, clip=None, class_weight=None, loss_name='sce'):

    for epoch in range(1, nepochs+1):
        start = time.time()
        # train
        is_train = True
        one_epoch(data_iter_train, model, loss, trainer, ctx, is_train,
                  epoch, penal_coeff, clip, class_weight, loss_name)

        # valid
        is_train = False
        one_epoch(data_iter_valid, model, loss, trainer, ctx, is_train,
                  epoch, penal_coeff, clip, class_weight, loss_name)
        end = time.time()
        print('time %.2f sec' % (end-start))
        print("*"*100)

## Train
Now we will train this model. To handle data inbalance, we first set an estimated weight of the labels.

In [17]:
from util import get_weight
weight_list = get_weight(DATA_FOLDER, LABEL_FILE)
print(weight_list)
class_weight = None
loss_name = 'wsce'
optim = 'adam'
lr = 0.001
penal_coeff = 0.003
clip = .5
nepochs = 20

trainer = gluon.Trainer(model.collect_params(), optim, {'learning_rate': lr})

if loss_name == 'sce':
    loss = gluon.loss.SoftmaxCrossEntropyLoss()
elif loss_name == 'wsce':
    loss = WeightedSoftmaxCE()
    # the value of class_weight is obtained by counting data in advance. It can be seen as a hyperparameter.
    class_weight = nd.array(weight_list, ctx=ctx)
elif loss_name == 'l1':
    loss = gluon.loss.L1Loss()
elif loss_name == 'l2':
    loss = gluon.loss.L2Loss()

[3.0438551041853086, 2.1327178079706437, 2.6289386989030965, 1.3132616875182228, 5.865340939109901, 5.4786618184253655, 4.224614467097051, 5.801492131646614, 4.885536195487413, 4.542375563350704, 4.081807483487323, 5.189702164863072, 3.305837627261018, 4.878938941572088, 3.6444791357195254, 3.7205017060522403, 4.472592473586576, 4.093379015241236, 4.248802590468786, 3.0519071826380233, 5.122529615543214, 5.458262330069614, 4.908613508965677, 3.5998957691127362, 4.323533509839654, 5.63636542003083, 3.9929312237387604, 3.03816172414858, 3.838505771921529, 3.3860142787030894, 4.648330755306223, 4.366421424733982, 3.0369374921158596, 4.546694392249273, 4.857127296782221, 4.99100541119585, 4.843795600702158, 4.4413003436993606, 4.398435138826409, 4.204894865312225, 2.853687744438452, 5.250217403899569, 4.900510567088717, 4.997759625484381, 4.783218487698108, 5.317967644021719, 6.028575926328332, 4.697668606671044, 4.440563477981859, 6.461061349927445, 3.794757404347419, 6.040033882163419, 4

In [18]:
# train and valid
train_valid(train_dataloader, valid_dataloader, model, loss, \
            trainer, ctx, nepochs, penal_coeff=penal_coeff, \
            clip=clip, class_weight=class_weight, loss_name=loss_name)
token = str(round(time.time()))
model.export("model/model"+token, epoch=1)

  'precision', 'predicted', average, warn_for)


epoch 1, learning_rate 0.00100 
	 train_loss 16.6725, acc_train 0.063, F1_train 0.036, 
	 valid_loss 15.4185, acc_valid 0.081, F1_valid 0.016, 
time 9.86 sec
****************************************************************************************************
epoch 2, learning_rate 0.00100 
	 train_loss 16.3687, acc_train 0.064, F1_train 0.034, 
	 valid_loss 15.7555, acc_valid 0.044, F1_valid 0.016, 
time 10.00 sec
****************************************************************************************************
epoch 3, learning_rate 0.00100 
	 train_loss 16.0826, acc_train 0.073, F1_train 0.054, 
	 valid_loss 15.5848, acc_valid 0.105, F1_valid 0.041, 
time 10.02 sec
****************************************************************************************************
epoch 4, learning_rate 0.00090 
	 train_loss 15.9632, acc_train 0.069, F1_train 0.041, 
	 valid_loss 15.5278, acc_valid 0.062, F1_valid 0.012, 
time 10.02 sec
**************************************************************

In [52]:
TEST_DATA = 'test.csv'
predictions = []
test_df = pd.read_csv(DATA_FOLDER+TEST_DATA, header=None, sep='\t')
len(test_df)

200000

In [53]:
start = time.time()
for _, tweet in test_df.iterrows():
    token = vocab[seg.cut(tweet[1])]
    if token is None:
        token = [0]
    inp = nd.array(token, ctx=ctx).reshape(1,-1)
    pred, _ = model(inp)
    pred = nd.argmax(pred, axis=-1).asscalar()
    predictions.append(int(pred))
    if len(predictions)%2000==0:
        ckpt = time.time()
        print('current pred len %d, time %.2fs' % (len(predictions), ckpt-start))
        start = ckpt
submit = pd.DataFrame({'Expected': predictions})
submit.to_csv('submission.csv', sep=',', index_label='ID')

current pred len 2000, time 30.26s
current pred len 4000, time 30.39s
current pred len 6000, time 30.38s
current pred len 8000, time 30.28s
current pred len 10000, time 30.28s
current pred len 12000, time 31.28s
current pred len 14000, time 31.20s
current pred len 16000, time 30.63s
current pred len 18000, time 30.59s
current pred len 20000, time 31.37s


MXNetError: [10:45:56] src/operator/./cudnn_rnn-inl.h:710: Check failed: e == CUDNN_STATUS_SUCCESS (8 vs. 0) cuDNN: CUDNN_STATUS_EXECUTION_FAILED

Stack trace returned 10 entries:
[bt] (0) /home/steven/miniconda3/envs/dl/lib/python3.7/site-packages/mxnet/libmxnet.so(+0x40ba6a) [0x7fd464d15a6a]
[bt] (1) /home/steven/miniconda3/envs/dl/lib/python3.7/site-packages/mxnet/libmxnet.so(+0x40c081) [0x7fd464d16081]
[bt] (2) /home/steven/miniconda3/envs/dl/lib/python3.7/site-packages/mxnet/libmxnet.so(+0x5644cc3) [0x7fd469f4ecc3]
[bt] (3) /home/steven/miniconda3/envs/dl/lib/python3.7/site-packages/mxnet/libmxnet.so(+0x5646ccb) [0x7fd469f50ccb]
[bt] (4) /home/steven/miniconda3/envs/dl/lib/python3.7/site-packages/mxnet/libmxnet.so(+0x2f88314) [0x7fd467892314]
[bt] (5) /home/steven/miniconda3/envs/dl/lib/python3.7/site-packages/mxnet/libmxnet.so(mxnet::imperative::PushOperator(mxnet::OpStatePtr const&, nnvm::Op const*, nnvm::NodeAttrs const&, mxnet::Context const&, std::vector<mxnet::engine::Var*, std::allocator<mxnet::engine::Var*> > const&, std::vector<mxnet::engine::Var*, std::allocator<mxnet::engine::Var*> > const&, std::vector<mxnet::Resource, std::allocator<mxnet::Resource> > const&, std::vector<mxnet::NDArray*, std::allocator<mxnet::NDArray*> > const&, std::vector<mxnet::NDArray*, std::allocator<mxnet::NDArray*> > const&, std::vector<unsigned int, std::allocator<unsigned int> > const&, std::vector<mxnet::OpReqType, std::allocator<mxnet::OpReqType> > const&, mxnet::DispatchMode)::{lambda(mxnet::RunContext, mxnet::engine::CallbackOnComplete)#3}::operator()(mxnet::RunContext, mxnet::engine::CallbackOnComplete) const+0x2f0) [0x7fd46767a7c0]
[bt] (6) /home/steven/miniconda3/envs/dl/lib/python3.7/site-packages/mxnet/libmxnet.so(std::_Function_handler<void (mxnet::RunContext), mxnet::imperative::PushOperator(mxnet::OpStatePtr const&, nnvm::Op const*, nnvm::NodeAttrs const&, mxnet::Context const&, std::vector<mxnet::engine::Var*, std::allocator<mxnet::engine::Var*> > const&, std::vector<mxnet::engine::Var*, std::allocator<mxnet::engine::Var*> > const&, std::vector<mxnet::Resource, std::allocator<mxnet::Resource> > const&, std::vector<mxnet::NDArray*, std::allocator<mxnet::NDArray*> > const&, std::vector<mxnet::NDArray*, std::allocator<mxnet::NDArray*> > const&, std::vector<unsigned int, std::allocator<unsigned int> > const&, std::vector<mxnet::OpReqType, std::allocator<mxnet::OpReqType> > const&, mxnet::DispatchMode)::{lambda(mxnet::RunContext)#4}>::_M_invoke(std::_Any_data const&, mxnet::RunContext)+0x26) [0x7fd46767ae36]
[bt] (7) /home/steven/miniconda3/envs/dl/lib/python3.7/site-packages/mxnet/libmxnet.so(+0x2cc15fd) [0x7fd4675cb5fd]
[bt] (8) /home/steven/miniconda3/envs/dl/lib/python3.7/site-packages/mxnet/libmxnet.so(+0x2cc15e7) [0x7fd4675cb5e7]
[bt] (9) /home/steven/miniconda3/envs/dl/lib/python3.7/site-packages/mxnet/libmxnet.so(+0x2cc15e7) [0x7fd4675cb5e7]

