In [14]:
import pandas as pd
import numpy as np
import mxnet as mx
from mxnet import nd, autograd, gluon, init
from mxnet.gluon import nn, rnn
import gluonnlp as nlp
import jieba
import multiprocessing as mp
import time
from d2l import try_gpu
import itertools
from sklearn.metrics import accuracy_score, f1_score
# fixed random number seed
np.random.seed(9102)
mx.random.seed(9102)

In [15]:
DATA_FOLDER = 'data/'
TRAIN_DATA = 'train.csv'
WORD_EMBED = 'sgns.weibo.word'
LABEL_FILE = 'train.label'
N_ROWS=50000
ctx = try_gpu()

In [16]:
train_df = pd.read_csv(DATA_FOLDER+TRAIN_DATA, sep='|')

In [17]:
dataset =[ [row[0], row[1]] for _, row in train_df.iterrows()]
train_dataset, valid_dataset = nlp.data.train_valid_split(dataset, .2)
len(train_dataset), len(valid_dataset)

(690531, 172633)

In [20]:
def tokenizer(x):
    tweet, label = x
    if type(tweet) != str:
        tweet = str(tweet)
    word_list = jieba.lcut(tweet)
    return word_list, label

def get_length(x):
    return float(len(x[0]))

def to_word_list(dataset):
    start = time.time()
    with mp.Pool() as pool:
        # Each sample is processed in an asynchronous manner.
        dataset = gluon.data.ArrayDataset(pool.map(tokenizer, dataset))
        lengths = gluon.data.ArrayDataset(pool.map(get_length, dataset))
    end = time.time()

    print('Done! Tokenizing Time={:.2f}s, #Sentences={}'.format(end - start, len(dataset)))
    return dataset, lengths

train_word_list, train_word_lengths = to_word_list(train_dataset)
valid_word_list, valid_word_lengths = to_word_list(valid_dataset)

Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache
Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache
Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache
Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache
Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache
Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache
Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache
Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache
Building prefix dict from the default dictionary ...
Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache
Loading model from cache /tmp/jieba.cache
Building prefix dict from the default dictionary .

Done! Tokenizing Time=19.46s, #Sentences=690531


Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache
Loading model from cache /tmp/jieba.cache
Building prefix dict from the default dictionary ...
Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache
Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache
Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache
Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache
Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache
Building prefix dict from the default dictionary ...
Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache
Loading model from cache /tmp/jieba.cache
Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache
Building prefix dict from the default dictionary .

Done! Tokenizing Time=6.19s, #Sentences=172633


In [21]:
train_seqs = [sample[0] for sample in train_word_list]
counter = nlp.data.count_tokens(list(itertools.chain.from_iterable(train_seqs)))

vocab = nlp.Vocab(counter, max_size=40000)

# load customed pre-trained embedding
embedding_weights = nlp.embedding.TokenEmbedding.from_file(file_path=DATA_FOLDER+WORD_EMBED)
vocab.set_embedding(embedding_weights)
print(vocab)

def token_to_idx(x):
    return vocab[x[0]], x[1]

# A token index or a list of token indices is returned according to the vocabulary.
with mp.Pool() as pool:
    train_dataset = pool.map(token_to_idx, train_word_list)
    valid_dataset = pool.map(token_to_idx, valid_word_list)

  .format(line_num, pretrained_file_path))


Vocab(size=40004, unk="<unk>", reserved="['<pad>', '<bos>', '<eos>']")


In [22]:
batch_size = 128
bucket_num = 10
bucket_ratio = 0.5


def get_dataloader():
    # Construct the DataLoader Pad data, stack label and lengths
    batchify_fn = nlp.data.batchify.Tuple(nlp.data.batchify.Pad(axis=0), \
                                          nlp.data.batchify.Stack())

    # in this example, we use a FixedBucketSampler,
    # which assigns each data sample to a fixed bucket based on its length.
    batch_sampler = nlp.data.sampler.FixedBucketSampler(
        train_word_lengths,
        batch_size=batch_size,
        num_buckets=bucket_num,
        ratio=bucket_ratio,
        shuffle=True)
    print(batch_sampler.stats())

    # train_dataloader
    train_dataloader = gluon.data.DataLoader(
        dataset=train_dataset,
        batch_sampler=batch_sampler,
        batchify_fn=batchify_fn)
    # valid_dataloader
    valid_dataloader = gluon.data.DataLoader(
        dataset=valid_dataset,
        batch_size=batch_size,
        shuffle=False,
        batchify_fn=batchify_fn)
    return train_dataloader, valid_dataloader

train_dataloader, valid_dataloader = get_dataloader()

FixedBucketSampler:
  sample_num=690531, batch_num=1649
  key=[27, 47, 67, 87, 107, 127, 147, 167, 187, 207]
  cnt=[592849, 59408, 23322, 11861, 2847, 166, 47, 17, 7, 7]
  batch_size=[490, 281, 197, 152, 128, 128, 128, 128, 128, 128]


In [23]:
for tweet, label in train_dataloader:
    print(tweet, label)
    break


[[9.0950e+03 9.1750e+03 0.0000e+00 ... 0.0000e+00 0.0000e+00 0.0000e+00]
 [1.2007e+04 6.0000e+00 2.0830e+03 ... 0.0000e+00 0.0000e+00 0.0000e+00]
 [7.8350e+03 1.8485e+04 6.0000e+00 ... 0.0000e+00 0.0000e+00 0.0000e+00]
 ...
 [0.0000e+00 1.0000e+01 0.0000e+00 ... 0.0000e+00 0.0000e+00 0.0000e+00]
 [8.0000e+00 2.7000e+01 1.2264e+04 ... 0.0000e+00 0.0000e+00 0.0000e+00]
 [7.3900e+02 5.0000e+00 4.0000e+00 ... 0.0000e+00 0.0000e+00 0.0000e+00]]
<NDArray 490x27 @cpu_shared(0)> 
[10  3 68 25 50  6 66 30 59 43 30 49 52 18 35 35 38 15 63 19 19 27 67 35
 28 61 31 33 65 63 62 31 60 12 34  8 42 57 35 23 52 30 31 34 16 12  3 42
 23 54 34 31 23 34 23 10 45 44  3 12 23 34 40 13 23 27  3  3 30 20 62  1
  3 62 43 35 32 29  6 14 44 21 14 34 45  3 45 52 30 54 34 10 27  9 44 19
 35  6 14 23  9 35 21  3  3 14 16 46 48 34 30 35 62  7 16  3  3 35 18 50
  1 41 35 35 30 55 27  1 34 52 42  3 24  3 14 35 30 65 39 29 18 31 41 49
 56 71 17 27 35 15 28  6 39 30 23 35  3 50 18 29 12 62 35 47 14 12 41 53
 10 29 29 1

## Model contruction
Self attention layer, weighted cross entropy, and whole model

In [24]:
# custom attention layer
# in this class, we want to implement the operation:
# softmax(W_2 * tanh(W_1 * H))
# where H is the word embedding of the whole sentence, of shape (num_of_word, embed_size)
class SelfAttention(nn.HybridBlock):
    def __init__(self, att_unit, att_hops, **kwargs):
        super(SelfAttention, self).__init__(**kwargs)
        with self.name_scope():
            # this layer is tanh(w_1 * H), the att_unit corresponds to d_a in the essay
            self.ut_dense = nn.Dense(att_unit, activation='tanh', flatten=False)
            # this layer implements the multiple hops
            self.et_dense = nn.Dense(att_hops, activation=None, flatten=False)

    def hybrid_forward(self, F, x): # F is the backend which implements the tensor operation
        # x shape: [batch_size, seq_len, embedding_width]
        # ut shape: [batch_size, seq_len, att_unit]
        ut = self.ut_dense(x) # batch_size * seq_len [* embed_size * embed_size *] att_unit
        # et shape: [batch_size, seq_len, att_hops]
        et = self.et_dense(ut)# batch_size * seq_len [* att_unit * att_unit *] att_hops

        # att shape: [batch_size,  att_hops, seq_len]
        # softmax is performed along the seq_len dimension
        att = F.softmax(F.transpose(et, axes=(0, 2, 1)), axis=-1)
        # output shape [batch_size, att_hops, embedding_width]
        output = F.batch_dot(att, x)
        # output is the weighted matrix representation of the matrix
        # att is the weighted vector we use as attention
        return output, att
    
# d_a = 20, hops = 5
print(SelfAttention(20, 5))

SelfAttention(
  (ut_dense): Dense(None -> 20, Activation(tanh))
  (et_dense): Dense(None -> 5, linear)
)


In [25]:
class WeightedSoftmaxCE(nn.HybridBlock):
    def __init__(self, sparse_label=True, from_logits=False,  **kwargs):
        super(WeightedSoftmaxCE, self).__init__(**kwargs)
        with self.name_scope():
            self.sparse_label = sparse_label
            self.from_logits = from_logits

    def hybrid_forward(self, F, pred, label, class_weight, depth=None):
        if self.sparse_label:
            label = F.reshape(label, shape=(-1, ))
            label = F.one_hot(label, depth)
        if not self.from_logits:
            pred = F.log_softmax(pred, -1)

        weight_label = F.broadcast_mul(label, class_weight)
        loss = -F.sum(pred * weight_label, axis=-1)

        # return F.mean(loss, axis=0, exclude=True)
        return loss

In [26]:
class SelfAttentiveBiLSTM(nn.HybridBlock):
    def __init__(self, vocab_len, embsize, nhidden, nlayers, natt_unit, natt_hops, \
                 nfc, nclass, # these two params are not used currrently
                 drop_prob, pool_way, prune_p=None, prune_q=None, **kwargs):
        super(SelfAttentiveBiLSTM, self).__init__(**kwargs)
        with self.name_scope():
            # now we switch back to shared layers
            self.embedding_layer = nn.Embedding(vocab_len, embsize)
            
            self.bilstm = rnn.LSTM(nhidden, num_layers=nlayers, dropout=drop_prob, \
                                        bidirectional=True)
            
            self.att_encoder = SelfAttention(natt_unit, natt_hops)
            self.dense = nn.Dense(nfc, activation='tanh')
            # this layer is used to output the final class
            self.output_layer = nn.Dense(nclass)
            
            self.dense_p, self.dense_q = None, None
            if all([prune_p, prune_q]):
                self.dense_p = nn.Dense(prune_p, activation='tanh', flatten=False)
                self.dense_q = nn.Dense(prune_q, activation='tanh', flatten=False)

            self.drop_prob = drop_prob
            self.pool_way = pool_way

    def hybrid_forward(self, F, inp):
        # inp_embed size: [batch, seq_len, embed_size]
        inp_embed = self.embedding_layer(inp)
        # rnn requires the first dimension to be the time steps
        h_output = self.bilstm(F.transpose(inp_embed, axes=(1, 0, 2)))
        # att_output: [batch, att_hops, emsize]
        output, att = self.att_encoder(F.transpose(h_output, axes=(1, 0, 2)))
        '''
        FIXME: now this code will only work with flatten
        '''
        dense_input = None
        if self.pool_way == 'flatten':
            dense_input = F.Dropout(F.flatten(output), self.drop_prob)
        else:
            raise NotImplemented
        '''
        elif self.pool_way == 'mean':
            dense_input = F.Dropout(F.mean(att_output, axis=1), self.drop_prob)
        elif self.pool_way == 'prune' and all([self.dense_p, self.dense_q]):
            # p_section: [batch, att_hops, prune_p]
            p_section = self.dense_p(att_output)
            # q_section: [batch, emsize, prune_q]
            q_section = self.dense_q(F.transpose(att_output, axes=(0, 2, 1)))
            dense_input = F.Dropout(F.concat(F.flatten(p_section), F.flatten(q_section), dim=-1), self.drop_prob)
        '''
        dense_out = self.dense(dense_input)
        output = self.output_layer(F.Dropout(dense_out, self.drop_prob))

        return output, att

In [27]:
vocab_len = len(vocab)
emsize = 300   # word embedding size
nhidden = 300    # lstm hidden_dim
nlayers = 3     # lstm layers
natt_unit = 300     # the hidden_units of attention layer
natt_hops = 10    # the channels of attention
nfc = 256  # last dense layer size
nclass = 72 # we have 72 emoji in total

drop_prob = 0
pool_way = 'flatten'    # # The way to handle M
prune_p = None
prune_q = None

ctx = try_gpu()

try:
    assert(False)
    model = gluon.nn.SymbolBlock.imports("model/model-symbol.json", ['data'], \
                                         "model/model-0001.params", ctx=ctx)
    print('use saved model params to start')
except:
    model = SelfAttentiveBiLSTM(vocab_len, emsize, nhidden, nlayers,
                            natt_unit, natt_hops, nfc, nclass,
                            drop_prob, pool_way, prune_p, prune_q)

    print('initialize a new model')
    model.initialize(init=init.Xavier(), ctx=ctx)
    model.hybridize()

    # Attach a pre-trained glove word vector to the embedding layer
    model.embedding_layer.weight.set_data(vocab.embedding.idx_to_vec)
    # fixed the embedding layer
    model.embedding_layer.collect_params().setattr('grad_req', 'null')

print(model)

initialize a new model
SelfAttentiveBiLSTM(
  (embedding_layer): Embedding(40004 -> 300, float32)
  (bilstm): LSTM(None -> 300, TNC, num_layers=3, bidirectional)
  (att_encoder): SelfAttention(
    (ut_dense): Dense(None -> 300, Activation(tanh))
    (et_dense): Dense(None -> 10, linear)
  )
  (dense): Dense(None -> 256, Activation(tanh))
  (output_layer): Dense(None -> 72, linear)
)


In [28]:
vocab.embedding.idx_to_vec[vocab.embedding.token_to_idx['i']]


[ 0.021406  0.399445 -0.150773  0.416859 -0.173093 -0.460412 -0.09578
 -0.452269 -0.060334  0.178076  0.129666 -0.187627 -0.268714  0.281752
  0.672784  0.079613 -0.504229  0.056391 -0.041292 -0.359443  0.09827
 -0.278529 -0.140741  0.193164  0.061355 -0.310622 -0.198531 -0.223974
 -0.082908  0.320169 -0.182967 -0.212077  0.134077 -0.236404  0.189204
  0.0568    0.298494  0.59866  -0.32057  -0.242243 -0.044432  0.217198
  0.202773 -0.211469 -0.173185 -0.098174  0.112375  0.270286 -0.148778
 -0.42995  -0.418146  0.294171  0.087337  0.670113 -0.030841 -0.420761
 -0.209422 -0.622883 -0.143712 -0.551885 -0.10698   0.017491  0.247325
  0.006137 -0.380092 -0.164557 -0.417308 -0.579596  0.321489  0.255118
 -0.123747  0.029786  0.354533 -0.066662 -0.078275  0.371021 -0.162618
 -0.203096 -0.072074 -0.597944  0.238788 -0.080151  0.420655  0.035114
 -0.431164 -0.094002  0.175306  0.183954 -0.381618  0.202283 -0.009674
  0.05437  -0.123119  0.073714 -0.183245  0.02759   0.364248  0.242034
  0.409

## Training helpers
Calculate loss, one epoch computation and top function for train and valid

In [29]:
def calculate_loss(x, y, model, loss, class_weight, penal_coeff):
    pred, att = model(x)
    y = nd.array(y.asnumpy().astype('int32')).as_in_context(ctx)
    if loss_name in ['sce', 'l1', 'l2']:
        l = loss(pred, y)
    elif loss_name == 'wsce':
        l = loss(pred, y, class_weight, class_weight.shape[0])
    else:
        raise NotImplemented
    # penalty, now we have two att's
    diversity_penalty = nd.batch_dot(att, nd.transpose(att, axes=(0, 2, 1))) - \
                        nd.eye(att.shape[1], ctx=att.context)
    l = l + penal_coeff * diversity_penalty.norm(axis=(1, 2))

    return pred, l

In [30]:
def one_epoch(data_iter, model, loss, trainer, ctx, is_train, epoch,
              penal_coeff=0.0, clip=None, class_weight=None, loss_name='sce'):

    loss_val = 0.
    total_pred = []
    total_true = []
    n_batch = 0

    for batch_x, batch_y in data_iter:
        batch_x = batch_x.as_in_context(ctx)
        batch_y = batch_y.as_in_context(ctx)

        if is_train:
            with autograd.record():
                batch_pred, l = calculate_loss(batch_x, batch_y, model, \
                                               loss, class_weight, penal_coeff)

            # backward calculate
            l.backward()

            # clip gradient
            clip_params = [p.data() for p in model.collect_params().values()]
            if clip is not None:
                norm = nd.array([0.0], ctx)
                for param in clip_params:
                    if param.grad is not None:
                        norm += (param.grad ** 2).sum()
                norm = norm.sqrt().asscalar()
                if norm > clip:
                    for param in clip_params:
                        if param.grad is not None:
                            param.grad[:] *= clip / norm

            # update parmas
            trainer.step(batch_x.shape[0])

        else:
            batch_pred, l = calculate_loss(batch_x, batch_y, model, \
                                           loss, class_weight, penal_coeff)

        # keep result for metric
        batch_pred = nd.argmax(nd.softmax(batch_pred, axis=1), axis=1).asnumpy()
        batch_true = np.reshape(batch_y.asnumpy(), (-1, ))
        total_pred.extend(batch_pred.tolist())
        total_true.extend(batch_true.tolist())
        
        batch_loss = l.mean().asscalar()

        n_batch += 1
        loss_val += batch_loss

        # check the result of traing phase
        if is_train and n_batch % 400 == 0:
            print('epoch %d, batch %d, batch_train_loss %.4f, batch_train_acc %.3f' %
                  (epoch, n_batch, batch_loss, accuracy_score(batch_true, batch_pred)))

    # metric
    F1 = f1_score(np.array(total_true), np.array(total_pred), average='weighted')
    acc = accuracy_score(np.array(total_true), np.array(total_pred))
    loss_val /= n_batch

    if is_train:
        print('epoch %d, learning_rate %.5f \n\t train_loss %.4f, acc_train %.3f, F1_train %.3f, ' %
              (epoch, trainer.learning_rate, loss_val, acc, F1))
        # declay lr
        if epoch % 3 == 0:
            trainer.set_learning_rate(trainer.learning_rate * 0.9)
    else:
        print('\t valid_loss %.4f, acc_valid %.3f, F1_valid %.3f, ' % (loss_val, acc, F1))

In [31]:
def train_valid(data_iter_train, data_iter_valid, model, loss, trainer, ctx, nepochs,
                penal_coeff=0.0, clip=None, class_weight=None, loss_name='sce'):

    for epoch in range(1, nepochs+1):
        start = time.time()
        # train
        is_train = True
        one_epoch(data_iter_train, model, loss, trainer, ctx, is_train,
                  epoch, penal_coeff, clip, class_weight, loss_name)

        # valid
        is_train = False
        one_epoch(data_iter_valid, model, loss, trainer, ctx, is_train,
                  epoch, penal_coeff, clip, class_weight, loss_name)
        end = time.time()
        print('time %.2f sec' % (end-start))
        print("*"*100)

## Train
Now we will train this model. To handle data inbalance, we first set an estimated weight of the labels.

In [32]:
from util import get_weight
weight_list = get_weight(DATA_FOLDER, LABEL_FILE, 10)
class_weight = None
loss_name = 'wsce'
optim = 'adam'
lr = 0.001
penal_coeff = 0.003
clip = .5
nepochs = 5

trainer = gluon.Trainer(model.collect_params(), optim, {'learning_rate': lr})

if loss_name == 'sce':
    loss = gluon.loss.SoftmaxCrossEntropyLoss()
elif loss_name == 'wsce':
    loss = WeightedSoftmaxCE()
    # the value of class_weight is obtained by counting data in advance. It can be seen as a hyperparameter.
    class_weight = nd.array(weight_list, ctx=ctx)
elif loss_name == 'l1':
    loss = gluon.loss.L1Loss()
elif loss_name == 'l2':
    loss = gluon.loss.L2Loss()

In [None]:
# train and valid
train_valid(train_dataloader, valid_dataloader, model, loss, \
            trainer, ctx, nepochs, penal_coeff=penal_coeff, \
            clip=clip, class_weight=class_weight, loss_name=loss_name)

epoch 1, batch 400, batch_train_loss 65.4778, batch_train_acc 0.096
epoch 1, batch 800, batch_train_loss 67.1403, batch_train_acc 0.092
epoch 1, batch 1200, batch_train_loss 67.1463, batch_train_acc 0.082
epoch 1, batch 1600, batch_train_loss 64.2306, batch_train_acc 0.143


  'precision', 'predicted', average, warn_for)


epoch 1, learning_rate 0.00100 
	 train_loss 66.8441, acc_train 0.102, F1_train 0.050, 
	 valid_loss 64.8300, acc_valid 0.118, F1_valid 0.055, 
time 413.33 sec
****************************************************************************************************


In [93]:
token = str(round(time.time()))
model.export("model/model"+token, epoch=1)

In [96]:
TEST_DATA = 'test.csv'
predictions = []
test_df = pd.read_csv(DATA_FOLDER+TEST_DATA, header=None, sep='\t')
len(test_df)

200000

In [97]:
start = time.time()
for _, tweet in test_df.iterrows():
    token = vocab[jieba.lcut(tweet[1])]
    inp = nd.array(token, ctx=ctx).reshape(1,-1)
    pred, _ = model(inp)
    pred = nd.argmax(pred, axis=-1).asscalar()
    predictions.append(int(pred))
    if len(predictions)%2000==0:
        ckpt = time.time()
        print('current pred len %d, time %.2fs' % (len(predictions), ckpt-start))
        start = ckpt
submit = pd.DataFrame({'Expected': predictions})
submit.to_csv('submission.csv', sep=',', index_label='ID')

current pred len 2000, time 13.13
current pred len 4000, time 13.16
current pred len 6000, time 13.20
current pred len 8000, time 13.00
current pred len 10000, time 13.12
current pred len 12000, time 13.20
current pred len 14000, time 13.19
current pred len 16000, time 12.84
current pred len 18000, time 13.03
current pred len 20000, time 13.09
current pred len 22000, time 13.18
current pred len 24000, time 13.10
current pred len 26000, time 13.20
current pred len 28000, time 13.34
current pred len 30000, time 13.49
current pred len 32000, time 13.19
current pred len 34000, time 13.31
current pred len 36000, time 13.19
current pred len 38000, time 13.18
current pred len 40000, time 13.13
current pred len 42000, time 13.13
current pred len 44000, time 13.06
current pred len 46000, time 13.21
current pred len 48000, time 13.10
current pred len 50000, time 13.06
current pred len 52000, time 13.01
current pred len 54000, time 13.26
current pred len 56000, time 13.22
current pred len 58000, 