In [1]:
import pandas as pd
import numpy as np
import mxnet as mx
from mxnet import nd, autograd, gluon, init
from mxnet.gluon import nn, rnn
import gluonnlp as nlp
import pkuseg
import multiprocessing as mp
import time
from d2l import try_gpu
import itertools
import jieba
from sklearn.metrics import accuracy_score, f1_score
import d2l
import re
import warnings
warnings.filterwarnings("ignore")
# fixed random number seed
np.random.seed(2333)
mx.random.seed(2333)

In [2]:
DATA_FOLDER = 'data/'
TRAIN_DATA = 'train.csv'
WORD_EMBED = 'sgns.weibo.bigram-char'
LABEL_FILE = 'train.label'
N_ROWS=1000
ctx = mx.gpu(0)
seg = pkuseg.pkuseg(model_name='web')

In [3]:
train_df = pd.read_csv(DATA_FOLDER+TRAIN_DATA, sep='|')
train_df = train_df.sample(frac=1)
train_df.head()

Unnamed: 0,tweet,label
302867,就喜欢你臭吧拉几的帅,13
293170,现在还能买到鎏金宝鉴么？,39
758944,笑尿你：病房按钮乱按的爆笑后果病房按钮乱按的爆笑后果 06集 12 lol:-) 第一季谂,34
10078,美图手机,3
488768,哈～哈哈～哈哈哈～啊哈哈哈哈哈～,45


In [4]:
dataset =[ [row[0], row[1]] for _, row in train_df.iterrows()]
train_dataset, valid_dataset = nlp.data.train_valid_split(dataset)
len(train_dataset), len(valid_dataset)

(820005, 43159)

In [5]:
def tokenizer(x):
    tweet, label = x
    if type(tweet) != str:
        print(tweet)
        tweet = str(tweet)
    word_list = jieba.lcut(tweet)
    if len(word_list)==0:
        word_list=['<unk>']
    return word_list, label

def get_length(x):
    return float(len(x[0]))

def to_word_list(dataset):
    start = time.time()
    with mp.Pool() as pool:
        # Each sample is processed in an asynchronous manner.
        dataset = gluon.data.ArrayDataset(pool.map(tokenizer, dataset))
        lengths = gluon.data.ArrayDataset(pool.map(get_length, dataset))
    end = time.time()

    print('Done! Tokenizing Time={:.2f}s, #Sentences={}'.format(end - start, len(dataset)))
    return dataset, lengths

train_word_list, train_word_lengths = to_word_list(train_dataset)
valid_word_list, valid_word_lengths = to_word_list(valid_dataset)

Building prefix dict from the default dictionary ...
Building prefix dict from the default dictionary ...
Building prefix dict from the default dictionary ...
Building prefix dict from the default dictionary ...
Building prefix dict from the default dictionary ...
Building prefix dict from the default dictionary ...
Building prefix dict from the default dictionary ...
Building prefix dict from the default dictionary ...
Building prefix dict from the default dictionary ...
Building prefix dict from the default dictionary ...
Building prefix dict from the default dictionary ...
Building prefix dict from the default dictionary ...
Dumping model to file cache /tmp/jieba.cache
Dumping model to file cache /tmp/jieba.cache
Loading model cost 1.420 seconds.
Prefix dict has been built succesfully.
Dumping model to file cache /tmp/jieba.cache
Dumping model to file cache /tmp/jieba.cache
Loading model cost 1.514 seconds.
Prefix dict has been built succesfully.
Dumping model to file cache /tmp/jie

nan
Done! Tokenizing Time=24.44s, #Sentences=820005


Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache
Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache
Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache
Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache
Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache
Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache
Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache
Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache
Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache
Building prefix dict from the default dictionary ...
Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cac

nan


Loading model cost 1.927 seconds.
Prefix dict has been built succesfully.


Done! Tokenizing Time=3.22s, #Sentences=43159


In [6]:
train_seqs = [sample[0] for sample in train_word_list]
counter = nlp.data.count_tokens(list(itertools.chain.from_iterable(train_seqs)))

vocab = nlp.Vocab(counter, max_size=200000)

# load customed pre-trained embedding
embedding_weights = nlp.embedding.TokenEmbedding.from_file(file_path=DATA_FOLDER+WORD_EMBED)
vocab.set_embedding(embedding_weights)
print(vocab)

Vocab(size=200004, unk="<unk>", reserved="['<pad>', '<bos>', '<eos>']")


In [7]:
def token_to_idx(x):
    return vocab[x[0]], x[1]

# A token index or a list of token indices is returned according to the vocabulary.
with mp.Pool() as pool:
    train_dataset = pool.map(token_to_idx, train_word_list)
    valid_dataset = pool.map(token_to_idx, valid_word_list)

In [8]:
batch_size = 1024
bucket_num = 20
bucket_ratio = 0.1


def get_dataloader():
    # Construct the DataLoader Pad data, stack label and lengths
    batchify_fn = nlp.data.batchify.Tuple(nlp.data.batchify.Pad(axis=0), \
                                          nlp.data.batchify.Stack())

    # in this example, we use a FixedBucketSampler,
    # which assigns each data sample to a fixed bucket based on its length.
    batch_sampler = nlp.data.sampler.FixedBucketSampler(
        train_word_lengths,
        batch_size=batch_size,
        num_buckets=bucket_num,
        ratio=bucket_ratio,
        shuffle=True)
    print(batch_sampler.stats())

    # train_dataloader
    train_dataloader = gluon.data.DataLoader(
        dataset=train_dataset,
        batch_sampler=batch_sampler,
        batchify_fn=batchify_fn)
    # valid_dataloader
    valid_dataloader = gluon.data.DataLoader(
        dataset=valid_dataset,
        batch_size=batch_size,
        shuffle=False,
        batchify_fn=batchify_fn)
    return train_dataloader, valid_dataloader

train_dataloader, valid_dataloader = get_dataloader()

FixedBucketSampler:
  sample_num=820005, batch_num=632
  key=[15, 26, 37, 48, 59, 70, 81, 92, 103, 114, 125, 136, 147, 158, 169, 180, 191, 202, 213, 224]
  cnt=[573412, 124268, 51415, 27570, 17000, 11311, 8113, 5220, 1308, 205, 83, 48, 15, 11, 7, 7, 6, 2, 2, 2]
  batch_size=[1529, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024]


In [9]:
for tweet, label in train_dataloader:
    print(tweet, label)
    break


[[2.8000e+01 4.7000e+01 8.0000e+00 ... 0.0000e+00 0.0000e+00 0.0000e+00]
 [7.0900e+02 4.2430e+03 1.9200e+03 ... 3.3310e+03 4.5660e+03 3.6580e+03]
 [7.4600e+02 2.5000e+01 1.4900e+02 ... 0.0000e+00 0.0000e+00 0.0000e+00]
 ...
 [1.2120e+03 5.3500e+02 9.0000e+00 ... 0.0000e+00 0.0000e+00 0.0000e+00]
 [1.4000e+01 8.5540e+03 9.0000e+01 ... 0.0000e+00 0.0000e+00 0.0000e+00]
 [3.3770e+03 5.0000e+00 1.0984e+04 ... 1.2000e+02 2.3700e+02 1.1000e+01]]
<NDArray 1529x15 @cpu_shared(0)> 
[ 3 31  3 ... 35 31 10]
<NDArray 1529 @cpu_shared(0)>


## Model contruction
Self attention layer, weighted cross entropy, and whole model

In [10]:
class TextCNN(nn.Block):
    def __init__(self, vocab_len, embed_size, kernel_sizes, num_channels, \
                 dropout, nclass, **kwargs):
        super(TextCNN, self).__init__(**kwargs)
        self.embedding = nn.Embedding(vocab_len, embed_size)
        
        self.constant_embedding = nn.Embedding(vocab_len, embed_size)
        self.dropout = nn.Dropout(dropout)
        self.decoder = nn.Dense(nclass)
        self.pool = nn.GlobalMaxPool1D()
        self.convs = nn.Sequential()  
        for c, k in zip(num_channels, kernel_sizes):
            self.convs.add(nn.Conv1D(c, k, activation='relu'))
    
    def forward(self, inputs):
        embeddings = nd.concat(
            self.embedding(inputs), self.constant_embedding(inputs), dim=2)
        embeddings = embeddings.transpose((0, 2, 1))
        
        encoding = nd.concat(*[nd.flatten(
            self.pool(conv(embeddings))) for conv in self.convs], dim=1)
        outputs = self.decoder(self.dropout(encoding))
        return outputs

In [19]:
vocab_len = len(vocab)
emsize = 300   # word embedding size
nhidden = 400    # lstm hidden_dim
nlayers = 4     # lstm layers
natt_unit = 400     # the hidden_units of attention layer
natt_hops = 20    # the channels of attention
nfc = 256  # last dense layer size
nclass = 72 # we have 72 emoji in total

drop_prob = 0.2
pool_way = 'flatten'    # # The way to handle M
prune_p = None
prune_q = None

ctx = try_gpu()

kernel_sizes, nums_channels = [2, 3, 4, 5], [100, 100, 100, 100]
model = TextCNN(vocab_len, emsize, kernel_sizes, nums_channels, drop_prob, nclass)
model.initialize(init.Xavier(), ctx=ctx)

print(model)
model.embedding.weight.set_data(vocab.embedding.idx_to_vec)
model.constant_embedding.weight.set_data(vocab.embedding.idx_to_vec)
model.constant_embedding.collect_params().setattr('grad_req', 'null')

TextCNN(
  (embedding): Embedding(200004 -> 300, float32)
  (constant_embedding): Embedding(200004 -> 300, float32)
  (dropout): Dropout(p = 0.2, axes=())
  (decoder): Dense(None -> 72, linear)
  (pool): GlobalMaxPool1D(size=(1,), stride=(1,), padding=(0,), ceil_mode=True)
  (convs): Sequential(
    (0): Conv1D(None -> 100, kernel_size=(2,), stride=(1,), Activation(relu))
    (1): Conv1D(None -> 100, kernel_size=(3,), stride=(1,), Activation(relu))
    (2): Conv1D(None -> 100, kernel_size=(4,), stride=(1,), Activation(relu))
    (3): Conv1D(None -> 100, kernel_size=(5,), stride=(1,), Activation(relu))
  )
)


In [20]:
tmp = nd.array([10, 20, 30, 40, 50, 60], ctx=ctx).reshape(1, -1)
model(tmp)


[[-1.4535143   0.4316371   0.03780118 -0.43824768  0.41816843 -1.0937376
  -0.71099544  0.16095117 -0.2091109  -0.42367667  1.634491    0.38694513
   1.4579254   1.0658729  -0.39729273  1.4154611   0.0408833   0.9625461
   0.91877794  0.43661913 -0.19778824 -0.8435166  -1.9327714   0.90318596
   0.75077903  0.7486939   1.0200762   1.2349734   2.3193169   1.1560069
   1.1089306   0.6726989   0.659776   -0.91031754  0.99988955 -0.06935319
  -1.7049602   0.74020696  1.9678237  -1.4590362  -1.4255147   1.3767172
  -0.9265099   0.81759036 -0.7621071   0.97436655 -1.5960187  -0.15850382
  -0.19329947 -0.36371315  0.3686909   0.14982761 -0.6566879  -0.92307013
   0.7295457   0.3880864   1.8115426  -0.93350893 -0.9633518   1.2313509
  -2.288724    0.3680066  -1.5167515   0.25544786  0.13798247  0.13479555
  -0.78266764  0.54013383 -0.19405475 -0.70071375  1.7750703   1.1349797 ]]
<NDArray 1x72 @gpu(0)>

In [21]:
class WeightedSoftmaxCE(nn.HybridBlock):
    def __init__(self, sparse_label=True, from_logits=False,  **kwargs):
        super(WeightedSoftmaxCE, self).__init__(**kwargs)
        with self.name_scope():
            self.sparse_label = sparse_label
            self.from_logits = from_logits

    def hybrid_forward(self, F, pred, label, class_weight, depth=None):
        if self.sparse_label:
            label = F.reshape(label, shape=(-1, ))
            label = F.one_hot(label, depth)
        if not self.from_logits:
            pred = F.log_softmax(pred, -1)

        weight_label = F.broadcast_mul(label, class_weight)
        loss = -F.sum(pred * weight_label, axis=-1)

        # return F.mean(loss, axis=0, exclude=True)
        return loss

In [22]:
def calculate_loss(x, y, model, loss, class_weight):
    pred = model(x)
    y = nd.array(y.asnumpy().astype('int32')).as_in_context(ctx)
    if loss_name == 'sce':
        l = loss(pred, y)
    elif loss_name == 'wsce':
        l = loss(pred, y, class_weight, class_weight.shape[0])
    else:
        raise NotImplemented
    return pred, l

In [23]:
def one_epoch(data_iter, model, loss, trainer, ctx, is_train, epoch,
              clip=None, class_weight=None, loss_name='sce'):

    loss_val = 0.
    total_pred = []
    total_true = []
    n_batch = 0

    for batch_x, batch_y in data_iter:
        batch_x = batch_x.as_in_context(ctx)
        batch_y = batch_y.as_in_context(ctx)

        if is_train:
            with autograd.record():
                batch_pred, l = calculate_loss(batch_x, batch_y, model, \
                                               loss, class_weight)

            # backward calculate
            l.backward()

            # clip gradient
            clip_params = [p.data() for p in model.collect_params().values()]
            if clip is not None:
                norm = nd.array([0.0], ctx)
                for param in clip_params:
                    if param.grad is not None:
                        norm += (param.grad ** 2).sum()
                norm = norm.sqrt().asscalar()
                if norm > clip:
                    for param in clip_params:
                        if param.grad is not None:
                            param.grad[:] *= clip / norm

            # update parmas
            trainer.step(batch_x.shape[0])

        else:
            batch_pred, l = calculate_loss(batch_x, batch_y, model, \
                                           loss, class_weight)

        # keep result for metric
        batch_pred = nd.argmax(nd.softmax(batch_pred, axis=1), axis=1).asnumpy()
        batch_true = np.reshape(batch_y.asnumpy(), (-1, ))
        total_pred.extend(batch_pred.tolist())
        total_true.extend(batch_true.tolist())
        
        batch_loss = l.mean().asscalar()

        n_batch += 1
        loss_val += batch_loss

        # check the result of traing phase
        if is_train and n_batch % 400 == 0:
            print('epoch %d, batch %d, batch_train_loss %.4f, batch_train_acc %.3f' %
                  (epoch, n_batch, batch_loss, accuracy_score(batch_true, batch_pred)))

    # metric
    F1 = f1_score(np.array(total_true), np.array(total_pred), average='weighted')
    acc = accuracy_score(np.array(total_true), np.array(total_pred))
    loss_val /= n_batch

    if is_train:
        print('epoch %d, learning_rate %.5f \n\t train_loss %.4f, acc_train %.3f, F1_train %.3f, ' %
              (epoch, trainer.learning_rate, loss_val, acc, F1))
        # declay lr
        if epoch % 3 == 0:
            trainer.set_learning_rate(trainer.learning_rate * 0.9)
    else:
        print('\t valid_loss %.4f, acc_valid %.3f, F1_valid %.3f, ' % (loss_val, acc, F1))

In [24]:
def train_valid(data_iter_train, data_iter_valid, model, loss, trainer, ctx, nepochs,
                clip=None, class_weight=None, loss_name='sce'):

    for epoch in range(1, nepochs+1):
        start = time.time()
        # train
        is_train = True
        one_epoch(data_iter_train, model, loss, trainer, ctx, is_train,
                  epoch, clip, class_weight, loss_name)

        # valid
        is_train = False
        one_epoch(data_iter_valid, model, loss, trainer, ctx, is_train,
                  epoch, clip, class_weight, loss_name)
        end = time.time()
        print('time %.2f sec' % (end-start))
        print("*"*100)

In [25]:
from util import get_weight
weight_list = get_weight(DATA_FOLDER, LABEL_FILE)

class_weight = None
loss_name = 'sce'
optim = 'adam'
lr, wd = .001, .999
clip = None
nepochs = 5

trainer = gluon.Trainer(model.collect_params(), optim, {'learning_rate': lr})

if loss_name == 'sce':
    loss = gluon.loss.SoftmaxCrossEntropyLoss()
elif loss_name == 'wsce':
    loss = WeightedSoftmaxCE()
    # the value of class_weight is obtained by counting data in advance. It can be seen as a hyperparameter.
    class_weight = nd.array(weight_list, ctx=ctx)

In [None]:
# train and valid
print(ctx)
train_valid(train_dataloader, valid_dataloader, model, loss, \
            trainer, ctx, nepochs, clip=clip, class_weight=class_weight, \
            loss_name=loss_name)

gpu(0)
epoch 1, batch 400, batch_train_loss 3.4517, batch_train_acc 0.155
epoch 1, learning_rate 0.00100 
	 train_loss 3.5593, acc_train 0.152, F1_train 0.101, 
	 valid_loss 3.4231, acc_valid 0.170, F1_valid 0.106, 
time 105.61 sec
****************************************************************************************************
epoch 2, batch 400, batch_train_loss 3.3264, batch_train_acc 0.178
epoch 2, learning_rate 0.00100 
	 train_loss 3.3224, acc_train 0.181, F1_train 0.128, 
	 valid_loss 3.3578, acc_valid 0.179, F1_valid 0.125, 
time 106.17 sec
****************************************************************************************************
epoch 3, batch 400, batch_train_loss 3.2211, batch_train_acc 0.206
epoch 3, learning_rate 0.00100 
	 train_loss 3.1981, acc_train 0.200, F1_train 0.148, 
	 valid_loss 3.3231, acc_valid 0.186, F1_valid 0.135, 
time 106.75 sec
****************************************************************************************************
epoch 4, batch 

In [46]:
model.save_parameters("model/textcnn.params")

In [49]:
kernel_sizes, nums_channels = [2, 3, 4, 5], [100, 100, 100, 100]
model = TextCNN(vocab_len, emsize, kernel_sizes, nums_channels, 0, nclass)
model.load_parameters('model/textcnn.params', ctx=ctx)

In [50]:
TEST_DATA = 'test.csv'
predictions = []
test_df = pd.read_csv(DATA_FOLDER+TEST_DATA, header=None, sep='\t')
len(test_df)

200000

In [None]:
start = time.time()
for _, tweet in test_df.iterrows():
    token = vocab[jieba.lcut(tweet[1])]
    if len(token)<5:
        token += [0.]*(5-len(token))
    inp = nd.array(token, ctx=ctx).reshape(1,-1)
    pred = model(inp)
    pred = nd.argmax(pred, axis=1).asscalar()
    predictions.append(int(pred))
    if len(predictions)%2000==0:
        ckpt = time.time()
        print('current pred len %d, time %.2fs' % (len(predictions), ckpt-start))
        start = ckpt
submit = pd.DataFrame({'Expected': predictions})
submit.to_csv('submission.csv', sep=',', index_label='ID')

Building prefix dict from the default dictionary ...
DEBUG:jieba:Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache
DEBUG:jieba:Loading model from cache /tmp/jieba.cache
Dumping model to file cache /tmp/jieba.cache
DEBUG:jieba:Dumping model to file cache /tmp/jieba.cache
Dump cache file failed.
Traceback (most recent call last):
  File "/home/user_data/anaconda3/lib/python3.6/site-packages/jieba/__init__.py", line 152, in initialize
    _replace_file(fpath, cache_file)
PermissionError: [Errno 1] Operation not permitted: '/tmp/tmp2d02kovo' -> '/tmp/jieba.cache'
ERROR:jieba:Dump cache file failed.
Traceback (most recent call last):
  File "/home/user_data/anaconda3/lib/python3.6/site-packages/jieba/__init__.py", line 152, in initialize
    _replace_file(fpath, cache_file)
PermissionError: [Errno 1] Operation not permitted: '/tmp/tmp2d02kovo' -> '/tmp/jieba.cache'
Loading model cost 0.884 seconds.
DEBUG:jieba:Loading model cost 0.884 seconds.
Pr

current pred len 2000, time 6.17s
current pred len 4000, time 4.82s
current pred len 6000, time 4.99s
current pred len 8000, time 5.27s
current pred len 10000, time 5.15s
current pred len 12000, time 4.91s
current pred len 14000, time 5.08s
current pred len 16000, time 4.36s
current pred len 18000, time 4.03s
current pred len 20000, time 4.68s
current pred len 22000, time 5.15s
current pred len 24000, time 5.01s
current pred len 26000, time 5.03s
current pred len 28000, time 4.93s
current pred len 30000, time 4.76s
current pred len 32000, time 4.54s
current pred len 34000, time 4.81s
current pred len 36000, time 4.94s
current pred len 38000, time 5.04s
current pred len 40000, time 4.43s
current pred len 42000, time 4.07s
current pred len 44000, time 4.23s
current pred len 46000, time 4.22s
current pred len 48000, time 4.35s
current pred len 50000, time 4.71s
current pred len 52000, time 4.94s
current pred len 54000, time 5.11s
current pred len 56000, time 4.20s
current pred len 58000, 