In [23]:
import pandas as pd
import numpy as np
import mxnet as mx
from mxnet import nd, autograd, gluon, init
from mxnet.gluon import nn, rnn
import gluonnlp as nlp
import pkuseg
import multiprocessing as mp
import time
from d2l import try_gpu
import itertools
from sklearn.metrics import accuracy_score, f1_score
import d2l
# fixed random number seed
np.random.seed(9102)
mx.random.seed(9102)

In [49]:
DATA_FOLDER = 'data/'
TRAIN_DATA = 'train.csv'
WORD_EMBED = 'sgns.weibo.bigram-char'
LABEL_FILE = 'train.label'
N_ROWS=10000
ctx = try_gpu()
seg = pkuseg.pkuseg(model_name='web')

In [60]:
train_df = pd.read_csv(DATA_FOLDER+TRAIN_DATA, sep='|')

In [61]:
dataset =[ [row[0], row[1]] for _, row in train_df.iterrows()]
train_dataset, valid_dataset = nlp.data.train_valid_split(dataset, .2)
len(train_dataset), len(valid_dataset)

(690531, 172633)

In [62]:
def tokenizer(x):
    tweet, label = x
    if type(tweet) != str:
        tweet = str(tweet)
    word_list = seg.cut(tweet)
    return word_list, label

def get_length(x):
    return float(len(x[0]))

def to_word_list(dataset):
    start = time.time()
    with mp.Pool() as pool:
        # Each sample is processed in an asynchronous manner.
        dataset = gluon.data.ArrayDataset(pool.map(tokenizer, dataset))
        lengths = gluon.data.ArrayDataset(pool.map(get_length, dataset))
    end = time.time()

    print('Done! Tokenizing Time={:.2f}s, #Sentences={}'.format(end - start, len(dataset)))
    return dataset, lengths

train_word_list, train_word_lengths = to_word_list(train_dataset)
valid_word_list, valid_word_lengths = to_word_list(valid_dataset)

Done! Tokenizing Time=77.48s, #Sentences=690531
Done! Tokenizing Time=20.25s, #Sentences=172633


In [63]:
train_seqs = [sample[0] for sample in train_word_list]
counter = nlp.data.count_tokens(list(itertools.chain.from_iterable(train_seqs)))

vocab = nlp.Vocab(counter, max_size=40000)

# load customed pre-trained embedding
embedding_weights = nlp.embedding.TokenEmbedding.from_file(file_path=DATA_FOLDER+WORD_EMBED)
vocab.set_embedding(embedding_weights)
print(vocab)

  .format(line_num, pretrained_file_path))


Vocab(size=40004, unk="<unk>", reserved="['<pad>', '<bos>', '<eos>']")


In [64]:
def token_to_idx(x):
    return vocab[x[0]], x[1]

# A token index or a list of token indices is returned according to the vocabulary.
with mp.Pool() as pool:
    train_dataset = pool.map(token_to_idx, train_word_list)
    valid_dataset = pool.map(token_to_idx, valid_word_list)

In [65]:
batch_size = 64
bucket_num = 10
bucket_ratio = 0.5


def get_dataloader():
    # Construct the DataLoader Pad data, stack label and lengths
    batchify_fn = nlp.data.batchify.Tuple(nlp.data.batchify.Pad(axis=0), \
                                          nlp.data.batchify.Stack())

    # in this example, we use a FixedBucketSampler,
    # which assigns each data sample to a fixed bucket based on its length.
    batch_sampler = nlp.data.sampler.FixedBucketSampler(
        train_word_lengths,
        batch_size=batch_size,
        num_buckets=bucket_num,
        ratio=bucket_ratio,
        shuffle=True)
    print(batch_sampler.stats())

    # train_dataloader
    train_dataloader = gluon.data.DataLoader(
        dataset=train_dataset,
        batch_sampler=batch_sampler,
        batchify_fn=batchify_fn)
    # valid_dataloader
    valid_dataloader = gluon.data.DataLoader(
        dataset=valid_dataset,
        batch_size=batch_size,
        shuffle=False,
        batchify_fn=batchify_fn)
    return train_dataloader, valid_dataloader

train_dataloader, valid_dataloader = get_dataloader()

FixedBucketSampler:
  sample_num=690531, batch_num=3568
  key=[15, 28, 41, 54, 67, 80, 93, 106, 119, 132]
  cnt=[489999, 112913, 41388, 20449, 12212, 7963, 4669, 906, 29, 3]
  batch_size=[281, 150, 103, 78, 64, 64, 64, 64, 64, 64]


In [66]:
for tweet, label in train_dataloader:
    print(tweet, label)
    break


[[0.0000e+00 0.0000e+00 0.0000e+00 ... 0.0000e+00 0.0000e+00 0.0000e+00]
 [8.0000e+01 0.0000e+00 7.4000e+01 ... 7.8000e+01 0.0000e+00 0.0000e+00]
 [3.7659e+04 3.4900e+02 6.8550e+03 ... 0.0000e+00 0.0000e+00 0.0000e+00]
 ...
 [1.3134e+04 4.0000e+00 2.4150e+03 ... 0.0000e+00 0.0000e+00 0.0000e+00]
 [1.0000e+01 4.1000e+01 8.0000e+01 ... 0.0000e+00 0.0000e+00 0.0000e+00]
 [2.5410e+03 2.3750e+03 7.0000e+00 ... 0.0000e+00 0.0000e+00 0.0000e+00]]
<NDArray 281x15 @cpu_shared(0)> 
[31 12 35 42 35  7  3 33 62 22 39 15 12  3 54 34  3 12 71  3  1 34 18  3
 17 29 16 29 46 35 14  0 35 52 69 40  8 47 12 50  3 14  3 50 41 34 15  3
 60 50 35 61 24 14 27 12 17  1 34  3 29 42 60 15 42 20 29 17 49 35 34 15
  3 12 35 31 58 45 24 54 49 34 29 62 12 36 12 35 51 34 52 34  2  6 29 12
 44 35 62 60  3 35 29 16 42 35 45 34 27 15  3 55 42 34 12  6 34 16 24 54
 24 62 55 29 18 29 17 29 62 14 23 53 39 57 29 46 27 35  3 54 42 14 32 67
 25 15  5 12 69 69 58 64 32 37 42 29 34 52 12 11 17 16  9 16 34 68 31 35
 23  3 30 2

## Model contruction
Self attention layer, weighted cross entropy, and whole model

In [67]:
class TextCNN(nn.Block):
    def __init__(self, vocab_len, embed_size, kernel_sizes, num_channels, \
                 dropout, nclass, **kwargs):
        super(TextCNN, self).__init__(**kwargs)
        self.embedding = nn.Embedding(vocab_len, embed_size)
        
        self.constant_embedding = nn.Embedding(vocab_len, embed_size)
        self.dropout = nn.Dropout(dropout)
        self.decoder = nn.Dense(nclass)
        self.pool = nn.GlobalMaxPool1D()
        self.convs = nn.Sequential()  
        for c, k in zip(num_channels, kernel_sizes):
            self.convs.add(nn.Conv1D(c, k, activation='relu'))
    
    def forward(self, inputs):
        
        embeddings = nd.concat(
            self.embedding(inputs), self.constant_embedding(inputs), dim=2)
        
        embeddings = embeddings.transpose((0, 2, 1))
        
        encoding = nd.concat(*[nd.flatten(
            self.pool(conv(embeddings))) for conv in self.convs], dim=1)
        
        outputs = self.decoder(self.dropout(encoding))
        return outputs

In [68]:
vocab_len = len(vocab)
emsize = 300   # word embedding size
nhidden = 400    # lstm hidden_dim
nlayers = 4     # lstm layers
natt_unit = 400     # the hidden_units of attention layer
natt_hops = 20    # the channels of attention
nfc = 256  # last dense layer size
nclass = 72 # we have 72 emoji in total

drop_prob = 0.5
pool_way = 'flatten'    # # The way to handle M
prune_p = None
prune_q = None

ctx = try_gpu()

kernel_sizes, nums_channels = [2, 3, 4, 5], [100, 100, 100, 100]
model = TextCNN(vocab_len, emsize, kernel_sizes, nums_channels, drop_prob, nclass)
model.initialize(init.Xavier(), ctx=ctx)

print(model)
model.embedding.weight.set_data(vocab.embedding.idx_to_vec)
model.constant_embedding.weight.set_data(vocab.embedding.idx_to_vec)
model.constant_embedding.collect_params().setattr('grad_req', 'null')

TextCNN(
  (embedding): Embedding(40004 -> 300, float32)
  (constant_embedding): Embedding(40004 -> 300, float32)
  (dropout): Dropout(p = 0.5, axes=())
  (decoder): Dense(None -> 72, linear)
  (pool): GlobalMaxPool1D(size=(1,), stride=(1,), padding=(0,), ceil_mode=True)
  (convs): Sequential(
    (0): Conv1D(None -> 100, kernel_size=(2,), stride=(1,), Activation(relu))
    (1): Conv1D(None -> 100, kernel_size=(3,), stride=(1,), Activation(relu))
    (2): Conv1D(None -> 100, kernel_size=(4,), stride=(1,), Activation(relu))
    (3): Conv1D(None -> 100, kernel_size=(5,), stride=(1,), Activation(relu))
  )
)


In [None]:
lr, num_epochs = 0.001, 20
trainer = gluon.Trainer(model.collect_params(), 'adam', {'learning_rate': lr})
loss = gluon.loss.SoftmaxCrossEntropyLoss()
d2l.train(train_dataloader, valid_dataloader, model, loss, trainer, ctx, num_epochs)
token = str(round(time.time()))
model.export("model/model"+token, epoch=1)

training on gpu(0)
epoch 1, loss 3.6108, train acc 0.142, test acc 0.163, time 147.5 sec
epoch 2, loss 3.4957, train acc 0.160, test acc 0.169, time 86.3 sec


In [40]:
TEST_DATA = 'test.csv'
predictions = []
test_df = pd.read_csv(DATA_FOLDER+TEST_DATA, header=None, sep='\t')
len(test_df)

200000

In [41]:
start = time.time()
for _, tweet in test_df.iterrows():
    token = vocab[seg.cut(tweet[1])]
    inp = nd.array(token, ctx=ctx).reshape(1,-1)
    pred, _ = model(inp)
    pred = nd.argmax(pred, axis=-1).asscalar()
    predictions.append(int(pred))
    if len(predictions)%2000==0:
        ckpt = time.time()
        print('current pred len %d, time %.2fs' % (len(predictions), ckpt-start))
        start = ckpt
submit = pd.DataFrame({'Expected': predictions})
submit.to_csv('submission.csv', sep=',', index_label='ID')

Building prefix dict from the default dictionary ...
DEBUG:jieba:Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache
DEBUG:jieba:Loading model from cache /tmp/jieba.cache
Loading model cost 0.459 seconds.
DEBUG:jieba:Loading model cost 0.459 seconds.
Prefix dict has been built succesfully.
DEBUG:jieba:Prefix dict has been built succesfully.


current pred len 2000, time 10.20s
current pred len 4000, time 9.77s
current pred len 6000, time 9.81s
current pred len 8000, time 9.60s
current pred len 10000, time 9.22s
current pred len 12000, time 9.45s
current pred len 14000, time 9.32s
current pred len 16000, time 9.08s
current pred len 18000, time 9.21s
current pred len 20000, time 9.23s
current pred len 22000, time 9.36s
current pred len 24000, time 9.30s
current pred len 26000, time 9.38s
current pred len 28000, time 9.82s
current pred len 30000, time 9.76s
current pred len 32000, time 9.79s
current pred len 34000, time 9.74s
current pred len 36000, time 9.75s
current pred len 38000, time 9.78s
current pred len 40000, time 9.63s
current pred len 42000, time 9.66s
current pred len 44000, time 9.57s
current pred len 46000, time 9.73s
current pred len 48000, time 9.63s
current pred len 50000, time 9.65s
current pred len 52000, time 9.55s
current pred len 54000, time 9.71s
current pred len 56000, time 9.76s
current pred len 58000,