In [1]:
import pandas as pd
import numpy as np
import mxnet as mx
from mxnet import nd, autograd, gluon, init
from mxnet.gluon import nn, rnn
import gluonnlp as nlp
import pkuseg
import multiprocessing as mp
import time
from d2l import try_gpu
import itertools
from sklearn.metrics import accuracy_score, f1_score
import d2l
import re
# fixed random number seed
np.random.seed(9102)
mx.random.seed(9102)

In [2]:
DATA_FOLDER = 'data/'
TRAIN_DATA = 'train.csv'
WORD_EMBED = 'sgns.weibo.bigram-char'
LABEL_FILE = 'train.label'
N_ROWS=1000
ctx = try_gpu()
seg = pkuseg.pkuseg(model_name='web')

In [3]:
train_df = pd.read_csv(DATA_FOLDER+TRAIN_DATA, sep='|')

In [4]:
dataset =[ [row[0], row[1]] for _, row in train_df.iterrows()]
train_dataset, valid_dataset = nlp.data.train_valid_split(dataset, .1)
len(train_dataset), len(valid_dataset)

(776847, 86317)

In [5]:
def tokenizer(x):
    tweet, label = x
    if type(tweet) != str:
        tweet = str(tweet)
    word_list = seg.cut(re.sub('\W+', ' ', tweet))
    if len(word_list)==0:
        word_list=['<unk>']
    return word_list, label

def get_length(x):
    return float(len(x[0]))

def to_word_list(dataset):
    start = time.time()
    with mp.Pool() as pool:
        # Each sample is processed in an asynchronous manner.
        dataset = gluon.data.ArrayDataset(pool.map(tokenizer, dataset))
        lengths = gluon.data.ArrayDataset(pool.map(get_length, dataset))
    end = time.time()

    print('Done! Tokenizing Time={:.2f}s, #Sentences={}'.format(end - start, len(dataset)))
    return dataset, lengths

train_word_list, train_word_lengths = to_word_list(train_dataset)
valid_word_list, valid_word_lengths = to_word_list(valid_dataset)

Done! Tokenizing Time=-28725.97s, #Sentences=776847
Done! Tokenizing Time=9.68s, #Sentences=86317


In [6]:
train_seqs = [sample[0] for sample in train_word_list]
counter = nlp.data.count_tokens(list(itertools.chain.from_iterable(train_seqs)))

vocab = nlp.Vocab(counter, max_size=200000)

# load customed pre-trained embedding
embedding_weights = nlp.embedding.TokenEmbedding.from_file(file_path=DATA_FOLDER+WORD_EMBED)
vocab.set_embedding(embedding_weights)
print(vocab)

  .format(line_num, pretrained_file_path))


Vocab(size=200004, unk="<unk>", reserved="['<pad>', '<bos>', '<eos>']")


In [7]:
def token_to_idx(x):
    return vocab[x[0]], x[1]

# A token index or a list of token indices is returned according to the vocabulary.
with mp.Pool() as pool:
    train_dataset = pool.map(token_to_idx, train_word_list)
    valid_dataset = pool.map(token_to_idx, valid_word_list)

In [8]:
batch_size = 1024
bucket_num = 10
bucket_ratio = 0.5


def get_dataloader():
    # Construct the DataLoader Pad data, stack label and lengths
    batchify_fn = nlp.data.batchify.Tuple(nlp.data.batchify.Pad(axis=0), \
                                          nlp.data.batchify.Stack())

    # in this example, we use a FixedBucketSampler,
    # which assigns each data sample to a fixed bucket based on its length.
    batch_sampler = nlp.data.sampler.FixedBucketSampler(
        train_word_lengths,
        batch_size=batch_size,
        num_buckets=bucket_num,
        ratio=bucket_ratio,
        shuffle=True)
    print(batch_sampler.stats())

    # train_dataloader
    train_dataloader = gluon.data.DataLoader(
        dataset=train_dataset,
        batch_sampler=batch_sampler,
        batchify_fn=batchify_fn)
    # valid_dataloader
    valid_dataloader = gluon.data.DataLoader(
        dataset=valid_dataset,
        batch_size=batch_size,
        shuffle=False,
        batchify_fn=batchify_fn)
    return train_dataloader, valid_dataloader

train_dataloader, valid_dataloader = get_dataloader()

FixedBucketSampler:
  sample_num=776847, batch_num=238
  key=[14, 27, 40, 53, 66, 79, 92, 105, 118, 131]
  cnt=[565720, 126529, 43158, 20664, 12429, 6854, 1455, 33, 4, 1]
  batch_size=[4790, 2484, 1676, 1265, 1024, 1024, 1024, 1024, 1024, 1024]


In [9]:
for tweet, label in train_dataloader:
    print(tweet, label)
    break


[[1.71000e+02 2.50000e+01 1.39600e+03 ... 0.00000e+00 0.00000e+00
  0.00000e+00]
 [5.32400e+03 3.78450e+04 3.89800e+03 ... 0.00000e+00 0.00000e+00
  0.00000e+00]
 [1.78900e+03 4.60000e+01 8.36000e+02 ... 0.00000e+00 0.00000e+00
  0.00000e+00]
 ...
 [5.11600e+03 1.71100e+03 3.90000e+01 ... 6.91000e+02 1.12000e+02
  0.00000e+00]
 [1.98129e+05 4.84700e+03 4.28190e+04 ... 0.00000e+00 0.00000e+00
  0.00000e+00]
 [1.30000e+01 7.47000e+03 1.37000e+02 ... 0.00000e+00 0.00000e+00
  0.00000e+00]]
<NDArray 1676x40 @cpu_shared(0)> 
[40 17  3 ... 12 12 14]
<NDArray 1676 @cpu_shared(0)>


## Model contruction
Self attention layer, weighted cross entropy, and whole model

In [10]:
class TextCNN(nn.Block):
    def __init__(self, vocab_len, embed_size, kernel_sizes, num_channels, \
                 dropout, nclass, **kwargs):
        super(TextCNN, self).__init__(**kwargs)
        self.embedding = nn.Embedding(vocab_len, embed_size)
        
        self.constant_embedding = nn.Embedding(vocab_len, embed_size)
        self.dropout = nn.Dropout(dropout)
        self.decoder = nn.Dense(nclass)
        self.pool = nn.GlobalMaxPool1D()
        self.convs = nn.Sequential()  
        for c, k in zip(num_channels, kernel_sizes):
            self.convs.add(nn.Conv1D(c, k, activation='relu'))
    
    def forward(self, inputs):
        
        embeddings = nd.concat(
            self.embedding(inputs), self.constant_embedding(inputs), dim=2)
        embeddings = embeddings.transpose((0, 2, 1))
        
        encoding = nd.concat(*[nd.flatten(
            self.pool(conv(embeddings))) for conv in self.convs], dim=1)
        outputs = self.decoder(self.dropout(encoding))
        return outputs

In [11]:
vocab_len = len(vocab)
emsize = 300   # word embedding size
nhidden = 400    # lstm hidden_dim
nlayers = 4     # lstm layers
natt_unit = 400     # the hidden_units of attention layer
natt_hops = 20    # the channels of attention
nfc = 256  # last dense layer size
nclass = 72 # we have 72 emoji in total

drop_prob = 0.2
pool_way = 'flatten'    # # The way to handle M
prune_p = None
prune_q = None

ctx = try_gpu()

kernel_sizes, nums_channels = [2, 3, 4, 5], [100, 100, 100, 100]
model = TextCNN(vocab_len, emsize, kernel_sizes, nums_channels, drop_prob, nclass)
model.initialize(init.Xavier(), ctx=ctx)

print(model)
model.embedding.weight.set_data(vocab.embedding.idx_to_vec)
model.constant_embedding.weight.set_data(vocab.embedding.idx_to_vec)
model.constant_embedding.collect_params().setattr('grad_req', 'null')

TextCNN(
  (embedding): Embedding(200004 -> 300, float32)
  (constant_embedding): Embedding(200004 -> 300, float32)
  (dropout): Dropout(p = 0.2, axes=())
  (decoder): Dense(None -> 72, linear)
  (pool): GlobalMaxPool1D(size=(1,), stride=(1,), padding=(0,), ceil_mode=True)
  (convs): Sequential(
    (0): Conv1D(None -> 100, kernel_size=(2,), stride=(1,), Activation(relu))
    (1): Conv1D(None -> 100, kernel_size=(3,), stride=(1,), Activation(relu))
    (2): Conv1D(None -> 100, kernel_size=(4,), stride=(1,), Activation(relu))
    (3): Conv1D(None -> 100, kernel_size=(5,), stride=(1,), Activation(relu))
  )
)


In [12]:
tmp = nd.array([10, 20, 30, 40, 50, 60], ctx=ctx).reshape(1, -1)
model(tmp)


[[-0.34600145  0.04017189  0.1369038  -0.7636087  -0.3934391   1.1943265
  -0.32672587  1.7432895   0.03964555  0.46450374 -1.0460553   0.16721234
   0.5969963  -0.75534594 -1.5392365  -0.65512806  1.1295297  -0.49703023
   1.754624   -1.0651437  -0.21764863 -0.8766225   0.60445654 -0.67168844
  -1.1860421  -0.72298837 -0.9003191  -1.3292121  -1.2939929  -0.173226
  -0.13076256  1.5140643   0.16108891  1.1098144  -0.00212728  1.5642483
  -0.65339553  0.7023618   0.42604226 -1.3614258   0.3992056   1.0475352
  -0.13890669  0.6355842   0.9506167  -0.8263583  -0.63210803 -0.88465756
   0.04642108  0.57851565  0.6982124  -0.46893287  1.6976713  -0.8701376
  -0.9612554   0.35169083  2.0739958  -0.27529195  1.3988712  -0.80950534
  -0.45686567  0.2837877   0.3057649  -0.69839436 -1.4799446  -0.5075748
  -1.0260262   0.20910491  0.48553443  0.78714824  1.9416616   0.32587254]]
<NDArray 1x72 @gpu(0)>

In [14]:
lr, wd, num_epochs = .001, .95, 3
# model.load_parameters("model/textcnn1560329753")
trainer = gluon.Trainer(model.collect_params(), 'adam', {'learning_rate': lr})
loss = gluon.loss.SoftmaxCrossEntropyLoss()
d2l.train(train_dataloader, valid_dataloader, model, loss, trainer, ctx, num_epochs)
token = str(round(time.time()))
model.save_parameters("model/textcnn"+token+'.params')

training on gpu(0)
epoch 1, loss 3.3362, train acc 0.187, test acc 0.173, time 94.4 sec
epoch 2, loss 3.2507, train acc 0.205, test acc 0.173, time 95.2 sec
epoch 3, loss 3.1382, train acc 0.229, test acc 0.172, time 94.9 sec


In [18]:
kernel_sizes, nums_channels = [2, 3, 4, 5], [100, 100, 100, 100]
model = TextCNN(vocab_len, emsize, kernel_sizes, nums_channels, 0, nclass)
model.load_parameters('model/textcnn1560502613.params', ctx=ctx)

In [19]:
TEST_DATA = 'test.csv'
predictions = []
test_df = pd.read_csv(DATA_FOLDER+TEST_DATA, header=None, sep='\t')
len(test_df)

200000

In [20]:
start = time.time()
for _, tweet in test_df.iterrows():
    token = vocab[seg.cut(tweet[1])]
    if len(token)<5:
        token += [0.]*(5-len(token))
    inp = nd.array(token, ctx=ctx).reshape(1,-1)
    pred = model(inp)
    pred = nd.argmax(pred, axis=1).asscalar()
    predictions.append(int(pred))
    if len(predictions)%2000==0:
        ckpt = time.time()
        print('current pred len %d, time %.2fs' % (len(predictions), ckpt-start))
        start = ckpt
submit = pd.DataFrame({'Expected': predictions})
submit.to_csv('submission.csv', sep=',', index_label='ID')

current pred len 2000, time 4.67s
current pred len 4000, time 4.63s
current pred len 6000, time 4.68s
current pred len 8000, time 4.55s
current pred len 10000, time 4.66s
current pred len 12000, time 4.68s
current pred len 14000, time 4.63s
current pred len 16000, time 4.63s
current pred len 18000, time 4.65s
current pred len 20000, time 4.58s
current pred len 22000, time 4.67s
current pred len 24000, time 4.66s
current pred len 26000, time 4.57s
current pred len 28000, time 4.67s
current pred len 30000, time 4.67s
current pred len 32000, time 4.59s
current pred len 34000, time 4.67s
current pred len 36000, time 4.64s
current pred len 38000, time 4.63s
current pred len 40000, time 4.62s
current pred len 42000, time 4.66s
current pred len 44000, time 4.63s
current pred len 46000, time 4.65s
current pred len 48000, time 4.62s
current pred len 50000, time 4.65s
current pred len 52000, time 4.63s
current pred len 54000, time 4.83s
current pred len 56000, time 4.71s
current pred len 58000, 