In [1]:
import pandas as pd
import numpy as np
import mxnet as mx
from mxnet import nd, autograd, gluon, init
from mxnet.gluon import nn, rnn
import gluonnlp as nlp
import pkuseg
import multiprocessing as mp
import time
from d2l import try_gpu
import itertools
from sklearn.metrics import accuracy_score, f1_score
import d2l
# fixed random number seed
np.random.seed(9102)
mx.random.seed(9102)

In [2]:
DATA_FOLDER = 'data/'
TRAIN_DATA = 'train.csv'
WORD_EMBED = 'sgns.weibo.bigram-char'
LABEL_FILE = 'train.label'
N_ROWS=10000
ctx = try_gpu()
seg = pkuseg.pkuseg(model_name='web')

In [3]:
train_df = pd.read_csv(DATA_FOLDER+TRAIN_DATA, sep='|')

In [4]:
dataset =[ [row[0], row[1]] for _, row in train_df.iterrows()]
train_dataset, valid_dataset = nlp.data.train_valid_split(dataset, .1)
len(train_dataset), len(valid_dataset)

(776847, 86317)

In [5]:
def tokenizer(x):
    tweet, label = x
    if type(tweet) != str:
        tweet = str(tweet)
    word_list = seg.cut(tweet)
    return word_list, label

def get_length(x):
    return float(len(x[0]))

def to_word_list(dataset):
    start = time.time()
    with mp.Pool() as pool:
        # Each sample is processed in an asynchronous manner.
        dataset = gluon.data.ArrayDataset(pool.map(tokenizer, dataset))
        lengths = gluon.data.ArrayDataset(pool.map(get_length, dataset))
    end = time.time()

    print('Done! Tokenizing Time={:.2f}s, #Sentences={}'.format(end - start, len(dataset)))
    return dataset, lengths

train_word_list, train_word_lengths = to_word_list(train_dataset)
valid_word_list, valid_word_lengths = to_word_list(valid_dataset)

Done! Tokenizing Time=75.32s, #Sentences=776847
Done! Tokenizing Time=10.19s, #Sentences=86317


In [6]:
train_seqs = [sample[0] for sample in train_word_list]
counter = nlp.data.count_tokens(list(itertools.chain.from_iterable(train_seqs)))

vocab = nlp.Vocab(counter, max_size=100000)

# load customed pre-trained embedding
embedding_weights = nlp.embedding.TokenEmbedding.from_file(file_path=DATA_FOLDER+WORD_EMBED)
vocab.set_embedding(embedding_weights)
print(vocab)

  .format(line_num, pretrained_file_path))


Vocab(size=100004, unk="<unk>", reserved="['<pad>', '<bos>', '<eos>']")


In [7]:
def token_to_idx(x):
    return vocab[x[0]], x[1]

# A token index or a list of token indices is returned according to the vocabulary.
with mp.Pool() as pool:
    train_dataset = pool.map(token_to_idx, train_word_list)
    valid_dataset = pool.map(token_to_idx, valid_word_list)

In [8]:
batch_size = 256
bucket_num = 10
bucket_ratio = 0.5


def get_dataloader():
    # Construct the DataLoader Pad data, stack label and lengths
    batchify_fn = nlp.data.batchify.Tuple(nlp.data.batchify.Pad(axis=0), \
                                          nlp.data.batchify.Stack())

    # in this example, we use a FixedBucketSampler,
    # which assigns each data sample to a fixed bucket based on its length.
    batch_sampler = nlp.data.sampler.FixedBucketSampler(
        train_word_lengths,
        batch_size=batch_size,
        num_buckets=bucket_num,
        ratio=bucket_ratio,
        shuffle=True)
    print(batch_sampler.stats())

    # train_dataloader
    train_dataloader = gluon.data.DataLoader(
        dataset=train_dataset,
        batch_sampler=batch_sampler,
        batchify_fn=batchify_fn)
    # valid_dataloader
    valid_dataloader = gluon.data.DataLoader(
        dataset=valid_dataset,
        batch_size=batch_size,
        shuffle=False,
        batchify_fn=batchify_fn)
    return train_dataloader, valid_dataloader

train_dataloader, valid_dataloader = get_dataloader()

FixedBucketSampler:
  sample_num=776847, batch_num=1158
  key=[22, 35, 48, 61, 74, 87, 100, 113, 126, 139]
  cnt=[637662, 69980, 31150, 17042, 11028, 7082, 2748, 149, 3, 3]
  batch_size=[808, 508, 370, 291, 256, 256, 256, 256, 256, 256]


In [10]:
for tweet, label in train_dataloader:
    print(tweet, label)
    break


[[3.7000e+01 3.1490e+03 1.0100e+02 ... 0.0000e+00 0.0000e+00 0.0000e+00]
 [1.3000e+01 8.0100e+02 9.4320e+03 ... 0.0000e+00 0.0000e+00 0.0000e+00]
 [1.1071e+04 5.0000e+01 1.1030e+03 ... 0.0000e+00 0.0000e+00 0.0000e+00]
 ...
 [4.5330e+03 0.0000e+00 8.0000e+00 ... 0.0000e+00 0.0000e+00 0.0000e+00]
 [1.4000e+01 4.1000e+01 4.5000e+01 ... 0.0000e+00 0.0000e+00 0.0000e+00]
 [1.0000e+01 1.7000e+02 1.2600e+02 ... 0.0000e+00 0.0000e+00 0.0000e+00]]
<NDArray 508x35 @cpu_shared(0)> 
[39 53  6 29 31 52 12 37 44 33 31 14 52 44 23 31 62 23 34 12 45 62 12 35
 36 29  3 35 11 52  3 49 37  8 14 60 12  3 55  6 29 12 13 10 45 59 59 35
 14 54 22 35 49 60 47 36 12  6 14  4 48 14 39 35  3 31 35 71 16 16 65 29
 32  3  4 20  2 35 47 14  9 11 11 24 58 23 29 18 35 43 19 17 38  2 34 29
 28 29 69 29 38 23 12  3 13 34 49 28 14 29 14 60 64 55  2 29 31 28 45 12
  3 24 41 23 29 10 66 30  3  3 27 15 40 36 20 31 10 19 22 27 62 35 12 29
  3 29 14 43  1 27 14 36  6 11 46 39 27 32  3  7 16 30  3  3 58 29 48  3
 29 29  3 6

TypeError: 'DataLoader' object is not subscriptable

## Model contruction
Self attention layer, weighted cross entropy, and whole model

In [12]:
class TextCNN(nn.Block):
    def __init__(self, vocab_len, embed_size, kernel_sizes, num_channels, \
                 dropout, nclass, **kwargs):
        super(TextCNN, self).__init__(**kwargs)
        self.embedding = nn.Embedding(vocab_len, embed_size)
        
        self.constant_embedding = nn.Embedding(vocab_len, embed_size)
        self.dropout = nn.Dropout(dropout)
        self.decoder = nn.Dense(nclass)
        self.pool = nn.GlobalMaxPool1D()
        self.convs = nn.Sequential()  
        for c, k in zip(num_channels, kernel_sizes):
            self.convs.add(nn.Conv1D(c, k, activation='relu'))
    
    def forward(self, inputs):
        
        embeddings = nd.concat(
            self.embedding(inputs), self.constant_embedding(inputs), dim=2)
        
        embeddings = embeddings.transpose((0, 2, 1))
        
        encoding = nd.concat(*[nd.flatten(
            self.pool(conv(embeddings))) for conv in self.convs], dim=1)
        
        outputs = self.decoder(self.dropout(encoding))
        return outputs

In [13]:
vocab_len = len(vocab)
emsize = 300   # word embedding size
nhidden = 400    # lstm hidden_dim
nlayers = 4     # lstm layers
natt_unit = 400     # the hidden_units of attention layer
natt_hops = 20    # the channels of attention
nfc = 256  # last dense layer size
nclass = 72 # we have 72 emoji in total

drop_prob = 0.5
pool_way = 'flatten'    # # The way to handle M
prune_p = None
prune_q = None

ctx = try_gpu()

kernel_sizes, nums_channels = [2, 3, 4, 5], [200, 200, 200, 200]
model = TextCNN(vocab_len, emsize, kernel_sizes, nums_channels, drop_prob, nclass)
model.initialize(init.Xavier(), ctx=ctx)

print(model)
model.embedding.weight.set_data(vocab.embedding.idx_to_vec)
model.constant_embedding.weight.set_data(vocab.embedding.idx_to_vec)
model.constant_embedding.collect_params().setattr('grad_req', 'null')

TextCNN(
  (embedding): Embedding(100004 -> 300, float32)
  (constant_embedding): Embedding(100004 -> 300, float32)
  (dropout): Dropout(p = 0.5, axes=())
  (decoder): Dense(None -> 72, linear)
  (pool): GlobalMaxPool1D(size=(1,), stride=(1,), padding=(0,), ceil_mode=True)
  (convs): Sequential(
    (0): Conv1D(None -> 200, kernel_size=(2,), stride=(1,), Activation(relu))
    (1): Conv1D(None -> 200, kernel_size=(3,), stride=(1,), Activation(relu))
    (2): Conv1D(None -> 200, kernel_size=(4,), stride=(1,), Activation(relu))
    (3): Conv1D(None -> 200, kernel_size=(5,), stride=(1,), Activation(relu))
  )
)


In [20]:
lr, num_epochs = 0.0001, 2
model.load_parameters("model/textcnn1560254639")
trainer = gluon.Trainer(model.collect_params(), 'adam', {'learning_rate': lr})
loss = gluon.loss.SoftmaxCrossEntropyLoss()
d2l.train(valid_dataloader, valid_dataloader, model, loss, trainer, ctx, num_epochs)
token = str(round(time.time()))
model.save_parameters("model/textcnn"+token)

training on gpu(0)
epoch 1, loss 3.3426, train acc 0.187, test acc 0.205, time 47.3 sec
epoch 2, loss 3.3065, train acc 0.192, test acc 0.210, time 48.9 sec


In [21]:
# token = str(round(time.time()))
# model.save_parameters("model/model"+token)

In [21]:
TEST_DATA = 'test.csv'
predictions = []
test_df = pd.read_csv(DATA_FOLDER+TEST_DATA, header=None, sep='\t')
len(test_df)

200000

In [22]:
start = time.time()
for _, tweet in test_df.iterrows():
    token = vocab[seg.cut(tweet[1])]
    if len(token)<5:
        token += [0.]*(5-len(token))
    inp = nd.array(token, ctx=ctx).reshape(1,-1)
    pred = model(inp)
    pred = nd.argmax(pred, axis=1).asscalar()
    predictions.append(int(pred))
    if len(predictions)%2000==0:
        ckpt = time.time()
        print('current pred len %d, time %.2fs' % (len(predictions), ckpt-start))
        start = ckpt
submit = pd.DataFrame({'Expected': predictions})
submit.to_csv('submission.csv', sep=',', index_label='ID')

current pred len 2000, time 4.71s
current pred len 4000, time 4.62s
current pred len 6000, time 4.69s
current pred len 8000, time 4.69s
current pred len 10000, time 4.50s
current pred len 12000, time 4.59s
current pred len 14000, time 4.43s
current pred len 16000, time 4.69s
current pred len 18000, time 4.45s
current pred len 20000, time 4.54s
current pred len 22000, time 4.52s
current pred len 24000, time 4.45s
current pred len 26000, time 4.46s
current pred len 28000, time 4.52s
current pred len 30000, time 4.43s
current pred len 32000, time 4.55s
current pred len 34000, time 4.38s
current pred len 36000, time 4.63s
current pred len 38000, time 4.66s
current pred len 40000, time 4.57s
current pred len 42000, time 4.59s
current pred len 44000, time 4.55s
current pred len 46000, time 4.50s
current pred len 48000, time 4.73s
current pred len 50000, time 4.50s
current pred len 52000, time 4.48s
current pred len 54000, time 4.37s
current pred len 56000, time 4.35s
current pred len 58000, 