In [1]:
import pandas as pd
import numpy as np
import mxnet as mx
from mxnet import nd, autograd, gluon, init
from mxnet.gluon import nn, rnn
import gluonnlp as nlp
import pkuseg
import multiprocessing as mp
import time
from d2l import try_gpu
import itertools
from sklearn.metrics import accuracy_score, f1_score
import d2l
# fixed random number seed
np.random.seed(9102)
mx.random.seed(9102)

In [4]:
DATA_FOLDER = 'data/'
TRAIN_DATA = 'train.csv'
WORD_EMBED = 'sgns.weibo.bigram-char'
LABEL_FILE = 'train.label'
N_ROWS=1000
ctx = try_gpu()
seg = pkuseg.pkuseg(model_name='web')

In [5]:
train_df = pd.read_csv(DATA_FOLDER+TRAIN_DATA, sep='|', nrows=N_ROWS)

In [6]:
dataset =[ [row[0], row[1]] for _, row in train_df.iterrows()]
train_dataset, valid_dataset = nlp.data.train_valid_split(dataset, .01)
len(train_dataset), len(valid_dataset)

(990, 10)

In [7]:
def tokenizer(x):
    tweet, label = x
    if type(tweet) != str:
        tweet = str(tweet)
    word_list = seg.cut(tweet)
    return word_list, label

def get_length(x):
    return float(len(x[0]))

def to_word_list(dataset):
    start = time.time()
    with mp.Pool() as pool:
        # Each sample is processed in an asynchronous manner.
        dataset = gluon.data.ArrayDataset(pool.map(tokenizer, dataset))
        lengths = gluon.data.ArrayDataset(pool.map(get_length, dataset))
    end = time.time()

    print('Done! Tokenizing Time={:.2f}s, #Sentences={}'.format(end - start, len(dataset)))
    return dataset, lengths

train_word_list, train_word_lengths = to_word_list(train_dataset)
valid_word_list, valid_word_lengths = to_word_list(valid_dataset)

Done! Tokenizing Time=0.29s, #Sentences=990
Done! Tokenizing Time=0.30s, #Sentences=10


In [8]:
train_seqs = [sample[0] for sample in train_word_list]
counter = nlp.data.count_tokens(list(itertools.chain.from_iterable(train_seqs)))

vocab = nlp.Vocab(counter, max_size=100000)

# load customed pre-trained embedding
embedding_weights = nlp.embedding.TokenEmbedding.from_file(file_path=DATA_FOLDER+WORD_EMBED)
vocab.set_embedding(embedding_weights)
print(vocab)

  .format(line_num, pretrained_file_path))


Vocab(size=4100, unk="<unk>", reserved="['<pad>', '<bos>', '<eos>']")


In [9]:
def token_to_idx(x):
    return vocab[x[0]], x[1]

# A token index or a list of token indices is returned according to the vocabulary.
with mp.Pool() as pool:
    train_dataset = pool.map(token_to_idx, train_word_list)
    valid_dataset = pool.map(token_to_idx, valid_word_list)

In [10]:
batch_size = 256
bucket_num = 10
bucket_ratio = 0.5


def get_dataloader():
    # Construct the DataLoader Pad data, stack label and lengths
    batchify_fn = nlp.data.batchify.Tuple(nlp.data.batchify.Pad(axis=0), \
                                          nlp.data.batchify.Stack())

    # in this example, we use a FixedBucketSampler,
    # which assigns each data sample to a fixed bucket based on its length.
    batch_sampler = nlp.data.sampler.FixedBucketSampler(
        train_word_lengths,
        batch_size=batch_size,
        num_buckets=bucket_num,
        ratio=bucket_ratio,
        shuffle=True)
    print(batch_sampler.stats())

    # train_dataloader
    train_dataloader = gluon.data.DataLoader(
        dataset=train_dataset,
        batch_sampler=batch_sampler,
        batchify_fn=batchify_fn)
    # valid_dataloader
    valid_dataloader = gluon.data.DataLoader(
        dataset=valid_dataset,
        batch_size=batch_size,
        shuffle=False,
        batchify_fn=batchify_fn)
    return train_dataloader, valid_dataloader

train_dataloader, valid_dataloader = get_dataloader()

FixedBucketSampler:
  sample_num=990, batch_num=10
  key=[12, 21, 30, 39, 48, 57, 66, 75, 84, 93]
  cnt=[650, 167, 64, 42, 15, 16, 20, 5, 8, 3]
  batch_size=[992, 566, 396, 305, 256, 256, 256, 256, 256, 256]


In [11]:
for tweet, label in train_dataloader:
    print(tweet, label)
    break


[[3.067e+03 6.490e+02 2.322e+03 ... 4.007e+03 8.000e+00 0.000e+00]
 [3.500e+01 2.510e+02 2.000e+01 ... 2.400e+01 8.000e+00 0.000e+00]
 [1.339e+03 1.950e+02 1.051e+03 ... 9.900e+01 9.800e+01 0.000e+00]
 ...
 [3.400e+01 1.030e+02 4.000e+00 ... 0.000e+00 0.000e+00 0.000e+00]
 [2.544e+03 3.815e+03 5.000e+02 ... 9.090e+02 1.300e+01 0.000e+00]
 [5.200e+01 1.829e+03 5.800e+01 ... 0.000e+00 0.000e+00 0.000e+00]]
<NDArray 42x39 @cpu_shared(0)> 
[40 31 38  8 35 20 30 15 45 27 24  3 53 56 23 51 29 16 36  3  3 59 13 34
 18 32  6 22 14 19  3 22 35 54 24 30 33  2 30 34  2  3]
<NDArray 42 @cpu_shared(0)>


## Model contruction
Self attention layer, weighted cross entropy, and whole model

In [19]:
class TextCNN(nn.Block):
    def __init__(self, vocab_len, embed_size, kernel_sizes, num_channels, \
                 dropout, nclass, **kwargs):
        super(TextCNN, self).__init__(**kwargs)
        self.embedding = nn.Embedding(vocab_len, embed_size)
        
        self.constant_embedding = nn.Embedding(vocab_len, embed_size)
        self.dropout = nn.Dropout(dropout)
        self.decoder = nn.Dense(nclass)
        self.pool = nn.GlobalMaxPool1D()
        self.convs = nn.Sequential()  
        for c, k in zip(num_channels, kernel_sizes):
            self.convs.add(nn.Conv1D(c, k, activation='relu'))
    
    def forward(self, inputs):
        
        embeddings = nd.concat(
            self.embedding(inputs), self.constant_embedding(inputs), dim=2)
        print(embeddings.shape)
        embeddings = embeddings.transpose((0, 2, 1))
        
        encoding = nd.concat(*[nd.flatten(
            self.pool(conv(embeddings))) for conv in self.convs], dim=1)
        print(encoding.shape)
        outputs = self.decoder(self.dropout(encoding))
        return outputs

In [20]:
vocab_len = len(vocab)
emsize = 300   # word embedding size
nhidden = 400    # lstm hidden_dim
nlayers = 4     # lstm layers
natt_unit = 400     # the hidden_units of attention layer
natt_hops = 20    # the channels of attention
nfc = 256  # last dense layer size
nclass = 72 # we have 72 emoji in total

drop_prob = 0.5
pool_way = 'flatten'    # # The way to handle M
prune_p = None
prune_q = None

ctx = try_gpu()

kernel_sizes, nums_channels = [2, 3, 4, 5], [200, 200, 200, 200]
model = TextCNN(vocab_len, emsize, kernel_sizes, nums_channels, drop_prob, nclass)
model.initialize(init.Xavier(), ctx=ctx)

print(model)
model.embedding.weight.set_data(vocab.embedding.idx_to_vec)
model.constant_embedding.weight.set_data(vocab.embedding.idx_to_vec)
model.constant_embedding.collect_params().setattr('grad_req', 'null')

TextCNN(
  (embedding): Embedding(4100 -> 300, float32)
  (constant_embedding): Embedding(4100 -> 300, float32)
  (dropout): Dropout(p = 0.4, axes=())
  (decoder): Dense(None -> 72, linear)
  (pool): GlobalMaxPool1D(size=(1,), stride=(1,), padding=(0,), ceil_mode=True)
  (convs): Sequential(
    (0): Conv1D(None -> 200, kernel_size=(2,), stride=(1,), Activation(relu))
    (1): Conv1D(None -> 200, kernel_size=(3,), stride=(1,), Activation(relu))
    (2): Conv1D(None -> 200, kernel_size=(4,), stride=(1,), Activation(relu))
    (3): Conv1D(None -> 200, kernel_size=(5,), stride=(1,), Activation(relu))
  )
)


In [21]:
tmp = nd.array([10, 20, 30, 40, 50, 60], ctx=ctx).reshape(1, -1)
model(tmp)

(1, 6, 600)
(1, 800)



[[ 0.798947   -1.5127401   0.5445767  -0.522098    1.3970379  -0.3428049
  -1.0130728  -0.07241816  0.9931398   0.31141812 -1.3905609   0.04244024
   0.94099516 -0.6591962   1.4808002   0.25050783 -0.13716994  0.61912596
   0.06574339  0.43878537  1.0778569  -0.9727909   1.5894302  -0.01894315
  -0.37659562 -0.87707716  0.9530885   0.08174714 -0.3447823   0.59718174
   2.8320794   0.25457725 -1.1092602   0.4659843  -1.7563459   0.9833747
  -0.45883682  0.13550195  0.6052164   1.3967222   0.9361794   0.15929052
  -0.27935922  0.85601705  1.2627268   0.43748006 -0.99077004  2.2483907
  -1.9119489   0.45857954  1.1022046   0.12607937  1.167786   -0.41126722
   1.1529258   1.0230607  -0.63536763 -0.4965821  -0.73578906 -1.4766121
   0.5688625   1.7369577   1.5292891  -0.8065958   0.7418031   0.76949096
   1.734976    0.69301844  1.078259   -0.18670827 -0.992385    0.31959945]]
<NDArray 1x72 @gpu(0)>

In [19]:
lr, wd, num_epochs = .001, .01, 8
# model.load_parameters("model/textcnn1560329753")
trainer = gluon.Trainer(model.collect_params(), 'adam', {'learning_rate': lr, 'wd': wd})
loss = gluon.loss.SoftmaxCrossEntropyLoss()
d2l.train(train_dataloader, valid_dataloader, model, loss, trainer, ctx, num_epochs)
token = str(round(time.time()))
model.save_parameters("model/textcnn"+token+'.params')

training on gpu(0)
epoch 1, loss 3.6126, train acc 0.140, test acc 0.138, time 127.6 sec
epoch 2, loss 3.5836, train acc 0.144, test acc 0.144, time 128.1 sec
epoch 3, loss 3.5826, train acc 0.145, test acc 0.144, time 125.5 sec
epoch 4, loss 3.5820, train acc 0.144, test acc 0.140, time 126.8 sec
epoch 5, loss 3.5815, train acc 0.144, test acc 0.142, time 128.8 sec
epoch 6, loss 3.5817, train acc 0.144, test acc 0.138, time 130.0 sec
epoch 7, loss 3.5823, train acc 0.144, test acc 0.142, time 128.1 sec
epoch 8, loss 3.5810, train acc 0.145, test acc 0.142, time 122.9 sec


In [None]:
# token = str(round(time.time()))
# model.save_parameters("model/model"+token)

In [None]:
TEST_DATA = 'test.csv'
predictions = []
test_df = pd.read_csv(DATA_FOLDER+TEST_DATA, header=None, sep='\t')
len(test_df)

In [None]:
start = time.time()
for _, tweet in test_df.iterrows():
    token = vocab[seg.cut(tweet[1])]
    if len(token)<5:
        token += [0.]*(5-len(token))
    inp = nd.array(token, ctx=ctx).reshape(1,-1)
    pred = model(inp)
    pred = nd.argmax(pred, axis=1).asscalar()
    predictions.append(int(pred))
    if len(predictions)%2000==0:
        ckpt = time.time()
        print('current pred len %d, time %.2fs' % (len(predictions), ckpt-start))
        start = ckpt
submit = pd.DataFrame({'Expected': predictions})
submit.to_csv('submission.csv', sep=',', index_label='ID')