In [1]:
import warnings
warnings.filterwarnings('ignore')
from gensim.models import FastText
from tqdm import tqdm_notebook
import numpy as np
import cntk as C

In [2]:
"""
Configuration
"""
embedding_path = '../embedding/fasttext_384_15_5_50_sg0.bin'
train_data = '../data/train.ctf'
valid_data = '../data/dev.ctf'
test_data = 'test'
valid_minibatch_size = 1024
num_validation = 10000
minibatch_size = 1024
epoch_size = 4000000
embedding_dim = 384
learning_rate = 5
hidden_dim = 256
max_epochs = 300
log_freq = 2000
dropout = 0.2
version = 'train'

In [3]:
"""
CUDNN approach rnn stack
"""
from cntk.layers.blocks import _INFERRED
def OptimizedRnnStack(hidden_dim, num_layers=1, recurrent_op='lstm', bidirectional=True, 
                      use_cudnn=True, name=''):
    if use_cudnn:
        W = C.parameter(_INFERRED + (hidden_dim,), init=C.glorot_uniform())
        def func(x):
            return C.optimized_rnnstack(x, W, hidden_dim, num_layers, bidirectional, 
                                        recurrent_op=recurrent_op, name=name)
        return func
    else:
        raise NotImplementedError

In [4]:
"""
Step function used to calculate sequence length
"""
def plus1(x, y):
    return x + 1 + (y - y)

In [5]:
"""
Model Class
"""
from collections import defaultdict
class Model:
    def __init__(self):
        self.embedding_path = embedding_path
        self.fmodel = FastText.load(self.embedding_path)
        self.embedding_dim = embedding_dim
        self.hidden_dim = hidden_dim
        self.word_dim = len(self.fmodel.wv.vocab) + 1
        self.vocab = defaultdict(int)
        self.dropout = dropout
        self.use_cudnn = True
        
        index = 1
        for vocab in self.fmodel.wv.vocab:
            self.vocab[vocab] = index
            index += 1
        
    def embed(self):
        vec = np.zeros((self.word_dim, self.embedding_dim), dtype=np.float32)

        for vocab in self.fmodel.wv.vocab:
            vec[self.vocab[vocab]] = self.fmodel.wv[vocab]
        
        embedding = C.parameter(shape=vec.shape, init=vec)
        
        def func(context):
            return C.times(context, embedding)
        
        return func
        
    def input_layer(self, c1w, c2w):
        c1w_ph = C.placeholder()
        c2w_ph = C.placeholder()
        
        input_words = C.placeholder(shape=(self.word_dim))
        
        embedded = self.embed()(input_words)
        processed = OptimizedRnnStack(self.hidden_dim,
                       num_layers=1, bidirectional=True, use_cudnn=True, name='input_rnn')(embedded)
        
        c1_processed = processed.clone(C.CloneMethod.share, {input_words: c1w_ph})
        c2_processed = processed.clone(C.CloneMethod.share, {input_words: c2w_ph})
        
        return C.as_block(
            C.combine([c1_processed, c2_processed]),
            [(c1w_ph, c1w), (c2w_ph, c2w)],
            'input_layer',
            'input_layer'
        )
    
    def attention_layer(self, c1, c2, layer):        
        q_processed = C.placeholder(shape=(2*self.hidden_dim,))
        p_processed = C.placeholder(shape=(2*self.hidden_dim,))

        qvw, qvw_mask = C.sequence.unpack(q_processed, padding_value=0).outputs

        wq = C.parameter(shape=(2*self.hidden_dim, 2*self.hidden_dim), init=C.glorot_uniform())
        wp = C.parameter(shape=(2*self.hidden_dim, 2*self.hidden_dim), init=C.glorot_uniform())
        wg = C.parameter(shape=(8*self.hidden_dim, 8*self.hidden_dim), init=C.glorot_uniform())
        v = C.parameter(shape=(2*self.hidden_dim, 1), init=C.glorot_uniform())

        # seq[tensor[2d]] p_len x 2d
        wpt = C.reshape(C.times(p_processed, wp), (-1, 2*self.hidden_dim))

        # q_len x 2d
        wqt = C.reshape(C.times(qvw, wq), (-1, 2*self.hidden_dim))
        
        # seq[tensor[q_len]]
        S = C.reshape(C.times(C.tanh(C.sequence.broadcast_as(wqt, p_processed) + wpt), v), (-1))

        qvw_mask_expanded = C.sequence.broadcast_as(qvw_mask, p_processed)

        # seq[tensor[q_len]]
        S = C.element_select(qvw_mask_expanded, S, C.constant(-1e+30))
        
        # seq[tensor[q_len]]
        A = C.softmax(S, axis=0)

        # seq[tensor[2d]]
        swap_qvw = C.swapaxes(qvw)
        cq = C.reshape(C.reduce_sum(A * C.sequence.broadcast_as(swap_qvw, A), axis=1), (-1))

        # seq[tensor[4d]]
        uc_concat = C.splice(p_processed, cq, p_processed * cq, cq * cq)
        
        # seq[tensor[4d]]
        gt = C.tanh(C.times(uc_concat, wg))
        
        # seq[tensor[4d]]
        uc_concat_star = gt * uc_concat
 
        # seq[tensor[4d]]
        vp = C.layers.Sequential([
            C.layers.Dropout(self.dropout),
            OptimizedRnnStack(self.hidden_dim, bidirectional=True, 
                use_cudnn=self.use_cudnn, name=layer+'_attention_rnn')])(uc_concat_star)
        
        return C.as_block(
            vp,
            [(p_processed, c1), (q_processed, c2)],
            'attention_layer_' + layer,
            'attention_layer_' + layer)
        
    def model(self):
        c1_axis = C.Axis.new_unique_dynamic_axis('c1_axis')
        c2_axis = C.Axis.new_unique_dynamic_axis('c2_axis')
        b = C.Axis.default_batch_axis()
        
        c1 = C.input_variable(self.word_dim, dynamic_axes=[b, c1_axis], name='c1')
        c2 = C.input_variable(self.word_dim, dynamic_axes=[b, c2_axis], name='c2')
        
        y = C.input_variable(1, dynamic_axes=[b], name='y')
        
        c1_processed, c2_processed = self.input_layer(c1, c2).outputs
        att_context = self.attention_layer(c2_processed, c1_processed, 'attention')
        
        c2_len = C.layers.Fold(plus1)(c2_processed)
        att_len = C.layers.Fold(plus1)(att_context)
        
        cos = C.cosine_distance(C.sequence.reduce_sum(c2_processed)/c2_len, 
                                C.sequence.reduce_sum(att_context)/att_len)
        
        prob = C.sigmoid(cos)
        is_context = C.greater(prob, 0.5)
        
        loss = C.losses.binary_cross_entropy(prob, y)
        acc = C.equal(is_context, y)
        
        return cos, loss, acc

In [6]:
"""
Read ctf format to minibatch source, input function
"""
def deserialize(func, ctf_path, model, randomize=True, repeat=True, is_test=False):
    if not is_test:
        mb_source = C.io.MinibatchSource(
            C.io.CTFDeserializer(
                ctf_path,
                C.io.StreamDefs(
                    c1 = C.io.StreamDef('c1', shape=model.word_dim, is_sparse=True),
                    c2 = C.io.StreamDef('c2', shape=model.word_dim, is_sparse=True),
                    y  = C.io.StreamDef('y', shape=1, is_sparse=False))),
            randomize=randomize,
            max_sweeps=C.io.INFINITELY_REPEAT if repeat else 1)

        input_map = {
            argument_by_name(func, 'c1'): mb_source.streams.c1,
            argument_by_name(func, 'c2'): mb_source.streams.c2,
            argument_by_name(func, 'y'): mb_source.streams.y
        }
    else:
        mb_source = C.io.MinibatchSource(
            C.io.CTFDeserializer(
                ctf_path,
                C.io.StreamDefs(
                    c1 = C.io.StreamDef('c1', shape=model.word_dim, is_sparse=True),
                    c2 = C.io.StreamDef('c2', shape=model.word_dim, is_sparse=True))),
            randomize=randomize,
            max_sweeps=C.io.INFINITELY_REPEAT if repeat else 1)

        input_map = {
            argument_by_name(func, 'c1'): mb_source.streams.c1,
            argument_by_name(func, 'c2'): mb_source.streams.c2
        }    
    return mb_source, input_map

In [7]:
"""
Helper function used to map the variable
"""
def argument_by_name(func, name):
    found = [arg for arg in func.arguments if arg.name == name]
    if len(found) == 0:
        raise ValueError('no matching names in arguments')
    elif len(found) > 1:
        raise ValueError('multiple matching names in arguments')
    else:
        return found[0]

In [8]:
"""
Trainer
"""
def train():
    model = Model()
    z, loss, acc = model.model()
    
    progress_writers = [C.logging.ProgressPrinter(
                            num_epochs = max_epochs,
                            freq = log_freq,
                            tag = 'Training',
                            log_to_file = 'log/log_' + version)]
    
    lr = C.learning_parameter_schedule(learning_rate, minibatch_size=None, epoch_size=None)
    learner = C.adadelta(z.parameters, lr)
    trainer = C.Trainer(z, (loss, acc), learner, progress_writers)
    
    mb_source, input_map = deserialize(loss, train_data, model)
    mb_valid, valid_map = deserialize(loss, valid_data, model)
    
    try:
        trainer.restore_from_checkpoint('../model/' + version)
    except Exception:
        print('No checkpoint.')
    
    for epoch in range(max_epochs):
        num_seq = 0        
        # validation
        with tqdm_notebook(total=num_validation) as valid_progress_bar:
            while True:
                data = mb_valid.next_minibatch(minibatch_size, input_map=valid_map)
                if not data:
                    break
                trainer.test_minibatch(data)
                num_seq += len(data)
                valid_progress_bar.update(len(data))
                if num_seq >= num_validation:
                    break
            trainer.summarize_test_progress()
            
        num_seq = 0
        # train
        with tqdm_notebook(total=epoch_size) as progress_bar:
            while True:
                data = mb_source.next_minibatch(minibatch_size, input_map=input_map)
                trainer.train_minibatch(data)
                num_seq += trainer.previous_minibatch_sample_count
                progress_bar.update(trainer.previous_minibatch_sample_count)
                if num_seq >= epoch_size:
                    break
            trainer.summarize_training_progress()
            trainer.save_checkpoint('../model/' + version + '/' + str(epoch)) 

In [9]:
# train()

In [10]:
"""
Inference
"""
def inference(data):
    p = Model()
    model = C.load_model('../model/' + version + '/12')

    cos = model.outputs[0]
    loss = C.as_composite(model.outputs[1].owner)

    mb_test, map_test = deserialize(loss, '../data/{}.ctf'.format(data), p, 
                                    randomize=False, repeat=False, is_test=True)
    c1 = argument_by_name(loss, 'c1')
    c2 = argument_by_name(loss, 'c2')

    results = []
    if 'test' in data:
        total_samples = 3000
    else:
        total_samples = num_validation
        
    with tqdm_notebook(total=total_samples) as progress_bar:
        while True:
            data = mb_test.next_minibatch(minibatch_size, input_map=map_test)
            if not data:
                break
            out = model.eval(data, outputs=[cos])
            results.extend(out)
            progress_bar.update(len(data))
    assert(len(results) == total_samples)
    return results

In [11]:
results = inference(test_data)

HBox(children=(IntProgress(value=0, max=3000), HTML(value='')))




In [12]:
predict = []
for index in range(0, len(results), 6):
    sample = results[index:index+6]
    predict.append(np.argmax(sample))

In [13]:
questions = np.load('../data/tokenized_questions.npy')

In [14]:
for index, p in enumerate(predict):
    print(questions[index][0], questions[index][p + 1])

['媽', '給你', '送錢', '錢包', '來', '啦', '來', '你', '看', '一下', '是不', '不是', '這', '個', '對', '就是', '這', '個', '你', '在', '哪裡', '找到', '它', '的'] ['早上', '你', '爸爸', '在', '車上', '找到', '的', '一定', '是', '前天', '你', '放學', '的', '時候', '掉', '在', '車上', '了']
['古人', '說', '三日', '日不', '讀書', '面目', '面目可憎', '可憎', '我', '覺得', '我', '最近', '可能', '臉色', '太難', '難看', '了'] ['所以', '想', '回復', '我', '昔日', '面貌', '姣好', '的', '樣子']
['你', '說', '我們', '做', '父母', '的', '最', '擔心', '的', '就是', '這', '個'] ['只是', '小孩', '自己', '的', '興趣', '不能', '得到', '發展', '他們', '的', '心', '裡', '可能', '也會', '會很', '悶', '喔']
['是不', '不是', '因為', '以前', '你', '的', '書包', '太重', '了', '所以', '你', '想要', '發明', '這', '個'] ['這', '個', '應該', '也是', '其', '中', '之', '一', '的', '原因']
['那', '可', '不見得', '見得', '有錢', '能', '使', '鬼', '推磨'] ['真的', '嗎', '什', '麼', '禮物']
['真的', '嗎', '當然', '是', '真的', '囉', '如果', '你不', '相信', '的', '話', '可以', '問', '姜', '老師', '啊'] ['是', '啊', '這', '點', '老師', '也', '可以', '證明', '喔']
['你', '今天', '是要', '介紹', '哪', '一', '位', '人物', '讓我', '我們', '認識', '呢', '是', '和', '戲劇', '有關', '喔', '特別'

In [15]:
with open(version + '_answer.csv', 'w', encoding='utf-8') as out:
    out.write('Id,Answer\n')
    for idx, p in enumerate(predict):
        out.write(str(idx) + ',' + str(p) + '\n')