In [1]:
%load_ext autoreload
%autoreload 2

import data
import config
import tensorflow as tf
from tensorflow.python.ops import math_ops
from tensorflow.python.ops import control_flow_ops
from tensorflow.python.ops import tensor_array_ops

In [2]:
buckets = data.load_data("train_ids.enc", "train_ids.dec")

Bucketing conversation number 9999
Bucketing conversation number 19999
Bucketing conversation number 29999
Bucketing conversation number 39999
Bucketing conversation number 49999
Bucketing conversation number 59999
Bucketing conversation number 69999
Bucketing conversation number 79999
Bucketing conversation number 89999
Bucketing conversation number 99999
Bucketing conversation number 109999
Bucketing conversation number 119999
Bucketing conversation number 129999
Bucketing conversation number 139999
Bucketing conversation number 149999
Bucketing conversation number 159999
Bucketing conversation number 169999
Bucketing conversation number 179999
Bucketing conversation number 189999


In [35]:
print("BUCKETS: ", config.BUCKETS)
print("ENC_VOCAB: ", config.ENC_VOCAB)
print("DEC_VOCAB: ", config.DEC_VOCAB)
print("HIDDEN_UNITS:", config.HIDDEN_SIZE)
print("LR:", config.LR)

BUCKETS:  [(8, 10), (12, 14), (16, 19)]
ENC_VOCAB:  24474
DEC_VOCAB:  24683
HIDDEN_UNITS: 256
LR: 0.001


In [4]:
list(map(lambda i: i[1], buckets[0][:10]))

[[2, 3751, 3],
 [2, 8, 172, 97, 31, 3476, 9, 3],
 [2, 21, 1130, 9, 3],
 [2, 10, 71, 231, 8, 71, 4, 3],
 [2, 49, 36, 864, 19, 986, 18803, 9, 3],
 [2, 99, 5, 15, 73, 4, 3],
 [2, 45, 32, 37, 20, 3],
 [2, 34, 4, 3],
 [2, 8, 216, 376, 65, 52, 14, 3],
 [2, 1715, 4, 3]]

In [5]:
import abc


class ChatBotModelBase(metaclass=abc.ABCMeta):
    @abc.abstractmethod
    def encode(self):
        raise NotImplemented
    
    @abc.abstractmethod
    def decode(self, enc_outputs, enc_final_state):
        raise NotImplemented
    
    @abc.abstractmethod
    def create_loss(self):
        raise NotImplemented

In [39]:
class BasicChatBotModel(ChatBotModelBase):
    def __init__(self, batch_size=64):
        self.source_seq_tensor = tf.placeholder(tf.int32, shape=[None, batch_size], name="source_seq_tensor") # [Time, Batch]
        self.target_seq_tensor = tf.placeholder(tf.int32, shape=[None, batch_size], name="target_seq_tensor") # [Time, Batch]
        self.target_length = tf.placeholder(tf.int32, shape=(), name="target_length")
        self.decoder_seq_length = tf.placeholder(tf.int32, shape=(batch_size,), name="decoder_seq_length")
        self.global_step = tf.contrib.framework.get_global_step()
        self.batch_size = batch_size
    
    def build(self):
        enc_outputs, enc_final_state = self.encode()
        self.final_outputs, final_state = self.decode(enc_outputs, enc_final_state)
        self.train_op = self.create_loss()
    
    def encode(self):
        with tf.variable_scope('encoder') as scope:
            scope.set_initializer(tf.random_uniform_initializer(-0.1, 0.1))
            
            W = tf.get_variable(name="W", shape=[config.ENC_VOCAB, config.HIDDEN_SIZE], dtype=tf.float32)
            source_embedded = tf.nn.embedding_lookup(W, self.source_seq_tensor)
            
            # DropoutWrapper, LSTM, ...
            # v1.2
            # before v1.2, use tf.contrib.rnn.rnn_cell
            cell = tf.nn.rnn_cell.GRUCell(num_units=config.HIDDEN_SIZE)
            
            enc_outputs, enc_final_state = tf.nn.dynamic_rnn(cell=cell, inputs=source_embedded, time_major=True, dtype=tf.float32)
            
            return enc_outputs, enc_final_state

    def decode(self, enc_outputs, enc_final_state):
        with tf.variable_scope('decoder') as scope:
            scope = tf.get_variable_scope()
            scope.set_initializer(tf.random_uniform_initializer(-0.1, 0.1))

            W = tf.get_variable(
                name="W",
                shape=[config.DEC_VOCAB, config.HIDDEN_SIZE],
                initializer=tf.random_uniform_initializer(-0.1, 0.1))
            target_embedded = tf.nn.embedding_lookup(W, self.target_seq_tensor)

            cell = tf.nn.rnn_cell.GRUCell(num_units=config.HIDDEN_SIZE)
            print("target_embedded.get_shape(): ", target_embedded.get_shape())
            print("enc_final_state.get_shape(), ", enc_final_state.get_shape())

            def condition(time, all_outputs, inputs, states):
                return time < self.target_length - 1
                # return tf.reduce_all(self.decoder_length_tensor > time)

            def body(time, all_outputs, inputs, states):
                dec_outputs, dec_state = cell(inputs=inputs, state=states)
                output_logits = tf.contrib.layers.fully_connected(inputs=dec_outputs, num_outputs=config.DEC_VOCAB,
                                                                  activation_fn=None)
                all_outputs = all_outputs.write(time, output_logits)

                output_label = tf.arg_max(output_logits, dimension=1)
                next_input = tf.nn.embedding_lookup(W, output_label)
                next_input.set_shape((self.batch_size, config.HIDDEN_SIZE))

                return time + 1, all_outputs, next_input, dec_state

            output_ta = tensor_array_ops.TensorArray(dtype=tf.float32,
                                                     size=0,
                                                     dynamic_size=True,
                                                     element_shape=(self.batch_size, config.DEC_VOCAB))

            res = control_flow_ops.while_loop(
                condition,
                body,
                loop_vars=[0, output_ta, target_embedded[0], enc_final_state],
            )
            final_outputs = res[1].stack()
            final_state = res[3]
        return final_outputs, final_state
    
    def create_loss(self):
        with tf.variable_scope('loss') as scope:
            print("self.final_outputs, ", self.final_outputs.get_shape())
            losses = tf.nn.sparse_softmax_cross_entropy_with_logits(
                logits=self.final_outputs, labels=self.target_seq_tensor[1:])
            mask = tf.sequence_mask(self.decoder_seq_length, self.target_length - 1)
            print(losses.get_shape())
            losses = losses * tf.transpose(tf.to_float(mask), (1, 0))
            self.loss = tf.reduce_sum(losses) / tf.to_float(tf.reduce_sum(self.decoder_seq_length -1))
            print(self.loss.get_shape())
            
            self.optimizer = tf.train.AdamOptimizer(learning_rate=config.LR)
            trainables = tf.trainable_variables()
            self.grads = self.optimizer.compute_gradients(self.loss, trainables)
            train_op = self.optimizer.apply_gradients(self.grads, global_step=self.global_step)
        return train_op

In [65]:
class AttentionChatBotModel(BasicChatBotModel):
    def decode(self, enc_outputs, enc_final_state):
        with tf.variable_scope('decoder') as scope:
            scope = tf.get_variable_scope()
            scope.set_initializer(tf.random_uniform_initializer(-0.1, 0.1))

            W = tf.get_variable(
                name="W",
                shape=[config.DEC_VOCAB, config.HIDDEN_SIZE],
                initializer=tf.random_uniform_initializer(-0.1, 0.1))
            target_embedded = tf.nn.embedding_lookup(W, self.target_seq_tensor)

            cell = tf.nn.rnn_cell.GRUCell(num_units=config.HIDDEN_SIZE)
            print("target_embedded.get_shape(): ", target_embedded.get_shape())
            print("enc_final_state.get_shape(), ", enc_final_state.get_shape())

            def condition(time, all_outputs, inputs, states):
                return time < self.target_length - 1
                # return tf.reduce_all(self.decoder_length_tensor > time)

            def body(time, all_outputs, inputs, states):
                cell_state_input = tf.contrib.layers.fully_connected(inputs=states, num_outputs=config.HIDDEN_SIZE, activation_fn=None)
                dec_outputs, dec_state = cell(inputs=inputs, state=cell_state_input)
                
                ## attention score
                att_key = tf.contrib.layers.fully_connected(inputs=enc_outputs, num_outputs=config.CONTEXT_SIZE, activation_fn=None)
                att_query = tf.contrib.layers.fully_connected(inputs=dec_outputs, num_outputs=config.CONTEXT_SIZE, activation_fn=None)
                scores = tf.reduce_sum(att_key * tf.expand_dims(att_query, 0), [2])
                scores_normalized = tf.nn.softmax(scores, dim=0)
                
                ## context
                context = tf.reduce_sum(enc_outputs * tf.expand_dims(scores_normalized, 2), [0], name="context")
                
                projection_input = tf.concat([dec_outputs, context], 1)
                
                output_logits = tf.contrib.layers.fully_connected(inputs=projection_input, num_outputs=config.DEC_VOCAB,
                                                                  activation_fn=None)
                all_outputs = all_outputs.write(time, output_logits)

                output_label = tf.arg_max(output_logits, dimension=1)
                next_input = tf.nn.embedding_lookup(W, output_label)
                next_input.set_shape((self.batch_size, config.HIDDEN_SIZE))

                return time + 1, all_outputs, next_input, tf.concat([dec_state, context], 1)

            output_ta = tensor_array_ops.TensorArray(dtype=tf.float32,
                                                     size=0,
                                                     dynamic_size=True,
                                                     element_shape=(self.batch_size, config.DEC_VOCAB))

            res = control_flow_ops.while_loop(
                condition,
                body,
                loop_vars=[0,
                           output_ta, target_embedded[0],
                           tf.concat(
                               [enc_final_state,
                               tf.zeros(dtype=tf.float32, shape=(self.batch_size, config.CONTEXT_SIZE))], 1)],
            )
            final_outputs = res[1].stack()
            final_state = res[3]
        return final_outputs, final_state

In [37]:
test_buckets, data_buckets, train_buckets_scale = get_buckets()

Bucketing conversation number 9999
Bucketing conversation number 19999
Bucketing conversation number 9999
Bucketing conversation number 19999
Bucketing conversation number 29999
Bucketing conversation number 39999
Bucketing conversation number 49999
Bucketing conversation number 59999
Bucketing conversation number 69999
Bucketing conversation number 79999
Bucketing conversation number 89999
Bucketing conversation number 99999
Bucketing conversation number 109999
Bucketing conversation number 119999
Bucketing conversation number 129999
Bucketing conversation number 139999
Bucketing conversation number 149999
Bucketing conversation number 159999
Bucketing conversation number 169999
Bucketing conversation number 179999
Bucketing conversation number 189999
Number of samples in each bucket:
 [37961, 34335, 31129]
Bucket scale:
 [0.3670389170896785, 0.6990186125211506, 1.0]


In [66]:
import numpy as np
import random
from util import get_buckets

tf.reset_default_graph()

model = AttentionChatBotModel(batch_size=config.BATCH_SIZE)
model.build()
with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    for step in range(3):
        rand = random.random()
        bucket_id = min([i for i in range(len(train_buckets_scale))
                    if train_buckets_scale[i] > rand])
        encoder_inputs, decoder_inputs, decoder_masks = data.get_batch(
                data_buckets[bucket_id], bucket_id, batch_size=config.BATCH_SIZE)
        decoder_lens = np.sum(np.transpose(np.array(decoder_masks), (1, 0)), axis=1)
        loss_res, _ = sess.run([model.loss, model.train_op], feed_dict={
            model.source_seq_tensor: encoder_inputs,
            model.target_seq_tensor: decoder_inputs,
            model.target_length: config.BUCKETS[bucket_id][1],
            model.decoder_seq_length: decoder_lens
        })
        print("Step {}: loss - {}".format(step, loss_res))

    

target_embedded.get_shape():  (?, 64, 256)
enc_final_state.get_shape(),  (?, 256)
self.final_outputs,  (?, 64, 24683)
(?, 64)
()
Object was never used (type <class 'tensorflow.python.ops.tensor_array_ops.TensorArray'>):
<tensorflow.python.ops.tensor_array_ops.TensorArray object at 0x130306be0>
If you want to mark it as used call its "mark_used()" method.
It was originally created here:
Step 0: loss - 11.440528869628906
Step 1: loss - 11.018695831298828
Step 2: loss - 11.42657470703125


10.113870026547724