In [1]:
from Utils import *
from Models import *

Using TensorFlow backend.


In [2]:
from gensim.models import KeyedVectors
import sentencepiece as spm

path = "/work/"
train_data = "100K_QD_ml15"

max_len = 15
enablePadding = True

sp = spm.SentencePieceProcessor()
sp.Load('/work/data/bpe/en.wiki.bpe.op50000.model')
bpe = KeyedVectors.load_word2vec_format("/work/data/bpe/en.wiki.bpe.op50000.d200.w2v.bin", binary=True)
bpe.index2word = [''] + bpe.index2word + ['<sos>'] + ['<eos>']  # add empty string 
nb_words = len(bpe.index2word)
# word2index
bpe_dict = {bpe.index2word[i]: i for i in range(len(bpe.index2word))}
# construct embedding_matrix
embedding_matrix = np.concatenate([np.zeros((1, bpe.vector_size)), bpe.vectors, np.zeros((2, bpe.vector_size))]) # add zero vector for empty string (i.e. used for padding)

embedding_layer = Embedding(nb_words,
                    embedding_matrix.shape[-1],
                    weights=[embedding_matrix],
                    input_length=max_len,
                    trainable=True)

# test_set = []
# for i in ["MayFlower", "JuneFlower", "JulyFlower", "sts", "quora", "para"]:
#     df, qrel = get_test_data(i, path)
#     q_ = parse_texts_bpe(df.q.tolist(), sp, bpe_dict, max_len, enablePadding)
#     d_ = parse_texts_bpe(df.d.tolist(), sp, bpe_dict, max_len, enablePadding)
#     test_set.append([q_, d_, qrel, df, i])

In [3]:
from Models import *

class VariationalAutoEncoder():

    def __init__(self, nb_words, max_len, embedding_matrix, dim, optimizer=Adam(), kl_rate=0.01, keep_rate_word_dropout=0.5, enableKL=True, enableCond=False):

        self.dim = dim
        self.nb_words = nb_words
        self.max_len = max_len
        self.embedding_matrix = embedding_matrix
        self.optimizer = optimizer
        self.keep_rate_word_dropout = keep_rate_word_dropout
        self.kl_rate = kl_rate
        self.enableKL = enableKL
        self.enableCond = enableCond

        self.hidden_dim = self.dim[0]
        self.latent_dim = self.dim[1]

        self.build()

    def build(self):

        query_inputs = Input(shape=(self.max_len,))
        doc_inputs = Input(shape=(self.max_len,))
        label_inputs = Input(shape=(1,))
        kl_inputs = Input(shape=(1,))

        encoder_embedding = Embedding(self.nb_words,
                                        self.embedding_matrix.shape[-1],
                                        weights=[self.embedding_matrix],
                                        input_length=self.max_len,
                                        mask_zero=True,
                                        name="q_embeding_layer",
                                        trainable=True)

        doc_encoder_embedding = Embedding(self.nb_words,
                                        self.embedding_matrix.shape[-1],
                                        weights=[self.embedding_matrix],
                                        input_length=self.max_len,
                                        mask_zero=True,
                                        trainable=True)

        norm = BatchNormalization()


        encoder_lstm = GRU(self.hidden_dim, name="q_gru")
        doc_encoder_lstm = GRU(self.hidden_dim)


        state = norm(encoder_lstm(encoder_embedding(query_inputs)))
        doc_state = norm(doc_encoder_lstm(doc_encoder_embedding(doc_inputs)))

        if self.enableCond:
            state_z = merge([state, doc_state, label_inputs], mode="concat")

        state_z = state
        self.mean = Dense(self.latent_dim)
        self.var = Dense(self.latent_dim)

        self.state_mean = self.mean(state_z)
        self.state_var = self.var(state_z)



        state_z = Lambda(self.sampling, name="kl")([self.state_mean, self.state_var])

        if self.enableCond:
            state_z = merge([state_z, doc_state, label_inputs], mode="concat")

        decoder_inputs = Input(shape=(self.max_len,))

        self.latent2hidden = Dense(self.hidden_dim)
        self.decoder_lstm = GRU(self.hidden_dim, return_sequences=True, name="dec_gru")
        self.decoder_dense = Dense(self.nb_words, activation='softmax', name="rec")
        # self.decoder_embedding = Embedding(self.nb_words,
        #                                 self.embedding_matrix.shape[-1],
        #                                 weights=[self.embedding_matrix],
        #                                 input_length=self.max_len,
        #                                 mask_zero=True,
        #                                 trainable=True)

        rec_outputs = self.decoder_dense(self.decoder_lstm(encoder_embedding(decoder_inputs) , initial_state=self.latent2hidden(state_z)))

        if self.enableKL:

            def kl_annealing_loss(x, x_):
                kl_loss = - 0.5 * K.sum(1 + state_var - K.square(state_mean) - K.exp(state_var), axis=-1)
                return kl_inputs * kl_loss

            self.model = Model([query_inputs, decoder_inputs, kl_inputs], [rec_outputs, state_z])
            self.model.compile(optimizer=self.optimizer, loss=['sparse_categorical_crossentropy', kl_annealing_loss])

        else:
            if self.enableCond:
                inputs = [query_inputs, decoder_inputs, doc_inputs, label_inputs]
            else:
                inputs = [query_inputs, decoder_inputs]
            self.model = Model(inputs, [rec_outputs])
            self.model.compile(optimizer=self.optimizer, loss=self.vae_loss, metrics=[self.rec_loss, self.kl_loss])        

        self.encoder = Model(query_inputs, state)

    # def vae_loss(self, y_true, y_pred):
    # 	recon = K.sum(K.sparse_categorical_crossentropy(y_true, y_pred), axis=-1)
    # 	kl = 0.5 * K.sum(K.exp(self.state_var) + K.square(self.state_mean) - 1. - self.state_var, axis=-1)
    # 	return recon + kl

    def vae_loss(self, x, x_decoded_onehot):
        xent_loss = objectives.sparse_categorical_crossentropy(x, x_decoded_onehot)
        kl_loss = - 0.5 * K.mean(1 + self.state_var - K.square(self.state_mean) - K.exp(self.state_var))
        loss = xent_loss + kl_loss
        return loss


    def kl_loss(self, y_true, y_pred):
        kl_loss = - 0.5 * K.mean(1 + self.state_var - K.square(self.state_mean) - K.exp(self.state_var))
        return kl_loss

    def rec_loss(self, y_true, y_pred):
        return objectives.sparse_categorical_crossentropy(y_true, y_pred)

    def name(self):
        if self.enableCond:
            return "cvae"
        return "vae" if not self.enableKL else "vae_kl"

    def word_dropout(self, x, unk_token):
        np.random.seed(0)
        x_ = np.copy(x)
        rows, cols = np.nonzero(x_)
        for r, c in zip(rows, cols):
            if random.random() <= self.keep_rate_word_dropout:
                continue
            x_[r][c] = unk_token

            return x_

    def sampling(self, args):
            z_mean, z_log_var = args
            epsilon = K.random_normal(shape=(K.shape(z_mean)[0], K.shape(z_mean)[1]), mean=0.,\
                                      stddev=1)
            return z_mean + K.exp(z_log_var / 2) * epsilon

In [4]:
from keras.engine.topology import Layer
from keras.layers import Conv2DTranspose
import keras.backend as K
class Conv1DTranspose(Layer):
    def __init__(self, filters, kernel_size, strides=1, *args, **kwargs):
        self._filters = filters
        self._kernel_size = (1, kernel_size)
        self._strides = (1, strides)
        self._args, self._kwargs = args, kwargs
        super(Conv1DTranspose, self).__init__()

    def build(self, input_shape):
        self._model = Sequential()
        self._model.add(Lambda(lambda x: K.expand_dims(x,axis=1), batch_input_shape=input_shape))
        self._model.add(Conv2DTranspose(self._filters,
                                        kernel_size=self._kernel_size,
                                        strides=self._strides,
                                        *self._args, **self._kwargs))
        self._model.add(Lambda(lambda x: x[:,0]))
        super(Conv1DTranspose, self).build(input_shape)

    def call(self, x):
        return self._model(x)

    def compute_output_shape(self, input_shape):
        return self._model.compute_output_shape(input_shape)

In [51]:
from keras.layers import Embedding
from keras.layers import Dense, Input, Flatten
from keras.layers import Conv1D, MaxPooling1D, Embedding, Merge, Dropout
from keras.models import Model

# applying a more complex convolutional approach
convs = []
filter_sizes = [2,3,4,5]

sequence_input = Input(shape=(15,), dtype='int32')
embedded_sequences = embedding_layer(sequence_input)

for fsz in filter_sizes:
    l_conv = Conv1D(nb_filter=25,filter_length=fsz,activation='relu')(embedded_sequences)
    l_pool = GlobalMaxPooling1D()(l_conv)
    convs.append(l_pool)
    
x = Merge(mode='concat', concat_axis=1)(convs)

# x = Conv1DTranspose(filters=128, kernel_size=1)(x)
# softmax = Dense(5555, activation="softmax")
preds = x
model = Model(sequence_input, preds)
model.compile(loss='categorical_crossentropy',
              optimizer='rmsprop',
              metrics=['acc'])



In [52]:
model.predict(np.random.randint(2, size=(5, 15))).shape

(5, 100)

In [35]:
128/4

32.0