# Generating Sentences from a Continuous Space
### Samuel R. Bowman, Luke Vilnis, Oriol Vinyals, Andrew M. Dai, Rafal Jozefowicz, Samy Bengio

In [3]:
from Models import *
from Utils import *
from FastModels import *
import warnings
# warnings.filterwarnings('ignore')

In [4]:
max_len = 10
enablePadding = True
nb_words= len(bpe_dict)

df_may, qrel_may = get_test_data("MayFlower", "/work/")
df_june, qrel_june = get_test_data("JuneFlower", "/work/")
df_july, qrel_july = get_test_data("JulyFlower", "/work/")

q_may = parse_texts_bpe(df_may.q.tolist(), sp, bpe_dict, max_len, enablePadding)
d_may = parse_texts_bpe(df_may.d.tolist(), sp, bpe_dict, max_len, enablePadding)

q_june = parse_texts_bpe(df_june.q.tolist(), sp, bpe_dict, max_len, enablePadding)
d_june = parse_texts_bpe(df_june.d.tolist(), sp, bpe_dict, max_len, enablePadding)

q_july = parse_texts_bpe(df_july.q.tolist(), sp, bpe_dict, max_len, enablePadding, "pre")
d_july = parse_texts_bpe(df_july.d.tolist(), sp, bpe_dict, max_len, enablePadding, "pre")

test_set = [[q_may, d_may, qrel_may, df_may, "MayFlower"], [q_june, d_june, qrel_june, df_june, "JuneFlower"], [q_july, d_july, qrel_july, df_july, "JulyFlower"]]


b'Skipping line 19898: expected 6 fields, saw 8\nSkipping line 20620: expected 6 fields, saw 8\nSkipping line 38039: expected 6 fields, saw 8\n'


In [5]:
from gensim.models import KeyedVectors
import sentencepiece as spm
sp = spm.SentencePieceProcessor()
sp.Load('/work/data/bpe/en.wiki.bpe.op50000.model')
bpe = KeyedVectors.load_word2vec_format("/work/data/bpe/en.wiki.bpe.op50000.d200.w2v.bin", binary=True)
bpe.index2word = [''] + bpe.index2word # add empty string
nb_words = len(bpe.index2word)
# word2index
bpe_dict = {bpe.index2word[i]: i for i in range(len(bpe.index2word))}
# construct embedding_matrix
embedding_matrix = np.concatenate([np.zeros((1, bpe.vector_size)), bpe.vectors]) # add zero vector for empty string (i.e. used for padding)

embedding_layer = Embedding(nb_words,
                    embedding_matrix.shape[-1],
                    weights=[embedding_matrix],
                    input_length=max_len,
                    trainable=True)

In [71]:
df = pd.read_csv("/work/data/train_data/30M_QD_lower2.txt", nrows=100000, names=["label", "q", "d"], sep="\t", header=None, error_bad_lines=False)

In [72]:
q_df = parse_texts_bpe(df.q.tolist(), sp, bpe_dict, max_len, enablePadding)
# np.save("/work/data/train_data/30M_QD_lower2.2.npy", q_df)
# x = np.load("/work/data/train_data/30M_QD_lower2.txt.npy")

In [73]:
class SeqVAE(object):
    
    def __init__(self, nb_words, max_len, emb, dim, optimizer=RMSprop(), word_dropout_prob=0.5, kl_weight=0):
        self.dim = dim
        self.nb_words = nb_words
        self.max_len = max_len
        self.emb = emb
        self.optimizer = optimizer
        self.kl_weight = kl_weight
        self.word_dropout_prob = word_dropout_prob
        
        self.build()
        
    def build(self):
                
#       Encoder
        
        e_input = Input(shape=(self.max_len,))
        e_mask = Masking(mask_value=0)
        e_emb = self.emb
        e_lstm = GRU(self.dim[0], return_state=True)
        
#         h, state_h, state_c = e_lstm(e_emb(e_mask(e_input)))#         
        _, h = e_lstm(e_emb(e_mask(e_input)))

        
        mean = Dense(self.dim[1])
        var = Dense(self.dim[1])
        
#         self.state_h_mean = mean(state_h)
#         self.state_h_log_var = var(state_h)
        
#         self.state_c_mean = mean(state_c)
#         self.state_c_log_var = var(state_c)
        
        self.h_mean = mean(h)
        self.h_log_var = var(h)
        
#         state_h_z = Lambda(self.sampling)([self.state_h_mean, self.state_h_log_var])     
#         state_c_z = Lambda(self.sampling)([self.state_c_mean, self.state_c_log_var])
        z = Lambda(self.sampling)([self.h_mean, self.h_log_var])

#         self.encoder = Model(e_input, self.state_h_mean)
        self.encoder = Model(e_input, self.h_mean)
        self.encoder2 = Model(e_input, z)

        
#       Decoder

        d_input = Input(shape=(self.max_len,))
        d_latent2hidden = Dense(self.dim[0])
        d_lstm = GRU(self.dim[0], return_sequences=True)
        d_output2vocab = TimeDistributed(Dense(self.nb_words, activation="softmax"))


#         state_h_z = d_latent2hidden(state_h_z)
#         state_c_z = d_latent2hidden(state_c_z)
        h_z = d_latent2hidden(z)
        
        d_embed_input = e_emb(Dropout(self.word_dropout_prob)(d_input))
#         d_embed_input = e_emb(d_input)

#         outputs = d_lstm(d_embed_input, initial_state=[state_h_z, state_c_z])        
        outputs = d_lstm(d_embed_input, initial_state=[h_z])

        pred = d_output2vocab(outputs)

        
        
#       VAE model
        self.model = Model(inputs=[e_input, d_input], outputs=[pred, pred])
        self.model.compile(optimizer=self.optimizer, loss=["sparse_categorical_crossentropy", self.kl_loss])




    def name(self):
        
        return "seqvae"
    
    def kl_loss(self, x, x_):
#         x = K.flatten(x)
#         x_ = K.flatten(x_)
#         x = tf.cast(x, tf.int32)

#         rec_loss = objectives.sparse_categorical_crossentropy(x,x_)

#         state_h_kl_loss = - 0.5 * K.sum(1 + self.state_h_log_var - K.square(self.state_h_mean) - K.exp(self.state_h_log_var), axis=-1)
#         state_c_kl_loss = - 0.5 * K.sum(1 + self.state_c_log_var - K.square(self.state_c_mean) - K.exp(self.state_c_log_var), axis=-1)
        
#         return (self.kl_weight * state_h_kl_loss) + (self.kl_weight * state_c_kl_loss)
        kl_loss = - 0.5 * K.sum(1 + self.h_log_var - K.square(self.h_mean) - K.exp(self.h_log_var), axis=-1)
        return (self.kl_weight * kl_loss) 

    def sampling(self, args):
        z_mean, z_log_var = args
        epsilon = K.random_normal(shape=(K.shape(z_mean)[0], self.dim[1]), mean=0.,\
                                  stddev=1)
    
        return z_mean + K.exp(z_log_var / 2) * epsilon 
    
    def nosampling(self, args):
        z_mean, z_log_var = args
        return z_mean + K.exp(z_log_var / 2)


In [74]:
def eval(epoch, logs):
#     print(logs)
    loss = logs.get("loss")
    val_loss = logs.get("val_loss")
    
    q = run.encoder.predict(q_july)
    d = run.encoder.predict(d_july)
    
    cosine = CosineSim(q.shape[-1])
    pred = cosine.model.predict([q, d])
    print(loss, val_loss, auc(qrel_july, pred.flatten()))

In [75]:
class KL_Annealing(Callback):
    
    def __init__(self, run):
        super(KL_Annealing, self).__init__()
        
        self.run = run
        self.kl_inc_rate = 1 / 5000. # set the annealing rate for KL loss
        self.cos_inc_rate = 1
        self.max_cos_weight = 150.
        self.max_kl_weight = 1.

    def on_epoch_end(self, epoch, logs=None):
        self.run.kl_weight = min(self.run.kl_weight + self.kl_inc_rate, self.max_kl_weight)
        self.run.model.compile(optimizer=self.run.optimizer, loss=["sparse_categorical_crossentropy", self.run.kl_loss])



In [None]:
run = SeqVAE(nb_words, max_len, embedding_layer, [300, 200], optimizer=RMSprop(), word_dropout_prob=0.5, kl_weight=0.0001)

In [None]:

x_train = [q_df, q_df]
y_train = [np.expand_dims(q_df, axis=-1), np.expand_dims(q_df, axis=-1)]
# tmp = np.concatenate([q_july, d_july])
# x_train = [tmp, tmp]
# y_train = [np.expand_dims(tmp, axis=-1), np.expand_dims(tmp, axis=-1)]
# callbacks = [EarlyStopping(verbose=1, patience=5), ReduceLROnPlateau(verbose=1, patience=3),LambdaCallback(on_epoch_end=eval), TQDMNotebookCallback()]
callbacks = [EarlyStopping(verbose=1, patience=5), ReduceLROnPlateau(verbose=1, patience=3), KL_Annealing(run), TQDMNotebookCallback()]
# callbacks =[]
# run.kl_weight = 0
run.model.fit(x_train, y_train, validation_split=0.2, verbose=0, batch_size=256, epochs=50, callbacks=callbacks)


In [None]:
def generate_output(model, bpe, x, nrows=None, idx=None):

    gen_x = np.argmax(model.predict(x), axis=-1) if idx == None else np.argmax(model.predict([x,x])[idx], axis=-1)

    bleu = []
    results = ""
    for i, k in zip(gen_x, x):
        gen_x = " ".join([bpe.index2word[t] for t in i])
        real_x = " ".join([bpe.index2word[t] for t in k])

        bleu.append(sentence_bleu(real_x, gen_x))
        
        real_x = real_x.replace("▁the", "")
        real_x = real_x.replace("▁","")
        gen_x = gen_x.replace("▁the", "")
        gen_x = gen_x.replace("▁","")
        if nrows != None:
            if nrows == 0:
                break
            else:
                nrows = nrows - 1
        
        results = results + "%s : %s\n\n" % (real_x, gen_x)
    print("BLEU: %.4f" % np.mean(bleu))
    print(results)

generate_output(run.model, bpe, q_july,idx=0)

In [64]:
q = run.encoder.predict(q_july)
d = run.encoder.predict(d_july)

cosine = CosineSim(q.shape[-1])
pred = cosine.model.predict([q, d])
print(auc(qrel_july, pred.flatten()))

0.5259400920634215


In [65]:
for i in range(10):
    q = run.encoder2.predict(q_july)
    d = run.encoder2.predict(d_july)

    cosine = CosineSim(q.shape[-1])
    pred = cosine.model.predict([q, d])
    print(auc(qrel_july, pred.flatten()))

0.5594298294380573
0.5548266583646512
0.5579176766216727
0.5607863194645201
0.5666347928573017
0.552069203228891
0.5657897662834397
0.5624986101536614
0.5639218128043764
0.5621650470324001


In [66]:
w2v = Sequential()
w2v.add(bpe.get_keras_embedding(train_embeddings=True))
w2v.add(GlobalMaxPooling1D())

In [67]:
q_gen = w2v.predict(np.argmax(run.model.predict([q_july,q_july]), axis=-1)[0])
d_gen = w2v.predict(np.argmax(run.model.predict([d_july, d_july]), axis=-1)[0])

In [68]:
cosine = CosineSim(q_gen.shape[-1])
pred2 = cosine.model.predict([q_gen, d_gen])

In [69]:
print(auc(qrel_july, pred2.flatten()))

0.5128421801685606
