
# Generating Sentences from a Continuous Space
### Samuel R. Bowman, Luke Vilnis, Oriol Vinyals, Andrew M. Dai, Rafal Jozefowicz, Samy Bengio

In [1]:
from Models import *
from Utils import *
from FastModels import *
import warnings
from livelossplot import PlotLossesKeras
# warnings.filterwarnings('ignore')

Using TensorFlow backend.


In [2]:
from gensim.models import KeyedVectors
import sentencepiece as spm

max_len = 15
enablePadding = True

sp = spm.SentencePieceProcessor()
sp.Load('/work/data/bpe/en.wiki.bpe.op50000.model')
bpe = KeyedVectors.load_word2vec_format("/work/data/bpe/en.wiki.bpe.op50000.d200.w2v.bin", binary=True)
bpe.index2word = [''] + bpe.index2word + ['<sos>'] + ['<eos>']  # add empty string 
nb_words = len(bpe.index2word)
# word2index
bpe_dict = {bpe.index2word[i]: i for i in range(len(bpe.index2word))}
# construct embedding_matrix
embedding_matrix = np.concatenate([np.zeros((1, bpe.vector_size)), bpe.vectors, np.zeros((2, bpe.vector_size))]) # add zero vector for empty string (i.e. used for padding)

embedding_layer = Embedding(nb_words,
                    embedding_matrix.shape[-1],
                    weights=[embedding_matrix],
                    input_length=max_len,
                    trainable=True)

In [50]:
from keras.layers import multiply
inp1 = Input((15,))
inp2 = Input((15,3,))

a = embedding_layer(inp1)
b = embedding_layer(inp2)

x = multiply([a, b])
x = Activation("softmax")(x)
run = Model([inp1, inp2], x)

ValueError: Dimensions must be equal, but are 15 and 3 for 'multiply_1/mul' (op: 'Mul') with input shapes: [?,15,200], [?,15,3,200].

In [45]:
x1 = np.random.randint(100, size=(1,15))
x2 = np.random.randint(100, size=(1,15,3))

run.predict([x1, x2]).shape

(1, 15, 3)

In [33]:
x1

array([[37, 49, 40, 95, 11, 56, 63, 14, 32, 65, 65, 36, 25, 76, 72]])

In [41]:
neg_samples = []
for i in x1:
    for j in i:
        neg = np.random.randint(100, size=100)
        while j in neg:
            neg = np.random.randint(100, size=100)
        neg_samples.append(neg)
neg_samples = np.array(neg_samples)

In [40]:
neg_samples.shape

(1, 100)

In [4]:
model = Sequential([embedding_layer, Bidirectional(GRU(10, return_sequences=True)), Dense(100, activation="softmax")])

In [51]:
model.predict(q_july).shape

(426, 15, 100)

In [26]:
[{"loss_weights": {"yfake": 1e-2, "yreal": 1e-2, "xpred": 1}}] * 2

[{'loss_weights': {'xpred': 1, 'yfake': 0.01, 'yreal': 0.01}},
 {'loss_weights': {'xpred': 1, 'yfake': 0.01, 'yreal': 0.01}}]

In [23]:
inputs = Input((max_len,200,))
dense = Dense(nb_words, use_bias=False, weights=[embedding_matrix.T], activation="linear", trainable=True)
h = Dense(1, activation="sigmoid")
model = Model(inputs, h(dense(inputs)))
model.compile(Adam(1), "binary_crossentropy")
# model = Sequential([embedding_layer, LSTM(200, return_sequences=True)])

In [24]:
x = np.random.randint(nb_words, size=(nb_words,max_len,200))
y = np.random.randint(2, size=(nb_words,max_len,1))
model.fit(x, y, verbose=2, epochs=1)

Epoch 1/1
 - 17s - loss: 7.9639


<keras.callbacks.History at 0x7faf0c70cef0>

In [None]:
def train(self, path, train_data, batch_size):

    x_train, y_train, valid, fake = self.get_training_data(path, train_data)

    max_val_loss = float("inf") 

    for i in range(100):
        latent_fake = run.gs_encoder.predict(x_train[0])
        latent_real = np.random.normal(size=(len(x_train[0]), self.latent_dim))

        d_loss_real = run.discriminator.fit(latent_real, valid, batch_size=batch_size, verbose=1)
        d_loss_fake = run.discriminator.fit(latent_fake, fake, batch_size=batch_size, verbose=1)


        d_loss = 0.5 * np.add(d_loss_real.history['loss'], d_loss_fake.history['loss'])
        d_acc = 0.5 * np.add(d_loss_real.history['acc'], d_loss_fake.history['acc'])
        print(d_loss, d_acc)

        hist = self.model.fit(x_train, y_train,
                                        shuffle=True,
                                        verbose=1,
                                        batch_size=batch_size,
                                        validation_split=0.2,
                                        callbacks=[EarlyStopping()]
                                        )
        if max_val_loss < hist.history["val_loss"][0]:
            break
        else:
            max_val_loss = hist.history["val_loss"][0]
    may_ndcg, june_ndcg, july_auc, quora_auc, para_auc, sts_pcc = evaluate(self.encoder, test_set)
    print(may_ndcg, june_ndcg, july_auc, quora_auc, para_auc, sts_pcc)

In [25]:
model.get_weights()[0] == embedding_matrix.T

array([[False, False, False, ..., False, False, False],
       [False, False, False, ..., False, False, False],
       [False, False, False, ..., False, False, False],
       ...,
       [False, False, False, ..., False, False, False],
       [False, False, False, ..., False, False, False],
       [False, False, False, ..., False, False, False]])

In [26]:
x = model.predict(np.random.randint(nb_words, size=(20,max_len,200)))

In [43]:
x.shape

(20, 15, 50512)

In [9]:
embedding_matrix.shape

(50512, 200)

In [10]:
x = np.random.randint(200, size=(200))

In [11]:
x

array([173,  18, 168, 133, 139, 168, 179, 189,   4, 123,  79, 121,  83,
        45,  87,  97,  40, 153, 157,  65, 198, 199,  68,   0, 153,  48,
       193,  85,  69,  62, 130,  99,   2, 142, 122, 169,  16, 126,  24,
        34,  33, 101, 135, 142, 149, 138, 149,  49,  72,  46,  37, 179,
        10, 124,  16, 161, 138, 106,  78,  58,  52,  87,   3, 132, 110,
        66, 148,  60,  54,   4, 154,  10,  29,  77, 145, 154, 156,  54,
       104,  65, 105, 136,  70,  73, 197, 184, 118, 142, 117,   5, 134,
        21,  39,  37, 162, 117, 176, 141, 103, 181, 105,  28, 155,  98,
       196,  43, 149,  54,  62, 159,  95,  46,  76,  31, 118,   0,  70,
       128,  18,  71, 189, 182, 182, 187,  26, 103,  38, 100,  26,  59,
        98,   0, 152, 194, 155, 127, 109,  84, 153,  98,  35, 130, 123,
        66, 102, 122,  87,   4, 117, 164, 172, 106, 160,  61,  42,  55,
       147, 105,  16, 156, 136, 190,  42,  42,  46,  94,  49,  24,  32,
         9,  14, 153, 198, 147, 141,  82, 135, 111,  33,   6,  9

In [20]:
enablePadding = True
df_may, qrel_may = get_test_data("MayFlower", "/work/")
df_june, qrel_june = get_test_data("JuneFlower", "/work/")
df_july, qrel_july = get_test_data("JulyFlower", "/work/")

q_may = parse_texts_bpe(df_may.q.tolist(), sp, bpe_dict, max_len, enablePadding)
d_may = parse_texts_bpe(df_may.d.tolist(), sp, bpe_dict, max_len, enablePadding)

q_june = parse_texts_bpe(df_june.q.tolist(), sp, bpe_dict, max_len, enablePadding)
d_june = parse_texts_bpe(df_june.d.tolist(), sp, bpe_dict, max_len, enablePadding)

q_july = parse_texts_bpe(df_july.q.tolist(), sp, bpe_dict, max_len, enablePadding)
d_july = parse_texts_bpe(df_july.d.tolist(), sp, bpe_dict, max_len, enablePadding)

q_july_ = parse_texts_bpe(df_july.q.tolist(), sp, bpe_dict, max_len, enablePadding, "post")
d_july_ = parse_texts_bpe(df_july.d.tolist(), sp, bpe_dict, max_len, enablePadding, "post")

test_set = [[q_may, d_may, qrel_may, df_may, "MayFlower"], [q_june, d_june, qrel_june, df_june, "JuneFlower"], [q_july, d_july, qrel_july, df_july, "JulyFlower"]]


b'Skipping line 19898: expected 6 fields, saw 8\nSkipping line 20620: expected 6 fields, saw 8\nSkipping line 38039: expected 6 fields, saw 8\n'


In [4]:
path = "/work/"
test_set = []
for i in ["MayFlower", "JuneFlower", "JulyFlower", "sts", "quora", "para"]:
    df, qrel = get_test_data(i, path)
    q_ = parse_texts_bpe(df.q.tolist(), sp, bpe_dict, max_len, enablePadding)
    d_ = parse_texts_bpe(df.d.tolist(), sp, bpe_dict, max_len, enablePadding)
    test_set.append([q_, d_, qrel, df, i])

b'Skipping line 19898: expected 6 fields, saw 8\nSkipping line 20620: expected 6 fields, saw 8\nSkipping line 38039: expected 6 fields, saw 8\n'


In [9]:
w2v = Sequential([embedding_layer, GlobalAveragePooling1D()])

In [21]:
q_ = w2v.predict(q_july, batch_size=1024)
d_ = w2v.predict(d_july, batch_size=1024)
cosine = CosineSim(q_.shape[-1])

pred = cosine.model.predict([q_, d_]).flatten()

In [36]:
pred

array([ 0.7406662 ,  0.85557044,  0.78561455,  0.9508481 ,  0.21879916,
        0.7453836 ,  0.80263525,  0.8800487 ,  0.8917499 ,  0.5874681 ,
        0.6540834 ,  0.92953557,  0.5811623 ,  0.9591745 ,  0.6000247 ,
        0.8475549 ,  0.9185499 ,  0.8128013 ,  0.7412871 ,  0.923996  ,
        0.9300424 ,  0.9498619 ,  0.87161493,  0.60895574,  0.9269576 ,
        0.24144852,  0.21983142,  0.7962915 ,  0.66414773,  0.8485445 ,
        0.8836608 ,  0.9419743 ,  0.972886  ,  0.83557457,  0.85681504,
        0.8955547 ,  0.9416139 ,  0.94760853,  0.7242776 ,  0.87324125,
        0.8219911 ,  0.87939245,  0.87306607, -0.15584849,  0.9455508 ,
        0.9145586 ,  0.9175409 ,  0.72205913,  0.62718993,  0.82563776,
        0.49371317,  0.6480255 ,  0.20071238,  0.7714883 ,  0.8624506 ,
        0.8891364 ,  0.8943575 ,  0.91400164,  0.92129666,  0.8619352 ,
        0.82246464,  0.8945058 ,  0.7497718 ,  0.65425056,  0.76147807,
        0.94014007,  0.8038611 ,  0.8885384 ,  0.92338496,  0.85

In [35]:
len(qrel_july[qrel_july == 1]), len(qrel_july[qrel_july == 0])

(193, 233)

In [10]:
evaluate(w2v, test_set)

(0.5140063870919466,
 0.8463482465188383,
 0.518468278147168,
 0.7317146600481685,
 0.9301704561504908,
 0.4440271653122348)

In [3]:
def eval_july(run):
    q_ = run.predict(q_july)
    d_ = run.predict(d_july)
    cosine = CosineSim(q_.shape[-1])

    pred = cosine.model.predict([q_, d_])
    print(auc(qrel_july, pred.flatten()))

In [4]:
df = pd.read_csv("/work/data/train_data/30M_QD_lower2.txt", nrows=10000, names=["label", "q", "d"], sep="\t", header=None, error_bad_lines=False)
# df = df[df.label == 1]

# df = pd.read_csv("/work/data/train_data/QueryLog", nrows=100000, names=["q"], sep="\t", header=None, error_bad_lines=False)

In [7]:
enablePadding = True
q_df = parse_texts_bpe(df.q.tolist(), sp, bpe_dict, max_len, enablePadding, "post")
d_df = parse_texts_bpe(df.d.tolist(), sp, bpe_dict, max_len, enablePadding, "post")

In [467]:
# np.save("/work/data/train_data/1M_QD.q.npy", q_df)
# np.save("/work/data/train_data/1M_QD.d.npy", d_df)
# np.save("/work/data/train_data/1M_QD.label.npy", df.label.values)

In [19]:
q = np.load("/work/data/train_data/100K_QD_ml15.q.npy")[:10000]
qdi = np.load("/work/data/train_data/100K_QD_ml15.q.di.npy")[:10000]
qdo = np.load("/work/data/train_data/100K_QD_ml15.q.do.npy")[:10000]

d = np.load("/work/data/train_data/100K_QD_ml15.d.npy")[:10000]
ddi = np.load("/work/data/train_data/100K_QD_ml15.d.di.npy")[:10000]
ddo = np.load("/work/data/train_data/100K_QD_ml15.d.do.npy")[:10000]


In [7]:
def get_tokens(df):
    tokens = set([])
    for i in df:
        for j in i:
            tokens.add(j)
    return tokens

july_tokens = get_tokens(np.concatenate([q_july, d_july]))
june_tokens = get_tokens(np.concatenate([q_june, d_june]))
may_tokens = get_tokens(np.concatenate([q_may, d_may]))

train_tokens = get_tokens(q_df)
for i in [july_tokens, june_tokens, may_tokens]:
    print("Train %d, Test %d, Overlap %d" % (len(train_tokens), len(i), len(train_tokens.intersection(i))))

Train 29279, Test 1523, Overlap 1464
Train 29279, Test 10049, Overlap 9274
Train 29279, Test 10561, Overlap 9854


In [22]:
K.clear_session()
kl_rate = 0.01
keep_rate_word_dropout = 0.75
run = Seq2Seq(nb_words, max_len, embedding_matrix, [200,100], Adam(), kl_rate, keep_rate_word_dropout, mode=1, enableKL=True, enableS2S=False)



In [31]:
for i in range(30):
    kl_weight = 1
    x_train = [q, qdi] + [np.array([kl_weight] * len(q))]
    y_train = [np.expand_dims(qdo, axis=-1), np.ones(len(qdo))]
    hist = run.model.fit(x_train, y_train, verbose=2,batch_size=256)
    print(evaluate(run.encoder, test_set))

Epoch 1/1
 - 14s - loss: 1.3279 - rec_loss: 1.3278 - kl_loss: 1.1052e-04
(0.49573213269410454, 0.8271802074919363, 0.5322777913673864, 0, 0, 0)
Epoch 1/1
 - 14s - loss: 1.3310 - rec_loss: 1.3309 - kl_loss: 1.4369e-04
(0.49700126026727803, 0.8284703563578426, 0.5154217349729814, 0, 0, 0)
Epoch 1/1
 - 14s - loss: 1.3319 - rec_loss: 1.3317 - kl_loss: 1.5625e-04
(0.4986811292663033, 0.8284686959008363, 0.5466432431230404, 0, 0, 0)
Epoch 1/1
 - 14s - loss: 1.3318 - rec_loss: 1.3316 - kl_loss: 1.5749e-04
(0.4971709287759919, 0.8299626597306875, 0.5374146634348106, 0, 0, 0)
Epoch 1/1
 - 14s - loss: 1.3317 - rec_loss: 1.3315 - kl_loss: 1.4919e-04
(0.4973130620148586, 0.8284320371335961, 0.5194912050523695, 0, 0, 0)
Epoch 1/1
 - 14s - loss: 1.3300 - rec_loss: 1.3298 - kl_loss: 1.4954e-04
(0.4972599356660459, 0.8286235417083805, 0.5486446218506082, 0, 0, 0)
Epoch 1/1
 - 14s - loss: 1.3291 - rec_loss: 1.3289 - kl_loss: 1.3506e-04
(0.49860879214391884, 0.8302246223549468, 0.5365696368609486, 0, 0,

In [1206]:
generate_output(run.model, bpe, [q_july, dec_input],  idx=0)

BLEU: 0.6552
              should you get rental car insurance	what is of of of of              
               police radar how it works	what is of                 
                native tribes of arizona	what of of                 
             how long spaghetti sauce in refriger ator	what of of                 
                clayton county in ga	what of of                 
         when is gay pride parade in nyc  2 0 <unk>	what is of of of of of             
             easy chicken breast recipes cro ck pot	what of                  
            what to do in g ta  <unk>	what is is is of of of of            
                dancing with  stars	what of of                 
              leonardo dic ap rio wife name	what of                  
                 is doge dead	what of of                 
                super class in java	what of of                 
                  hungry hearts	                   
             phar rell williams desp icable me soundtrack	         

In [29]:
%%time


CPU times: user 29.4 s, sys: 4.32 s, total: 33.7 s
Wall time: 6.2 s


(0.4977366469106477, 0.8276762859992753, 0.5304098378883231, 0, 0, 0)

In [5]:
class SeqVAE(object):
    
    def __init__(self, nb_words, max_len, embedding_matrix, dim, optimizer=RMSprop(), keep_rate_word_dropout=0.5, kl_weight=0, mode=1):
        self.dim = dim
        self.nb_words = nb_words
        self.max_len = max_len
        self.embedding_matrix = embedding_matrix
        self.optimizer = optimizer
        self.kl_weight = kl_weight
        self.keep_rate_word_dropout = keep_rate_word_dropout
        self.enableKL = True if kl_weight == 0 else False
        self.mode = mode
        self.build()
        
    def build(self):
                
        e_input = Input(shape=(self.max_len,))
        self.kl_input = Input(shape=(1,))

        embedding_layer = Embedding(self.nb_words,
                            self.embedding_matrix.shape[-1],
                            weights=[self.embedding_matrix],
                            input_length=self.max_len,
                            mask_zero=False,
                            trainable=True if self.mode == 1 else False)

        e_lstm = GRU(self.dim[0], return_state=True)
        
        _, h = e_lstm(embedding_layer(e_input))

        
        mean = Dense(self.dim[1])
        var = Dense(self.dim[1])
        
        self.h_mean = mean(h)
        self.h_log_var = var(h)
        
        z = Lambda(self.sampling)([self.h_mean, self.h_log_var])

        self.encoder = Model(e_input, self.h_mean)

        d_input = Input(shape=(self.max_len,))
        d_latent2hidden = Dense(self.dim[0], activation='relu')
        d_lstm = GRU(self.dim[0], return_sequences=True)

        softmax_layer = Dense(self.nb_words, activation="softmax")
        d_output2vocab = TimeDistributed(softmax_layer, name="rec")
        d_output2vocab_kl = TimeDistributed(softmax_layer, name="kl")
        
        dec_embedding_layer = Embedding(self.nb_words,
                                    self.embedding_matrix.shape[-1],
                                    weights=[self.embedding_matrix],
                                    input_length=self.max_len,
                                    mask_zero=False,
                                    trainable=True)

        h_z = d_latent2hidden(z)
        
        d_embed_input = dec_embedding_layer(Masking(mask_value=0.0)(d_input))
        outputs = d_lstm(d_embed_input, initial_state=[h_z])

        pred = d_output2vocab(outputs)
        pred_kl = d_output2vocab_kl(outputs)
        
        
        
#       VAE model
        self.model = Model(inputs=[e_input, d_input], outputs=[pred, pred_kl])
        self.model.compile(optimizer=self.optimizer, loss=["sparse_categorical_crossentropy", self.kl_loss])

    
    def name(self):
        
        return "seqvae_kl_%d" % self.mode if self.enableKL else "seqvae_%d" % self.mode
    
    def kl_loss(self, x, x_):
            kl_loss = - 0.5 * K.sum(1 + self.h_log_var - K.square(self.h_mean) - K.exp(self.h_log_var), axis=-1)
            return (self.kl_weight * kl_loss) 

    def word_dropout(self, x, unk_token):

        x_ = np.copy(x)
        rows, cols = np.nonzero(x_)
        for r, c in zip(rows, cols):
            if random.random() <= self.keep_rate_word_dropout:
                continue
            x_[r][c] = unk_token

        return x_
    

    def sampling(self, args):
        z_mean, z_log_var = args
        epsilon = K.random_normal(shape=(K.shape(z_mean)[0], self.dim[1]), mean=0.,\
                                  stddev=1)
    
        return z_mean + K.exp(z_log_var / 2) * epsilon 
    

In [30]:
K.clear_session()
run = SeqVAE(nb_words, max_len, embedding_matrix, [200,200], optimizer=Adam(), kl_weight=0, mode=1)


In [26]:
eval_july(run.encoder)

0.5588961284440392


In [7]:
q_df

array([[    0,     0,     0, ...,  7461,  4212,     5],
       [    0,     0,     0, ..., 43852,  2966,  2936],
       [    0,     0,     0, ...,  1365,    89,  6772],
       ...,
       [    0,     0,     0, ..., 50509,  7513,  1184],
       [    0,     0,     0, ...,     4,  8875, 39913],
       [    0,     0,     0, ..., 28624, 24255,   727]], dtype=int32)

In [31]:
y = np.expand_dims(q_df_, axis=-1)
dec_input = pad_sequences(pad_sequences(q_df_, maxlen=max_len+1, value=bpe_dict['<sos>']), maxlen=max_len, truncating="post")
for i in range(100):
    run.model.fit([q_df,dec_input] , [y, y], verbose=2, epochs=1, batch_size=256)
#     if i % 20 == 0 and i != 0:
#         kl_rate = 0.001
#         run.kl_weight = min(run.kl_weight + kl_rate, 1)
#         run.model.compile(optimizer=run.optimizer, loss=["sparse_categorical_crossentropy", run.kl_loss])
    eval_july(run.encoder)

Epoch 1/1
 - 139s - loss: 2.5997 - rec_loss: 2.5997 - kl_loss: 0.0000e+00
0.5040583513086793
Epoch 1/1
 - 139s - loss: 1.8012 - rec_loss: 1.8012 - kl_loss: 0.0000e+00
0.49567479819431165
Epoch 1/1
 - 138s - loss: 1.5472 - rec_loss: 1.5472 - kl_loss: 0.0000e+00
0.4622072983610932
Epoch 1/1
 - 138s - loss: 1.3042 - rec_loss: 1.3042 - kl_loss: 0.0000e+00
0.4338989081367164
Epoch 1/1
 - 137s - loss: 1.0799 - rec_loss: 1.0799 - kl_loss: 0.0000e+00
0.41601992483711003
Epoch 1/1
 - 138s - loss: 0.8800 - rec_loss: 0.8800 - kl_loss: 0.0000e+00
0.40216593653405674
Epoch 1/1
 - 139s - loss: 0.7109 - rec_loss: 0.7109 - kl_loss: 0.0000e+00
0.39854121728301717
Epoch 1/1
 - 137s - loss: 0.5709 - rec_loss: 0.5709 - kl_loss: 0.0000e+00
0.3932264448842536
Epoch 1/1
 - 139s - loss: 0.4600 - rec_loss: 0.4600 - kl_loss: 0.0000e+00
0.3870444083702106
Epoch 1/1
 - 138s - loss: 0.3749 - rec_loss: 0.3749 - kl_loss: 0.0000e+00
0.38637728212768796
Epoch 1/1
 - 138s - loss: 0.3086 - rec_loss: 0.3086 - kl_loss: 0.

KeyboardInterrupt: 

In [14]:
d_dec_input = pad_sequences(pad_sequences(d_july_, maxlen=max_len+1, value=bpe_dict['<sos>']), maxlen=max_len, truncating="post")


In [21]:
generate_output(run.model, bpe, [q_july, dec_input], idx=0)

BLEU: 0.6941
                cord of wood dimensions	 of  name                
                 history of b	how of justification                 
               states which legal ized marijuana	 of  credit                
                largest all ig tor	 moor stream java                
               characteristics of observ ant people	how of  in                
            what does f ** * you mean ?	how does f ** * you ? ?            
                 california adventure park	how condor stations                 
                 william howard taft	 of                  
              how many yards to a mile	 is get a your your              
             crime rate in su w anee ga	how does in o men ? ?             
                snap fitness membership prices	how of in treatment                
               mid s omer murders cast	how s omer murders cast               
             easy chicken breast recipes cro ck pot	how is in in in mail              
             olym

In [10]:
def generate_output(model, bpe, x, nrows=100000, idx=None):

    gen_x = np.argmax(model.predict(x), axis=-1) if idx == None else np.argmax(model.predict(x)[idx], axis=-1)
    bleu = []
    results = []
    count = 0
    for i, k in zip(gen_x, x[0]):
        gen_x = " ".join([bpe.index2word[t] for t in i])
        real_x = " ".join([bpe.index2word[t] for t in k])

        bleu.append(sentence_bleu(real_x, gen_x))
        
        real_x = real_x.replace("▁the", "")
        real_x = real_x.replace("▁","")
        gen_x = gen_x.replace("▁the", "")
        gen_x = gen_x.replace("▁","")
        
        if count > nrows:
            break
        count = count + 1
        results.append("%s\t%s" % (real_x, gen_x))
        
    
    print("BLEU: %.4f" % np.mean(bleu))
    idx = np.arange(len(results))
    shuffle(idx)
    for i in idx[:nrows]:
        print(results[i])

In [42]:
# import os
# os.environ["THEANO_FLAGS"] = "mode=FAST_COMPILE,device=cpu,floatX=float32"

import matplotlib as mpl

# This line allows mpl to run with no DISPLAY defined
mpl.use('Agg')

from keras.layers import Dense, Reshape, Flatten, Input, merge
from keras.models import Sequential, Model
from keras.optimizers import Adam
from keras_adversarial.legacy import l1l2
import keras.backend as K
import pandas as pd
import numpy as np
from keras_adversarial.image_grid_callback import ImageGridCallback

from keras_adversarial import AdversarialModel, fix_names, n_choice
from keras_adversarial import AdversarialOptimizerSimultaneous, normal_latent_sampling
from keras.layers import LeakyReLU, Activation
import os
import numpy as np
from keras.datasets import mnist


def mnist_process(x):
    x = x.astype(np.float32) / 255.0
    return x


def mnist_data():
    (xtrain, ytrain), (xtest, ytest) = mnist.load_data()
    return mnist_process(xtrain), mnist_process(xtest)


def model_generator(latent_dim, input_shape, hidden_dim=512, reg=lambda: l1l2(1e-7, 0)):
    return Sequential([
        Dense(hidden_dim, name="generator_h1", input_dim=latent_dim, W_regularizer=reg()),
        LeakyReLU(0.2),
        Dense(hidden_dim, name="generator_h2", W_regularizer=reg()),
        LeakyReLU(0.2),
        Dense(np.prod(input_shape), name="generator_x_flat", W_regularizer=reg()),
        Activation('sigmoid'),
        Reshape(input_shape, name="generator_x")],
        name="generator")


def model_encoder(latent_dim, input_shape, hidden_dim=512, reg=lambda: l1l2(1e-7, 0)):
    x = Input(input_shape, name="x")
    h = Flatten()(x)
    h = Dense(hidden_dim, name="encoder_h1", W_regularizer=reg())(h)
    h = LeakyReLU(0.2)(h)
    h = Dense(hidden_dim, name="encoder_h2", W_regularizer=reg())(h)
    h = LeakyReLU(0.2)(h)
    mu = Dense(latent_dim, name="encoder_mu", W_regularizer=reg())(h)
    log_sigma_sq = Dense(latent_dim, name="encoder_log_sigma_sq", W_regularizer=reg())(h)
    z = merge([mu, log_sigma_sq], mode=lambda p: p[0] + K.random_normal(K.shape(p[0])) * K.exp(p[1] / 2),
              output_shape=lambda p: p[0])
    return Model(x, z, name="encoder")


def model_discriminator(latent_dim, output_dim=1, hidden_dim=512,
                        reg=lambda: l1l2(1e-7, 1e-7)):
    z = Input((latent_dim,))
    h = z
    h = Dense(hidden_dim, name="discriminator_h1", W_regularizer=reg())(h)
    h = LeakyReLU(0.2)(h)
    h = Dense(hidden_dim, name="discriminator_h2", W_regularizer=reg())(h)
    h = LeakyReLU(0.2)(h)
    y = Dense(output_dim, name="discriminator_y", activation="sigmoid", W_regularizer=reg())(h)
    return Model(z, y)


def example_aae(path, adversarial_optimizer):
    # z \in R^100
    latent_dim = 100
    # x \in R^{28x28}
    input_shape = (28, 28)

    # generator (z -> x)
    generator = model_generator(latent_dim, input_shape)
    # encoder (x ->z)
    encoder = model_encoder(latent_dim, input_shape)
    # autoencoder (x -> x')
    autoencoder = Model(encoder.inputs, generator(encoder(encoder.inputs)))
    # discriminator (z -> y)
    discriminator = model_discriminator(latent_dim)

    # assemple AAE
    x = encoder.inputs[0]
    z = encoder(x)
    xpred = generator(z)
    zreal = normal_latent_sampling((latent_dim,))(x)
    yreal = discriminator(zreal)
    yfake = discriminator(z)
    aae = Model(x, fix_names([xpred, yfake, yreal], ["xpred", "yfake", "yreal"]))

    # build adversarial model
    generative_params = generator.trainable_weights + encoder.trainable_weights
    model = AdversarialModel(base_model=aae,
                             player_params=[generative_params, discriminator.trainable_weights],
                             player_names=["generator", "discriminator"])
    model.adversarial_compile(adversarial_optimizer=adversarial_optimizer,
                              player_optimizers=[Adam(1e-4, decay=1e-4), Adam(1e-3, decay=1e-4)],
                              loss={"yfake": "binary_crossentropy", "yreal": "binary_crossentropy",
                                    "xpred": "mean_squared_error"},
                              player_compile_kwargs=[{"loss_weights": {"yfake": 1e-2, "yreal": 1e-2, "xpred": 1}}] * 2)

    # load mnist data
    xtrain, xtest = mnist_data()

    # callback for image grid of generated samples
    def generator_sampler():
        zsamples = np.random.normal(size=(10 * 10, latent_dim))
        return generator.predict(zsamples).reshape((10, 10, 28, 28))

    generator_cb = ImageGridCallback(os.path.join(path, "generated-epoch-{:03d}.png"), generator_sampler)

    # callback for image grid of autoencoded samples
    def autoencoder_sampler():
        xsamples = n_choice(xtest, 10)
        xrep = np.repeat(xsamples, 9, axis=0)
        xgen = autoencoder.predict(xrep).reshape((10, 9, 28, 28))
        xsamples = xsamples.reshape((10, 1, 28, 28))
        samples = np.concatenate((xsamples, xgen), axis=1)
        return samples

    autoencoder_cb = ImageGridCallback(os.path.join(path, "autoencoded-epoch-{:03d}.png"), autoencoder_sampler)

    # train network
    # generator, discriminator; pred, yfake, yreal
    n = xtrain.shape[0]
    y = [xtrain, np.ones((n, 1)), np.zeros((n, 1)), xtrain, np.zeros((n, 1)), np.ones((n, 1))]
    ntest = xtest.shape[0]
    ytest = [xtest, np.ones((ntest, 1)), np.zeros((ntest, 1)), xtest, np.zeros((ntest, 1)), np.ones((ntest, 1))]
    history = model.fit(x=xtrain, y=y, validation_data=(xtest, ytest), callbacks=[generator_cb, autoencoder_cb],
                        nb_epoch=100, batch_size=256, verbose=2)



def main():
    example_aae("output/aae", AdversarialOptimizerSimultaneous())


main()

Train on 60000 samples, validate on 10000 samples
Epoch 1/100
 - 22s - loss: 0.2440 - generator_loss: 0.1262 - generator_xpred_loss: 0.0937 - generator_yfake_loss: 1.1283 - generator_yreal_loss: 0.9786 - discriminator_loss: 0.1178 - discriminator_xpred_loss: 0.0937 - discriminator_yfake_loss: 0.6023 - discriminator_yreal_loss: 0.6644 - val_loss: 0.1780 - val_generator_loss: 0.0908 - val_generator_xpred_loss: 0.0650 - val_generator_yfake_loss: 0.9078 - val_generator_yreal_loss: 0.7378 - val_discriminator_loss: 0.0872 - val_discriminator_xpred_loss: 0.0650 - val_discriminator_yfake_loss: 0.5917 - val_discriminator_yreal_loss: 0.6972
Epoch 2/100
 - 19s - loss: 0.1694 - generator_loss: 0.0881 - generator_xpred_loss: 0.0605 - generator_yfake_loss: 0.9969 - generator_yreal_loss: 0.9262 - discriminator_loss: 0.0813 - discriminator_xpred_loss: 0.0605 - discriminator_yfake_loss: 0.6218 - discriminator_yreal_loss: 0.6256 - val_loss: 0.1583 - val_generator_loss: 0.0852 - val_generator_xpred_loss:

In [51]:
# Build model
from keras.layers import Dense, Dropout, Flatten, Input, MaxPooling1D, Convolution1D, Embedding

filter_sizes = (3, 8)
num_filters = 10
hidden_dims = 200

input_shape = (max_len,)

model_input = Input(shape=input_shape)

z = embedding_layer(model_input)



# Convolutional block
conv_blocks = []
for sz in filter_sizes:
    conv = Convolution1D(filters=num_filters,
                         kernel_size=sz,
                         padding="valid",
                         activation="relu",
                         strides=1)(z)
    conv = MaxPooling1D(pool_size=2)(conv)
    conv = Flatten()(conv)
    conv_blocks.append(conv)
z = Concatenate()(conv_blocks) if len(conv_blocks) > 1 else conv_blocks[0]

# z = Dense(hidden_dims, activation="relu")(z)
# model_output = Dense(1, activation="sigmoid")(z)
model_output = z

model = Model(model_input, model_output)
model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

In [53]:
model.predict(np.random.randint(20, size=(10,15))).shape

(10, 100)