# Generating Sentences from a Continuous Space
### Samuel R. Bowman, Luke Vilnis, Oriol Vinyals, Andrew M. Dai, Rafal Jozefowicz, Samy Bengio

In [1]:
from Models import *
from Utils import *
from FastModels import *
import warnings
from livelossplot import PlotLossesKeras
# warnings.filterwarnings('ignore')

Using TensorFlow backend.


In [2]:
from gensim.models import KeyedVectors
import sentencepiece as spm

max_len = 20
enablePadding = True

sp = spm.SentencePieceProcessor()
sp.Load('/work/data/bpe/en.wiki.bpe.op50000.model')
bpe = KeyedVectors.load_word2vec_format("/work/data/bpe/en.wiki.bpe.op50000.d200.w2v.bin", binary=True)
bpe.index2word = [''] + bpe.index2word + ['<sos>'] + ['<eos>']  # add empty string 
nb_words = len(bpe.index2word)
# word2index
bpe_dict = {bpe.index2word[i]: i for i in range(len(bpe.index2word))}
# construct embedding_matrix
embedding_matrix = np.concatenate([np.zeros((1, bpe.vector_size)), bpe.vectors, np.zeros((2, bpe.vector_size))]) # add zero vector for empty string (i.e. used for padding)

embedding_layer = Embedding(nb_words,
                    embedding_matrix.shape[-1],
                    weights=[embedding_matrix],
                    input_length=max_len,
                    trainable=True)

In [3]:
enablePadding = True
df_may, qrel_may = get_test_data("MayFlower", "/work/")
df_june, qrel_june = get_test_data("JuneFlower", "/work/")
df_july, qrel_july = get_test_data("JulyFlower", "/work/")

q_may = parse_texts_bpe(df_may.q.tolist(), sp, bpe_dict, max_len, enablePadding)
d_may = parse_texts_bpe(df_may.d.tolist(), sp, bpe_dict, max_len, enablePadding)

q_june = parse_texts_bpe(df_june.q.tolist(), sp, bpe_dict, max_len, enablePadding)
d_june = parse_texts_bpe(df_june.d.tolist(), sp, bpe_dict, max_len, enablePadding)

q_july = parse_texts_bpe(df_july.q.tolist(), sp, bpe_dict, max_len, enablePadding)
d_july = parse_texts_bpe(df_july.d.tolist(), sp, bpe_dict, max_len, enablePadding)

q_july_ = parse_texts_bpe(df_july.q.tolist(), sp, bpe_dict, max_len, enablePadding, "post")
d_july_ = parse_texts_bpe(df_july.d.tolist(), sp, bpe_dict, max_len, enablePadding, "post")

test_set = [[q_may, d_may, qrel_may, df_may, "MayFlower"], [q_june, d_june, qrel_june, df_june, "JuneFlower"], [q_july, d_july, qrel_july, df_july, "JulyFlower"]]


b'Skipping line 19898: expected 6 fields, saw 8\nSkipping line 20620: expected 6 fields, saw 8\nSkipping line 38039: expected 6 fields, saw 8\n'


In [4]:
def eval_july(run):
    q_ = run.predict(q_july)
    d_ = run.predict(d_july)
    cosine = CosineSim(q_.shape[-1])

    pred = cosine.model.predict([q_, d_])
    print(auc(qrel_july, pred.flatten()))

In [27]:
df = pd.read_csv("/work/data/train_data/30M_QD_lower2.txt", nrows=100000, names=["label", "q", "d"], sep="\t", header=None, error_bad_lines=False)
# df = df[df.label == 1]

# df = pd.read_csv("/work/data/train_data/QueryLog", nrows=100000, names=["q"], sep="\t", header=None, error_bad_lines=False)

In [28]:
enablePadding = True
q_df = parse_texts_bpe(df.q.tolist(), sp, bpe_dict, max_len, enablePadding)
d_df = parse_texts_bpe(df.d.tolist(), sp, bpe_dict, max_len, enablePadding)

q_df_ = parse_texts_bpe(df.q.tolist(), sp, bpe_dict, max_len, enablePadding, "post")
d_df_ = parse_texts_bpe(df.d.tolist(), sp, bpe_dict, max_len, enablePadding, "post")

In [467]:
# np.save("/work/data/train_data/1M_QD.q.npy", q_df)
# np.save("/work/data/train_data/1M_QD.d.npy", d_df)
# np.save("/work/data/train_data/1M_QD.label.npy", df.label.values)

In [7]:
def get_tokens(df):
    tokens = set([])
    for i in df:
        for j in i:
            tokens.add(j)
    return tokens

july_tokens = get_tokens(np.concatenate([q_july, d_july]))
june_tokens = get_tokens(np.concatenate([q_june, d_june]))
may_tokens = get_tokens(np.concatenate([q_may, d_may]))

train_tokens = get_tokens(q_df)
for i in [july_tokens, june_tokens, may_tokens]:
    print("Train %d, Test %d, Overlap %d" % (len(train_tokens), len(i), len(train_tokens.intersection(i))))

Train 29279, Test 1523, Overlap 1464
Train 29279, Test 10049, Overlap 9274
Train 29279, Test 10561, Overlap 9854


In [255]:
# q_df = np.load("/work/data/train_data/QueryQueryLog.q.npy")
# d_df = np.load("/work/data/train_data/QueryQueryLog.d.npy")

In [450]:
# back = np.concatenate([q_july_, d_july_])
# front = np.concatenate([q_july, d_july])

# dropout_x_train = run.word_dropout(back, bpe_dict['<unk>'])
# x_train = [front, dropout_x_train]
# # x_train = [q_july]
# y_train_ = np.expand_dims(back, axis=-1)
# y_train = [y_train_, y_train_]

In [5]:
# K.clear_session()
# run = SeqGAN(nb_words, max_len, embedding_matrix, [512, 200], optimizer=Adam(), keep_rate_word_dropout=0.0, kl_weight=1)

# classifier = Classifier(nb_words, max_len, embedding_matrix, [200, 200], optimizer=Adam(), keep_rate_word_dropout=0.0, kl_weight=1)
# output is bad
# run = VAE_LSTM(nb_words, max_len, embedding_matrix, [200,200], optimizer=Adam(), kl_weight=0)
# callbacks = [ PlotLossesKeras()]
# PlotLossesKeras(), TQDMNotebookCallback()

In [1131]:
class AdversarialAutoencoder(object):
    
    def __init__(self, nb_words, max_len, embedding_matrix, dim, comp_topk=None, ctype=None, epsilon_std=1.0, optimizer=Adadelta(lr=2.), kl_weight=0, PoolMode="max", trainMode=1):
        self.dim = dim
        self.embedding_matrix = embedding_matrix
        self.comp_topk = comp_topk
        self.ctype = ctype
        self.epsilon_std = epsilon_std
        self.max_len = max_len
        self.nb_words = nb_words
        self.optimizer = optimizer
        self.kl_weight = kl_weight
        self.enableKL = True if kl_weight == 0 else False
        self.PoolMode = PoolMode
        self.trainMode = trainMode
        
        
        self.model, self.train_encoder, self.encoder, self.discriminator = self.build()
        
        if self.trainMode == 1:
            for layer in self.discriminator.layers:
                layer.trainable=False
        
        self.discriminator.compile(loss='binary_crossentropy',
            optimizer=optimizer,
            metrics=['accuracy'])
        
        

        self.model.compile(loss=['sparse_categorical_crossentropy', "binary_crossentropy"],
#             loss_weights=[0.7, 0.3],
            optimizer=optimizer)

    def name(self):
        return "aae_%s_%d" % (self.PoolMode, self.trainMode)

    def build(self):
        
        e_input = Input(shape=(self.max_len,), name="enc_input")
        e_input2 = Input(shape=(self.max_len,), name="enc_input2")

        emb = Embedding(self.nb_words,
                            self.embedding_matrix.shape[-1],
                            weights=[self.embedding_matrix],
                            input_length=self.max_len,
                            mask_zero=True,
                            trainable=True)
        
        rnn = GRU(self.dim[0], name='lstm_1', return_sequences=False, return_state=True)

        h = emb(e_input)
        _, h = rnn(h)
        mu = Dense(self.dim[1])
        log_var = Dense(self.dim[1])
        h_mu = mu(h)
        h_log_var = log_var(h)
        z = merge([h_mu, h_log_var],
                mode=lambda p: p[0] + K.random_normal(K.shape(p[0])) * K.exp(p[1] / 2),
                output_shape=lambda p: p[0])
        
        
        h2 = emb(e_input2)
        _, h2 = rnn(h2)
        h2_mu = mu(h2)
        
        
        d_input = Input(shape=(self.max_len,), name="dec_input")
        
        d_latent2hidden = Dense(self.dim[0], activation='relu')
        d_lstm = GRU(self.dim[0], return_sequences=True)
        dec_embedding_layer = Embedding(self.nb_words,
                                    self.embedding_matrix.shape[-1],
#                                     weights=[self.embedding_matrix],
                                    input_length=self.max_len,
                                    mask_zero=True,
                                    trainable=True)

        softmax_layer = Dense(self.nb_words, activation="softmax")
        d_output2vocab = TimeDistributed(softmax_layer, name="rec")
        
        
        h_z = d_latent2hidden(z)
        
        d_embed_input = dec_embedding_layer(d_input)
        outputs = d_lstm(d_embed_input, initial_state=[h_z])
        x_ = d_output2vocab(outputs)
        
        
        
        

        z_input = Input(shape=(self.dim[1],))
        h1 = Dense(200, input_dim=self.dim[1])
        fc = Dense(1, activation="sigmoid", name="dis")

        pred = fc(h1(z_input))
        
        discriminator = Model(z_input, pred)
        
        
        
        gan_pred = fc(h1(z))
        
        train_encoder = Model(e_input, z)
        encoder = Model(e_input, h_mu)
        
        vae = Model([e_input, d_input], [x_, gan_pred])
        
#         vae = Model([e_input, d_input], [pred])
#         train_encoder = Model(e_input, z)
#         encoder = Model(e_input, mu)
        
    
        return vae, train_encoder, encoder, discriminator


In [1192]:
K.clear_session()

run = AdversarialAutoencoder(nb_words, max_len, embedding_matrix, [200,200], optimizer=Adam(), trainMode=2)

In [1193]:
# valid = np.ones((len(q_df), 1))
# fake = np.zeros((len(q_df), 1))

# enc_train = q_df
# dec_train = pad_sequences(pad_sequences(q_df_, maxlen=max_len+1, value=bpe_dict['<sos>']), maxlen=max_len, truncating="post")

# x_train = [enc_train, dec_train]
# y_train = [np.expand_dims(q_df_, axis=-1), valid]

# July Data
valid = np.ones((len(q_july) * 2, 1))
fake = np.zeros((len(q_july) * 2, 1))

enc_train = np.concatenate([q_july, d_july])

q_tmp = pad_sequences(pad_sequences(q_july_, maxlen=max_len+1, value=bpe_dict['<sos>']), maxlen=max_len, truncating="post")
d_tmp = pad_sequences(pad_sequences(d_july_, maxlen=max_len+1, value=bpe_dict['<sos>']), maxlen=max_len, truncating="post")
dec_train = np.concatenate([q_tmp, d_tmp])

x_train = [enc_train, dec_train]
y_train = [np.expand_dims(np.concatenate([q_july_, d_july_]), axis=-1), valid]

In [1194]:
latent_fake = run.train_encoder.predict(enc_train)
latent_real = np.random.normal(size=(len(enc_train), 200))

In [1217]:
for layer in run.discriminator.layers:
    layer.trainable=True
        
run.discriminator.compile(loss='binary_crossentropy',
    optimizer=run.optimizer,
    metrics=['accuracy'])



for i in range(10):
    
    d_loss_real = run.discriminator.train_on_batch(latent_real, valid)
    d_loss_fake = run.discriminator.train_on_batch(latent_fake, fake)
    d_loss = 0.5 * np.add(d_loss_real, d_loss_fake)
    print(d_loss)


[0.63396007 0.6414319 ]
[0.6177918  0.66021127]
[0.6132539 0.6672536]
[0.61203563 0.6637324 ]
[0.6123042 0.6684272]
[0.6120887  0.66784036]
[0.61181974 0.65786386]
[0.6119311  0.66138494]
[0.6116909 0.6666666]
[0.6113531 0.665493 ]


In [1204]:
def set_trainable(run, isTrainable):

    for layer in run.discriminator.layers:
        layer.trainable=isTrainable

    run.discriminator.compile(loss='binary_crossentropy',
        optimizer=run.optimizer,
        metrics=['accuracy'])

In [1205]:
K.clear_session()
run = AdversarialAutoencoder(nb_words, max_len, embedding_matrix, [200,200], optimizer=Adam(), trainMode=2)

for epoch in range(20):
    
    latent_fake = run.train_encoder.predict(enc_train)
    latent_real = np.random.normal(size=(len(enc_train), 200))
    
    d_loss_real = run.discriminator.train_on_batch(latent_real, valid)
    d_loss_fake = run.discriminator.train_on_batch(latent_fake, fake)
    d_loss = 0.5 * np.add(d_loss_real, d_loss_fake)

    g_loss = run.model.fit(x_train, y_train, verbose=0, batch_size=256, validation_split=0.2)
    g_loss = g_loss.history
    # Plot the progress
    print ("%d [D loss: %f, acc: %.2f%%] [Rec loss: %f, GAN loss: %f] [VRec loss: %f, VGAN loss: %f]" % (epoch, d_loss[0], 100*d_loss[1], g_loss["rec_loss"][-1], g_loss["dis_loss"][-1], g_loss["val_rec_loss"][-1], g_loss["val_dis_loss"][-1]))
    eval_july(run.encoder)

0 [D loss: 0.908930, acc: 49.59%] [Rec loss: 10.821520, GAN loss: 0.805989] [VRec loss: 10.787205, VGAN loss: 0.658835]
0.5508683759923503
1 [D loss: 0.963634, acc: 41.37%] [Rec loss: 10.750865, GAN loss: 0.542718] [VRec loss: 10.672027, VGAN loss: 0.417450]
0.5605861815917632
2 [D loss: 1.060488, acc: 36.56%] [Rec loss: 10.565256, GAN loss: 0.393388] [VRec loss: 10.346186, VGAN loss: 0.209007]
0.5610531699615291
3 [D loss: 1.405959, acc: 30.52%] [Rec loss: 9.994340, GAN loss: 0.223600] [VRec loss: 9.196527, VGAN loss: 0.097199]
0.5455091285107518
4 [D loss: 2.331551, acc: 30.16%] [Rec loss: 8.336987, GAN loss: 0.104999] [VRec loss: 7.519493, VGAN loss: 0.063864]
0.5367920122751229
5 [D loss: 4.397253, acc: 29.69%] [Rec loss: 6.794015, GAN loss: 0.056852] [VRec loss: 7.616348, VGAN loss: 0.056566]
0.532177722431008
6 [D loss: 5.959632, acc: 30.46%] [Rec loss: 6.505024, GAN loss: 0.045374] [VRec loss: 7.552958, VGAN loss: 0.033389]
0.5368364873579576
7 [D loss: 5.435437, acc: 29.93%] [R

In [None]:
K.clear_session()
run = AdversarialAutoencoder(nb_words, max_len, embedding_matrix, [200,200], optimizer=Adam(), trainMode=1)

In [1212]:
latent_fake = run.train_encoder.predict(enc_train)
latent_real = np.random.normal(size=(len(enc_train), 200))
for epoch in range(10):
    
    set_trainable(run, True)
    
    d_loss_real = run.discriminator.train_on_batch(latent_real, valid)
    d_loss_fake = run.discriminator.train_on_batch(latent_fake, fake)
    d_loss = 0.5 * np.add(d_loss_real, d_loss_fake)
    print(d_loss)
    set_trainable(run, False)

# for epoch in range(20):
    
#     g_loss = run.model.fit(x_train, y_train, verbose=2, batch_size=32, validation_split=0.2)
#     g_loss = g_loss.history
#     # Plot the progress
# #     print ("%d [D loss: %f, acc: %.2f%%] [Rec loss: %f, GAN loss: %f] [VRec loss: %f, VGAN loss: %f]" % (epoch, d_loss[0], 100*d_loss[1], g_loss["rec_loss"][-1], g_loss["dis_loss"][-1], g_loss["val_rec_loss"][-1], g_loss["val_dis_loss"][-1]))
#     eval_july(run.encoder)

[0.7884649  0.49530518]
[0.7381919  0.53638494]
[0.70274484 0.5674883 ]
[0.6782027 0.592723 ]
[0.66414547 0.6109155 ]
[0.65782106 0.6203052 ]
[0.65528643 0.62558687]
[0.6552018  0.62147886]
[0.65696955 0.6279343 ]
[0.6604165 0.6285211]


In [1206]:
generate_output(run.model, bpe, [q_july, dec_input],  idx=0)

BLEU: 0.6552
              should you get rental car insurance	what is of of of of              
               police radar how it works	what is of                 
                native tribes of arizona	what of of                 
             how long spaghetti sauce in refriger ator	what of of                 
                clayton county in ga	what of of                 
         when is gay pride parade in nyc  2 0 <unk>	what is of of of of of             
             easy chicken breast recipes cro ck pot	what of                  
            what to do in g ta  <unk>	what is is is of of of of            
                dancing with  stars	what of of                 
              leonardo dic ap rio wife name	what of                  
                 is doge dead	what of of                 
                super class in java	what of of                 
                  hungry hearts	                   
             phar rell williams desp icable me soundtrack	         

In [1208]:
evaluate(run.encoder, test_set)

(0.4922502268948714, 0.8163040468865793, 0.5537147812937802)

In [5]:
class SeqVAE(object):
    
    def __init__(self, nb_words, max_len, embedding_matrix, dim, optimizer=RMSprop(), keep_rate_word_dropout=0.5, kl_weight=0, mode=1):
        self.dim = dim
        self.nb_words = nb_words
        self.max_len = max_len
        self.embedding_matrix = embedding_matrix
        self.optimizer = optimizer
        self.kl_weight = kl_weight
        self.keep_rate_word_dropout = keep_rate_word_dropout
        self.enableKL = True if kl_weight == 0 else False
        self.mode = mode
        self.build()
        
    def build(self):
                
        e_input = Input(shape=(self.max_len,))
        self.kl_input = Input(shape=(1,))

        embedding_layer = Embedding(self.nb_words,
                            self.embedding_matrix.shape[-1],
                            weights=[self.embedding_matrix],
                            input_length=self.max_len,
                            mask_zero=False,
                            trainable=True if self.mode == 1 else False)

        e_lstm = GRU(self.dim[0], return_state=True)
        
        _, h = e_lstm(embedding_layer(e_input))

        
        mean = Dense(self.dim[1])
        var = Dense(self.dim[1])
        
        self.h_mean = mean(h)
        self.h_log_var = var(h)
        
        z = Lambda(self.sampling)([self.h_mean, self.h_log_var])

        self.encoder = Model(e_input, self.h_mean)

        d_input = Input(shape=(self.max_len,))
        d_latent2hidden = Dense(self.dim[0], activation='relu')
        d_lstm = GRU(self.dim[0], return_sequences=True)

        softmax_layer = Dense(self.nb_words, activation="softmax")
        d_output2vocab = TimeDistributed(softmax_layer, name="rec")
        d_output2vocab_kl = TimeDistributed(softmax_layer, name="kl")
        
        dec_embedding_layer = Embedding(self.nb_words,
                                    self.embedding_matrix.shape[-1],
                                    weights=[self.embedding_matrix],
                                    input_length=self.max_len,
                                    mask_zero=False,
                                    trainable=True)

        h_z = d_latent2hidden(z)
        
        d_embed_input = dec_embedding_layer(Masking(mask_value=0.0)(d_input))
        outputs = d_lstm(d_embed_input, initial_state=[h_z])

        pred = d_output2vocab(outputs)
        pred_kl = d_output2vocab_kl(outputs)
        
        
        
#       VAE model
        self.model = Model(inputs=[e_input, d_input], outputs=[pred, pred_kl])
        self.model.compile(optimizer=self.optimizer, loss=["sparse_categorical_crossentropy", self.kl_loss])

    
    def name(self):
        
        return "seqvae_kl_%d" % self.mode if self.enableKL else "seqvae_%d" % self.mode
    
    def kl_loss(self, x, x_):
            kl_loss = - 0.5 * K.sum(1 + self.h_log_var - K.square(self.h_mean) - K.exp(self.h_log_var), axis=-1)
            return (self.kl_weight * kl_loss) 

    def word_dropout(self, x, unk_token):

        x_ = np.copy(x)
        rows, cols = np.nonzero(x_)
        for r, c in zip(rows, cols):
            if random.random() <= self.keep_rate_word_dropout:
                continue
            x_[r][c] = unk_token

        return x_
    

    def sampling(self, args):
        z_mean, z_log_var = args
        epsilon = K.random_normal(shape=(K.shape(z_mean)[0], self.dim[1]), mean=0.,\
                                  stddev=1)
    
        return z_mean + K.exp(z_log_var / 2) * epsilon 
    

In [30]:
K.clear_session()
run = SeqVAE(nb_words, max_len, embedding_matrix, [200,200], optimizer=Adam(), kl_weight=0, mode=1)


In [26]:
eval_july(run.encoder)

0.5588961284440392


In [29]:
q_df

array([[    0,     0,     0, ...,  7461,  4212,     5],
       [    0,     0,     0, ..., 43852,  2966,  2936],
       [    0,     0,     0, ...,  1365,    89,  6772],
       ...,
       [    0,     0,     0, ...,     0,     0,  5890],
       [    0,     0,     0, ...,     0,  1662,  2522],
       [    0,     0,     0, ...,     0,     0,  2673]], dtype=int32)

In [31]:
y = np.expand_dims(q_df_, axis=-1)
dec_input = pad_sequences(pad_sequences(q_df_, maxlen=max_len+1, value=bpe_dict['<sos>']), maxlen=max_len, truncating="post")
for i in range(100):
    run.model.fit([q_df,dec_input] , [y, y], verbose=2, epochs=1, batch_size=256)
#     if i % 20 == 0 and i != 0:
#         kl_rate = 0.001
#         run.kl_weight = min(run.kl_weight + kl_rate, 1)
#         run.model.compile(optimizer=run.optimizer, loss=["sparse_categorical_crossentropy", run.kl_loss])
    eval_july(run.encoder)

Epoch 1/1
 - 139s - loss: 2.5997 - rec_loss: 2.5997 - kl_loss: 0.0000e+00
0.5040583513086793
Epoch 1/1
 - 139s - loss: 1.8012 - rec_loss: 1.8012 - kl_loss: 0.0000e+00
0.49567479819431165
Epoch 1/1
 - 138s - loss: 1.5472 - rec_loss: 1.5472 - kl_loss: 0.0000e+00
0.4622072983610932
Epoch 1/1
 - 138s - loss: 1.3042 - rec_loss: 1.3042 - kl_loss: 0.0000e+00
0.4338989081367164
Epoch 1/1
 - 137s - loss: 1.0799 - rec_loss: 1.0799 - kl_loss: 0.0000e+00
0.41601992483711003
Epoch 1/1
 - 138s - loss: 0.8800 - rec_loss: 0.8800 - kl_loss: 0.0000e+00
0.40216593653405674
Epoch 1/1
 - 139s - loss: 0.7109 - rec_loss: 0.7109 - kl_loss: 0.0000e+00
0.39854121728301717
Epoch 1/1
 - 137s - loss: 0.5709 - rec_loss: 0.5709 - kl_loss: 0.0000e+00
0.3932264448842536
Epoch 1/1
 - 139s - loss: 0.4600 - rec_loss: 0.4600 - kl_loss: 0.0000e+00
0.3870444083702106
Epoch 1/1
 - 138s - loss: 0.3749 - rec_loss: 0.3749 - kl_loss: 0.0000e+00
0.38637728212768796
Epoch 1/1
 - 138s - loss: 0.3086 - rec_loss: 0.3086 - kl_loss: 0.

KeyboardInterrupt: 

In [14]:
d_dec_input = pad_sequences(pad_sequences(d_july_, maxlen=max_len+1, value=bpe_dict['<sos>']), maxlen=max_len, truncating="post")


In [21]:
generate_output(run.model, bpe, [q_july, dec_input], idx=0)

BLEU: 0.6941
                cord of wood dimensions	 of  name                
                 history of b	how of justification                 
               states which legal ized marijuana	 of  credit                
                largest all ig tor	 moor stream java                
               characteristics of observ ant people	how of  in                
            what does f ** * you mean ?	how does f ** * you ? ?            
                 california adventure park	how condor stations                 
                 william howard taft	 of                  
              how many yards to a mile	 is get a your your              
             crime rate in su w anee ga	how does in o men ? ?             
                snap fitness membership prices	how of in treatment                
               mid s omer murders cast	how s omer murders cast               
             easy chicken breast recipes cro ck pot	how is in in in mail              
             olym

In [10]:
def generate_output(model, bpe, x, nrows=100000, idx=None):

    gen_x = np.argmax(model.predict(x), axis=-1) if idx == None else np.argmax(model.predict(x)[idx], axis=-1)
    bleu = []
    results = []
    count = 0
    for i, k in zip(gen_x, x[0]):
        gen_x = " ".join([bpe.index2word[t] for t in i])
        real_x = " ".join([bpe.index2word[t] for t in k])

        bleu.append(sentence_bleu(real_x, gen_x))
        
        real_x = real_x.replace("▁the", "")
        real_x = real_x.replace("▁","")
        gen_x = gen_x.replace("▁the", "")
        gen_x = gen_x.replace("▁","")
        
        if count > nrows:
            break
        count = count + 1
        results.append("%s\t%s" % (real_x, gen_x))
        
    
    print("BLEU: %.4f" % np.mean(bleu))
    idx = np.arange(len(results))
    shuffle(idx)
    for i in idx[:nrows]:
        print(results[i])