In [1]:
from Models import *
from Utils import *
from FastModels import *
import warnings
from livelossplot import PlotLossesKeras
# warnings.filterwarnings('ignore')

Using TensorFlow backend.


In [2]:
from gensim.models import KeyedVectors
import sentencepiece as spm

max_len = 20
enablePadding = True

sp = spm.SentencePieceProcessor()
sp.Load('/work/data/bpe/en.wiki.bpe.op50000.model')
bpe = KeyedVectors.load_word2vec_format("/work/data/bpe/en.wiki.bpe.op50000.d200.w2v.bin", binary=True)
bpe.index2word = [''] + bpe.index2word + ['<sos>'] + ['<eos>']  # add empty string 
nb_words = len(bpe.index2word)
# word2index
bpe_dict = {bpe.index2word[i]: i for i in range(len(bpe.index2word))}
# construct embedding_matrix
embedding_matrix = np.concatenate([np.zeros((1, bpe.vector_size)), bpe.vectors, np.zeros((2, bpe.vector_size))]) # add zero vector for empty string (i.e. used for padding)

embedding_layer = Embedding(nb_words,
                    embedding_matrix.shape[-1],
                    weights=[embedding_matrix],
                    input_length=max_len,
                    trainable=True)

In [3]:
enablePadding = True
df_may, qrel_may = get_test_data("MayFlower", "/work/")
df_june, qrel_june = get_test_data("JuneFlower", "/work/")
df_july, qrel_july = get_test_data("JulyFlower", "/work/")

q_may = parse_texts_bpe(df_may.q.tolist(), sp, bpe_dict, max_len, enablePadding)
d_may = parse_texts_bpe(df_may.d.tolist(), sp, bpe_dict, max_len, enablePadding)

q_june = parse_texts_bpe(df_june.q.tolist(), sp, bpe_dict, max_len, enablePadding)
d_june = parse_texts_bpe(df_june.d.tolist(), sp, bpe_dict, max_len, enablePadding)

q_july = parse_texts_bpe(df_july.q.tolist(), sp, bpe_dict, max_len, enablePadding)
d_july = parse_texts_bpe(df_july.d.tolist(), sp, bpe_dict, max_len, enablePadding)

q_july_ = parse_texts_bpe(df_july.q.tolist(), sp, bpe_dict, max_len, enablePadding, "post")
d_july_ = parse_texts_bpe(df_july.d.tolist(), sp, bpe_dict, max_len, enablePadding, "post")

test_set = [[q_may, d_may, qrel_may, df_may, "MayFlower"], [q_june, d_june, qrel_june, df_june, "JuneFlower"], [q_july, d_july, qrel_july, df_july, "JulyFlower"]]


b'Skipping line 19898: expected 6 fields, saw 8\nSkipping line 20620: expected 6 fields, saw 8\nSkipping line 38039: expected 6 fields, saw 8\n'


In [4]:
def eval_july(run):
    q_ = run.predict(q_july)
    d_ = run.predict(d_july)
    cosine = CosineSim(q_.shape[-1])

    pred = cosine.model.predict([q_, d_])
    return auc(qrel_july, pred.flatten())

In [5]:
df = pd.read_csv("/work/data/train_data/30M_QD_lower2.txt", nrows=1000000, names=["label", "q", "d"], sep="\t", header=None, error_bad_lines=False)

q_df = parse_texts_bpe(df.q.tolist(), sp, bpe_dict, max_len, enablePadding)
d_df = parse_texts_bpe(df.d.tolist(), sp, bpe_dict, max_len, enablePadding)

q_df_ = parse_texts_bpe(df.q.tolist(), sp, bpe_dict, max_len, enablePadding, "post")
d_df_ = parse_texts_bpe(df.d.tolist(), sp, bpe_dict, max_len, enablePadding, "post")

In [32]:
# K.clear_session()
hidden_dim, latent_dim = 200, 200
optimizer = Adam()

dssm = DSSM(hidden_dim, latent_dim, 1, nb_words, max_len, embedding_matrix, optimizer=optimizer, enableLSTM=True, enableSeparate=True)


In [1]:
step = 100 * 256
for epoch in range(10):
    
    for i in range(0, len(q_df), step):
        
        q = q_df[i: i + step]
        d = d_df[i: i + step]
    
        idx = np.arange(len(q))
        shuffle(idx)
        
        x_train = [q, d, d[idx]]
        y_train = np.zeros((len(q), 2))
        y_train[:, 0] = 1
        
        hist = dssm.model.fit(x_train, y_train, verbose=0, batch_size=256)
        print("Epoch %d, Loss %.4f, AUC %.4f" % (epoch, hist.history['loss'][-1], eval_july(dssm.encoder)))
        
        if i > 100000:
            break

- Train DSSM_LSTM on 1M Q-D paired data
- Train SeqVAE on 1M on 1M Q data
- Train SeqVAE on 1M on 1M D data
- intialise DSSM_LSTM using SeqVAEs

In [86]:
K.clear_session()


In [14]:
class SeqVAE(object):
    
    def __init__(self, nb_words, max_len, embedding_matrix, dim, optimizer=Adam(), keep_rate_word_dropout=0.5, kl_weight=0, enableKL=False):
        self.dim = dim
        self.nb_words = nb_words
        self.max_len = max_len
        self.embedding_matrix = embedding_matrix
        self.optimizer = optimizer
        self.kl_weight = kl_weight
        self.keep_rate_word_dropout = keep_rate_word_dropout
        self.enableKL = enableKL
        self.build()
        
    def build(self):
                
        e_input = Input(shape=(self.max_len,))
        self.kl_input = Input(shape=(1,))

        embedding_layer = Embedding(self.nb_words,
                            self.embedding_matrix.shape[-1],
                            weights=[self.embedding_matrix],
                            input_length=self.max_len,
                            mask_zero=True,
                            trainable=True)

        lstm = GRU(self.dim[0], return_state=True, return_sequences=True)
        
        _, h = lstm(embedding_layer(e_input))

        
        mean = Dense(self.dim[1])
        var = Dense(self.dim[1])
        
        self.h_mean = mean(h)
        self.h_log_var = var(h)
        
        z = Lambda(self.sampling)([self.h_mean, self.h_log_var])

        self.encoder = Model(e_input, self.h_mean)

        d_input = Input(shape=(self.max_len,))
        d_latent2hidden = Dense(self.dim[0], activation='relu')
        # d_lstm = GRU(self.dim[0], return_sequences=True)

        softmax_layer = Dense(self.nb_words, activation="softmax")
        d_output2vocab = TimeDistributed(softmax_layer, name="rec")
        d_output2vocab_kl = TimeDistributed(softmax_layer, name="kl")
        
        # dec_embedding_layer = Embedding(self.nb_words,
        #                             self.embedding_matrix.shape[-1],
        #                             weights=[self.embedding_matrix],
        #                             input_length=self.max_len,
        #                             mask_zero=True,
        #                             trainable=True)

        h_z = d_latent2hidden(z)
        
        d_embed_input = embedding_layer(d_input)
        outputs, _ = lstm(d_embed_input, initial_state=[h_z])

        pred = d_output2vocab(outputs)
        pred_kl = d_output2vocab_kl(outputs)
        
#       VAE model
        self.model = Model(inputs=[e_input, d_input], outputs=[pred])
        self.model.compile(optimizer=self.optimizer, loss=["sparse_categorical_crossentropy"])

    
    def name(self):
        
        return "seqvae_kl" if self.enableKL else "seqvae_%d" % self.kl_weight
    
    def kl_loss(self, x, x_):
            kl_loss = - 0.5 * K.sum(1 + self.h_log_var - K.square(self.h_mean) - K.exp(self.h_log_var), axis=-1)
            return (self.kl_weight * kl_loss) 

    def word_dropout(self, x, unk_token):

        x_ = np.copy(x)
        rows, cols = np.nonzero(x_)
        for r, c in zip(rows, cols):
            if random.random() <= self.keep_rate_word_dropout:
                continue
            x_[r][c] = unk_token

        return x_
    

    def sampling(self, args):
        z_mean, z_log_var = args
        epsilon = K.random_normal(shape=(K.shape(z_mean)[0], self.dim[1]), mean=0.,\
                                  stddev=1)
    
        return z_mean + K.exp(z_log_var / 2) * epsilon 


In [15]:
run = SeqVAE(nb_words, max_len, embedding_matrix, [200,200], optimizer=Adam(), kl_weight=1)


In [25]:
step = 100 * 256
for epoch in range(5):
    for i in range(0, len(q_df), step):

        enc_input = q_df[i: i + step]
        dec_output = q_df_[i: i + step]
        dec_input = pad_sequences(pad_sequences(dec_output, maxlen=max_len+1, value=bpe_dict['<sos>']), maxlen=max_len, truncating="post")

        x_train = [enc_input, dec_input]
        _ = np.expand_dims(dec_output, axis=-1)
        y_train = [_]

        hist = run.model.fit(x_train, y_train, verbose=2, batch_size=256, validation_split=0.3)
        print("Epoch %d, Loss %.4f" % (epoch, hist.history['loss'][-1]))
        
        if i > 200000:
            break

Train on 17920 samples, validate on 7680 samples
Epoch 1/1
 - 34s - loss: 1.6718 - val_loss: 2.6919
Epoch 0, Loss 1.6718
Train on 17920 samples, validate on 7680 samples
Epoch 1/1
 - 34s - loss: 1.6074 - val_loss: 2.6538
Epoch 0, Loss 1.6074
Train on 17920 samples, validate on 7680 samples
Epoch 1/1
 - 34s - loss: 1.5992 - val_loss: 2.6229
Epoch 0, Loss 1.5992
Train on 17920 samples, validate on 7680 samples
Epoch 1/1
 - 34s - loss: 1.5779 - val_loss: 2.5381
Epoch 0, Loss 1.5779
Train on 17920 samples, validate on 7680 samples
Epoch 1/1
 - 34s - loss: 1.5336 - val_loss: 2.6094
Epoch 0, Loss 1.5336
Train on 17920 samples, validate on 7680 samples
Epoch 1/1
 - 34s - loss: 1.5128 - val_loss: 2.5215
Epoch 0, Loss 1.5128
Train on 17920 samples, validate on 7680 samples
Epoch 1/1
 - 34s - loss: 1.5019 - val_loss: 2.5332
Epoch 0, Loss 1.5019
Train on 17920 samples, validate on 7680 samples
Epoch 1/1


KeyboardInterrupt: 

In [26]:
embedding_weights = run.model.layers[2].get_weights()
rnn_weights = run.model.layers[3].get_weights()

In [27]:
np.save("/work/workspace/vae.emb", embedding_weights)

In [28]:
np.save("/work/workspace/vae.gpu", rnn_weights)

In [11]:
embedding_weights = np.load("/work/workspace/vae.emb.npy")
rnn_weights = np.load("/work/workspace/vae.gpu.npy")

In [21]:
class DSSM():
    
    def __init__(self, hidden_dim=300, latent_dim=128, num_negatives=1, nb_words=50005, max_len=10, embedding_matrix=None, optimizer=None, enableLSTM=False, enableSeparate=False, PoolMode="max"):

        self.hidden_dim = hidden_dim
        self.latent_dim = latent_dim
        self.num_negatives = num_negatives
        self.nb_words = nb_words
        self.max_len = max_len
        self.enableSeparate = enableSeparate
        self.enableLSTM = enableLSTM
        self.PoolMode = PoolMode

        # Input tensors holding the query, positive (clicked) document, and negative (unclicked) documents.
        # The first dimension is None because the queries and documents can vary in length.
        query = Input(shape = (self.max_len,))
        pos_doc = Input(shape = (self.max_len,))
        neg_docs = [Input(shape = (self.max_len,)) for j in range(self.num_negatives)]

        embed_layer = Embedding(nb_words,
                    embedding_matrix.shape[-1],
                    weights=[embedding_matrix],
                    input_length=max_len,
                    mask_zero=True if self.enableLSTM else False,
                    trainable=True)
        
        bilstm = GRU(hidden_dim, name='lstm_1', return_sequences=False)

        if enableSeparate:
            d_embed_layer = Embedding(nb_words,
                    embedding_matrix.shape[-1],
                    weights=[embedding_matrix],
                    input_length=max_len,
                    mask_zero=True if self.enableLSTM else False,
                    trainable=True)
            d_bilstm = GRU(hidden_dim, name="lstm_2")

        Pooling = GlobalMaxPooling1D() if self.PoolMode == "max" else GlobalAveragePooling1D()

        if self.enableLSTM:
            query_sem = bilstm(embed_layer(query))
            pos_doc_sem = bilstm(embed_layer(pos_doc)) if not enableSeparate else d_bilstm(d_embed_layer(pos_doc))
            neg_doc_sems = [bilstm(embed_layer(neg_doc)) for neg_doc in neg_docs] if not enableSeparate else [d_bilstm(d_embed_layer(neg_doc)) for neg_doc in neg_docs]
        else:
            query_sem = Pooling(embed_layer(query))
            pos_doc_sem = Pooling(embed_layer(pos_doc)) if not enableSeparate else Pooling(d_embed_layer(pos_doc))
            neg_doc_sems = [Pooling(embed_layer(neg_doc)) for neg_doc in neg_docs] if not enableSeparate else [Pooling(d_embed_layer(neg_doc)) for neg_doc in neg_docs]




        # This layer calculates the cosine similarity between the semantic representations of
        # a query and a document.
        R_Q_D_p = dot([query_sem, pos_doc_sem], axes = 1, normalize = True) # See equation (4).
        R_Q_D_ns = [dot([query_sem, neg_doc_sem], axes = 1, normalize = True) for neg_doc_sem in neg_doc_sems] # See equation (4).

        concat_Rs = concatenate([R_Q_D_p] + R_Q_D_ns)
        concat_Rs = Reshape((self.num_negatives + 1, 1))(concat_Rs)

        # In this step, we multiply each R(Q, D) value by gamma. In the paper, gamma is
        # described as a smoothing factor for the softmax function, and it's set empirically
        # on a held-out data set. We're going to learn gamma's value by pretending it's
        # a single 1 x 1 kernel.
        weight = np.array([1]).reshape(1, 1, 1)
        with_gamma = Convolution1D(1, 1, padding = "same", input_shape = (self.num_negatives + 1, 1), activation = "linear", use_bias = False, weights = [weight])(concat_Rs) # See equation (5).
        with_gamma = Reshape((self.num_negatives + 1, ))(with_gamma)

        # Finally, we use the softmax function to calculate P(D+|Q).
        prob = Activation("softmax")(with_gamma) # See equation (5).

        # We now have everything we need to define our model.
        self.model = Model(inputs = [query, pos_doc] + neg_docs, outputs = prob)
        self.model.compile(optimizer = optimizer, loss = "categorical_crossentropy")

        self.encoder = Model(inputs=query, outputs=query_sem)

    def name(self):
        if self.enableLSTM:
            return "dssm_lstm2" if self.enableSeparate else "dssm_lstm"
        else:
            return "dssm2_%s" % self.PoolMode if self.enableSeparate else "dssm_%s" % self.PoolMode


In [29]:
dssm2 = DSSM(200, 20, 1, nb_words, max_len, embedding_matrix, optimizer=Adam(), enableLSTM=True, enableSeparate=True)


In [None]:
dssm2.model.layers[3].set_weights(embedding_weights)
dssm2.model.layers[4].set_weights(embedding_weights)

dssm2.model.layers[5].set_weights(rnn_weights)
dssm2.model.layers[6].set_weights(rnn_weights)

In [None]:
step = 100 * 256
for epoch in range(10):
    
    for i in range(0, len(q_df), step):
        
        q = q_df[i: i + step]
        d = d_df[i: i + step]
    
        idx = np.arange(len(q))
        shuffle(idx)
        
        x_train = [q, d, d[idx]]
        y_train = np.zeros((len(q), 2))
        y_train[:, 0] = 1
        
        hist = dssm2.model.fit(x_train, y_train, verbose=0, batch_size=256)
        print("Epoch %d, Loss %.4f, AUC %.4f" % (epoch, hist.history['loss'][-1], eval_july(dssm2.encoder)))
        
        if i > 100000:
            break

Epoch 0, Loss 0.4660, AUC 0.5194
Epoch 0, Loss 0.3933, AUC 0.5115
Epoch 0, Loss 0.3627, AUC 0.5084
Epoch 0, Loss 0.3421, AUC 0.5242
Epoch 0, Loss 0.3219, AUC 0.5251
Epoch 1, Loss 0.3008, AUC 0.5256
