In [1]:
from keras.layers import Bidirectional, merge, Flatten, dot, Dense, Embedding, Input, Lambda, LSTM, RepeatVector, TimeDistributed, Layer, Activation, Dropout
from keras.preprocessing.sequence import pad_sequences
from keras.losses import mse, binary_crossentropy
from keras.layers.advanced_activations import ELU
from keras.preprocessing.text import Tokenizer
from keras.callbacks import ModelCheckpoint
from keras.optimizers import Adam
from keras import backend as K
from keras.models import Model
from scipy import spatial
import tensorflow as tf
import pandas as pd
import numpy as np
import codecs
import csv
import os
import scipy.sparse
import random
import itertools
import math
from tqdm import tqdm

Using TensorFlow backend.


In [9]:
TRAIN_DATA_FILE = '/data/chzho/deepqts/train_data/unifiedclick/join_oneyearsample_2B_training_all_top10'
batch_size = 1000
MAX_SEQUENCE_LENGTH = 7
MAX_NB_WORDS = 1000000
max_features = 50000

In [4]:
%%time
num_read_row = 10000000
df = pd.read_csv(TRAIN_DATA_FILE, sep="\t", usecols=[0,1,3], names=['label', 'q', 'd'], header=None , error_bad_lines=False, nrows=num_read_row)
df = df.dropna()

CPU times: user 1min 26s, sys: 6.18 s, total: 1min 32s
Wall time: 1min 32s


In [4]:
%%time
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

count_vect = CountVectorizer(max_features=max_features)
x_train = count_vect.fit_transform(df.q.tolist() + df.d.tolist())
tf_transformer = TfidfTransformer().fit(x_train)
x_train = tf_transformer.transform(x_train)
y_train = df.label.values

CPU times: user 25.3 s, sys: 760 ms, total: 26.1 s
Wall time: 26.1 s


In [56]:
x_train

<2000000x50000 sparse matrix of type '<class 'numpy.float64'>'
	with 7974826 stored elements in Compressed Sparse Row format>

In [5]:
q_train = x_train[:len(df)]
d_train = x_train[len(df):]

In [None]:
# %%time
# sample_num = 100000
# sup_x_train = np.concatenate((q_train[:sample_num].todense(), d_train[:sample_num].todense()), axis=1)
# sup_y_train = y_train[:sample_num]

In [24]:
class VAE():
    def __init__(self, latent_dim, hidden_dim, feature_num):
        
        self.latent_dim = latent_dim
        self.hidden_dim = hidden_dim
        self.feature_num = feature_num
    
    def transform(self, docs):
        return self.encoder.predict(docs)
    
    def build(self):
        
        def sampling(args):
            
            """Reparameterization trick by sampling fr an isotropic unit Gaussian.
            # Arguments:
                args (tensor): mean and log of variance of Q(z|X)
            # Returns:
                z (tensor): sampled latent vector
            """
            z_mean, z_log_var = args
            batch = K.shape(z_mean)[0]
            dim = K.int_shape(z_mean)[1]
            # by default, random_normal has mean=0 and std=1.0
            epsilon = K.random_normal(shape=(batch, dim))
            return z_mean + K.exp(0.5 * z_log_var) * epsilon
        
        # VAE model = encoder + decoder
        # build encoder model
        inputs = Input(shape=(self.feature_num, ), name='encoder_input')
        x = Flatten()(embed(inputs))
        x = Dense(self.hidden_dim, activation='relu')(x)
        z_mean = Dense(self.latent_dim, name='z_mean')(x)
        z_log_var = Dense(self.latent_dim, name='z_log_var')(x)

        # use reparameterization trick to push the sampling out as input
        # note that "output_shape" isn't necessary with the TensorFlow backend
        z = Lambda(sampling, output_shape=(self.latent_dim,), name='z')([z_mean, z_log_var])

        # instantiate encoder model
        self.encoder = Model(inputs, z, name='encoder')

        # build decoder model
        latent_inputs = Input(shape=(self.latent_dim, ), name='z_sampling')
        x = Dense(self.hidden_dim, activation='relu')(latent_inputs)
        outputs = Dense(self.feature_num, activation='sigmoid')(x)

        # instantiate decoder model
        self.decoder = Model(latent_inputs, outputs, name='decoder')


        # instantiate VAE model
        outputs = self.decoder(self.encoder(inputs))
        self.model = Model(inputs, outputs, name='vae_mlp')
        
        reconstruction_loss = binary_crossentropy(inputs,
                                                  outputs)
        reconstruction_loss *= self.feature_num
        kl_loss = 1 + z_log_var - K.square(z_mean) - K.exp(z_log_var)
        kl_loss = K.sum(kl_loss, axis=-1)
        kl_loss *= -0.5
        vae_loss = K.mean(reconstruction_loss + kl_loss)
        self.model.add_loss(vae_loss)
        self.model.compile(optimizer='adam')
        

In [25]:
vae = VAE(200,1400, 7)
vae.build()



In [8]:
%%time
sample_num = 100000
uns_q_train = q_train[sample_num:]
uns_d_train = d_train[sample_num:]

CPU times: user 124 ms, sys: 88 ms, total: 212 ms
Wall time: 211 ms


In [60]:
uns_q_train

<900000x50000 sparse matrix of type '<class 'numpy.float64'>'
	with 2571627 stored elements in Compressed Sparse Row format>

In [53]:
%%time
x = q_train[:100000]
batch_size = 64

for ep in range(1):

    for i in tqdm(range(math.ceil(uns_q_train.shape[0]/batch_size))):
        batch_q = uns_q_train[i*batch_size:(i+1)*batch_size].todense()
        batch_d = uns_d_train[i*batch_size:(i+1)*batch_size].todense()

        vae.model.train_on_batch(batch_d, [])
    
    train_mlp()
    evaluate()

    


100%|██████████| 1563/1563 [01:35<00:00, 16.36it/s]
100%|██████████| 1250/1250 [01:58<00:00, 10.51it/s]
100%|██████████| 2000/2000 [00:39<00:00, 50.58it/s]
  0%|          | 2/1563 [00:00<01:50, 14.15it/s]

0.508125012428


100%|██████████| 1563/1563 [01:35<00:00, 16.29it/s]
100%|██████████| 1250/1250 [01:57<00:00, 10.66it/s]
100%|██████████| 2000/2000 [00:38<00:00, 52.79it/s]
  0%|          | 2/1563 [00:00<01:42, 15.24it/s]

0.51920001292


100%|██████████| 1563/1563 [01:35<00:00, 16.30it/s]
100%|██████████| 1250/1250 [01:56<00:00, 10.69it/s]
100%|██████████| 2000/2000 [00:39<00:00, 50.41it/s]
  0%|          | 2/1563 [00:00<01:44, 14.93it/s]

0.524500013161


100%|██████████| 1563/1563 [01:35<00:00, 16.28it/s]
100%|██████████| 1250/1250 [01:57<00:00, 10.46it/s]
100%|██████████| 2000/2000 [00:39<00:00, 50.72it/s]
  0%|          | 2/1563 [00:00<01:47, 14.53it/s]

0.527920013409


100%|██████████| 1563/1563 [01:35<00:00, 16.39it/s]
100%|██████████| 1250/1250 [01:56<00:00, 10.70it/s]
100%|██████████| 2000/2000 [00:39<00:00, 50.90it/s]
  0%|          | 2/1563 [00:00<02:05, 12.41it/s]

0.532216680278


100%|██████████| 1563/1563 [01:35<00:00, 16.43it/s]
100%|██████████| 1250/1250 [01:55<00:00, 10.71it/s]
100%|██████████| 2000/2000 [00:39<00:00, 50.84it/s]
  0%|          | 2/1563 [00:00<01:58, 13.20it/s]

0.535585728075


100%|██████████| 1563/1563 [01:35<00:00, 16.36it/s]
100%|██████████| 1250/1250 [01:56<00:00, 10.34it/s]
100%|██████████| 2000/2000 [00:38<00:00, 51.44it/s]
  0%|          | 2/1563 [00:00<01:43, 15.03it/s]

0.538343763873


100%|██████████| 1563/1563 [01:35<00:00, 16.37it/s]
100%|██████████| 1250/1250 [01:56<00:00, 10.69it/s]
100%|██████████| 2000/2000 [00:39<00:00, 51.10it/s]
  0%|          | 2/1563 [00:00<02:08, 12.17it/s]

0.540800014019


100%|██████████| 1563/1563 [01:35<00:00, 16.31it/s]
100%|██████████| 1250/1250 [01:56<00:00, 10.68it/s]
100%|██████████| 2000/2000 [00:39<00:00, 50.99it/s]
  0%|          | 2/1563 [00:00<01:56, 13.45it/s]

0.543050014093


100%|██████████| 1563/1563 [01:35<00:00, 16.37it/s]
100%|██████████| 1250/1250 [01:56<00:00, 10.69it/s]
100%|██████████| 2000/2000 [00:38<00:00, 51.47it/s]

0.54491819594
CPU times: user 34min 58s, sys: 11min 21s, total: 46min 20s
Wall time: 41min 59s





In [31]:
import keras
from keras import backend as K
from keras.models import Sequential,  Model
from keras.layers.core import Dense, Lambda, Activation
from keras.layers import Embedding, Input, Dense, merge, Reshape, Merge, Flatten, Dropout, GlobalAveragePooling1D
from keras.constraints import maxnorm
from keras.optimizers import Adagrad, Adam, SGD, RMSprop
from time import time
import sys

def mlp(latent_dim):

    que_input = Input(shape=(latent_dim,))
    doc_input = Input(shape=(latent_dim,))

    concat = merge([que_input, doc_input], mode="concat")

    d1 = Dense(512, activation='relu')
    d2 = Dense(256, activation='relu')
    d3 = Dense(128, activation='relu')
    d4 = Dense(64, activation='relu')
    d5 = Dense(32, activation='relu')
    d6 = Dense(1, activation='sigmoid')

    out = d6(d5(d4(d3(d2(d1(concat))))))

    model = Model(input=[que_input, doc_input], output=out)
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

    return model


In [29]:
test_num = 20000
sup_q_train = q_train[:sample_num-test_num]
sup_d_train = d_train[:sample_num-test_num]
sup_y_train = y_train[:sample_num-test_num]

sup_q_test = q_train[sample_num-test_num:sample_num]
sup_d_test = d_train[sample_num-test_num:sample_num]
sup_y_test = y_train[sample_num-test_num:sample_num]


In [28]:
from sklearn.model_selection import train_test_split



In [63]:
%%time

def train_mlp(model):

    batch_size = 64

    for i in tqdm(range(math.ceil(sup_q_train.shape[0]/batch_size))):
        batch_q = sup_q_train[i*batch_size:(i+1)*batch_size].todense()
        batch_d = sup_d_train[i*batch_size:(i+1)*batch_size].todense()
        batch_y = sup_y_train[i*batch_size:(i+1)*batch_size]

    #   encode inputs

        enc_q = vae.encoder.predict(batch_q)
        enc_d = vae.encoder.predict(batch_d)

        loss = model.train_on_batch([enc_q, enc_d], batch_y)
#     print("\r Loss:{:.3f}".format(loss), end='')
    


CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 74.9 µs


In [66]:
mean_avg = []
def evaluate_vae(model):
    batch_size = 10
    for i in tqdm(range(math.ceil(sup_q_test.shape[0]/batch_size))):
        batch_q = sup_q_test[i*batch_size:(i+1)*batch_size].todense()
        batch_d = sup_d_test[i*batch_size:(i+1)*batch_size].todense()
        batch_y = sup_y_test[i*batch_size:(i+1)*batch_size]

        enc_q = vae.encoder.predict(batch_q)
        enc_d = vae.encoder.predict(batch_d)

        avg = model.evaluate([enc_q, enc_d], batch_y, verbose=0)
        mean_avg.append(avg[1])
    print(np.average(mean_avg))

In [74]:
def train_mlp(model):

    batch_size = 1000

    for i in tqdm(range(math.ceil(sup_q_train.shape[0]/batch_size))):
        batch_q = sup_q_train[i*batch_size:(i+1)*batch_size].todense()
        batch_d = sup_d_train[i*batch_size:(i+1)*batch_size].todense()
        batch_y = sup_y_train[i*batch_size:(i+1)*batch_size]

        loss = model.train_on_batch([batch_q, batch_d], batch_y)

def evaluate_mlp(model):
    batch_size = 10
    for i in tqdm(range(math.ceil(sup_q_test.shape[0]/batch_size))):
        batch_q = sup_q_test[i*batch_size:(i+1)*batch_size].todense()
        batch_d = sup_d_test[i*batch_size:(i+1)*batch_size].todense()
        batch_y = sup_y_test[i*batch_size:(i+1)*batch_size]

        avg = model.evaluate([batch_q, batch_d], batch_y, verbose=0)
        mean_avg.append(avg[1])
    print(np.average(mean_avg))

In [75]:
tf_idf = mlp(50000)
train_mlp(tf_idf)
evaluate_mlp(tf_idf)

  name=name)

  0%|          | 0/80 [00:00<?, ?it/s][A
100%|██████████| 80/80 [01:15<00:00,  1.05it/s]
100%|██████████| 2000/2000 [00:41<00:00, 48.03it/s]

0.655575016832





In [None]:
df.q.tolist() + df.d.tolist()

In [10]:
len(df.q.tolist())

9999994

In [11]:
from keras.preprocessing.text import Tokenizer

tokenizer = Tokenizer(MAX_NB_WORDS)
tokenizer.fit_on_texts(df.q.tolist() + df.d.tolist())
word_index = tokenizer.word_index #the dict values start from 1 so this is fine with zeropadding
index2word = {v: k for k, v in word_index.items()}
print('Found %s unique tokens' % len(word_index))
NB_WORDS = (min(tokenizer.num_words, len(word_index)) + 1 ) #+1 for zero padding
print('Number of Vocab: %d' % NB_WORDS)

Found 3090780 unique tokens
Number of Vocab: 1000001


In [12]:
import pickle

# saving
with open('/home/t-jamano/data/10M_query_title_tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)
# loading
with open('/home/t-jamano/data/10M_query_title_tokenizer.pickle', 'rb') as handle:
    tokenizer = pickle.load(handle)

In [13]:
sequences = tokenizer.texts_to_sequences(df.q.tolist())
data_train = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)

d_sequences = tokenizer.texts_to_sequences(df.d.tolist())
d_data_train = pad_sequences(d_sequences, maxlen=MAX_SEQUENCE_LENGTH)

In [15]:
np.save('/home/t-jamano/data/10M_query_token', data_train)
np.save('/home/t-jamano/data/10M_title_token', d_data_train)
np.save('/home/t-jamano/data/10M_labels', df.label.values)

In [102]:
# np.load('/home/t-jamano/data/10M_query_token.npy')[0]

array([ 0,  0,  0,  0, 14, 42,  2], dtype=int32)

In [19]:
GLOVE_EMBEDDING = '/home/t-jamano/data/glove/glove.6B.50d.txt'
embeddings_index = {}
f = open(GLOVE_EMBEDDING, encoding='utf8')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))

EMBEDDING_DIM = 50

glove_embedding_matrix = np.zeros((NB_WORDS, EMBEDDING_DIM))
for word, i in word_index.items():
    if i < NB_WORDS:
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            # words not found in embedding index will be the word embedding of 'unk'.
            glove_embedding_matrix[i] = embedding_vector
        else:
            glove_embedding_matrix[i] = embeddings_index.get('unk')
print('Null word embeddings: %d' % np.sum(np.sum(glove_embedding_matrix, axis=1) == 0))

Found 400000 word vectors.


In [21]:
np.save('/home/t-jamano/data/10M_glove_embedding_matrix', glove_embedding_matrix)


In [2]:
glove_embedding_matrix = np.load('/home/t-jamano/data/10M_glove_embedding_matrix.npy')

In [3]:
NB_WORDS = 1000000 + 1
EMBEDDING_DIM = 50
MAX_SEQUENCE_LENGTH = 7
embed = Embedding(NB_WORDS, EMBEDDING_DIM, weights=[glove_embedding_matrix],
                            input_length=MAX_SEQUENCE_LENGTH, trainable=False)

In [4]:
def w2v(input_dim):
    
    que_input = Input(shape=(input_dim,))
    doc_input = Input(shape=(input_dim,))
    
    embed_q = Flatten()(embed(que_input))
    embed_d = Flatten()(embed(doc_input))
    
    

    concat = merge([que_input, doc_input], mode="concat")

    d1 = Dense(512, activation='relu')
    d2 = Dense(256, activation='relu')
    d3 = Dense(128, activation='relu')
    d4 = Dense(64, activation='relu')
    d5 = Dense(32, activation='relu')
    d6 = Dense(1, activation='sigmoid')

    out = d6(d5(d4(d3(d2(d1(concat))))))

    model = Model(input=[que_input, doc_input], output=out)
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

    return model

    

In [5]:
w2v_model = w2v(MAX_SEQUENCE_LENGTH)

  name=name)


In [6]:
query = np.load('/home/t-jamano/data/10M_query_token.npy')
title = np.load('/home/t-jamano/data/10M_title_token.npy')
label = np.load('/home/t-jamano/data/10M_labels.npy')

In [10]:
w2v_model.fit([query, title], label, verbose=2, batch_size=256, epochs=2, validation_split=0.33)

Train on 6699995 samples, validate on 3299999 samples
Epoch 1/2
 - 166s - loss: 6.0979 - acc: 0.5319 - val_loss: 0.6804 - val_acc: 0.5304
Epoch 2/2
 - 165s - loss: 0.6922 - acc: 0.5307 - val_loss: 0.6913 - val_acc: 0.5304


<keras.callbacks.History at 0x7fc9e0aafb38>

In [15]:
sup_sample_num = 1000000
sup_query = query[:sup_sample_num]
sup_title = title[:sup_sample_num]
sup_label = label[:sup_sample_num]

uns_doc = np.concatenate((query[sup_sample_num:], title[sup_sample_num:]))
uns_label = label[sup_sample_num:]



In [12]:
NB_WORDS = 1000000 + 1
EMBEDDING_DIM = 50
MAX_SEQUENCE_LENGTH = 7

In [19]:
batch_size = 256
max_len = MAX_SEQUENCE_LENGTH
emb_dim = EMBEDDING_DIM
latent_dim = 200
intermediate_dim = 64
epsilon_std = 1.0
num_sampled=500
act = ELU()

#y = Input(batch_shape=(None, max_len, NB_WORDS))
x = Input(batch_shape=(None, max_len))
x_embed = embed(x)
h = LSTM(intermediate_dim, return_sequences=False, recurrent_dropout=0.2)(x_embed)
# h = Dropout(0.2)(h)
# h = Dense(intermediate_dim, activation='linear')(h)
# h = act(h)
# h = Dropout(0.2)(h)
z_mean = Dense(latent_dim)(h)
z_log_var = Dense(latent_dim)(h)

def sampling(args):
    z_mean, z_log_var = args
    epsilon = K.random_normal(shape=(batch_size, latent_dim), mean=0.,
                              stddev=epsilon_std)
    return z_mean + K.exp(z_log_var / 2) * epsilon

# note that "output_shape" isn't necessary with the TensorFlow backend
z = Lambda(sampling, output_shape=(latent_dim,))([z_mean, z_log_var])

# we instantiate these layers separately so as to reuse them later
repeated_context = RepeatVector(max_len)
decoder_h = LSTM(intermediate_dim, return_sequences=True, recurrent_dropout=0.2)
decoder_mean = TimeDistributed(Dense(NB_WORDS, activation='linear'))#softmax is applied in the seq2seqloss by tf
h_decoded = decoder_h(repeated_context(z))
x_decoded_mean = decoder_mean(h_decoded)


# placeholder loss
def zero_loss(y_true, y_pred):
    return K.zeros_like(y_pred)

# Custom VAE loss layer
class CustomVariationalLayer(Layer):
    def __init__(self, **kwargs):
        self.is_placeholder = True
        super(CustomVariationalLayer, self).__init__(**kwargs)
        self.target_weights = tf.constant(np.ones((batch_size, max_len)), tf.float32)

    def vae_loss(self, x, x_decoded_mean):
        #xent_loss = K.sum(metrics.categorical_crossentropy(x, x_decoded_mean), axis=-1)
        labels = tf.cast(x, tf.int32)
        xent_loss = K.sum(tf.contrib.seq2seq.sequence_loss(x_decoded_mean, labels, 
                                                     weights=self.target_weights,
                                                     average_across_timesteps=False,
                                                     average_across_batch=False), axis=-1)
                                                     #softmax_loss_function=softmax_loss_f), axis=-1)#, uncomment for sampled softmax
        kl_loss = - 0.5 * K.sum(1 + z_log_var - K.square(z_mean) - K.exp(z_log_var), axis=-1)
        return K.mean(xent_loss + kl_loss)

    def call(self, inputs):
        x = inputs[0]
        x_decoded_mean = inputs[1]
        print(x.shape, x_decoded_mean.shape)
        loss = self.vae_loss(x, x_decoded_mean)
        self.add_loss(loss, inputs=inputs)
        # we don't use this output, but it has to have the correct shape:
        return K.ones_like(x)

loss_layer = CustomVariationalLayer()([x, x_decoded_mean])
vae = Model(x, [loss_layer])
# opt = Adam(lr=0.01) #SGD(lr=1e-2, decay=1e-6, momentum=0.9, nesterov=True)
vae.compile(optimizer='adam', loss=[zero_loss])
# vae.summary()

# build a model to project sentences on the latent space
encoder = Model(x, z_mean)

(?, 7) (?, 7, 1000001)


In [34]:
uns_doc[:10]

array([[     0,      0,      0,      0,   4221,  30633, 350544],
       [     0,      0,      0,   1962,    710,     40,   5543],
       [     0,      0,      0,      0,      0,      0, 891415],
       [     0,      0,      0,      0,      0,      0,   1669],
       [     0,      0,      0,  16191,   2921,     11,     10],
       [     0,      0,      0,   2063,    421, 177638,     40],
       [     0,      0,      0,      0,      0,    720, 891416],
       [     0,      0,      0,      0,     81,     36,      2],
       [     0,      0,      0,      0,      0,   6288,   9574],
       [     0,      0,      0,      0,    584,  91076,    383]], dtype=int32)

In [28]:
vae.model.fit(uns_doc, verbose=2, batch_size=256, epochs=2, validation_split=0.2)

Train on 14399990 samples, validate on 3599998 samples
Epoch 1/2
 - 326s - loss: -2.1132e+06 - val_loss: -2.2566e+06
Epoch 2/2
 - 327s - loss: -2.1133e+06 - val_loss: -2.2566e+06


<keras.callbacks.History at 0x7fc4731e9c50>

In [33]:
vae_mlp_model = mlp(latent_dim)
enc_q = vae.encoder.predict(query)
enc_d = vae.encoder.predict(title)
vae_mlp_model.fit([enc_q, enc_d], label, verbose=2, batch_size=256, epochs=2, validation_split=0.33)

  name=name)


Train on 6699995 samples, validate on 3299999 samples
Epoch 1/2
 - 195s - loss: 0.6913 - acc: 0.5306 - val_loss: 0.6913 - val_acc: 0.5304
Epoch 2/2
 - 194s - loss: 0.6913 - acc: 0.5307 - val_loss: 0.6913 - val_acc: 0.5304


<keras.callbacks.History at 0x7fc472c37f60>