In [2]:
from keras.layers import Bidirectional, merge, dot, Dense, Embedding, Input, Lambda, LSTM, RepeatVector, TimeDistributed, Layer, Activation, Dropout
from keras.preprocessing.sequence import pad_sequences
from keras.losses import mse, binary_crossentropy
from keras.layers.advanced_activations import ELU
from keras.preprocessing.text import Tokenizer
from keras.callbacks import ModelCheckpoint
from keras.optimizers import Adam
from keras import backend as K
from keras.models import Model
from scipy import spatial
import tensorflow as tf
import pandas as pd
import numpy as np
import codecs
import csv
import os
import scipy.sparse
import random
import itertools
import math
from tqdm import tqdm

In [3]:
TRAIN_DATA_FILE = '/data/chzho/deepqts/train_data/unifiedclick/join_oneyearsample_2B_training_all_top10'
batch_size = 1000
MAX_SEQUENCE_LENGTH = 7
MAX_NB_WORDS = 100000
max_features = 50000

In [None]:
%%time
num_read_row = 1000000
df = pd.read_csv(TRAIN_DATA_FILE, sep="\t", usecols=[0,1,3], names=['label', 'q', 'd'], header=None , error_bad_lines=False, nrows=num_read_row)
df = df.dropna()

CPU times: user 7.3 s, sys: 29.1 s, total: 36.4 s
Wall time: 45.8 s


In [None]:
%%time
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

count_vect = CountVectorizer(max_features=max_features)
x_train = count_vect.fit_transform(df.q.tolist() + df.d.tolist())
tf_transformer = TfidfTransformer().fit(x_train)
x_train = tf_transformer.transform(x_train)
y_train = df.label.values

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


In [None]:
q_train = x_train[:len(df)]
d_train = x_train[len(df):]

In [None]:
# %%time
# sample_num = 100000
# sup_x_train = np.concatenate((q_train[:sample_num].todense(), d_train[:sample_num].todense()), axis=1)
# sup_y_train = y_train[:sample_num]

In [None]:
class VAE():
    def __init__(self, latent_dim, hidden_dim, feature_num):
        
        self.latent_dim = latent_dim
        self.hidden_dim = hidden_dim
        self.feature_num = feature_num
    
    def transform(self, docs):
        return self.encoder.predict(docs)
    
    def build(self):
        
        def sampling(args):
            
            """Reparameterization trick by sampling fr an isotropic unit Gaussian.
            # Arguments:
                args (tensor): mean and log of variance of Q(z|X)
            # Returns:
                z (tensor): sampled latent vector
            """
            z_mean, z_log_var = args
            batch = K.shape(z_mean)[0]
            dim = K.int_shape(z_mean)[1]
            # by default, random_normal has mean=0 and std=1.0
            epsilon = K.random_normal(shape=(batch, dim))
            return z_mean + K.exp(0.5 * z_log_var) * epsilon
        
        # VAE model = encoder + decoder
        # build encoder model
        inputs = Input(shape=(self.feature_num, ), name='encoder_input')
        x = Dense(self.hidden_dim, activation='relu')(inputs)
        z_mean = Dense(self.latent_dim, name='z_mean')(x)
        z_log_var = Dense(self.latent_dim, name='z_log_var')(x)

        # use reparameterization trick to push the sampling out as input
        # note that "output_shape" isn't necessary with the TensorFlow backend
        z = Lambda(sampling, output_shape=(self.latent_dim,), name='z')([z_mean, z_log_var])

        # instantiate encoder model
        self.encoder = Model(inputs, z, name='encoder')

        # build decoder model
        latent_inputs = Input(shape=(self.latent_dim, ), name='z_sampling')
        x = Dense(self.hidden_dim, activation='relu')(latent_inputs)
        outputs = Dense(self.feature_num, activation='sigmoid')(x)

        # instantiate decoder model
        self.decoder = Model(latent_inputs, outputs, name='decoder')


        # instantiate VAE model
        outputs = self.decoder(self.encoder(inputs))
        self.model = Model(inputs, outputs, name='vae_mlp')
        
        reconstruction_loss = binary_crossentropy(inputs,
                                                  outputs)
        reconstruction_loss *= self.feature_num
        kl_loss = 1 + z_log_var - K.square(z_mean) - K.exp(z_log_var)
        kl_loss = K.sum(kl_loss, axis=-1)
        kl_loss *= -0.5
        vae_loss = K.mean(reconstruction_loss + kl_loss)
        self.model.add_loss(vae_loss)
        self.model.compile(optimizer='adam')
        

In [None]:
vae = VAE(32,500, 50000)
vae.build()

In [101]:
x_ = np.random.randint(2, size=(batch_size, vae.feature_num))
vae.model.fit(x_, batch_size=batch_size, verbose=1)

Epoch 1/1


<keras.callbacks.History at 0x7f86c7ba14a8>

In [20]:
%%time
sample_num = 100000
uns_q_train = q_train[sample_num:]
uns_d_train = d_train[sample_num:]

CPU times: user 84 ms, sys: 436 ms, total: 520 ms
Wall time: 518 ms


In [111]:
%%time
x = q_train[:100000]
batch_size = 64

for i in tqdm(range(math.ceil(x.shape[0]/batch_size))):
    batch_q = uns_q_train[i*batch_size:(i+1)*batch_size].todense()
    batch_d = uns_d_train[i*batch_size:(i+1)*batch_size].todense()
    
    vae.model.train_on_batch(batch_d, [])

    


100%|██████████| 1563/1563 [01:44<00:00, 14.90it/s]

CPU times: user 1min 18s, sys: 22.7 s, total: 1min 41s
Wall time: 1min 44s





In [None]:
%%time
sample_num = 100000
sup_x_train = np.concatenate((q_train[:sample_num].todense(), d_train[:sample_num].todense()), axis=1)
sup_y_train = y_train[:sample_num]

In [None]:
import keras
from keras import backend as K
from keras.models import Sequential,  Model
from keras.layers.core import Dense, Lambda, Activation
from keras.layers import Embedding, Input, Dense, merge, Reshape, Merge, Flatten, Dropout, GlobalAveragePooling1D
from keras.constraints import maxnorm
from keras.optimizers import Adagrad, Adam, SGD, RMSprop
from time import time
import sys

# que_input = Input(shape=(max_features,))
# doc_input = Input(shape=(max_features,))

que_input = Input(shape=(latent_dim,))
doc_input = Input(shape=(latent_dim,))

concat = merge([que_input, doc_input], mode="concat")

d1 = Dense(512, input_dim=input_dim, activation='relu')
d2 = Dense(256, activation='relu')
d3 = Dense(128, activation='relu')
d4 = Dense(64, activation='relu')
d5 = Dense(32, activation='relu')
d6 = Dense(1, activation='sigmoid')

out = d6(d5(d4(d3(d2(d1(concat))))))


model2 = Model(input=[que_input, doc_input], output=out)

model2.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
for i in tqdm(range(math.ceil(x.shape[0]/batch_size))):
    batch_q = q_train[i*batch_size:(i+1)*batch_size].todense()
    batch_d = d_train[i*batch_size:(i+1)*batch_size].todense()
    
    model.fit(small_x_train, small_y_train, batch_size=batch_size, verbose=0)