In [1]:
from keras.layers import Bidirectional, merge, dot, Dense, Embedding, Input, Lambda, LSTM, RepeatVector, TimeDistributed, Layer, Activation, Dropout
from keras.preprocessing.sequence import pad_sequences
from keras.layers.advanced_activations import ELU
from keras.preprocessing.text import Tokenizer
from keras.callbacks import ModelCheckpoint
from keras.optimizers import Adam
from keras import backend as K
from keras.models import Model
from scipy import spatial
import tensorflow as tf
import pandas as pd
import numpy as np
import codecs
import csv
import os
import scipy.sparse
import random
import itertools



Using TensorFlow backend.


In [2]:
TRAIN_DATA_FILE = '/data/chzho/deepqts/train_data/unifiedclick/join_oneyearsample_2B_training_all_top10'
batch_size = 1000
MAX_SEQUENCE_LENGTH = 7
MAX_NB_WORDS = 100000
max_features = 50000

In [3]:
%%time
num_read_row = 1000000
df = pd.read_csv(TRAIN_DATA_FILE, sep="\t", usecols=[0,1,3], names=['label', 'q', 'd'], header=None , error_bad_lines=False, nrows=num_read_row)
df = df.dropna()

CPU times: user 8.68 s, sys: 656 ms, total: 9.34 s
Wall time: 9.33 s


In [4]:
def text_generator(TRAIN_DATA_FILE, batch_size):
    reader = pd.read_csv(TRAIN_DATA_FILE, chunksize=batch_size, iterator=True, sep="\t", usecols=[0,1,3], names=['label', 'q', 'd'], header=None , error_bad_lines=False)
    for df in reader:
        yield df.iloc[:,1].tolist() + df.iloc[:,2].tolist()

In [5]:
%%time
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

count_vect = CountVectorizer(max_features=max_features)
x_train = count_vect.fit_transform(df.q.tolist() + df.d.tolist())
tf_transformer = TfidfTransformer().fit(x_train)
x_train = tf_transformer.transform(x_train)
y_train = df.label.values

CPU times: user 24.4 s, sys: 1.04 s, total: 25.5 s
Wall time: 25.5 s


In [6]:
q_train = x_train[:len(df)]
d_train = x_train[len(df):]

Split 90% of data as unsupervise data and 10% of data as supervised data

In [7]:
%%time
sample_num = 100000
sup_x_train = np.concatenate((q_train[:sample_num].todense(), d_train[:sample_num].todense()), axis=1)
sup_y_train = y_train[:sample_num]


CPU times: user 12.6 s, sys: 55.2 s, total: 1min 7s
Wall time: 1min 7s


In [8]:
# %%time
# uns_x_train = np.concatenate((q_train[sample_num:].todense(), d_train[sample_num:].todense()))
# uns_y_train = uns_x_train

In [9]:
import tensorflow as tf
import numpy as np
from utils import *
from VDSH import *

from __future__ import print_function


latent_dim = 32
sess = get_session("0") # choose the GPU and how much memory in percentage that we need
model = VDSH(sess, latent_dim, max_features)

# create an optimizer
learning_rate=0.001
decay_rate = 0.96
#decay_step = 10000
step = tf.Variable(0, trainable=False)  
lr = tf.train.exponential_decay(learning_rate, 
                                step, 
                                10000, 
                                decay_rate, 
                                staircase=True, name="lr")

my_optimizer = tf.train.AdamOptimizer(learning_rate=lr) \
                     .minimize(model.cost, global_step=step)
    
init = tf.global_variables_initializer()
model.sess.run(init)

In [None]:
%%time
uns_q_train = q_train[sample_num:]
uns_d_train = d_train[sample_num:]

CPU times: user 112 ms, sys: 88 ms, total: 200 ms
Wall time: 196 ms


In [None]:
%%time
total_epoch = 25
kl_weight = 0.
kl_inc = 1 / 5000. # set the annealing rate for KL loss

for epoch in range(total_epoch):
    epoch_loss = []
    for i in range(900000):
        # get doc
        doc =  uns_q_train[i].todense()
        word_indice = np.where(doc > 0)[1]
        
        # indices
        opt, loss = model.sess.run((my_optimizer, model.cost), 
                                    feed_dict={model.input_bow: doc.reshape((-1, max_features)),
                                               model.input_bow_idx: word_indice,
                                               model.kl_weight: kl_weight,
                                               model.keep_prob: 0.9})
        
        kl_weight = min(kl_weight + kl_inc, 1.0)
        epoch_loss.append(loss)
        
        
        # get doc
        doc =  uns_d_train[i].todense()
        word_indice = np.where(doc > 0)[1]
        
        # indices
        opt, loss = model.sess.run((my_optimizer, model.cost), 
                                    feed_dict={model.input_bow: doc.reshape((-1, max_features)),
                                               model.input_bow_idx: word_indice,
                                               model.kl_weight: kl_weight,
                                               model.keep_prob: 0.9})
        
        kl_weight = min(kl_weight + kl_inc, 1.0)
        epoch_loss.append(loss)
        
        if i % 50 == 0:
            print("\rEpoch:{}/{} {}/{}: Loss:{:.3f} AvgLoss:{:.3f}"
                  .format(epoch+1, total_epoch, i, 900000, loss, np.mean(epoch_loss)), end='')

Epoch:4/25 291050/900000: Loss:19.924 AvgLoss:48.38781

### encoder queries and documents for testing

In [None]:
enc_q = model.transform(q_train[:sample_num].todense())
enc_d = model.transform(d_train[:sample_num].todense())

In [63]:
enc_q = np.array(enc_q)
enc_d = np.array(enc_d)

In [64]:
from keras.models import Sequential
from keras.layers import Dense
import numpy as np
# fix random seed for reproducibility
np.random.seed(7)

In [65]:
# create model
input_dim = q_train.shape[1] + d_train.shape[1]
model = Sequential()
model.add(Dense(512, input_dim=input_dim, activation='relu'))
model.add(Dense(256, activation='relu'))
model.add(Dense(128, activation='relu'))
model.add(Dense(64, activation='relu'))
model.add(Dense(32, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

In [33]:
# Compile model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [34]:
model.fit(small_x_train, small_y_train, batch_size=batch_size, validation_split=0.2, verbose=2)

Train on 80000 samples, validate on 20000 samples
Epoch 1/1
 - 166s - loss: 0.6172 - acc: 0.6273 - val_loss: 0.5975 - val_acc: 0.6575


<keras.callbacks.History at 0x7f24187b4ef0>

In [35]:
np.count_nonzero(y_train)

53080

In [36]:
len(y_train)

100000

In [38]:
import random
p = 0.01  # 1% of the lines
random_df = pd.read_csv(TRAIN_DATA_FILE, sep="\t", usecols=[0,1,3], names=['label', 'q', 'd'], header=None , error_bad_lines=False, nrows=50)


In [77]:
import keras
from keras import backend as K
from keras.models import Sequential,  Model
from keras.layers.core import Dense, Lambda, Activation
from keras.layers import Embedding, Input, Dense, merge, Reshape, Merge, Flatten, Dropout, GlobalAveragePooling1D
from keras.constraints import maxnorm
from keras.optimizers import Adagrad, Adam, SGD, RMSprop
from time import time
import sys

# que_input = Input(shape=(max_features,))
# doc_input = Input(shape=(max_features,))

que_input = Input(shape=(latent_dim,))
doc_input = Input(shape=(latent_dim,))

concat = merge([que_input, doc_input], mode="concat")

d1 = Dense(512, input_dim=input_dim, activation='relu')
d2 = Dense(256, activation='relu')
d3 = Dense(128, activation='relu')
d4 = Dense(64, activation='relu')
d5 = Dense(32, activation='relu')
d6 = Dense(1, activation='sigmoid')

out = d6(d5(d4(d3(d2(d1(concat))))))


model2 = Model(input=[que_input, doc_input], output=out)

model2.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

  name=name)


TF-IDF features

In [22]:
model2.fit([q_train[:sample_num].todense(), d_train[:sample_num].todense()], sup_y_train, batch_size=batch_size, validation_split=0.2, verbose=2)

Train on 80000 samples, validate on 20000 samples
Epoch 1/1
 - 179s - loss: 0.6191 - acc: 0.6275 - val_loss: 0.6011 - val_acc: 0.6517


<keras.callbacks.History at 0x7f6f7b610ba8>

VAE 

In [80]:
model2.fit([enc_q, enc_d], sup_y_train, batch_size=batch_size, validation_split=0.2, verbose=2, epochs=10)

Train on 80000 samples, validate on 20000 samples
Epoch 1/10
 - 1s - loss: 0.6912 - acc: 0.5312 - val_loss: 0.6915 - val_acc: 0.5293
Epoch 2/10
 - 1s - loss: 0.6913 - acc: 0.5312 - val_loss: 0.6915 - val_acc: 0.5293
Epoch 3/10
 - 1s - loss: 0.6912 - acc: 0.5312 - val_loss: 0.6914 - val_acc: 0.5293
Epoch 4/10
 - 1s - loss: 0.6912 - acc: 0.5312 - val_loss: 0.6915 - val_acc: 0.5293
Epoch 5/10
 - 1s - loss: 0.6912 - acc: 0.5312 - val_loss: 0.6914 - val_acc: 0.5293
Epoch 6/10
 - 1s - loss: 0.6912 - acc: 0.5312 - val_loss: 0.6915 - val_acc: 0.5293
Epoch 7/10
 - 1s - loss: 0.6912 - acc: 0.5312 - val_loss: 0.6914 - val_acc: 0.5293
Epoch 8/10
 - 1s - loss: 0.6912 - acc: 0.5312 - val_loss: 0.6915 - val_acc: 0.5293
Epoch 9/10
 - 1s - loss: 0.6912 - acc: 0.5312 - val_loss: 0.6916 - val_acc: 0.5293
Epoch 10/10
 - 0s - loss: 0.6912 - acc: 0.5312 - val_loss: 0.6914 - val_acc: 0.5293


<keras.callbacks.History at 0x7f39d8033cf8>