In [3]:
from keras.layers import GlobalAveragePooling1D, merge, Bidirectional, Dense, Embedding, Input, Lambda, LSTM, RepeatVector, TimeDistributed, Layer, Activation, Dropout
from keras.preprocessing.sequence import pad_sequences
from keras.layers.advanced_activations import ELU
from keras.preprocessing.text import Tokenizer
from keras.callbacks import ModelCheckpoint
from keras.optimizers import Adam
from keras import backend as K
from keras.models import Model
from scipy import spatial
import tensorflow as tf
import pandas as pd
import numpy as np
import codecs
import csv
import os
from sklearn import metrics

import os
os.environ["CUDA_VISIBLE_DEVICES"]="0"

In [4]:
%%time
import pandas as pd

TRAIN_DATA_FILE = '/data/chzho/deepqts/train_data/unifiedclick/join_oneyearsample_2B_training_all_top10'
num_read_row = 1000000
df = pd.read_csv(TRAIN_DATA_FILE, sep="\t", usecols=[0,1,3], names=['label', 'q', 'd'], header=None , error_bad_lines=False, nrows=num_read_row)
df = df.dropna()

TEST_DATA_FILE = '/data/chzho/deepqts/test_data/uhrs/unified/uhrs_do_10'
df_qd = pd.read_csv(TEST_DATA_FILE, sep="\t", usecols=[0,1,3,5], names=['label', 'q', 'd', 'market'], header=None , error_bad_lines=False)
df_qd = df_qd.dropna()
df_qd = df_qd[df_qd.market == "en-US"]

TEST_DATA_FILE = '/data/chzho/deepqts/test_data/julyflower/julyflower_original.tsv'
df_qq = pd.read_csv(TEST_DATA_FILE, sep="\t", names=['q', 'd', 'label'], header=None , error_bad_lines=False)
df_qq = df_qq.dropna()

CPU times: user 7.16 s, sys: 504 ms, total: 7.66 s
Wall time: 7.66 s


In [81]:
texts = df_qd.q.tolist() + df_qd.d.tolist() + df_qq.q.tolist() + df_qq.d.tolist() + df.q.tolist() + df.d.tolist()

In [83]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

EMBEDDING_DIM = 50
MAX_SEQUENCE_LENGTH = 5

tokenizer = Tokenizer(500000)
tokenizer.fit_on_texts(texts)
word_index = tokenizer.word_index #the dict values start from 1 so this is fine with zeropadding
index2word = {v: k for k, v in word_index.items()}
print('Found %s unique tokens' % len(word_index))
NB_WORDS = (min(tokenizer.nb_words, len(word_index)) + 1 ) #+1 for zero padding
print('Number of Vocab: %d' % NB_WORDS)



Found 711013 unique tokens
Number of Vocab: 500001


In [90]:
q_train_qq = tokenizer.texts_to_sequences(df_qq.q.tolist())
q_train_qq = pad_sequences(q_train_qq, maxlen=MAX_SEQUENCE_LENGTH)
d_train_qq = tokenizer.texts_to_sequences(df_qq.d.tolist())
d_train_qq = pad_sequences(d_train_qq, maxlen=MAX_SEQUENCE_LENGTH)
y_train_qq = df_qq.label.values

uns_q = pad_sequences(tokenizer.texts_to_sequences(df.q.tolist()), maxlen=MAX_SEQUENCE_LENGTH)
uns_d = pad_sequences(tokenizer.texts_to_sequences(df.d.tolist()), maxlen=MAX_SEQUENCE_LENGTH)


In [8]:
GLOVE_EMBEDDING = '/home/t-jamano/data/glove/glove.6B.50d.txt'
embeddings_index = {}
f = open(GLOVE_EMBEDDING, encoding='utf8')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))


glove_embedding_matrix = np.zeros((NB_WORDS, EMBEDDING_DIM))
for word, i in word_index.items():
    if i < NB_WORDS:
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            # words not found in embedding index will be the word embedding of 'unk'.
            glove_embedding_matrix[i] = embedding_vector
        else:
            glove_embedding_matrix[i] = embeddings_index.get('unk')
print('Null word embeddings: %d' % np.sum(np.sum(glove_embedding_matrix, axis=1) == 0))

Found 400000 word vectors.
Null word embeddings: 1


In [85]:
class W2V():
    def __init__(self, input_dim, emb_dim, nb_words, weights=None):

        q_input = Input(shape=(input_dim,))
        d_input = Input(shape=(input_dim,))
        if weights != None:
            emb = Embedding(nb_words, emb_dim, input_length=input_dim, weights=[weights])
        else:
            emb = Embedding(nb_words, emb_dim, input_length=input_dim)

        
        q_embed = GlobalAveragePooling1D()(emb(q_input))
        d_embed = GlobalAveragePooling1D()(emb(d_input))

        concat = merge([q_embed, d_embed], mode="concat")


        pred = Dense(1, activation='sigmoid')(concat)

        self.model = Model(input=[q_input, d_input], output=pred)
        self.model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])


In [86]:
w2v = W2V(MAX_SEQUENCE_LENGTH, EMBEDDING_DIM, NB_WORDS)

In [87]:
w2v.model.fit([q_train_qq, d_train_qq], y_train_qq, verbose=2, batch_size=32, validation_split=0.5)

Train on 213 samples, validate on 213 samples
Epoch 1/10
0s - loss: 0.6932 - acc: 0.5305 - val_loss: 0.6916 - val_acc: 0.5399
Epoch 2/10
0s - loss: 0.6845 - acc: 0.7089 - val_loss: 0.6904 - val_acc: 0.5493
Epoch 3/10
0s - loss: 0.6754 - acc: 0.7606 - val_loss: 0.6892 - val_acc: 0.5728
Epoch 4/10
0s - loss: 0.6658 - acc: 0.8028 - val_loss: 0.6879 - val_acc: 0.5775
Epoch 5/10
0s - loss: 0.6554 - acc: 0.8357 - val_loss: 0.6865 - val_acc: 0.5775
Epoch 6/10
0s - loss: 0.6433 - acc: 0.8498 - val_loss: 0.6849 - val_acc: 0.5775
Epoch 7/10
0s - loss: 0.6303 - acc: 0.8592 - val_loss: 0.6834 - val_acc: 0.5775
Epoch 8/10
0s - loss: 0.6153 - acc: 0.8779 - val_loss: 0.6817 - val_acc: 0.5681
Epoch 9/10
0s - loss: 0.5995 - acc: 0.8873 - val_loss: 0.6801 - val_acc: 0.5681
Epoch 10/10
0s - loss: 0.5816 - acc: 0.8967 - val_loss: 0.6784 - val_acc: 0.5822


<keras.callbacks.History at 0x7f0df72f25f8>

In [88]:
pred = w2v.model.predict([q_train_qq[213:], d_train_qq[213:]])
fpr, tpr, thresholds = metrics.roc_curve(y_train_qq[213:], pred, pos_label=1)
auc = metrics.auc(fpr, tpr)

In [None]:
auc