In [72]:
from keras.layers import Bidirectional, Dense, Concatenate, Embedding, Merge, merge, Input, Lambda, LSTM, RepeatVector, TimeDistributed, Layer, Activation, Dropout
from keras.preprocessing.sequence import pad_sequences
from keras.layers.advanced_activations import ELU
from keras.preprocessing.text import Tokenizer
from keras.callbacks import ModelCheckpoint
from keras.optimizers import Adam
from keras import objectives, backend as K
from keras.models import Model
from keras.datasets import imdb
from scipy import spatial
import tensorflow as tf
import pandas as pd
import numpy as np
import codecs
import csv
import os
import pytrec_eval
from sklearn.utils import shuffle

In [2]:
TRAIN_DATA_FILE = '/data/t-mipha/data/agi_encoder/v4/universal/CLICKED_QQ_EN_universal_train_1M.txt'
df = pd.read_csv(TRAIN_DATA_FILE, usecols=[0,1], names=["q", "d"], sep="\t", header=None, error_bad_lines=False)
df = df.dropna()

q_train = df.q.tolist()
# use only first similar query
d_train = [i.split("<sep>")[0] for i in df.d.tolist()]
y_train = np.ones(len(df))

texts = q_train + d_train

In [3]:
file_dir = '/data/t-mipha/data/query_similarity_ndcg/MayFlowerIdeal.txt'
df_test = pd.read_csv(file_dir, names=["market", "qid", "q", "label", "d", "date"], sep="\t", header=0, error_bad_lines=False)
df_test = df_test.dropna()

In [29]:
print("Average query's length: {}".format(np.max(list(map(len, texts)))))

Average query's length: 561829


In [5]:
%%time
from l3wtransformer import L3wTransformer
tokeniser = L3wTransformer()
tokeniser.fit_on_texts(texts)

CPU times: user 1min 11s, sys: 212 ms, total: 1min 11s
Wall time: 1min 11s


In [77]:
tokenisertexts)

1993708

In [43]:
q_train = tokeniser.texts_to_sequences(q_train)
d_train = tokeniser.texts_to_sequences(d_train)

In [50]:
max_len = np.mean(list(map(len, q_train + d_train)), dtype='int32')
print("Average query's length: {}".format(max_len))

Average query's length: 23


In [63]:
q_train = pad_sequences(q_train, maxlen=max_len)
d_train = pad_sequences(d_train, maxlen=max_len)

In [11]:
q_test = pad_sequences(tokeniser.texts_to_sequences(df_test.q.tolist()), maxlen=max_len)
d_test = pad_sequences(tokeniser.texts_to_sequences(df_test.d.tolist()), maxlen=max_len)

In [12]:
y_test = np.array([0 if i == "Bad" else 1 if i == "Fare" else 2 for i in df_test.label.tolist()])

In [13]:
df_test['label'].value_counts()

Bad     9804
Good    2719
Fair    2609
Name: label, dtype: int64

In [14]:
def convert_2_trec(query, document, label, isQrel):
    trec = {}
    for i, j, k in zip(query, document, label):
        if i not in trec:
            trec[i] = {}
        if j not in trec[i]:
            trec[i][j] = {}
        trec[i][j] = int(k) if isQrel else float(k)
    return trec

def evaluate(qrel, pred):

    run = convert_2_trec(df_test.q.tolist(), df_test.d.tolist(), pred, False)
    evaluator = pytrec_eval.RelevanceEvaluator(
        qrel, {'map', 'ndcg'})

    results = evaluator.evaluate(run)
    print("NDCG: %f" % np.mean([i['ndcg'] for i in results.values()]))
    print("MAP: %f" % np.mean([i['map'] for i in results.values()]))
    
qrel = convert_2_trec(df_test.q.tolist(), df_test.d.tolist(), y_test, True)

In [17]:
MAX_NB_WORDS = tokeniser.max_ngrams + 5 + 1     # default l2wtransformer's max vocab = 50K + 5 for unknown word
MAX_SEQUENCE_LENGTH = max_len
EMBEDDING_DIM = 200 # similar to Bing pre-trained w2v




In [20]:
from keras.models import Model
from keras.layers import Input, LSTM, Dense

# Define an input sequence and process it.
encoder_inputs = Input(shape=(None, MAX_NB_WORDS))
encoder = LSTM(256, return_state=True)
encoder_outputs, state_h, state_c = encoder(encoder_inputs)
# We discard `encoder_outputs` and only keep the states.
encoder_states = [state_h, state_c]

# Set up the decoder, using `encoder_states` as initial state.
decoder_inputs = Input(shape=(None, MAX_NB_WORDS))
# We set up our decoder to return full output sequences,
# and to return internal states as well. We don't use the 
# return states in the training model, but we will use them in inference.
decoder_lstm = LSTM(256, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_inputs,
                                     initial_state=encoder_states)
decoder_dense = Dense(MAX_NB_WORDS, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

# Define the model that will turn
# `encoder_input_data` & `decoder_input_data` into `decoder_target_data`
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

In [24]:
x = np.random.randint(MAX_NB_WORDS, size=(10,MAX_SEQUENCE_LENGTH, MAX_NB_WORDS))
model.predict([x, x]).shape

(10, 23, 50006)

In [22]:
encoder_model = Model(encoder_inputs, encoder_states)

decoder_state_input_h = Input(shape=(256,))
decoder_state_input_c = Input(shape=(256,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]
decoder_outputs, state_h, state_c = decoder_lstm(
    decoder_inputs, initial_state=decoder_states_inputs)
decoder_states = [state_h, state_c]
decoder_outputs = decoder_dense(decoder_outputs)
decoder_model = Model(
    [decoder_inputs] + decoder_states_inputs,
    [decoder_outputs] + decoder_states)

In [68]:
class LSTM_Model():
    def __init__(self, max_len=10, emb_dim=100, nb_words=50000, lstm_dim=256):

        q_input = Input(shape=(max_len,))
        d_input = Input(shape=(max_len,))
        
        emb = Embedding(nb_words, emb_dim, mask_zero=True)

        lstm = LSTM(lstm_dim)

        self.q_embed = lstm(emb(q_input))
        self.d_embed = lstm(emb(d_input))

        concat = merge([self.q_embed, self.d_embed], mode="cos")
        
        self.encoder = Model(q_input, self.q_embed)

        pred = Dense(1, activation='sigmoid')(concat)

        self.model = Model(inputs=[q_input, d_input], outputs=pred)
        self.model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])


MAX_NB_WORDS = tokeniser.max_ngrams + 5 + 1     # default l2wtransformer's max vocab = 50K + 5 for unknown word
MAX_SEQUENCE_LENGTH = max_len
EMBEDDING_DIM = 200 # similar to Bing pre-trained w2v




In [76]:
# lstm = LSTM_Model(MAX_SEQUENCE_LENGTH, EMBEDDING_DIM, MAX_NB_WORDS)

for i in range(4):
    
    d_neg_train = np.copy(d_train)
    np.random.shuffle(d_neg_train)
    
    q_ = np.concatenate([q_train, q_train])
    d_ = np.concatenate([d_train, d_neg_train])
    y_ = np.concatenate([np.ones(len(q_train)), np.zeros(len(q_train))])
       
    q, y = shuffle(q_, y_, random_state=0)
    d, y = shuffle(d_, y_, random_state=0)
       
    
    lstm.model.fit([q, d], y , batch_size=128, epochs=1, verbose=2)

    pred = lstm.model.predict([q_test, d_test])
    evaluate(qrel, pred)
#     pred = get_cosine_sim(lstm.encoder.predict(q_test), lstm.encoder.predict(d_test))
#     evaluate(qrel, pred)

Epoch 1/1
 - 1982s - loss: 0.1809 - acc: 0.9344
NDCG: 0.529818
MAP: 0.516537
Epoch 1/1
 - 1978s - loss: 0.1523 - acc: 0.9442
NDCG: 0.531364
MAP: 0.518602
Epoch 1/1
 - 1984s - loss: 0.1331 - acc: 0.9507
NDCG: 0.531317
MAP: 0.518825
Epoch 1/1
 - 1997s - loss: 0.1181 - acc: 0.9558
NDCG: 0.530491
MAP: 0.517802


In [75]:
#first epoch
pred = get_cosine_sim(lstm.encoder.predict(q_test), lstm.encoder.predict(d_test))
evaluate(qrel, pred)

NDCG: 0.529803
MAP: 0.516425


NDCG: 0.478829
MAP: 0.448108


In [74]:
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from sklearn import metrics

def get_cosine_sim(x, y):
    tmp = []
    for i,j in zip(x,y):
        tmp.append(cosine_similarity(i.reshape(1, -1),j.reshape(1, -1)))
    return np.array(tmp).flatten()

def auc(y_test, pred):
    fpr, tpr, thresholds = metrics.roc_curve(y_test, pred, pos_label=1)
    return metrics.auc(fpr, tpr)    

## Trigram result

In [97]:
pred = get_cosine_sim(q_test, d_test)
run = convert_2_trec(df_test.q.tolist(), df_test.d.tolist(), pred, False)
evaluator = pytrec_eval.RelevanceEvaluator(
    qrel, {'map', 'ndcg'})

results = evaluator.evaluate(run)
print("NDCG: %f" % np.mean([i['ndcg'] for i in results.values()]))
print("MAP: %f" % np.mean([i['map'] for i in results.values()]))

NDCG: 0.506189
MAP: 0.483753


In [7]:
%%time
import pickle
with open(r"/data/t-mipha/data/agi_encoder/v4/universal/embedding_dict.pkl", "rb") as input_file:
    e = pickle.load(input_file)

CPU times: user 27.5 s, sys: 12.8 s, total: 40.3 s
Wall time: 40.2 s


In [98]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

MAX_NB_WORDS = 100000
MAX_SEQUENCE_LENGTH = 30

tokenizer = Tokenizer(num_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)

word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)

Found 10923 unique tokens.


In [99]:
EMBEDDING_DIM = len(e['test'])

In [100]:
embeddings_index = e

In [101]:
# prepare embedding matrix
num_words = min(MAX_NB_WORDS, len(word_index) + 1)
embedding_matrix = np.zeros((num_words, EMBEDDING_DIM))
for word, i in word_index.items():
    if i >= MAX_NB_WORDS:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

In [102]:
from keras.layers import Dense, Input, GlobalMaxPooling1D, GlobalAveragePooling1D
from keras.layers import Conv1D, MaxPooling1D, Embedding
from keras.models import Sequential


# load pre-trained word embeddings into an Embedding layer
# note that we set trainable = False so as to keep the embeddings fixed
embedding_layer = Embedding(num_words,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=False)


w2v = Sequential()
w2v.add(embedding_layer)
w2v.add(GlobalAveragePooling1D())


In [103]:
q_test = pad_sequences(tokenizer.texts_to_sequences(df_test.q.tolist()), maxlen=MAX_SEQUENCE_LENGTH)
d_test = pad_sequences(tokenizer.texts_to_sequences(df_test.d.tolist()), maxlen=MAX_SEQUENCE_LENGTH)

In [104]:
w2v = Sequential()
w2v.add(embedding_layer)
w2v.add(GlobalAveragePooling1D())

pred = get_cosine_sim(w2v.predict(q_test), w2v.predict(d_test))
run = convert_2_trec(df_test.q.tolist(), df_test.d.tolist(), pred, False)

results = evaluator.evaluate(run)
print("NDCG: %f" % np.mean([i['ndcg'] for i in results.values()]))
print("MAP: %f" % np.mean([i['map'] for i in results.values()]))

NDCG: 0.522923
MAP: 0.507142


In [64]:
w2v = Sequential()
w2v.add(embedding_layer)
w2v.add(GlobalMaxPooling1D())

pred = get_cosine_sim(w2v.predict(q_test), w2v.predict(d_test))
run = convert_2_trec(df_test.q.tolist(), df_test.d.tolist(), pred, False)

results2 = evaluator.evaluate(run)
print(np.mean([i['ndcg'] for i in results2.values()]))
print(np.mean([i['map'] for i in results2.values()]))

0.827361183641
0.745250255879


In [67]:
import scipy.stats
def ttest(res1, res2, metric="ndcg"):
    res1 = [i[metric] for i in res1.values()]
    res2 = [i[metric] for i in res2.values()]
    print(scipy.stats.ttest_rel(res1, res2))