In [70]:
from keras.layers import Bidirectional, Dense, Concatenate,Embedding, Input, Lambda, LSTM, RepeatVector, TimeDistributed, Layer, Activation, Dropout
from keras.preprocessing.sequence import pad_sequences
from keras.layers.advanced_activations import ELU
from keras.preprocessing.text import Tokenizer
from keras.callbacks import ModelCheckpoint
from keras.optimizers import Adam
from keras import objectives, backend as K
from keras.models import Model
from keras.datasets import imdb
from scipy import spatial
import tensorflow as tf
import pandas as pd
from sklearn.utils import shuffle
import numpy as np
import codecs
import csv
import os
import pytrec_eval
import warnings
warnings.filterwarnings('ignore')

In [2]:
TRAIN_DATA_FILE = '/data/t-mipha/data/agi_encoder/v4/universal/CLICKED_QQ_EN_universal_train_1M.txt'
df = pd.read_csv(TRAIN_DATA_FILE, usecols=[0,1], names=["q", "d"], sep="\t", header=None, error_bad_lines=False)
df = df.dropna()

q_train = df.q.tolist()
# use only first similar query
d_train = [i.split("<sep>")[0] for i in df.d.tolist()]
y_train = np.ones(len(df))

texts = q_train + d_train

In [13]:
file_dir = '/data/t-mipha/data/query_similarity_ndcg/MayFlowerIdeal.txt'
df_test = pd.read_csv(file_dir, names=["market", "qid", "q", "label", "d", "date"], sep="\t", header=0, error_bad_lines=False)
df_test = df_test.dropna()

In [4]:
print("Average query's length: {}".format(np.mean(list(map(len, texts)), dtype=int)))

Average query's length: 23


In [5]:
%%time
from l3wtransformer import L3wTransformer
tokeniser = L3wTransformer()
tokeniser.fit_on_texts(texts)

CPU times: user 1min 10s, sys: 132 ms, total: 1min 10s
Wall time: 1min 10s


In [69]:
tokeniser.max_ngrams


50000

In [6]:
q_train = tokeniser.texts_to_sequences(q_train)
d_train = tokeniser.texts_to_sequences(d_train)

In [61]:
max_len = np.max(list(map(len, q_train + d_train)))
print("Max query's length: {}".format(max_len))

Max query's length: 23


In [62]:
q_train = pad_sequences(q_train, maxlen=max_len)
d_train = pad_sequences(d_train, maxlen=max_len)

In [64]:
q_test = pad_sequences(tokeniser.texts_to_sequences(df_test.q.tolist()), maxlen=max_len)
d_test = pad_sequences(tokeniser.texts_to_sequences(df_test.d.tolist()), maxlen=max_len)

In [65]:
y_test = np.array([0 if i == "Bad" else 1 if i == "Fare" else 2 for i in df_test.label.tolist()])

In [66]:
class LSTM_Model():
    def __init__(self, max_len=10, emb_dim=100, nb_words=50000, lstm_dim=256):

        q_input = Input(shape=(max_len,))
        d_input = Input(shape=(max_len,))
        
        emb = Embedding(nb_words, emb_dim, mask_zero=True)

        lstm = LSTM(lstm_dim)

        self.q_embed = lstm(emb(q_input))
        self.d_embed = lstm(emb(d_input))

        concat = Concatenate()([self.q_embed, self.d_embed])
        
        self.encoder = Model(q_input, self.q_embed)

        pred = Dense(1, activation='sigmoid')(concat)

        self.model = Model(inputs=[q_input, d_input], outputs=pred)
        self.model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])


MAX_NB_WORDS = tokeniser.max_ngrams + 5 + 1     # default l2wtransformer's max vocab = 50K + 5 for unknown word
MAX_SEQUENCE_LENGTH = max_len
EMBEDDING_DIM = 200 # similar to Bing pre-trained w2v




In [67]:
lstm = LSTM_Model(MAX_SEQUENCE_LENGTH, EMBEDDING_DIM, MAX_NB_WORDS)

for i in range(5):
    
    d_neg_train = np.copy(d_train)
    np.random.shuffle(d_neg_train)
    
    q_ = np.concatenate([q_train, q_train])
    d_ = np.concatenate([d_train, d_neg_train])
    y_ = np.concatenate([np.ones(len(q_train)), np.zeros(len(q_train))])
    
#     nb = 100000
    
    q, y = shuffle(q_, y_, random_state=0)
    d, y = shuffle(d_, y_, random_state=0)
    
#     q = q[:nb]
#     d = d[:nb]
#     y = y[:nb]
    
    
    
    lstm.model.fit([q, d], y , batch_size=128, epochs=1, verbose=2)

    pred = lstm.model.predict([q_test, d_test])
    evaluate(qrel, pred)
    pred = get_cosine_sim(lstm.encoder.predict(q_test), lstm.encoder.predict(d_test))
    evaluate(qrel, pred)

Epoch 1/1
 - 1911s - loss: 0.6932 - acc: 0.5012
NDCG: 0.482409
MAP: 0.452641
NDCG: 0.484091
MAP: 0.455093
Epoch 1/1
 - 1921s - loss: 0.6932 - acc: 0.5001
NDCG: 0.484053
MAP: 0.454987
NDCG: 0.483480
MAP: 0.454120
Epoch 1/1
 - 1799s - loss: 0.6932 - acc: 0.4997
NDCG: 0.484978
MAP: 0.456440
NDCG: 0.486994
MAP: 0.458845
Epoch 1/1
 - 1882s - loss: 0.6932 - acc: 0.5002
NDCG: 0.479769
MAP: 0.449252
NDCG: 0.479720
MAP: 0.449187
Epoch 1/1
 - 1930s - loss: 0.6932 - acc: 0.5011
NDCG: 0.484907
MAP: 0.455871
NDCG: 0.485659
MAP: 0.456886


In [None]:
class LSTM_Model():
    def __init__(self, max_len=10, emb_dim=100, nb_words=50000, lstm_dim=256):

        q_input = Input(shape=(max_len,))
        d_input = Input(shape=(max_len,))
        
        emb = Embedding(nb_words, emb_dim, mask_zero=True)

        lstm = LSTM(lstm_dim)

        self.q_embed = lstm(emb(q_input))
        self.d_embed = lstm(emb(d_input))

        concat = Concatenate()([self.q_embed, self.d_embed])
        
        self.encoder = Model(q_input, self.q_embed)

        pred = Dense(1, activation='sigmoid')(concat)

        self.model = Model(inputs=[q_input, d_input], outputs=pred)
        self.model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])


MAX_NB_WORDS = tokeniser.max_ngrams + 5 + 1     # default l2wtransformer's max vocab = 50K + 5 for unknown word
MAX_SEQUENCE_LENGTH = max_len
EMBEDDING_DIM = 200 # similar to Bing pre-trained w2v




In [57]:
# trigram, tokeniser trained on train data
NDCG: 0.501180
MAP: 0.477413

array([[    0,     0,     0, ..., 19164, 33897, 50003],
       [    0,     0,     0, ..., 19164, 33897, 50003],
       [30359, 50003,  1162, ..., 14134, 35818, 50003],
       ..., 
       [    0,     0,     0, ..., 23810, 35067, 50003],
       [    0,     0,     0, ..., 23810, 35067, 50003],
       [    0,     0,     0, ..., 23810, 35067, 50003]], dtype=int32)

In [17]:
df_test['label'].value_counts()

Bad     9804
Good    2719
Fair    2609
Name: label, dtype: int64

NDCG: 0.478829
MAP: 0.448108


In [18]:
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from sklearn import metrics

def get_cosine_sim(x, y):
    tmp = []
    for i,j in zip(x,y):
        tmp.append(cosine_similarity(i.reshape(1, -1),j.reshape(1, -1)))
    return np.array(tmp).flatten()

def auc(y_test, pred):
    fpr, tpr, thresholds = metrics.roc_curve(y_test, pred, pos_label=1)
    return metrics.auc(fpr, tpr)    

In [23]:
def convert_2_trec(query, document, label, isQrel):
    trec = {}
    for i, j, k in zip(query, document, label):
        if i not in trec:
            trec[i] = {}
        if j not in trec[i]:
            trec[i][j] = {}
        trec[i][j] = int(k) if isQrel else float(k)
    return trec

def evaluate(qrel, pred):

    run = convert_2_trec(df_test.q.tolist(), df_test.d.tolist(), pred, False)
    evaluator = pytrec_eval.RelevanceEvaluator(
        qrel, {'map', 'ndcg'})

    results = evaluator.evaluate(run)
    print("NDCG: %f" % np.mean([i['ndcg'] for i in results.values()]))
    print("MAP: %f" % np.mean([i['map'] for i in results.values()]))
    
qrel = convert_2_trec(df_test.q.tolist(), df_test.d.tolist(), y_test, True)

## Trigram result

In [97]:
pred = get_cosine_sim(q_test, d_test)
run = convert_2_trec(df_test.q.tolist(), df_test.d.tolist(), pred, False)
evaluator = pytrec_eval.RelevanceEvaluator(
    qrel, {'map', 'ndcg'})

results = evaluator.evaluate(run)
print("NDCG: %f" % np.mean([i['ndcg'] for i in results.values()]))
print("MAP: %f" % np.mean([i['map'] for i in results.values()]))

NDCG: 0.506189
MAP: 0.483753


In [7]:
%%time
import pickle
with open(r"/data/t-mipha/data/agi_encoder/v4/universal/embedding_dict.pkl", "rb") as input_file:
    e = pickle.load(input_file)

CPU times: user 27.5 s, sys: 12.8 s, total: 40.3 s
Wall time: 40.2 s


In [98]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

MAX_NB_WORDS = 100000
MAX_SEQUENCE_LENGTH = 30

tokenizer = Tokenizer(num_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)

word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)

Found 10923 unique tokens.


In [99]:
EMBEDDING_DIM = len(e['test'])

In [100]:
embeddings_index = e

In [101]:
# prepare embedding matrix
num_words = min(MAX_NB_WORDS, len(word_index) + 1)
embedding_matrix = np.zeros((num_words, EMBEDDING_DIM))
for word, i in word_index.items():
    if i >= MAX_NB_WORDS:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

In [102]:
from keras.layers import Dense, Input, GlobalMaxPooling1D, GlobalAveragePooling1D
from keras.layers import Conv1D, MaxPooling1D, Embedding
from keras.models import Sequential


# load pre-trained word embeddings into an Embedding layer
# note that we set trainable = False so as to keep the embeddings fixed
embedding_layer = Embedding(num_words,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=False)


w2v = Sequential()
w2v.add(embedding_layer)
w2v.add(GlobalAveragePooling1D())


In [103]:
q_test = pad_sequences(tokenizer.texts_to_sequences(df_test.q.tolist()), maxlen=MAX_SEQUENCE_LENGTH)
d_test = pad_sequences(tokenizer.texts_to_sequences(df_test.d.tolist()), maxlen=MAX_SEQUENCE_LENGTH)

In [104]:
w2v = Sequential()
w2v.add(embedding_layer)
w2v.add(GlobalAveragePooling1D())

pred = get_cosine_sim(w2v.predict(q_test), w2v.predict(d_test))
run = convert_2_trec(df_test.q.tolist(), df_test.d.tolist(), pred, False)

results = evaluator.evaluate(run)
print("NDCG: %f" % np.mean([i['ndcg'] for i in results.values()]))
print("MAP: %f" % np.mean([i['map'] for i in results.values()]))

NDCG: 0.522923
MAP: 0.507142


In [64]:
w2v = Sequential()
w2v.add(embedding_layer)
w2v.add(GlobalMaxPooling1D())

pred = get_cosine_sim(w2v.predict(q_test), w2v.predict(d_test))
run = convert_2_trec(df_test.q.tolist(), df_test.d.tolist(), pred, False)

results2 = evaluator.evaluate(run)
print(np.mean([i['ndcg'] for i in results2.values()]))
print(np.mean([i['map'] for i in results2.values()]))

0.827361183641
0.745250255879


In [67]:
import scipy.stats
def ttest(res1, res2, metric="ndcg"):
    res1 = [i[metric] for i in res1.values()]
    res2 = [i[metric] for i in res2.values()]
    print(scipy.stats.ttest_rel(res1, res2))