In [5]:
from Models import *
from Utils import *

In [6]:
LETTER_GRAM_SIZE = 3 # See section 3.2.
WINDOW_SIZE = 3 # See section 3.2.
TOTAL_LETTER_GRAMS = 5005
WORD_DEPTH = WINDOW_SIZE * TOTAL_LETTER_GRAMS # See equation (1).
K = 300 # Dimensionality of the max-pooling layer. See section 3.4.
L = 128 # Dimensionality of latent semantic space. See section 3.5.
J = 1 # Number of random unclicked documents serving as negative examples for a query. See section 4.
FILTER_LENGTH = 1 # We only consider one time step for convolutions.

In [7]:
dssm = DSSM()

In [8]:
# load pre-trained trigram
tokeniser = L3wTransformer(TOTAL_LETTER_GRAMS)
tokeniser = tokeniser.load("/work/data/trigram/2M_50k_trigram")

In [9]:
nb_words = 50005
max_len = 10
batch_size = 128

In [10]:
df_may, qrel_may = get_test_data("MayFlower")
df_june, qrel_june = get_test_data("JuneFlower")

q_may = to_2D_one_hot(parse_texts(df_may.q.tolist(), tokeniser, max_len), nb_words)
d_may = to_2D_one_hot(parse_texts(df_may.d.tolist(), tokeniser, max_len), nb_words)


q_june = to_2D_one_hot(parse_texts(df_june.q.tolist(), tokeniser, max_len), nb_words)
d_june = to_2D_one_hot(parse_texts(df_june.d.tolist(), tokeniser, max_len), nb_words)

b'Skipping line 19898: expected 6 fields, saw 8\nSkipping line 20620: expected 6 fields, saw 8\nSkipping line 38039: expected 6 fields, saw 8\n'


In [11]:
file_dir = '/data/t-mipha/data/agi_encoder/v4/universal/CLICKED_QQ_EN_universal_train_1M.txt'

reader = pd.read_csv(file_dir, chunksize=batch_size, iterator=True, usecols=[0,1], names=["q", "d"], sep="\t", header=None, error_bad_lines=False)

def qq_batch_generator(reader, tokeniser, batch_size, max_len, nb_words):
    for df in reader:
        q = df.q.tolist()
        d = [i.split("<sep>")[0] for i in df.d.tolist()]
        
        q = pad_sequences(tokeniser.texts_to_sequences(q), maxlen=max_len)
        d = pad_sequences(tokeniser.texts_to_sequences(d), maxlen=max_len)
        
        q_one_hot = np.zeros((batch_size, nb_words))
        for i in range(len(q)):
            q_one_hot[i][q[i]] = 1
            
        d_one_hot = np.zeros((batch_size, nb_words))
        for i in range(len(d)):
            d_one_hot[i][d[i]] = 1
            
            
        # negative sampling from positive pool
        neg_d_one_hot = [[] for j in range(J)]
        for i in range(batch_size):
            possibilities = list(range(batch_size))
            possibilities.remove(i)
            negatives = np.random.choice(possibilities, J, replace = False)
            for j in range(J):
                negative = negatives[j]
                neg_d_one_hot[j].append(d_one_hot[negative].tolist())
        
        y = np.zeros((batch_size, J + 1))
        y[:, 0] = 1
        
        for j in range(J):
            neg_d_one_hot[j] = np.array(neg_d_one_hot[j])
        
#         print(q_one_hot.shape, d_one_hot.shape, len(neg_d_one_hot))
#         print(neg_d_one_hot[0])

        # negative sampling from randomness
        # for j in range(J):
        #     neg_d_one_hot[j] = np.random.randint(2, size=(batch_size, 10, WORD_DEPTH))
        

#         q_one_hot = to_categorical(q, nb_words)   
#         q_one_hot = q_one_hot.reshape(batch_size, max_len, nb_words)
        
        
        yield [q_one_hot, d_one_hot] + [neg_d_one_hot[j] for j in range(J)], y

        
dssm.model.fit_generator(qq_batch_generator(reader, tokeniser, batch_size, max_len, nb_words), steps_per_epoch=1000, epochs=1, verbose=2, callbacks=[TQDMNotebookCallback()])       
cosine = CosineSim(L)


for q, d, qrel, df in [[q_may, d_may, qrel_may, df_may], [q_june, d_june, qrel_june, df_june]]:
    pred = cosine.model.predict([dssm.encoder.predict(q), dssm.encoder.predict(d)])
    pred = convert_2_trec(df.q.tolist(), df.d.tolist(), pred, False)
    evaluate(qrel, pred)       

Epoch 1/1
 - 627s - loss: 0.3930



  pred = merge([q_input, d_input], mode="cos")
  name=name)


NDCG: 0.525413
MAP: 0.510676
NDCG: 0.810011
MAP: 0.726695


In [None]:
/work/data/train_data/30M_EN_pos_qd_log

In [14]:
df = pd.read_csv(file_dir, usecols=[0,1], names=["q", "d"], sep="\t", header=None, error_bad_lines=False)
df = df.dropna()

In [27]:
file_dir = '/data/chzho/deepqts/train_data/unifiedclick/join_oneyearsample_2B_training_all_top10'

In [56]:
%%time
df = pd.read_csv(file_dir, nrows=100000000, usecols=[0,1,3,5], names=["label", "q", "d", "market"], sep="\t", header=None, error_bad_lines=False)
df = df.dropna()

CPU times: user 17min 11s, sys: 4min 14s, total: 21min 26s
Wall time: 36min 47s


In [74]:
df[(df.market.str.contains("en-")) & (df.label == 1)].to_csv("/work/data/train_data/EN_QD_log", sep='\t', index=False)

In [75]:
df = pd.read_csv("/work/data/train_data/1M_EN_qq_log", usecols=[0,1], names=["q", "d"], sep="\t", header=None, error_bad_lines=False)
