In [25]:
import pandas as pd
en_train = pd.read_table('../data/cikm_english_train_20180516.txt',names=['en1','sp1','en2','sp2','label'])
sp_train = pd.read_table('../data/cikm_spanish_train_20180516.txt',names=['sp1','en1','sp2','en2','label'])
unlabel_sp_train = pd.read_table('../data/cikm_unlabel_spanish_train_20180516.txt',names=['sp1','en1','sp2','en2','label'])
test = pd.read_table('../data/cikm_test_a_20180516.txt',names=['sp1','sp2'])

df_all = pd.concat((en_train,sp_train))

df_all['sp1'].fillna('', inplace=True)
df_all['sp2'].fillna('', inplace=True)
test

Unnamed: 0,sp1,sp2
0,?Cómo puedo recibir un reembolso mediante tarj...,¿Cómo puedo recibir un reembolso?
1,?Cómo puedo recibir un reembolso mediante tarj...,¿Cómo puedo recibir un reembolso si mi banco h...
2,"?No he podido pagar con mi tarjeta, que debo h...","No puedo pagar con mi tarjeta de débito, ¿qué ..."
3,"?No he podido pagar con mi tarjeta, que debo h...",No puedo pagar mi pedido con tarjeta Visa de d...
4,"?No he podido pagar con mi tarjeta, que debo h...",No puedo pagar con tarjeta.
5,?Por qué no se ha podido procesar mi pago por ...,¿Por qué el pedido ha sido cerrado por razones...
6,?Cuándo recibiré mi reembolso si pago con tarj...,¿Me cobran tarifa de servicio si pago mis pedi...
7,?Qué pasará después de abrir una disputa?,¿Qué pasará después de escalar la disputa?
8,?Qué pasará después de abrir una disputa?,Quiero abrir una disputa.
9,?Por qué ha sido finalizado mi pedido por razo...,¿Por qué mi pedido está cerrado otra vez por r...


creat vocab

In [26]:
from sklearn.feature_extraction.text import CountVectorizer
import itertools

counts_vectorizer = CountVectorizer(max_features=10000-1).fit(
    itertools.chain(df_all['sp1'], df_all['sp2'],test['sp1'],test['sp2']))
other_index = len(counts_vectorizer.vocabulary_)

prep data

In [27]:
import re
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
words_tokenizer = re.compile(counts_vectorizer.token_pattern)

def create_padded_seqs(texts, max_len=10):
    seqs = texts.apply(lambda s:
        [counts_vectorizer.vocabulary_[w] if w in counts_vectorizer.vocabulary_ else other_index
         for w in words_tokenizer.findall(s.lower())])
    return pad_sequences(seqs, maxlen=max_len)

# df_all = df_all.sample(1000) # Just for debugging
print(create_padded_seqs(df_all[:]['sp1']))

X1_train, X1_val, X2_train, X2_val, y_train, y_val = \
    train_test_split(create_padded_seqs(df_all[:]['sp1']),
                     create_padded_seqs(df_all[:]['sp2']),
                     df_all['label'],
                     test_size=0.3, random_state=27)

[[   0    0    0 ... 2086 4297 4558]
 [4415 4983 4515 ... 4639 2086 1927]
 [1627 2360 2360 ... 3976 1627 1103]
 ...
 [   0    0    0 ... 3394 3540 1972]
 [   0    0    0 ... 3394 3540 1972]
 [   0    0    0 ...  890 2358 1532]]


training

In [28]:
import keras.layers as lyr
from keras.models import Model

input1_tensor = lyr.Input(X1_train.shape[1:])
input2_tensor = lyr.Input(X2_train.shape[1:])

words_embedding_layer = lyr.Embedding(len(counts_vectorizer.vocabulary_) + 1, 100)
seq_embedding_layer = lyr.LSTM(256, activation='tanh')

seq_embedding = lambda tensor: seq_embedding_layer(words_embedding_layer(tensor))

merge_layer = lyr.multiply([seq_embedding(input1_tensor), seq_embedding(input2_tensor)])

dense1_layer = lyr.Dense(16, activation='sigmoid')(merge_layer)
ouput_layer = lyr.Dense(1, activation='sigmoid')(dense1_layer)

model = Model([input1_tensor, input2_tensor], ouput_layer)

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

model.fit([X1_train, X2_train], y_train,
          validation_data=([X1_val, X2_val], y_val),
          batch_size=128, epochs=6, verbose=1)

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_5 (InputLayer)            (None, 10)           0                                            
__________________________________________________________________________________________________
input_6 (InputLayer)            (None, 10)           0                                            
__________________________________________________________________________________________________
embedding_3 (Embedding)         (None, 10, 100)      589500      input_5[0][0]                    
                                                                 input_6[0][0]                    
__________________________________________________________________________________________________
lstm_3 (LSTM)                   (None, 256)          365568      embedding_3[0][0]                
          

<keras.callbacks.History at 0x2894ce89358>

Extract Features From Model

In [29]:
features_model = Model([input1_tensor, input2_tensor], merge_layer)
features_model.compile(loss='mse', optimizer='adam')

In [30]:
F_train = features_model.predict([X1_train, X2_train], batch_size=128)
F_val = features_model.predict([X1_val, X2_val], batch_size=128)

In [8]:
import xgboost as xgb

In [31]:
dTrain = xgb.DMatrix(F_train, label=y_train)
dVal = xgb.DMatrix(F_val, label=y_val)

Train XGB

In [32]:
xgb_params = {
    'objective': 'binary:logistic',
    'booster': 'gbtree',
    'eval_metric': 'logloss',
    'eta': 0.1, 
    'max_depth': 9,
    'subsample': 0.9,
    'colsample_bytree': 1 / F_train.shape[1]**0.5,
    'min_child_weight': 5,
    'silent': 1
}
bst = xgb.train(xgb_params, dTrain, 1000,  [(dTrain,'train'), (dVal,'val')], 
                verbose_eval=10, early_stopping_rounds=10)

[0]	train-logloss:0.639288	val-logloss:0.651641
Multiple eval metrics have been passed: 'val-logloss' will be used for early stopping.

Will train until val-logloss hasn't improved in 10 rounds.
[10]	train-logloss:0.393243	val-logloss:0.490058
[20]	train-logloss:0.319254	val-logloss:0.469478
[30]	train-logloss:0.288414	val-logloss:0.473671
Stopping. Best iteration:
[21]	train-logloss:0.314356	val-logloss:0.469072



Predict Test

In [33]:
X1_test = create_padded_seqs(test[:]['sp1'])
X2_test = create_padded_seqs(test[:]['sp2'])
F_test = features_model.predict([X1_test, X2_test], batch_size=128)

In [49]:
dTest = xgb.DMatrix(F_test)
df_sub = pd.DataFrame({ 'score':bst.predict(dTest, ntree_limit=bst.best_ntree_limit)
    })

In [56]:
import datetime
unique_flag = datetime.datetime.now().strftime('%d_%H_%M')
print(unique_flag)
df_sub.head()
df_sub.to_csv('../result/submit_{}.csv'.format(unique_flag),header=False,index=False)
# df_sub.to_csv('../result/submit.csv',header=False,index=False)

29_20_16


In [None]:
import numpy as np 
def log_loss(true_y,pred_h):
    return -np.mean(true_y*np.log(pred_h)+(1-true_y)*np.log(1-pred_h))