In [22]:
import os.path

import numpy as np
import pandas as pd

In [23]:
DATA_IN_PATH = './preprocessed_data/'

In [24]:
TRAIN_Q1_DATA_FILE = 'q1_train.npy'
TRAIN_Q2_DATA_FILE = 'q2_train.npy'
TRAIN_LABEL_DATA_FILE = 'label_train.npy'

In [25]:
train_q1_data = np.load(open(DATA_IN_PATH + TRAIN_Q1_DATA_FILE, 'rb'))
train_q2_data = np.load(open(DATA_IN_PATH + TRAIN_Q2_DATA_FILE, 'rb'))
train_labels = np.load(open(DATA_IN_PATH + TRAIN_LABEL_DATA_FILE, 'rb'))

In [26]:
train_input = np.stack((train_q1_data, train_q2_data), axis=1)

In [27]:
print(train_input.shape)

(298526, 2, 31)


# Split train & validation data set

In [28]:
from sklearn.model_selection import train_test_split

In [29]:
train_input, eval_input, train_label, eval_label = train_test_split(train_input, train_labels, test_size=0.2, random_state=4242)

In [30]:
import xgboost as xgb

In [31]:
train_data = xgb.DMatrix(train_input.sum(axis=1), label=train_label)
eval_data = xgb.DMatrix(eval_input.sum(axis=1), label=eval_label)

data_list = [(train_data, 'train'), (eval_data, 'valid')]

In [32]:
params = {}

In [33]:
params['objective'] = 'binary:logistic'
params['eval_metric'] = 'rmse'

In [34]:
bst = xgb.train(params, train_data, num_boost_round=1000, evals=data_list, early_stopping_rounds=10)

[0]	train-rmse:0.48396	valid-rmse:0.48468
[1]	train-rmse:0.47388	valid-rmse:0.47504
[2]	train-rmse:0.46729	valid-rmse:0.46870
[3]	train-rmse:0.46257	valid-rmse:0.46440
[4]	train-rmse:0.45909	valid-rmse:0.46118
[5]	train-rmse:0.45610	valid-rmse:0.45835
[6]	train-rmse:0.45430	valid-rmse:0.45678
[7]	train-rmse:0.45252	valid-rmse:0.45520
[8]	train-rmse:0.45025	valid-rmse:0.45312
[9]	train-rmse:0.44930	valid-rmse:0.45234
[10]	train-rmse:0.44850	valid-rmse:0.45177
[11]	train-rmse:0.44674	valid-rmse:0.45024
[12]	train-rmse:0.44581	valid-rmse:0.44951
[13]	train-rmse:0.44470	valid-rmse:0.44866
[14]	train-rmse:0.44428	valid-rmse:0.44842
[15]	train-rmse:0.44363	valid-rmse:0.44793
[16]	train-rmse:0.44271	valid-rmse:0.44709
[17]	train-rmse:0.44240	valid-rmse:0.44691
[18]	train-rmse:0.44200	valid-rmse:0.44664
[19]	train-rmse:0.44176	valid-rmse:0.44651
[20]	train-rmse:0.44116	valid-rmse:0.44611
[21]	train-rmse:0.43980	valid-rmse:0.44491
[22]	train-rmse:0.43902	valid-rmse:0.44426
[23]	train-rmse:0.438

# Validate Test data

In [35]:
TEST_Q1_DATA_FILE = 'test_q1.npy'
TEST_Q2_DATA_FILE = 'test_q2.npy'
TEST_ID_DATA_FILE = 'test_id.npy'

In [45]:
test_q1_data = np.load(open(DATA_IN_PATH + TEST_Q1_DATA_FILE, 'rb'))
test_q2_data = np.load(open(DATA_IN_PATH + TEST_Q2_DATA_FILE, 'rb'))
test_id_data = np.load(open(DATA_IN_PATH + TEST_ID_DATA_FILE, 'rb'), allow_pickle=True)

In [46]:
test_input = np.stack((test_q1_data, test_q2_data), axis=1)
test_data = xgb.DMatrix(test_input.sum(axis=1))

In [47]:
test_predict = bst.predict(test_data)

# Predict Output

In [55]:
import pandas as pd
import os

In [56]:
DATA_OUT_PATH = './data_out/'

In [57]:
if not os.path.exists(DATA_OUT_PATH):
    os.makedirs(DATA_OUT_PATH)

In [58]:
output = pd.DataFrame({'test_id': test_id_data, 'is_duplicate':test_predict})

In [59]:
output.to_csv(DATA_OUT_PATH + 'simple_xgb.csv', index=False)