In [5]:
import pandas as pd
import numpy as np
import xgboost as xgb
from xgboost import XGBClassifier

In [2]:
DATA_DIR = "../../data"

ID_COLUMN = 'Id'
TARGET_COLUMN = 'Response'

In [3]:
SEED = 0
CHUNKSIZE = 50000
NROWS = 250000

In [4]:
TRAIN_NUMERIC = "{0}/train_numeric.csv".format(DATA_DIR)
TRAIN_DATE = "{0}/train_date.csv".format(DATA_DIR)

TEST_NUMERIC = "{0}/test_numeric.csv".format(DATA_DIR)
TEST_DATE = "{0}/test_date.csv".format(DATA_DIR)

FILENAME = "etimelhoods"

In [5]:
# only read id 和 response just nrows
train = pd.read_csv(TRAIN_NUMERIC, usecols=[ID_COLUMN, TARGET_COLUMN])

test = pd.read_csv(TEST_NUMERIC, usecols=[ID_COLUMN])

In [6]:
train["StartTime"] = -1
test["StartTime"] = -1

In [7]:
train.head()

Unnamed: 0,Id,Response,StartTime
0,4,0,-1
1,6,0,-1
2,7,0,-1
3,9,0,-1
4,11,0,-1


In [8]:
test.shape

(1183748, 2)

In [9]:
# date dataset 
nrows = 0
for tr, te in zip(pd.read_csv(TRAIN_DATE, chunksize=CHUNKSIZE), pd.read_csv(TEST_DATE, chunksize=CHUNKSIZE)):
    #pick all columns except ID
    feats = np.setdiff1d(tr.columns, [ID_COLUMN])

    stime_tr = tr[feats].min(axis=1).values
    stime_te = te[feats].min(axis=1).values

    train.loc[train.Id.isin(tr.Id), 'StartTime'] = stime_tr
    test.loc[test.Id.isin(te.Id), 'StartTime'] = stime_te

#     nrows += CHUNKSIZE
#     if nrows >= NROWS:
#         break

In [11]:
test.shape

(1183748, 2)

In [None]:
test.shape

In [None]:
# concat to calculate id difference 
ntrain = train.shape[0]
train_test = pd.concat((train, test)).reset_index(drop=True).reset_index(drop=False)

In [None]:
train_test.tail()

In [None]:
train_test.shape

In [None]:
train_test['magic1'] = train_test[ID_COLUMN].diff().fillna(9999999).astype(int)
train_test['magic2'] = train_test[ID_COLUMN].iloc[::-1].diff().fillna(9999999).astype(int)

In [None]:
train_test.head()

In [None]:
train_test = train_test.sort_values(by=['StartTime', 'Id'], ascending=True)

In [None]:
train_test.head()

In [None]:
train_test['magic3'] = train_test[ID_COLUMN].diff().fillna(9999999).astype(int)
train_test['magic4'] = train_test[ID_COLUMN].iloc[::-1].diff().fillna(9999999).astype(int)


In [None]:
train_test.head()

In [None]:
train_test = train_test.sort_values(by=['index']).drop(['index'], axis=1)
train = train_test.iloc[:ntrain, :]

In [None]:
train.head()

In [None]:
features = np.setdiff1d(list(train.columns), [TARGET_COLUMN, ID_COLUMN])

In [None]:
features

In [None]:
train.Response.shape

In [None]:
y = train.Response.ravel()
train = np.array(train[features])

In [None]:
print('train: {0}'.format(train.shape))
prior = np.sum(y) / (1.*len(y))

In [32]:
a=[1,2]
b=[3,4]
a+b

[1, 2, 3, 4]

In [12]:
ntrain = train.shape[0]
train_test = pd.concat((train, test)).reset_index(drop=True).reset_index(drop=False)

train_test['0_¯\_(ツ)_/¯_1'] = train_test[ID_COLUMN].diff().fillna(9999999).astype(int)
train_test['0_¯\_(ツ)_/¯_2'] = train_test[ID_COLUMN].iloc[::-1].diff().fillna(9999999).astype(int)

train_test = train_test.sort_values(by=['StartTime', 'Id'], ascending=True)

train_test['0_¯\_(ツ)_/¯_3'] = train_test[ID_COLUMN].diff().fillna(9999999).astype(int)
train_test['0_¯\_(ツ)_/¯_4'] = train_test[ID_COLUMN].iloc[::-1].diff().fillna(9999999).astype(int)

train_test = train_test.sort_values(by=['index']).drop(['index'], axis=1)
train = train_test.iloc[:ntrain, :]

features = np.setdiff1d(list(train.columns), [TARGET_COLUMN, ID_COLUMN])

y = train.Response.ravel()
train = np.array(train[features])

print('train: {0}'.format(train.shape))
prior = np.sum(y) / (1.*len(y))

train: (1183747, 5)


In [13]:
xgb_params = {
    'seed': 0,
    'colsample_bytree': 0.7,
    'silent': 1,
    'subsample': 0.7,
    'learning_rate': 0.1,
    'objective': 'binary:logistic',
    'max_depth': 4,
    'num_parallel_tree': 1,
    'min_child_weight': 2,
    'eval_metric': 'auc',
    'base_score': prior
}


dtrain = xgb.DMatrix(train, label=y)
res = xgb.cv(xgb_params, dtrain, num_boost_round=10, nfold=4, seed=0,early_stopping_rounds=10, verbose_eval=True)

cv_mean = res.iloc[-1, 0]
cv_std = res.iloc[-1, 1]

[0]	train-auc:0.548162+0.0017663	test-auc:0.540955+0.00236347
[1]	train-auc:0.76466+0.109172	test-auc:0.758706+0.113094
[2]	train-auc:0.833123+0.0970387	test-auc:0.828825+0.0973951
[3]	train-auc:0.836255+0.0976415	test-auc:0.831369+0.097993
[4]	train-auc:0.836691+0.0975994	test-auc:0.831106+0.0974974
[5]	train-auc:0.892419+0.00272637	test-auc:0.88811+0.00124845
[6]	train-auc:0.892988+0.00245378	test-auc:0.887921+0.00123
[7]	train-auc:0.894391+0.00165457	test-auc:0.888711+0.00202522
[8]	train-auc:0.895345+0.00223148	test-auc:0.889483+0.00135385
[9]	train-auc:0.895829+0.00235693	test-auc:0.890209+0.00103926


In [30]:
dtrain

<xgboost.core.DMatrix at 0x113baf190>

In [None]:
res

In [None]:
cv_mean

In [None]:
cv_std

In [14]:
np.argmax(res.ix[:,0]-res.ix[:,1])

9

In [15]:
res.ix[9,0]-res.ix[9,1]

0.88917023932528938