In [9]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from scipy.stats import skew
from scipy.sparse import hstack
from sklearn.model_selection import train_test_split
import xgboost as xgb
import gc

In [2]:
train = pd.read_csv('train.csv')

In [3]:
features = [c for c in train.columns if c not in['id','target']]

In [4]:
train[features] = train[features].fillna(-1000)

In [5]:
from sklearn.preprocessing import PolynomialFeatures
poly = PolynomialFeatures(degree=2)
poly_matrix = poly.fit_transform(train[features])

In [6]:
X_train, X_test, y_train, y_test = train_test_split(poly_matrix, train['target'], test_size=0.33, stratify=train['target'], random_state=42)

In [36]:
def gini(solution, submission):
    df = zip(solution, submission, range(len(solution)))
    df = sorted(df, key=lambda x: (x[1],-x[2]), reverse=True)
    rand = [float(i+1)/float(len(df)) for i in range(len(df))]
    totalPos = float(sum([x[0] for x in df]))
    cumPosFound = [df[0][0]]
    for i in range(1,len(df)):
        cumPosFound.append(cumPosFound[len(cumPosFound)-1] + df[i][0])
    Lorentz = [float(x)/totalPos for x in cumPosFound]
    Gini = [Lorentz[i]-rand[i] for i in range(len(df))]
    return sum(Gini)

def normalized_gini(solution, submission):
    normalized_gini = gini(solution, submission)/gini(solution, solution)
    return normalized_gini

In [8]:
%%time
d_train = xgb.DMatrix(X_train, label=y_train)
d_test = xgb.DMatrix(X_test, label=y_test)

CPU times: user 20.8 s, sys: 1min 2s, total: 1min 23s
Wall time: 3min 2s


In [10]:
del train; gc.collect()
del poly_matrix; gc.collect()

39

In [37]:
%%time
params = {}
params['eta'] = 0.1
params['objective'] = 'binary:logistic'
# params['max_depth'] = 2
# params['silent'] = 1
# params['learning_rate'] = 0.1
# params['subsample'] = 0.8
# params['colsample_bytree'] = 1
# params['colsample_bylevel'] = 1

watchlist = [(d_train, 'train'), (d_test, 'test')]
boost = xgb.train(params, d_train, 1000, watchlist, early_stopping_rounds=100, verbose_eval=10)

[0]	train-error:0.036282	test-error:0.03658
Multiple eval metrics have been passed: 'test-error' will be used for early stopping.

Will train until test-error hasn't improved in 100 rounds.
[10]	train-error:0.03639	test-error:0.036463
[20]	train-error:0.036397	test-error:0.036458
[30]	train-error:0.036395	test-error:0.036463
[40]	train-error:0.036395	test-error:0.036463
[50]	train-error:0.036392	test-error:0.036458
[60]	train-error:0.036387	test-error:0.036468
[70]	train-error:0.036375	test-error:0.036463
[80]	train-error:0.036367	test-error:0.036458
[90]	train-error:0.036352	test-error:0.036452
[100]	train-error:0.036337	test-error:0.036452
[110]	train-error:0.036302	test-error:0.036458
Stopping. Best iteration:
[15]	train-error:0.036387	test-error:0.036452

CPU times: user 1h 49min 12s, sys: 1h 45min 22s, total: 3h 34min 34s
Wall time: 6h 53min 22s


In [46]:
test_d = xgb.DMatrix(X_test)

In [48]:
preds = boost.predict(test_d, ntree_limit=boost.best_ntree_limit)

In [49]:
# print(normalized_gini(preds[:, 1], y_test))
print(normalized_gini(preds, y_test))

0.0359318337239


In [51]:
del X_train; del X_test; del y_train; del y_test; del d_train; del d_test; del test_d; gc.collect()

18

In [52]:
%%time
test = pd.read_csv('test.csv')
test[features] = test[features].fillna(-1000)
test_poly = poly.transform(test[features])
test_matrix = xgb.DMatrix(test_poly)

CPU times: user 1min 31s, sys: 2min 28s, total: 3min 59s
Wall time: 10min 27s


In [53]:
submission = pd.DataFrame()
submission['id'] = test.id

In [54]:
del test; del test_poly; gc.collect()

65

In [55]:
# preds = boost.predict_proba(test_poly)
preds = boost.predict(test_matrix, ntree_limit=boost.best_ntree_limit)

In [56]:
submission['target'] = preds

In [58]:
submission.to_csv('submission_xgb.csv', index=False)