In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from scipy.stats import skew
from scipy.sparse import hstack
from sklearn.model_selection import train_test_split
import xgboost as xgb



In [2]:
train = pd.read_csv('train.csv')

In [3]:
features = [c for c in train.columns if c not in['id','target']]

In [4]:
train[features] = train[features].fillna(-1000)

In [5]:
from sklearn.preprocessing import PolynomialFeatures
poly = PolynomialFeatures(degree=2)
poly_matrix = poly.fit_transform(train[features])

In [6]:
X_train, X_test, y_train, y_test = train_test_split(poly_matrix, train['target'], test_size=0.33, stratify=train['target'], random_state=42)

In [7]:
%%time
xgb = xgb.XGBClassifier()
boost = xgb.fit(X_train, y_train)

CPU times: user 47min 7s, sys: 54min 43s, total: 1h 41min 51s
Wall time: 2h 58min 37s


In [13]:
preds = boost.predict_proba(X_test)

In [14]:
def gini(solution, submission):
    df = zip(solution, submission, range(len(solution)))
    df = sorted(df, key=lambda x: (x[1],-x[2]), reverse=True)
    rand = [float(i+1)/float(len(df)) for i in range(len(df))]
    totalPos = float(sum([x[0] for x in df]))
    cumPosFound = [df[0][0]]
    for i in range(1,len(df)):
        cumPosFound.append(cumPosFound[len(cumPosFound)-1] + df[i][0])
    Lorentz = [float(x)/totalPos for x in cumPosFound]
    Gini = [Lorentz[i]-rand[i] for i in range(len(df))]
    return sum(Gini)

def normalized_gini(solution, submission):
    normalized_gini = gini(solution, submission)/gini(solution, solution)
    return normalized_gini

In [20]:
print preds[:, 1]

[ 0.0462359   0.03787738  0.01775254 ...,  0.03519754  0.03465168
  0.02416977]


In [22]:
print(normalized_gini(preds[:, 1], y_test))

0.0400183905614


In [None]:
%%time
d_train = xgb.DMatrix(X_train, label=y_train)
d_test = xgb.DMatrix(X_test, label=y_test)

In [None]:
%%time
params = {}
# params['eta'] = 0.1
params['objective'] = 'binary:logistic'
# params['eval_metric'] = 'auc'
# params['max_depth'] = 2
# params['silent'] = 1
# params['learning_rate'] = 0.1
# params['subsample'] = 0.8
# params['colsample_bytree'] = 1
# params['colsample_bylevel'] = 1

watchlist = [(d_train, 'train'), (d_test, 'test')]
boost = xgb.train(params, d_train, 1000, watchlist, early_stopping_rounds=100, verbose_eval=10)

In [23]:
%%time
test = pd.read_csv('test.csv')
test_poly = poly.transform(test[features])

CPU times: user 54.3 s, sys: 31.7 s, total: 1min 26s
Wall time: 1min 52s


In [24]:
preds = boost.predict_proba(test_poly)

In [26]:
submission = pd.DataFrame()
submission['id'] = test.id
submission['target'] = preds[:,1]

In [27]:
submission.to_csv('submission_xgb.csv', index=False)