In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import xgboost as xgb

In [4]:
# From anokas' code: https://www.kaggle.com/anokas/simple-xgboost-btb-0-27

# Read in our input data
df_train = pd.read_csv('input/train.csv')
df_test = pd.read_csv('input/test.csv')

In [10]:
# This prints out (rows, columns) in each dataframe
col_to_drop = df_train.columns[df_train.columns.str.startswith('ps_calc_')]
df_train = df_train.drop(col_to_drop, axis=1)  
df_test = df_test.drop(col_to_drop, axis=1)  

# print('Columns:', df_train.columns)

y_train = df_train['target'].values
id_train = df_train['id'].values
id_test = df_test['id'].values

# We drop these variables as we don't want to train on them
# The other 57 columns are all numerical and can be trained on without preprocessing
x_train = df_train.drop(['target', 'id'], axis=1)
x_test = df_test.drop(['id'], axis=1)

In [11]:
# Take a random 20% of the dataset as validation data
x_train, x_valid, y_train, y_valid = train_test_split(x_train, y_train, test_size=0.2, random_state=4242)
print('Train samples: {} Validation samples: {}'.format(len(x_train), len(x_valid)))

# Convert our data into XGBoost format
d_train = xgb.DMatrix(x_train, y_train)
d_valid = xgb.DMatrix(x_valid, y_valid)
d_test = xgb.DMatrix(x_test)

# Set xgboost parameters
params = {}
params['objective'] = 'binary:logistic'
params['eta'] = 0.02
params['silent'] = True
params['max_depth'] = 6
params['subsample'] = 0.9
params['colsample_bytree'] = 0.9

Train samples: 476169 Validation samples: 119043


In [12]:
# Define the gini metric - from https://www.kaggle.com/c/ClaimPredictionChallenge/discussion/703#5897
def gini(actual, pred, cmpcol = 0, sortcol = 1):
    assert( len(actual) == len(pred) )
    all = np.asarray(np.c_[ actual, pred, np.arange(len(actual)) ], dtype=np.float)
    all = all[ np.lexsort((all[:,2], -1*all[:,1])) ]
    totalLosses = all[:,0].sum()
    giniSum = all[:,0].cumsum().sum() / totalLosses
    
    giniSum -= (len(actual) + 1) / 2.
    return giniSum / len(actual)
 
def gini_normalized(a, p):
    return gini(a, p) / gini(a, a)

# Create an XGBoost-compatible metric from Gini

def gini_xgb(preds, dtrain):
    labels = dtrain.get_label()
    gini_score = gini_normalized(labels, preds)
    return [('gini', gini_score)]

In [13]:
# This is the data xgboost will test on after eachboosting round
watchlist = [(d_train, 'train'), (d_valid, 'valid')]

# Train the model! We pass in a max of 10,000 rounds (with early stopping after 100)
# and the custom metric (maximize=True tells xgb that higher metric is better)
mdl = xgb.train(params, d_train, 10000, watchlist, early_stopping_rounds=100, feval=gini_xgb, maximize=True, verbose_eval=10)

[0]	train-gini:0.226478	valid-gini:0.220199
Multiple eval metrics have been passed: 'valid-gini' will be used for early stopping.

Will train until valid-gini hasn't improved in 100 rounds.
[10]	train-gini:0.253798	valid-gini:0.242416
[20]	train-gini:0.259063	valid-gini:0.245434
[30]	train-gini:0.262097	valid-gini:0.245663
[40]	train-gini:0.268038	valid-gini:0.247867
[50]	train-gini:0.270511	valid-gini:0.248975
[60]	train-gini:0.272516	valid-gini:0.250108
[70]	train-gini:0.27573	valid-gini:0.251192
[80]	train-gini:0.278004	valid-gini:0.251764
[90]	train-gini:0.279618	valid-gini:0.252819
[100]	train-gini:0.281354	valid-gini:0.254055
[110]	train-gini:0.283488	valid-gini:0.254082
[120]	train-gini:0.287626	valid-gini:0.2558
[130]	train-gini:0.291665	valid-gini:0.257609
[140]	train-gini:0.295994	valid-gini:0.259481
[150]	train-gini:0.300195	valid-gini:0.261713
[160]	train-gini:0.304912	valid-gini:0.26381
[170]	train-gini:0.308921	valid-gini:0.265896
[180]	train-gini:0.31317	valid-gini:0.267

In [14]:
# Predict on our test data
p_test = mdl.predict(d_test)

# Create a submission file
sub = pd.DataFrame()
sub['id'] = id_test
sub['target'] = p_test
sub.to_csv('xgb1.csv', index=False)

sub.head()

Unnamed: 0,id,target
0,0,0.027588
1,1,0.027
2,2,0.024359
3,3,0.015029
4,4,0.03669
