In [2]:
# import package
import numpy as np
import pandas as pd
import xgboost as xgb
from xgboost.sklearn import XGBClassifier
from sklearn import cross_validation, metrics 
from sklearn.grid_search import GridSearchCV
import matplotlib.pyplot as plt
import seaborn as sns
import datetime
%matplotlib inline
chunksize = 100000

In [3]:
# define Idcol and Response Column
IDcol = "Id"
target = "Response"
# Sample the data to decide some important features
start_time = datetime.datetime.now()
date_chunks = pd.read_csv("../data/train_date.csv", index_col=0, chunksize=chunksize, dtype=np.float32)
num_chunks = pd.read_csv("../data/train_numeric.csv", index_col=0, chunksize=chunksize, dtype=np.float32)
train_value = pd.concat([pd.concat([dchunk, nchunk], axis=1).sample(frac=0.001)
               for dchunk, nchunk in zip(date_chunks, num_chunks)])
end_time = datetime.datetime.now()
print (end_time - start_time)

0:04:58.420949


In [4]:
# self-defined xgb model wrapper including data processing, model fit, predict and result   
def modelfit(alg, dtrain, predictors,useTrainCV=True, cv_folds=5, early_stopping_rounds=20):
    
    if useTrainCV:
        xgb_param = alg.get_xgb_params()
        xgtrain = xgb.DMatrix(dtrain[predictors].values, label=dtrain[target].values)
        cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=alg.get_params()['n_estimators'], nfold=cv_folds,
            metrics='auc', early_stopping_rounds=early_stopping_rounds, verbose_eval=True)
        alg.set_params(n_estimators=cvresult.shape[0])
    
    #Fit the algorithm on the data
    alg.fit(dtrain[predictors], dtrain[target],eval_metric='auc')
        
    #Predict training set:
    dtrain_predictions = alg.predict(dtrain[predictors])
    dtrain_predprob = alg.predict_proba(dtrain[predictors])[:,1]
    
    #Print model report:
    print ("\nModel Report")
    print ("Accuracy : %.4g" % metrics.accuracy_score(dtrain[target].values, dtrain_predictions))
    print ("AUC Score (Train): %f" % metrics.roc_auc_score(dtrain[target], dtrain_predprob))
                    
    feat_imp = pd.Series(alg.booster().get_fscore()).sort_values(ascending=False)
    feat_imp.plot(kind='bar', title='Feature Importances')
    plt.ylabel('Feature Importance Score')

# Xgboost starter

Specification:
1. learning rate (eta) = 0.1
2. n_estimator(nround in xgboost R) = 1000
3. max_depth = 4
4. subsample = 0.8
5. colsample_bytree = 0.8
6. objective = 'binary:logistic'(probability)
7. basescore = 0.005 (for imbalanced dataset)

In [6]:
#Choose all predictors except target & IDcols
predictors = [x for x in train_value.columns if x not in [target, IDcol]]
xgb_starter = XGBClassifier(
 learning_rate =0.01,
 n_estimators=2000,
 max_depth=5,
 min_child_weight=1,
 gamma=0,
 subsample=0.8,
 colsample_bytree=0.8,
 objective= 'binary:logistic',
 nthread=4,
 scale_pos_weight=1,
 seed=27,
 base_score=0.005)
modelfit(xgb_starter, train_value, predictors)

XGBoostError: [23:26:35] src/metric/rank_metric.cc:140: Check failed: !auc_error AUC: the dataset only contains pos or neg samples

# Tune max_depth and min_child_weight
result:failed (never run out for 4 hours)

In [None]:
param_test1 = {
 'max_depth':range(3,10,2),
 'min_child_weight':range(1,6,2)
}
gsearch1 = GridSearchCV(estimator = XGBClassifier( learning_rate =0.1, n_estimators=140, max_depth=5,
 min_child_weight=1, gamma=0, subsample=0.8, colsample_bytree=0.8,
 objective= 'binary:logistic', nthread=4, scale_pos_weight=1, seed=27), 
 param_grid = param_test1, scoring='roc_auc',n_jobs=4,iid=False, cv=5)
gsearch1.fit(train_value[predictors],train_value[target])
gsearch1.grid_scores_, gsearch1.best_params_, gsearch1.best_score_