In [None]:
SEED = 42

CV_FOLDS = 5 # number of folds used when there is CV
NUM_CPUS = 8 # use this many cpus (leave one if you want to surf during computation)

LEARNING_CURVE_SIZE = 10000 # build training curve up to this many samples

INCLUDE_CATEGORICAL = True # whether to include the categorical features

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import extraction
import util
from time import time


X, y, X_holdout, ids = extraction.prepare_data("./data/", drop_categorical=(not INCLUDE_CATEGORICAL))

def create_submission(clf, submission_name):
    file_name = submission_name + "_{}.csv".format(time())
    util.note_submission_info("Model: {}".format(clf), file_name)
    util.build_submission(clf, X_holdout, ids, file_name)
    print "Written {}".format(file_name)

In [None]:
print "{} features, {} data points".format(X.shape[1], X.shape[0])

In [None]:
# Quickfix: X_holdout is still broken (NaNs)!
import scipy.stats as stats
import numpy as np

col_mean = stats.nanmean(X_holdout,axis=0)
inds = np.where(np.isnan(X_holdout))
X_holdout[inds]=np.take(col_mean,inds[1])

In [None]:
from sklearn import metrics, cross_validation

def report_metrics(clf, X, y):
    log_scores = cross_validation.cross_val_score(clf, X, y, scoring="log_loss", cv=CV_FOLDS, n_jobs=NUM_CPUS)
    print "Log loss: %0.3f (+- %0.3f)" % (log_scores.mean(), log_scores.std())

In [None]:
# X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=0.2, random_state=SEED)

## XGBoost

In [None]:
import xgboost as xgb

In [None]:
xgb_clf = xgb.XGBClassifier(n_estimators=50, 
                            learning_rate=0.01, 
                            max_depth=7, 
                            subsample=0.8, 
                            colsample_bytree=0.68,
                            #reg_lambda=1
                            nthread=NUM_CPUS,
                            seed=SEED)
xgb_clf.fit(X, y)

In [None]:
# report_metrics(xgb_clf, X, y) # does not work well with xgb
# create_submission(xgb_clf, "xgb")

In [None]:
fig = learning_curve.plot_learning_curve(title="Learning curve for XGBoost", 
                                   estimator=gbt_clf, 
                                   X=X[:LEARNING_CURVE_SIZE], 
                                   y=y[:LEARNING_CURVE_SIZE], 
                                   n_jobs=NUM_CPUS, 
                                   cv=CV_FOLDS, scoring="log_loss")
fig.show()

In [None]:
xgtrain = xgb.DMatrix(X, y)
xgtest = xgb.DMatrix(X_holdout)

xgboost_params = {
   "objective": "binary:logistic",
   "booster": "gbtree",
   "eval_metric": "logloss",
   "eta": 0.01, # 0.06, #0.01,
   #"min_child_weight": 240,
   "subsample": 0.75,
   "colsample_bytree": 0.68,
   "max_depth": 7
}

xgb_clf_2 = xgb.train(xgboost_params,xgtrain,
                    num_boost_round=50,
                    verbose_eval=True,
                    maximize=False)

In [None]:
test_preds = clf.predict(xgtest, ntree_limit=xgb_clf_2.best_iteration)