In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

from sklearn.model_selection import (cross_val_score, train_test_split,
                                    GridSearchCV, RandomizedSearchCV)
from sklearn.preprocessing import Imputer
from mlutils import *
%load_ext autoreload
%autoreload 2

# SEED = 42 # Initial results were performed with a SEED of 42, but let's change things up.
SEED = 25

In [2]:
train = pd.read_csv('data/train_final.csv')
test = pd.read_csv('data/test_final.csv')
train = train.drop(['id'], axis=1)
test = test.drop(['id'], axis=1)

# Divide dataset into X and y
y = train.Y
X = train.drop(["Y"], axis=1)
X_test = test

In [3]:
# Impute missing features
from sklearn.preprocessing import Imputer

# train = train.apply(lambda x: x.fillna(x.value_counts().index[0]))
imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
train_xform = imp.fit_transform(X)
# TODO: Impute dataframe so that F5 uses median
# and F19 uses mean. For now, we'll impute via mean for both.

X = pd.DataFrame(train_xform, columns=X.columns)
test_xform = imp.transform(X_test)
X_test = pd.DataFrame(test_xform, columns=X_test.columns)

X.head(5)

# Split data.
# X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.10, random_state=SEED)
X_train = X
y_train = y

In [4]:
import xgboost as xgb
from sklearn.ensemble import BaggingClassifier

# For this example, train on all examples with risk of public leaderboard overfitting.
SEED = 25

params = {'max_depth': 5,
         'learning_rate': 0.025, 
         'n_estimators': 724,
         'silent': False,
         'objective': 'binary:logistic',
         'nthread': 4,
         'gamma': 0.6000000000000001, 
         'min_child_weight': 2.0, 
         'subsample': 0.9, 
         'colsample_bytree': 0.5, 
#          'tree_method': 'exact', 
         'seed': SEED,      
#          'booster': 'gbtree', 
#          'eval_metric': 'auc'
}

gbm_model = xgb.XGBClassifier(**params)
bag_clf = BaggingClassifier(gbm_model, n_estimators=10, random_state=SEED)
# predictions = gbm_model.predict(xgb.DMatrix(X_test))

In [5]:
scores = cross_val_score(
        bag_clf, X, y, scoring='roc_auc', cv=5, n_jobs=1)

  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)


In [9]:
print(scores.mean())
print(scores.std())

0.858460936701
0.00849036237219


In [12]:
bag_clf = BaggingClassifier(gbm_model, n_estimators=10, random_state=SEED)

bag_clf.fit(X, y)
predictions  = bag_clf.predict_proba(X_test)[:, 1]

In [14]:
params = {'max_depth': 5,
         'learning_rate': 0.025, 
         'n_estimators': 724,
         'silent': False,
         'objective': 'binary:logistic',
         'nthread': 4,
         'gamma': 0.6000000000000001, 
         'min_child_weight': 2.0, 
         'subsample': 0.9, 
         'colsample_bytree': 0.5, 
#          'tree_method': 'exact', 
         'seed': SEED,
#          'booster': 'gbtree', 
#          'eval_metric': 'auc'
         'n_bagged_estimators': 10,
}

score = 0.858460936701  # cross val score before picking this model and training it on all data
std = 0.00849036237219
model_name = 'xgb_bagged_10'
outfile = 'output/{}.csv'.format(model_name)

save_model_and_desc(bag_clf, model_name, params,
                  score, std, SEED)
write_results(outfile, predictions)

## Let's try bagging with more XGBoost estimators, and change the seed.
## Try 30 estimators, 10 seeds.

In [5]:
SEED = 100

params = {'max_depth': 5,
         'learning_rate': 0.025, 
         'n_estimators': 724,
         'silent': False,
         'objective': 'binary:logistic',
         'nthread': 4,
         'gamma': 0.6000000000000001, 
         'min_child_weight': 2.0, 
         'subsample': 0.9, 
         'colsample_bytree': 0.5, 
         'seed': SEED,
#          'n_bagged_estimators': 30,
}

gbm_model = xgb.XGBClassifier(**params)
bag_clf = BaggingClassifier(gbm_model, n_estimators=30, random_state=SEED)

In [6]:
scores = cross_val_score(
        bag_clf, X, y, scoring='roc_auc', cv=5, n_jobs=1)
print(scores.mean())
print(scores.std())

0.859178536779
0.00802806487255


In [7]:
bag_clf.fit(X, y)
predictions  = bag_clf.predict_proba(X_test)[:, 1]


In [8]:
params = {'max_depth': 5,
         'learning_rate': 0.025, 
         'n_estimators': 724,
         'silent': False,
         'objective': 'binary:logistic',
         'nthread': 4,
         'gamma': 0.6000000000000001, 
         'min_child_weight': 2.0, 
         'subsample': 0.9, 
         'colsample_bytree': 0.5, 
#          'tree_method': 'exact', 
         'seed': SEED,
#          'booster': 'gbtree', 
#          'eval_metric': 'auc'
         'n_bagged_estimators': 30,
}

score = scores.mean()  # cross val score before picking this model and training it on all data
std = scores.std()
model_name = 'xgb_bagged_30'
outfile = 'output/{}.csv'.format(model_name)

save_model_and_desc(bag_clf, model_name, params,
                  score, std, SEED)
write_results(outfile, predictions)

In [10]:
import xgboost as xgb
from sklearn.ensemble import BaggingClassifier

# For this example, train on all examples with risk of public leaderboard overfitting.

params_xgb = {'max_depth': 5,
         'learning_rate': 0.025, 
         'n_estimators': 724,
         'silent': False,
         'objective': 'binary:logistic',
         'nthread': 4,
         'gamma': 0.6000000000000001, 
         'min_child_weight': 2.0, 
         'subsample': 0.9, 
         'colsample_bytree': 0.5, 
#          'tree_method': 'exact', 
         'seed': SEED,      
#          'booster': 'gbtree', 
#          'eval_metric': 'auc'
}

gbm_model = xgb.XGBClassifier(**params_xgb)
bag_clf = BaggingClassifier(gbm_model, n_estimators=20, random_state=SEED)
# predictions = gbm_model.predict(xgb.DMatrix(X_test))

In [None]:
from sklearn.model_selection import StratifiedKFold

X = np.ones(10)
y = [0, 0, 0, 0, 1, 1, 1, 1, 1, 1]
skf = StratifiedKFold(n_splits=3)
for train, test in skf.split(X, y):
    print("%s %s" % (train, test))