## Day2 - Tuning XGBClassifier
Today, I will try to tune hyperparameters of XGBoots step by step

In [2]:
from sklearn import datasets
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import StandardScaler
from sklearn.svm import LinearSVC, SVC
import matplotlib.pyplot as plt
import numpy as np
from sklearn.metrics import mean_squared_error, log_loss
from sklearn.model_selection import GridSearchCV
import pandas as pd
from mpl_toolkits.mplot3d import Axes3D
from tqdm import tqdm
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
import xgboost as xgb
from xgboost import XGBClassifier

In [3]:
data = pd.read_csv('train_final.csv')

In [4]:
X_pred_data = pd.read_csv('test_final.csv')
X_pred = np.asarray(X_pred_data.iloc[:, 1:25]).reshape(-1, 24)

In [5]:
X = np.asarray(data.iloc[:, 2:26]).reshape(-1, 24)
y = np.asarray(data.iloc[:, 1]).reshape(-1, 1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.05)

In [6]:
xgb1 = xgb.XGBClassifier()

In [7]:
xgb1.set_params(
    learning_rate =0.1,
 n_estimators=1000,
 max_depth=5,
 min_child_weight=1,
 gamma=0,
 subsample=0.8,
 colsample_bytree=0.8,
 objective= 'binary:logistic',
 scale_pos_weight=1
)
xgb1.fit(X_train, y_train.ravel())

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bynode=1, colsample_bytree=0.8, gamma=0,
       learning_rate=0.1, max_delta_step=0, max_depth=5,
       min_child_weight=1, missing=None, n_estimators=1000, n_jobs=1,
       nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=None, subsample=0.8, verbosity=1)

In [8]:
print('Accuracy: %f' % metrics.accuracy_score(y_test, xgb1.predict(X_test)))
print('AUC: %f' %metrics.roc_auc_score(y_test, xgb1.predict_proba(X_test)[:, 1]))

Accuracy: 0.963415
AUC: 0.852844


In [9]:
param_test1 = [
 {'max_depth':[4, 5, 6],
 'min_child_weight':[4, 5, 6],
 'learning_rate': [0.1],
 'n_estimators': [1000],
 'max_depth':[5],
 'min_child_weight':[1],
 'gamma':[0],
 'subsample':[0.8],
 'colsample_bytree':[0.8],
 'objective': ['binary:logistic'],
 'scale_pos_weight':[1]}
]

In [10]:
grid1 = GridSearchCV(xgb.XGBClassifier(), param_grid = param_test1, scoring='roc_auc', n_jobs=2, iid=False, cv=2)

In [11]:
grid1.fit(X_train, y_train.ravel())

GridSearchCV(cv=2, error_score='raise-deprecating',
       estimator=XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bynode=1, colsample_bytree=1, gamma=0, learning_rate=0.1,
       max_delta_step=0, max_depth=3, min_child_weight=1, missing=None,
       n_estimators=100, n_jobs=1, nthread=None,
       objective='binary:logistic', random_state=0, reg_alpha=0,
       reg_lambda=1, scale_pos_weight=1, seed=None, silent=None,
       subsample=1, verbosity=1),
       fit_params=None, iid=False, n_jobs=2,
       param_grid=[{'max_depth': [5], 'min_child_weight': [1], 'learning_rate': [0.1], 'n_estimators': [1000], 'gamma': [0], 'subsample': [0.8], 'colsample_bytree': [0.8], 'objective': ['binary:logistic'], 'scale_pos_weight': [1]}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='roc_auc', verbose=0)

In [12]:
xgb2 = grid1.best_estimator_

In [13]:
xgb2.fit(X_train, y_train.ravel())

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bynode=1, colsample_bytree=0.8, gamma=0,
       learning_rate=0.1, max_delta_step=0, max_depth=5,
       min_child_weight=1, missing=None, n_estimators=1000, n_jobs=1,
       nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=None, subsample=0.8, verbosity=1)

In [14]:
print('Accuracy: %f' % metrics.accuracy_score(y_test, xgb2.predict(X_test)))
print('AUC: %f' %metrics.roc_auc_score(y_test, xgb2.predict_proba(X_test)[:, 1]))

Accuracy: 0.963415
AUC: 0.852844


In [15]:
# y_submission4 = pd.DataFrame(xgb2.predict_proba(X_pred)[:, 1], columns = ['Y']) 
# y_submission4['Id'] = X_pred_data['Id']
# y_submission4 = y_submission4.reindex(columns=["Id", "Y"])
# y_submission4.to_csv("submission4_uk734.csv", index=False)

In [16]:
# y_submission5 = pd.DataFrame(xgb2.predict_proba(X_pred)[:, 1], columns = ['Y']) 
# y_submission5['Id'] = X_pred_data['Id']
# y_submission5 = y_submission4.reindex(columns=["Id", "Y"])
# y_submission5.to_csv("submission5_uk734.csv", index=False)

In [17]:
param_test2 = [
 {'max_depth':[5],
 'min_child_weight':[1, 2, 3],
 'learning_rate': [0.1],
 'n_estimators': [1000],
 'max_depth':[5],
 'min_child_weight':[1],
 'gamma':[0],
 'subsample':[0.8],
 'colsample_bytree':[0.8],
 'objective': ['binary:logistic'],
 'scale_pos_weight':[1]}
]

In [18]:
grid2 = GridSearchCV(xgb.XGBClassifier(), param_grid = param_test2, scoring='roc_auc', n_jobs=2, iid=False, cv=2)

In [19]:
grid2.fit(X_train, y_train.ravel())

GridSearchCV(cv=2, error_score='raise-deprecating',
       estimator=XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bynode=1, colsample_bytree=1, gamma=0, learning_rate=0.1,
       max_delta_step=0, max_depth=3, min_child_weight=1, missing=None,
       n_estimators=100, n_jobs=1, nthread=None,
       objective='binary:logistic', random_state=0, reg_alpha=0,
       reg_lambda=1, scale_pos_weight=1, seed=None, silent=None,
       subsample=1, verbosity=1),
       fit_params=None, iid=False, n_jobs=2,
       param_grid=[{'max_depth': [5], 'min_child_weight': [1], 'learning_rate': [0.1], 'n_estimators': [1000], 'gamma': [0], 'subsample': [0.8], 'colsample_bytree': [0.8], 'objective': ['binary:logistic'], 'scale_pos_weight': [1]}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='roc_auc', verbose=0)

In [20]:
xgb3 = grid2.best_estimator_
xgb3.fit(X_train, y_train.ravel())

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bynode=1, colsample_bytree=0.8, gamma=0,
       learning_rate=0.1, max_delta_step=0, max_depth=5,
       min_child_weight=1, missing=None, n_estimators=1000, n_jobs=1,
       nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=None, subsample=0.8, verbosity=1)

In [21]:
print('Accuracy: %f' % metrics.accuracy_score(y_test, xgb3.predict(X_test)))
print('AUC: %f' %metrics.roc_auc_score(y_test, xgb3.predict_proba(X_test)[:, 1]))

Accuracy: 0.963415
AUC: 0.852844


In [22]:
param_test3 = [
 {'max_depth':[5],
 'min_child_weight':[1],
 'learning_rate': [0.1],
 'n_estimators': [1000],
 'max_depth':[5],
 'min_child_weight':[1],
 'gamma':[0, 0.1, 0.2],
 'subsample':[0.8],
 'colsample_bytree':[0.8],
 'objective': ['binary:logistic'],
 'scale_pos_weight':[1]}
]

In [23]:
grid3 = GridSearchCV(xgb.XGBClassifier(), param_grid = param_test3, scoring='roc_auc', n_jobs=2, iid=False, cv=2)

In [24]:
grid3.fit(X_train, y_train.ravel())

GridSearchCV(cv=2, error_score='raise-deprecating',
       estimator=XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bynode=1, colsample_bytree=1, gamma=0, learning_rate=0.1,
       max_delta_step=0, max_depth=3, min_child_weight=1, missing=None,
       n_estimators=100, n_jobs=1, nthread=None,
       objective='binary:logistic', random_state=0, reg_alpha=0,
       reg_lambda=1, scale_pos_weight=1, seed=None, silent=None,
       subsample=1, verbosity=1),
       fit_params=None, iid=False, n_jobs=2,
       param_grid=[{'max_depth': [5], 'min_child_weight': [1], 'learning_rate': [0.1], 'n_estimators': [1000], 'gamma': [0, 0.1, 0.2], 'subsample': [0.8], 'colsample_bytree': [0.8], 'objective': ['binary:logistic'], 'scale_pos_weight': [1]}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='roc_auc', verbose=0)

In [25]:
xgb4 = grid3.best_estimator_

In [26]:
xgb4.fit(X_train, y_train.ravel())

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bynode=1, colsample_bytree=0.8, gamma=0.2,
       learning_rate=0.1, max_delta_step=0, max_depth=5,
       min_child_weight=1, missing=None, n_estimators=1000, n_jobs=1,
       nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=None, subsample=0.8, verbosity=1)

In [27]:
print('Accuracy: %f' % metrics.accuracy_score(y_test, xgb4.predict(X_test)))
print('AUC: %f' %metrics.roc_auc_score(y_test, xgb4.predict_proba(X_test)[:, 1]))

Accuracy: 0.964634
AUC: 0.867779


In [28]:
param_test4 = [
 {'max_depth':[5],
 'min_child_weight':[1],
 'learning_rate': [0.1],
 'n_estimators': [1000],
 'max_depth':[5],
 'min_child_weight':[1],
 'gamma':[0.1],
 'subsample':[0.7, 0.8, 0.9],
 'colsample_bytree':[0.7, 0.8, 0.9],
 'objective': ['binary:logistic'],
 'scale_pos_weight':[1]}
]

In [29]:
grid4 = GridSearchCV(xgb.XGBClassifier(), param_grid = param_test4, scoring='roc_auc', n_jobs=2, iid=False, cv=2, verbose = 1)

In [30]:
grid4.fit(X_train, y_train)

Fitting 2 folds for each of 9 candidates, totalling 18 fits


[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  18 out of  18 | elapsed:  3.6min finished
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


GridSearchCV(cv=2, error_score='raise-deprecating',
       estimator=XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bynode=1, colsample_bytree=1, gamma=0, learning_rate=0.1,
       max_delta_step=0, max_depth=3, min_child_weight=1, missing=None,
       n_estimators=100, n_jobs=1, nthread=None,
       objective='binary:logistic', random_state=0, reg_alpha=0,
       reg_lambda=1, scale_pos_weight=1, seed=None, silent=None,
       subsample=1, verbosity=1),
       fit_params=None, iid=False, n_jobs=2,
       param_grid=[{'max_depth': [5], 'min_child_weight': [1], 'learning_rate': [0.1], 'n_estimators': [1000], 'gamma': [0.1], 'subsample': [0.7, 0.8, 0.9], 'colsample_bytree': [0.7, 0.8, 0.9], 'objective': ['binary:logistic'], 'scale_pos_weight': [1]}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='roc_auc', verbose=1)

In [31]:
xgb5 = grid4.best_estimator_

In [32]:
xgb5.fit(X_train, y_train.ravel())

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bynode=1, colsample_bytree=0.7, gamma=0.1,
       learning_rate=0.1, max_delta_step=0, max_depth=5,
       min_child_weight=1, missing=None, n_estimators=1000, n_jobs=1,
       nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=None, subsample=0.9, verbosity=1)

In [33]:
print('Accuracy: %f' % metrics.accuracy_score(y_test, xgb5.predict(X_test)))
print('AUC: %f' %metrics.roc_auc_score(y_test, xgb5.predict_proba(X_test)[:, 1]))

Accuracy: 0.960976
AUC: 0.865055


In [34]:
# y_submission6 = pd.DataFrame(xgb5.predict_proba(X_pred)[:, 1], columns = ['Y']) 
# y_submission6['Id'] = X_pred_data['Id']
# y_submission6 = y_submission6.reindex(columns=["Id", "Y"])
# y_submission6.to_csv("submission6_uk734.csv", index=False)

In [35]:
param_test5 = [
 {'max_depth':[5],
 'min_child_weight':[1],
 'learning_rate': [0.1],
 'n_estimators': [1000],
 'max_depth':[5],
 'min_child_weight':[1],
 'gamma':[0.1],
 'subsample':[0.9, 1.0],
 'colsample_bytree':[0.6, 0.7],
 'objective': ['binary:logistic'],
 'scale_pos_weight':[1]}
]

In [36]:
grid5 = GridSearchCV(xgb.XGBClassifier(), param_grid = param_test5, scoring='roc_auc', n_jobs=2, iid=False, cv=2, verbose = 2)

In [37]:
grid5.fit(X_train, y_train.ravel())

Fitting 2 folds for each of 4 candidates, totalling 8 fits


[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done   8 out of   8 | elapsed:  1.8min finished


GridSearchCV(cv=2, error_score='raise-deprecating',
       estimator=XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bynode=1, colsample_bytree=1, gamma=0, learning_rate=0.1,
       max_delta_step=0, max_depth=3, min_child_weight=1, missing=None,
       n_estimators=100, n_jobs=1, nthread=None,
       objective='binary:logistic', random_state=0, reg_alpha=0,
       reg_lambda=1, scale_pos_weight=1, seed=None, silent=None,
       subsample=1, verbosity=1),
       fit_params=None, iid=False, n_jobs=2,
       param_grid=[{'max_depth': [5], 'min_child_weight': [1], 'learning_rate': [0.1], 'n_estimators': [1000], 'gamma': [0.1], 'subsample': [0.9, 1.0], 'colsample_bytree': [0.6, 0.7], 'objective': ['binary:logistic'], 'scale_pos_weight': [1]}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='roc_auc', verbose=2)

In [38]:
xgb6 = grid5.best_estimator_

In [39]:
xgb6.fit(X_train, y_train.ravel())

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bynode=1, colsample_bytree=0.6, gamma=0.1,
       learning_rate=0.1, max_delta_step=0, max_depth=5,
       min_child_weight=1, missing=None, n_estimators=1000, n_jobs=1,
       nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=None, subsample=1.0, verbosity=1)

In [40]:
print('Accuracy: %f' % metrics.accuracy_score(y_test, xgb6.predict(X_test)))
print('AUC: %f' %metrics.roc_auc_score(y_test, xgb6.predict_proba(X_test)[:, 1]))

Accuracy: 0.963415
AUC: 0.873227


In [41]:
param_test6 = [
 {'max_depth':[5],
 'min_child_weight':[1],
 'learning_rate': [0.1],
 'n_estimators': [1000],
 'max_depth':[5],
 'min_child_weight':[1],
 'gamma':[0.1],
 'subsample':[0.9],
 'colsample_bytree':[0.7],
 'objective': ['binary:logistic'],
 'scale_pos_weight':[1],
 'reg_alpha':[1e-2, 0.1, 1]}
]

In [42]:
grid6 = GridSearchCV(xgb.XGBClassifier(), param_grid = param_test6, scoring='roc_auc', n_jobs=2, iid=False, cv=2, verbose = 2)

In [43]:
grid6.fit(X_train, y_train.ravel())

Fitting 2 folds for each of 3 candidates, totalling 6 fits


[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done   6 out of   6 | elapsed:  1.3min finished


GridSearchCV(cv=2, error_score='raise-deprecating',
       estimator=XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bynode=1, colsample_bytree=1, gamma=0, learning_rate=0.1,
       max_delta_step=0, max_depth=3, min_child_weight=1, missing=None,
       n_estimators=100, n_jobs=1, nthread=None,
       objective='binary:logistic', random_state=0, reg_alpha=0,
       reg_lambda=1, scale_pos_weight=1, seed=None, silent=None,
       subsample=1, verbosity=1),
       fit_params=None, iid=False, n_jobs=2,
       param_grid=[{'max_depth': [5], 'min_child_weight': [1], 'learning_rate': [0.1], 'n_estimators': [1000], 'gamma': [0.1], 'subsample': [0.9], 'colsample_bytree': [0.7], 'objective': ['binary:logistic'], 'scale_pos_weight': [1], 'reg_alpha': [0.01, 0.1, 1]}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='roc_auc', verbose=2)

In [44]:
xgb7 = grid6.best_estimator_

In [45]:
param_test7 = [
 {'max_depth':[5],
 'min_child_weight':[1],
 'learning_rate': [0.1],
 'n_estimators': [1000],
 'max_depth':[5],
 'min_child_weight':[1],
 'gamma':[0.1],
 'subsample':[0.9],
 'colsample_bytree':[0.7],
 'objective': ['binary:logistic'],
 'scale_pos_weight':[1],
 'reg_alpha':[1, 10]}
]

In [46]:
grid7 = GridSearchCV(xgb.XGBClassifier(), param_grid = param_test7, scoring='roc_auc', n_jobs=2, iid=False, cv=2, verbose = 2)

In [47]:
grid7.fit(X_train, y_train.ravel())

Fitting 2 folds for each of 2 candidates, totalling 4 fits


[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done   4 out of   4 | elapsed:   45.0s remaining:    0.0s
[Parallel(n_jobs=2)]: Done   4 out of   4 | elapsed:   45.0s finished


GridSearchCV(cv=2, error_score='raise-deprecating',
       estimator=XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bynode=1, colsample_bytree=1, gamma=0, learning_rate=0.1,
       max_delta_step=0, max_depth=3, min_child_weight=1, missing=None,
       n_estimators=100, n_jobs=1, nthread=None,
       objective='binary:logistic', random_state=0, reg_alpha=0,
       reg_lambda=1, scale_pos_weight=1, seed=None, silent=None,
       subsample=1, verbosity=1),
       fit_params=None, iid=False, n_jobs=2,
       param_grid=[{'max_depth': [5], 'min_child_weight': [1], 'learning_rate': [0.1], 'n_estimators': [1000], 'gamma': [0.1], 'subsample': [0.9], 'colsample_bytree': [0.7], 'objective': ['binary:logistic'], 'scale_pos_weight': [1], 'reg_alpha': [1, 10]}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='roc_auc', verbose=2)

In [48]:
xgb8 = grid7.best_estimator_
xgb8

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bynode=1, colsample_bytree=0.7, gamma=0.1,
       learning_rate=0.1, max_delta_step=0, max_depth=5,
       min_child_weight=1, missing=None, n_estimators=1000, n_jobs=1,
       nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=1, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=None, subsample=0.9, verbosity=1)

In [49]:
xgb8.fit(X_train, y_train.ravel())

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bynode=1, colsample_bytree=0.7, gamma=0.1,
       learning_rate=0.1, max_delta_step=0, max_depth=5,
       min_child_weight=1, missing=None, n_estimators=1000, n_jobs=1,
       nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=1, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=None, subsample=0.9, verbosity=1)

In [50]:
print('Accuracy: %f' % metrics.accuracy_score(y_test, xgb8.predict(X_test)))
print('AUC: %f' %metrics.roc_auc_score(y_test, xgb8.predict_proba(X_test)[:, 1]))

Accuracy: 0.964634
AUC: 0.857572


In [51]:
# y_submission7 = pd.DataFrame(xgb8.predict_proba(X_pred)[:, 1], columns = ['Y']) 
# y_submission7['Id'] = X_pred_data['Id']
# y_submission7 = y_submission7.reindex(columns=["Id", "Y"])
# y_submission7.to_csv("submission7_uk734.csv", index=False)

In [52]:
param_test8 = [
 {'max_depth':[5],
 'min_child_weight':[1],
 'learning_rate': [0.01],
 'n_estimators': [1000],
 'max_depth':[5],
 'min_child_weight':[1],
 'gamma':[0.1],
 'subsample':[0.9],
 'colsample_bytree':[0.7],
 'objective': ['binary:logistic'],
 'scale_pos_weight':[1],
 'reg_alpha':[1]}
]

In [53]:
grid8 = GridSearchCV(xgb.XGBClassifier(), param_grid = param_test8, scoring='roc_auc', n_jobs=2, iid=False, cv=2, verbose = 2)

In [54]:
grid8.fit(X_train, y_train.ravel())

Fitting 2 folds for each of 1 candidates, totalling 2 fits


[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done   2 out of   2 | elapsed:   23.5s finished


GridSearchCV(cv=2, error_score='raise-deprecating',
       estimator=XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bynode=1, colsample_bytree=1, gamma=0, learning_rate=0.1,
       max_delta_step=0, max_depth=3, min_child_weight=1, missing=None,
       n_estimators=100, n_jobs=1, nthread=None,
       objective='binary:logistic', random_state=0, reg_alpha=0,
       reg_lambda=1, scale_pos_weight=1, seed=None, silent=None,
       subsample=1, verbosity=1),
       fit_params=None, iid=False, n_jobs=2,
       param_grid=[{'max_depth': [5], 'min_child_weight': [1], 'learning_rate': [0.01], 'n_estimators': [1000], 'gamma': [0.1], 'subsample': [0.9], 'colsample_bytree': [0.7], 'objective': ['binary:logistic'], 'scale_pos_weight': [1], 'reg_alpha': [1]}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='roc_auc', verbose=2)

In [55]:
xgb9 = grid8.best_estimator_

In [56]:
xgb9.fit(X_train, y_train.ravel())

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bynode=1, colsample_bytree=0.7, gamma=0.1,
       learning_rate=0.01, max_delta_step=0, max_depth=5,
       min_child_weight=1, missing=None, n_estimators=1000, n_jobs=1,
       nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=1, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=None, subsample=0.9, verbosity=1)

In [57]:
# y_submission8 = pd.DataFrame(xgb9.predict_proba(X_pred)[:, 1], columns = ['Y']) 
# y_submission8['Id'] = X_pred_data['Id']
# y_submission8 = y_submission8.reindex(columns=["Id", "Y"])
# y_submission8.to_csv("submission8_uk734.csv", index=False)

In [58]:
print('Accuracy: %f' % metrics.accuracy_score(y_test, xgb9.predict(X_test)))
print('AUC: %f' %metrics.roc_auc_score(y_test, xgb9.predict_proba(X_test)[:, 1]))

Accuracy: 0.962195
AUC: 0.852187


In [59]:
param_test9 = [
 {'max_depth':[5],
 'min_child_weight':[1],
 'learning_rate': [0.01, 0.05, 0.1, 0.5],
 'n_estimators': [1000],
 'max_depth':[5],
 'min_child_weight':[1],
 'gamma':[0.1],
 'subsample':[0.9],
 'colsample_bytree':[0.7],
 'objective': ['binary:logistic'],
 'scale_pos_weight':[1],
 'reg_alpha':[1]}
]

In [60]:
grid9 = GridSearchCV(xgb.XGBClassifier(), param_grid = param_test9, scoring='roc_auc', n_jobs=2, iid=False, cv=2, verbose = 2)

In [61]:
grid9.fit(X_train, y_train.ravel())

Fitting 2 folds for each of 4 candidates, totalling 8 fits


[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done   8 out of   8 | elapsed:  1.5min finished


GridSearchCV(cv=2, error_score='raise-deprecating',
       estimator=XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bynode=1, colsample_bytree=1, gamma=0, learning_rate=0.1,
       max_delta_step=0, max_depth=3, min_child_weight=1, missing=None,
       n_estimators=100, n_jobs=1, nthread=None,
       objective='binary:logistic', random_state=0, reg_alpha=0,
       reg_lambda=1, scale_pos_weight=1, seed=None, silent=None,
       subsample=1, verbosity=1),
       fit_params=None, iid=False, n_jobs=2,
       param_grid=[{'max_depth': [5], 'min_child_weight': [1], 'learning_rate': [0.01, 0.05, 0.1, 0.5], 'n_estimators': [1000], 'gamma': [0.1], 'subsample': [0.9], 'colsample_bytree': [0.7], 'objective': ['binary:logistic'], 'scale_pos_weight': [1], 'reg_alpha': [1]}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='roc_auc', verbose=2)

In [62]:
xgb10 = grid9.best_estimator_
xgb10

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bynode=1, colsample_bytree=0.7, gamma=0.1,
       learning_rate=0.05, max_delta_step=0, max_depth=5,
       min_child_weight=1, missing=None, n_estimators=1000, n_jobs=1,
       nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=1, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=None, subsample=0.9, verbosity=1)

In [63]:
xgb10.fit(X_train, y_train.ravel())

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bynode=1, colsample_bytree=0.7, gamma=0.1,
       learning_rate=0.05, max_delta_step=0, max_depth=5,
       min_child_weight=1, missing=None, n_estimators=1000, n_jobs=1,
       nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=1, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=None, subsample=0.9, verbosity=1)

In [64]:
print('Accuracy: %f' % metrics.accuracy_score(y_test, xgb10.predict(X_test)))
print('AUC: %f' %metrics.roc_auc_score(y_test, xgb10.predict_proba(X_test)[:, 1]))

Accuracy: 0.963415
AUC: 0.875826


In [65]:
# y_submission9 = pd.DataFrame(xgb10.predict_proba(X_pred)[:, 1], columns = ['Y']) 
# y_submission9['Id'] = X_pred_data['Id']
# y_submission9 = y_submission9.reindex(columns=["Id", "Y"])
# y_submission9.to_csv("submission9_uk734.csv", index=False)