In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pylab as plt
import sklearn
from IPython.core.display import display
%matplotlib inline

In [2]:
from sklearn import cross_validation as cv
from sklearn import metrics as mtr

import xgboost as xgb
from xgboost.sklearn import XGBClassifier
from sklearn.grid_search import GridSearchCV

 
from graphviz import Digraph

# Data Load

In [3]:
data = pd.read_csv('http://archive.ics.uci.edu/ml/machine-learning-databases/'\
                   'pima-indians-diabetes/pima-indians-diabetes.data', header=None)

In [5]:
data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [4]:
X = data.drop(8,axis=1)
Y = data[8]

In [12]:
#Cross Validation
X_train, X_test, Y_train, Y_test = cv.train_test_split(X, Y, random_state=0)
kf = cv.KFold(n=len(X_train), n_folds=5, shuffle=True)

#parameter select
param_grid = {
    'learning_rate':[0.1],
    'n_estimators':[1000],
    'max_depth':[3,5],
    'min_child_weight':[1,2,3],
    'max_delta_step':[5],
    'gamma':[0,3,10],
    'subsample':[0.8],
    'colsample_bytree':[0.8],
    'objective':['binary:logistic'],
    'nthread':[4],
    'scale_pos_weight':[1],
    'seed':[0]}

#model fit
clf = GridSearchCV(XGBClassifier(), param_grid=param_grid, cv=kf, scoring='roc_auc')
clf.fit(X_train, Y_train)
clf.score(X_test, Y_test)

print("Best parameters: %s" % clf.best_params_)
print("Best auroc score: %s" % clf.best_score_)

print()

print("Grid scores on development set:")
print()
for params, mean_score, scores in clf.grid_scores_:
    print("%0.3f (+/-%0.03f) for %r"
          % (mean_score, scores.std() * 2, params))
print()

# print("The scores are computed on the full evaluation set.")
# print()
# print(mtr.classification_report(Y_test, clf.predict(X_test)))

Best parameters: {'max_depth': 5, 'subsample': 0.8, 'seed': 0, 'colsample_bytree': 0.8, 'n_estimators': 1000, 'objective': 'binary:logistic', 'gamma': 10, 'nthread': 4, 'max_delta_step': 5, 'scale_pos_weight': 1, 'learning_rate': 0.1, 'min_child_weight': 1}
Best auroc score: 0.82347106922

Grid scores on development set:

0.759 (+/-0.056) for {'max_depth': 3, 'subsample': 0.8, 'seed': 0, 'colsample_bytree': 0.8, 'n_estimators': 1000, 'objective': 'binary:logistic', 'gamma': 0, 'nthread': 4, 'max_delta_step': 5, 'scale_pos_weight': 1, 'learning_rate': 0.1, 'min_child_weight': 1}
0.756 (+/-0.062) for {'max_depth': 3, 'subsample': 0.8, 'seed': 0, 'colsample_bytree': 0.8, 'n_estimators': 1000, 'objective': 'binary:logistic', 'gamma': 0, 'nthread': 4, 'max_delta_step': 5, 'scale_pos_weight': 1, 'learning_rate': 0.1, 'min_child_weight': 2}
0.748 (+/-0.054) for {'max_depth': 3, 'subsample': 0.8, 'seed': 0, 'colsample_bytree': 0.8, 'n_estimators': 1000, 'objective': 'binary:logistic', 'gamma':

