In [None]:
from numpy import loadtxt
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
% pylab inline

# load pima-indians-diabetes data, it's a binary classification task
dataset = loadtxt("data.txt", delimiter=",")

# split data into X (features) and y (label)
X = dataset[:, 0:8]
y = dataset[:, 8]

In [None]:
print(X.shape)

In [None]:
print(y.shape)

In [None]:
# we have two labels, 0 with no diabetes, 1 with disbetes
y[:5]

In [None]:
# split our data into train & test
# train : test = 0.67 : 0.33
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 7, test_size = 0.33)

In [None]:
# eval_metrics can selecy rmse, logloss, error, auc, merror, mlogloss or custom define
eval_set =  [(X_test, y_test)]
model = XGBClassifier()
model.fit(X_train, y_train, early_stopping_rounds=10, eval_metric="auc",
          eval_set=eval_set, verbose=True)

In [None]:
# model.predict_proba will return the probility
# model.predict will return the predict label (use 0.5 as threshold)
y_pred = model.predict_proba(X_test)
y_pred[:5]

In [None]:
# we use model.predict to get the label
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred )
print("Accuracy: %.2f%%" % (accuracy * 100.0))

In [None]:
# we can show the feature importances for our features
print(model.feature_importances_)

In [None]:
# import the plot_importance function to visualize the feature importance
from xgboost import plot_importance
plot_importance(model)
plt.show()

In [None]:
!pip install graphviz

In [None]:
from xgboost import plot_tree
from matplotlib.pylab import rcParams

plot_tree(model, num_trees=1)
# plt.title("max_depth = 100, with gamma = 10")
# plt.savefig("tree_with_max_depth_gamma", dpi = 700)

## Reference for XGBoost

[Well explained for Gradient Boosting](http://blog.kaggle.com/2017/01/23/a-kaggle-master-explains-gradient-boosting/)

[Analytic vidhya parameter tuning](https://www.analyticsvidhya.com/blog/2016/03/complete-guide-parameter-tuning-xgboost-with-codes-python/)
[How parralle xgboost work](http://zhanpengfang.github.io/418home.html)

[How to tune gamma](https://medium.com/data-design/xgboost-hi-im-gamma-what-can-i-do-for-you-and-the-tuning-of-regularization-a42ea17e6ab6)

[slides for xgboost by tqchen](http://homes.cs.washington.edu/~tqchen/pdf/BoostedTree.pdf)

[slides for xgboost by kaggle winner](https://www.slideshare.net/ShangxuanZhang/kaggle-winning-solution-xgboost-algorithm-let-us-learn-from-its-author)

[xgboost 理論與參數介紹](http://odjt9j2ec.bkt.clouddn.com/xgboost-xgboost%E5%AF%BC%E8%AF%BB%E5%92%8C%E5%AE%9E%E6%88%98.pdf)
