<a href="https://colab.research.google.com/github/tuantran221/intro-Machine-Learning/blob/main/Extreme_Gradient_Boosting_with_XGBoost/CHAPTER_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Extreme Gradient Boosting with XGBoost
**Chapter 1: Classification with XGBoost**


*  This chapter will introduce you to the fundamental idea behind XGBoost—boosted learners. Once you understand how XGBoost works, you’ll apply it to solve a common classification problem found in industry: predicting whether a customer will stop being a customer at some point in the future.


# set up and read data

In [10]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [8]:
churn_data = pd.read_csv("https://raw.githubusercontent.com/thaile-isme/datasets-for-datacamp/main/churn_data.csv?fbclid=IwAR099ifEFDVWIi5NO_WdpyxPoSXsriYyP9pyXg4tcd8yJwWLM-MG-xjEvyI")
churn_data.head()

Unnamed: 0,avg_dist,avg_rating_by_driver,avg_rating_of_driver,avg_inc_price,inc_pct,weekday_pct,fancy_car_user,city_Carthag,city_Harko,phone_iPhone,first_month_cat_more_1_trip,first_month_cat_no_trips,month_5_still_here
0,3.67,5.0,4.7,1.1,15.4,46.2,True,0,1,1,1,0,1
1,8.26,5.0,5.0,1.0,0.0,50.0,False,1,0,0,0,1,0
2,0.77,5.0,4.3,1.0,0.0,100.0,False,1,0,1,1,0,0
3,2.36,4.9,4.6,1.14,20.0,80.0,True,0,1,1,1,0,1
4,3.13,4.9,4.4,1.19,11.8,82.4,False,0,0,0,1,0,0


#Introducing XGBoost

In [11]:
# Import xgboost
import xgboost as xgb 

# Create arrays for the features and the target: X, y
X, y = churn_data.iloc[:,:-1], churn_data.iloc[:,-1]

# Create the training and test sets
X_train, X_test, y_train, y_test= train_test_split(X, y, test_size=0.2, random_state=123)

# Instantiate the XGBClassifier: xg_cl
xg_cl = xgb.XGBClassifier(objective='binary:logistic', n_estimators=10, seed=123)

# Fit the classifier to the training set
xg_cl.fit(X_train,y_train)

# Predict the labels of the test set: preds
preds = xg_cl.predict(X_test)

# Compute the accuracy: accuracy
accuracy = float(np.sum(preds==y_test))/y_test.shape[0]
print("accuracy: %f" % (accuracy))


accuracy: 0.743300


# Decision trees
**What is a decision tree?**
Decision Trees

*   Base learner - Individual learning algorithm in an ensemble algorithm
*   Composed of a series of binary questions
*   Predictions happen at the "leaves" of the tree
  *   leaf nodes always contain decision values

* Constructed iteratively (one decision at a time)
  * Until a stopping criterion is met
* Individual decision trees tend to overfit
  * low bias, high variance
  * tend to overfit training data, and generalize poorly to new data
XGBoost

* Uses classification and regression trees (CART)
* Contain real-valued score in each leaf
  * regardless of classification or regression problem
  * can be thresholded to convert into categories for classification problems 







In [13]:

# Decision trees

# Import the necessary modules
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn import datasets

bc = datasets.load_breast_cancer()
X = bc.data
y = bc.target

X.shape, y.shape

((569, 30), (569,))

In [14]:
# Create the training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)

# Instantiate the classifier: dt_clf_4
dt_clf_4 = DecisionTreeClassifier(max_depth=4)

# Fit the classifier to the training set
dt_clf_4.fit(X_train,y_train)

# Predict the labels of the test set: y_pred_4
y_pred_4 = dt_clf_4.predict(X_test)

# Compute the accuracy of the predictions: accuracy
accuracy = float(np.sum(y_pred_4==y_test))/y_test.shape[0]
print("accuracy:", accuracy)

accuracy: 0.9649122807017544


#Measuring accuracy

In [15]:
# Create arrays for the features and the target: X, y
X, y = churn_data.iloc[:,:-1], churn_data.iloc[:,-1]

# Create the DMatrix: churn_dmatrix
churn_dmatrix = xgb.DMatrix(data=X, label=y)
 
# Create the parameter dictionary: params
params = {"objective":"reg:logistic", "max_depth":3}
 
# Perform cross-validation: cv_results
cv_results = xgb.cv(dtrain=churn_dmatrix, params=params, nfold=3, num_boost_round=5, metrics="error", as_pandas=True, seed=123)
 
# Print cv_results
print(cv_results)
 
# Print the accuracy
print(((1-cv_results["test-error-mean"]).iloc[-1]))

   train-error-mean  train-error-std  test-error-mean  test-error-std
0           0.28232         0.002366          0.28378        0.001932
1           0.26951         0.001855          0.27190        0.001932
2           0.25605         0.003213          0.25798        0.003963
3           0.25090         0.001845          0.25434        0.003827
4           0.24654         0.001981          0.24852        0.000934
0.75148


#Measuring AUC

In [16]:
# Perform cross_validation: cv_results
cv_results = xgb.cv(dtrain=churn_dmatrix, params=params, 
                  nfold=3, num_boost_round=5, 
                  metrics="auc", as_pandas=True, seed=123)

# Print cv_results
print(cv_results)

# Print the AUC
print((cv_results["test-auc-mean"]).iloc[-1])

   train-auc-mean  train-auc-std  test-auc-mean  test-auc-std
0        0.768893       0.001544       0.767863      0.002820
1        0.790864       0.006758       0.789157      0.006846
2        0.815872       0.003900       0.814476      0.005997
3        0.822959       0.002018       0.821682      0.003912
4        0.827528       0.000769       0.826191      0.001937
0.826191
