# XGBoost: Fit/Predict

In [1]:
import pandas as pd 
from sklearn.model_selection import train_test_split
import numpy as np

churn_data = pd.read_csv('churn_data.csv')

In [2]:
# Import xgboost
import xgboost as xgb

# Create arrays for the features and the target: X, y
X, y = churn_data.iloc[:,:-1], churn_data.iloc[:,-1]

# Create the training and test sets
X_train, X_test, y_train, y_test= train_test_split(X, y, test_size=0.2, random_state=123)

# Instantiate the XGBClassifier: xg_cl
xg_cl = xgb.XGBClassifier(objective='binary:logistic', 
                          n_estimators=10, 
                          seed=123)

# Fit the classifier to the training set
xg_cl.fit(X_train, y_train)

# Predict the labels of the test set: preds
preds = xg_cl.predict(X_test)

# Compute the accuracy: accuracy
accuracy = float(np.sum(preds==y_test))/y_test.shape[0]
print("accuracy: %f" % (accuracy))

accuracy: 0.758200


---

# Decision Trees

In [3]:
# Import the necessary modules
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.datasets import load_breast_cancer

b_cancer = load_breast_cancer()

X_bc = b_cancer.data
y_bc = b_cancer.target

# Create the training and test sets
Xbc_train, Xbc_test, ybc_train, ybc_test = train_test_split(X_bc, y_bc, test_size=0.2, random_state=123)

# Instantiate the classifier: dt_clf_4
dt_clf_4 = DecisionTreeClassifier(max_depth=4)

# Fit the classifier to the training set
dt_clf_4.fit(Xbc_train, ybc_train)

# Predict the labels of the test set: y_pred_4
ybc_pred_4 = dt_clf_4.predict(Xbc_test)

# Compute the accuracy of the predictions: accuracy
accuracy = float(np.sum(ybc_pred_4==ybc_test))/ybc_test.shape[0]
print("accuracy:", accuracy)

accuracy: 0.9736842105263158


---

# Measuring accuracy

In [4]:
# Create arrays for the features and the target: X, y
X, y = churn_data.iloc[:,:-1], churn_data.iloc[:,-1]

# Create the DMatrix from X and y: churn_dmatrix
churn_dmatrix = xgb.DMatrix(data=X, label=y)

# Create the parameter dictionary: params
params = {"objective":"reg:logistic", "max_depth":3}

In [11]:
cv_results

Unnamed: 0,train-error-mean,train-error-std,test-error-mean,test-error-std
0,0.28232,0.002366,0.28378,0.001932
1,0.26951,0.001855,0.2719,0.001932
2,0.25605,0.003213,0.25798,0.003963
3,0.2509,0.001844,0.25434,0.003827
4,0.24654,0.001981,0.24852,0.000934


In [10]:
# Perform cross-validation: cv_results
cv_results = xgb.cv(dtrain=churn_dmatrix, params=params, 
                  nfold=3, num_boost_round=5, 
                  metrics="error", as_pandas=True, seed=123)

# Print cv_results
print(cv_results)

# Print the accuracy
print(((1-cv_results["test-error-mean"]).iloc[-1]))

   train-error-mean  train-error-std  test-error-mean  test-error-std
0           0.28232         0.002366          0.28378        0.001932
1           0.26951         0.001855          0.27190        0.001932
2           0.25605         0.003213          0.25798        0.003963
3           0.25090         0.001844          0.25434        0.003827
4           0.24654         0.001981          0.24852        0.000934
0.751480015401492


In [7]:
cv_results["test-error-mean"]

0    0.28378
1    0.27190
2    0.25798
3    0.25434
4    0.24852
Name: test-error-mean, dtype: float64

---

# Measuring AUC

In [8]:
# Perform cross_validation: cv_results
cv_results = xgb.cv(dtrain=churn_dmatrix, params=params, 
                  nfold=3, num_boost_round=5, 
                  metrics="auc", as_pandas=True, seed=123)

# Print cv_results
print(cv_results)

# Print the AUC
print((cv_results["test-auc-mean"]).iloc[-1])

   train-auc-mean  train-auc-std  test-auc-mean  test-auc-std
0        0.768893       0.001544       0.767863      0.002819
1        0.790864       0.006758       0.789156      0.006846
2        0.815872       0.003900       0.814476      0.005997
3        0.822959       0.002018       0.821682      0.003912
4        0.827528       0.000769       0.826191      0.001937
0.8261911413597645


---