# XGBoost (Extreme Gradient Boosting)

In [0]:
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import pandas as pd

In [0]:
df = sns.load_dataset('titanic')

In [0]:
df.dropna(inplace=True)

## Data Pre-processing

In [0]:
X = df[['pclass', 'sex', 'age']].copy()

In [0]:
from sklearn import preprocessing
lb = preprocessing.LabelBinarizer()

In [0]:
X['sex'] = lb.fit_transform(X['sex'])

In [0]:
y = df['survived']

***

In [0]:
from sklearn.model_selection import train_test_split

In [0]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [0]:
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [0]:
def print_score(clf, X_train, y_train, X_test, y_test, train=True):
    '''
    print the accuracy score, classification report and confusion matrix of classifier
    '''
    if train:
        '''
        training performance
        '''
        print("Train Result:\n")
        print("accuracy score: {0:.4f}\n".format(accuracy_score(y_train, clf.predict(X_train))))
        print("Classification Report: \n {}\n".format(classification_report(y_train, clf.predict(X_train))))
        print("Confusion Matrix: \n {}\n".format(confusion_matrix(y_train, clf.predict(X_train))))

        res = cross_val_score(clf, X_train, y_train, cv=10, scoring='accuracy')
        print("Average Accuracy: \t {0:.4f}".format(np.mean(res)))
        print("Accuracy SD: \t\t {0:.4f}".format(np.std(res)))
        
    elif train==False:
        '''
        test performance
        '''
        print("Test Result:\n")        
        print("accuracy score: {0:.4f}\n".format(accuracy_score(y_test, clf.predict(X_test))))
        print("Classification Report: \n {}\n".format(classification_report(y_test, clf.predict(X_test))))
        print("Confusion Matrix: \n {}\n".format(confusion_matrix(y_test, clf.predict(X_test))))    
        

# XGBoost

In [0]:
import xgboost as xgb

In [0]:
xgb_clf = xgb.XGBClassifier(max_depth=5, n_estimators=10000, learning_rate=0.3,
                            n_jobs=-1)

In [0]:
xgb_clf.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.3, max_delta_step=0, max_depth=5,
              min_child_weight=1, missing=None, n_estimators=10000, n_jobs=-1,
              nthread=None, objective='binary:logistic', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, verbosity=1)

In [0]:
print_score(xgb_clf, X_train, y_train, X_test, y_test, train=True)

Train Result:

accuracy score: 0.9449

Classification Report: 
               precision    recall  f1-score   support

           0       0.91      0.93      0.92        42
           1       0.96      0.95      0.96        85

    accuracy                           0.94       127
   macro avg       0.94      0.94      0.94       127
weighted avg       0.95      0.94      0.95       127


Confusion Matrix: 
 [[39  3]
 [ 4 81]]

Average Accuracy: 	 0.7723
Accuracy SD: 		 0.1033


In [0]:
print_score(xgb_clf, X_train, y_train, X_test, y_test, train=False)

Test Result:

accuracy score: 0.8000

Classification Report: 
               precision    recall  f1-score   support

           0       0.69      0.65      0.67        17
           1       0.85      0.87      0.86        38

    accuracy                           0.80        55
   macro avg       0.77      0.76      0.76        55
weighted avg       0.80      0.80      0.80        55


Confusion Matrix: 
 [[11  6]
 [ 5 33]]

