# Bagging Machine Learning Algorithm

In [20]:
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import pandas as pd
from sklearn import preprocessing

## Data Pre-processing 

In [21]:
df = sns.load_dataset('titanic')
subset = df[['pclass', 'sex', 'age', 'survived']].copy()
subset.dropna(inplace=True)
X = subset[['pclass', 'sex', 'age']].copy()
le = preprocessing.LabelEncoder()
X['sex'] = le.fit_transform(subset['sex'])
y = subset['survived'].copy()

## Scoring function

In [22]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.model_selection import train_test_split

In [23]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [24]:
from sklearn import preprocessing
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.metrics import accuracy_score, classification_report
from sklearn.metrics import confusion_matrix, roc_auc_score
def print_score(clf, X_train, X_test, y_train, y_test, train=True):
    '''
    v0.1 Follow the scikit learn library format in terms of input
    print the accuracy score, classification report and confusion matrix of classifier
    '''
    lb = preprocessing.LabelBinarizer()
    lb.fit(y_train)
    if train:
        '''
        training performance
        '''
        res = clf.predict(X_train)
        print("Train Result:\n")
        print("accuracy score: {0:.4f}\n".format(accuracy_score(y_train, 
                                                                res)))
        print("Classification Report: \n {}\n".format(classification_report(y_train, 
                                                                            res)))
        print("Confusion Matrix: \n {}\n".format(confusion_matrix(y_train, 
                                                                  res)))
        print("ROC AUC: {0:.4f}\n".format(roc_auc_score(lb.transform(y_train), 
                                                      lb.transform(res))))

        res = cross_val_score(clf, X_train, y_train, cv=10, scoring='accuracy')
        print("Average Accuracy: \t {0:.4f}".format(np.mean(res)))
        print("Accuracy SD: \t\t {0:.4f}".format(np.std(res)))
        
    elif train==False:
        '''
        test performance
        '''
        res_test = clf.predict(X_test)
        print("Test Result:\n")        
        print("accuracy score: {0:.4f}\n".format(accuracy_score(y_test, 
                                                                res_test)))
        print("Classification Report: \n {}\n".format(classification_report(y_test, 
                                                                            res_test)))
        print("Confusion Matrix: \n {}\n".format(confusion_matrix(y_test, 
                                                                  res_test)))   
        print("ROC AUC: {0:.4f}\n".format(roc_auc_score(lb.transform(y_test), 
                                                      lb.transform(res_test))))
        

## Decsion Tree 

In [25]:
clf = DecisionTreeClassifier(random_state=42)
clf.fit(X_train, y_train)
print_score(clf, X_train, X_test, y_train, y_test, train=True)
print("\n********************************\n")
print_score(clf, X_train, X_test, y_train, y_test, train=False)

Train Result:

accuracy score: 0.9078

Classification Report: 
               precision    recall  f1-score   support

           0       0.88      0.97      0.93       297
           1       0.95      0.81      0.88       202

    accuracy                           0.91       499
   macro avg       0.92      0.89      0.90       499
weighted avg       0.91      0.91      0.91       499


Confusion Matrix: 
 [[289   8]
 [ 38 164]]

ROC AUC: 0.8925

Average Accuracy: 	 0.7795
Accuracy SD: 		 0.0287

********************************

Test Result:

accuracy score: 0.8093

Classification Report: 
               precision    recall  f1-score   support

           0       0.79      0.92      0.85       127
           1       0.85      0.65      0.74        88

    accuracy                           0.81       215
   macro avg       0.82      0.78      0.79       215
weighted avg       0.82      0.81      0.80       215


Confusion Matrix: 
 [[117  10]
 [ 31  57]]

ROC AUC: 0.7845



## Bagging

In [26]:
bag_clf = BaggingClassifier(estimator=clf, n_estimators=1000,
                            bootstrap=True, oob_score=True,
                            n_jobs=-1, random_state=42)
bag_clf.fit(X_train, y_train)
bag_clf.oob_score_
print_score(clf, X_train, X_test, y_train, y_test, train=True)
print("\n********************************\n")
print_score(clf, X_train, X_test, y_train, y_test, train=False)

Train Result:

accuracy score: 0.9078

Classification Report: 
               precision    recall  f1-score   support

           0       0.88      0.97      0.93       297
           1       0.95      0.81      0.88       202

    accuracy                           0.91       499
   macro avg       0.92      0.89      0.90       499
weighted avg       0.91      0.91      0.91       499


Confusion Matrix: 
 [[289   8]
 [ 38 164]]

ROC AUC: 0.8925

Average Accuracy: 	 0.7795
Accuracy SD: 		 0.0287

********************************

Test Result:

accuracy score: 0.8093

Classification Report: 
               precision    recall  f1-score   support

           0       0.79      0.92      0.85       127
           1       0.85      0.65      0.74        88

    accuracy                           0.81       215
   macro avg       0.82      0.78      0.79       215
weighted avg       0.82      0.81      0.80       215


Confusion Matrix: 
 [[117  10]
 [ 31  57]]

ROC AUC: 0.7845

