In [1]:
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import pandas as pd
from sklearn import preprocessing

## Data Pre-processing 

In [2]:
df = sns.load_dataset('titanic')
subset = df[['pclass', 'sex', 'age', 'survived']].copy()
subset.dropna(inplace=True)
X = subset[['pclass', 'sex', 'age']].copy()
le = preprocessing.LabelEncoder()
X['sex'] = le.fit_transform(subset['sex'])
y = subset['survived'].copy()

## Scoring function

In [3]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.model_selection import train_test_split

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [5]:
from sklearn import preprocessing
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.metrics import accuracy_score, classification_report
from sklearn.metrics import confusion_matrix, roc_auc_score
def print_score(clf, X_train, X_test, y_train, y_test, train=True):
    '''
    v0.1 Follow the scikit learn library format in terms of input
    print the accuracy score, classification report and confusion matrix of classifier
    '''
    lb = preprocessing.LabelBinarizer()
    lb.fit(y_train)
    if train:
        '''
        training performance
        '''
        res = clf.predict(X_train)
        print("Train Result:\n")
        print("accuracy score: {0:.4f}\n".format(accuracy_score(y_train, 
                                                                res)))
        print("Classification Report: \n {}\n".format(classification_report(y_train, 
                                                                            res)))
        print("Confusion Matrix: \n {}\n".format(confusion_matrix(y_train, 
                                                                  res)))
        print("ROC AUC: {0:.4f}\n".format(roc_auc_score(lb.transform(y_train), 
                                                      lb.transform(res))))

        res = cross_val_score(clf, X_train, y_train, cv=10, scoring='accuracy')
        print("Average Accuracy: \t {0:.4f}".format(np.mean(res)))
        print("Accuracy SD: \t\t {0:.4f}".format(np.std(res)))
        
    elif train==False:
        '''
        test performance
        '''
        res_test = clf.predict(X_test)
        print("Test Result:\n")        
        print("accuracy score: {0:.4f}\n".format(accuracy_score(y_test, 
                                                                res_test)))
        print("Classification Report: \n {}\n".format(classification_report(y_test, 
                                                                            res_test)))
        print("Confusion Matrix: \n {}\n".format(confusion_matrix(y_test, 
                                                                  res_test)))   
        print("ROC AUC: {0:.4f}\n".format(roc_auc_score(lb.transform(y_test), 
                                                      lb.transform(res_test))))
        

## Random Forest

In [7]:
rf_clf = RandomForestClassifier(random_state=42, n_estimators=100)
rf_clf.fit(X_train, y_train)
print_score(rf_clf, X_train, X_test, y_train, y_test, train=True)
print("\n******************************\n")
print_score(rf_clf, X_train, X_test, y_train, y_test, train=False)

Train Result:

accuracy score: 0.8938

Classification Report: 
               precision    recall  f1-score   support

           0       0.90      0.93      0.91       296
           1       0.89      0.84      0.87       203

    accuracy                           0.89       499
   macro avg       0.89      0.89      0.89       499
weighted avg       0.89      0.89      0.89       499


Confusion Matrix: 
 [[275  21]
 [ 32 171]]

ROC AUC: 0.8857

Average Accuracy: 	 0.7556
Accuracy SD: 		 0.0518

******************************

Test Result:

accuracy score: 0.7860

Classification Report: 
               precision    recall  f1-score   support

           0       0.85      0.78      0.81       128
           1       0.71      0.79      0.75        87

    accuracy                           0.79       215
   macro avg       0.78      0.79      0.78       215
weighted avg       0.79      0.79      0.79       215


Confusion Matrix: 
 [[100  28]
 [ 18  69]]

ROC AUC: 0.7872



## Grid Search

In [12]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

In [14]:
rf_clf = RandomForestClassifier(random_state=42, n_estimators=100)
params_grid = {"max_depth": [3, None],
               "min_samples_split": [2, 3, 10],
               "min_samples_leaf": [1, 3, 10],
               "bootstrap": [True, False],
               "criterion": ['gini', 'entropy']}
grid_search = GridSearchCV(rf_clf, params_grid,
                           n_jobs=-1, cv=5,
                           verbose=1, scoring='accuracy')

In [15]:
grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 72 candidates, totalling 360 fits


In [16]:
grid_search.best_score_

0.8076969696969696

In [17]:
grid_search.best_estimator_.get_params()

{'bootstrap': False,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': 3,
 'max_features': 'sqrt',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 10,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': 42,
 'verbose': 0,
 'warm_start': False}

In [18]:
print_score(grid_search, X_train, X_test, y_train, y_test, train=True)
print("\n******************************\n")
print_score(grid_search, X_train, X_test, y_train, y_test, train=False)

Train Result:

accuracy score: 0.8176

Classification Report: 
               precision    recall  f1-score   support

           0       0.78      0.96      0.86       296
           1       0.92      0.61      0.73       203

    accuracy                           0.82       499
   macro avg       0.85      0.78      0.80       499
weighted avg       0.84      0.82      0.81       499


Confusion Matrix: 
 [[285  11]
 [ 80 123]]

ROC AUC: 0.7844

Fitting 5 folds for each of 72 candidates, totalling 360 fits
Fitting 5 folds for each of 72 candidates, totalling 360 fits
Fitting 5 folds for each of 72 candidates, totalling 360 fits
Fitting 5 folds for each of 72 candidates, totalling 360 fits
Fitting 5 folds for each of 72 candidates, totalling 360 fits
Fitting 5 folds for each of 72 candidates, totalling 360 fits
Fitting 5 folds for each of 72 candidates, totalling 360 fits
Fitting 5 folds for each of 72 candidates, totalling 360 fits
Fitting 5 folds for each of 72 candidates, totallin

# Extra-Trees (Extremely Randomized Trees) Ensemble

In [23]:
from sklearn.ensemble import ExtraTreesClassifier

In [24]:
xt_clf = ExtraTreesClassifier(random_state=42, n_estimators=100)

In [25]:
xt_clf.fit(X_train, y_train)

In [26]:
print_score(xt_clf, X_train, X_test, y_train, y_test, train=True)
print("\n******************************\n")
print_score(xt_clf, X_train, X_test, y_train, y_test, train=False)

Train Result:

accuracy score: 0.8938

Classification Report: 
               precision    recall  f1-score   support

           0       0.86      0.98      0.92       296
           1       0.96      0.77      0.86       203

    accuracy                           0.89       499
   macro avg       0.91      0.87      0.89       499
weighted avg       0.90      0.89      0.89       499


Confusion Matrix: 
 [[289   7]
 [ 46 157]]

ROC AUC: 0.8749

Average Accuracy: 	 0.7777
Accuracy SD: 		 0.0593

******************************

Test Result:

accuracy score: 0.7721

Classification Report: 
               precision    recall  f1-score   support

           0       0.81      0.81      0.81       128
           1       0.72      0.71      0.72        87

    accuracy                           0.77       215
   macro avg       0.76      0.76      0.76       215
weighted avg       0.77      0.77      0.77       215


Confusion Matrix: 
 [[104  24]
 [ 25  62]]

ROC AUC: 0.7626

