In [16]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
sns.set(color_codes=True)
%matplotlib inline
from sklearn import tree
from sklearn.tree import export_graphviz
import graphviz

In [17]:
df = pd.read_csv('Employee-Attrition.csv')

In [31]:
df.head()
pd.set_option('display.max_columns',500)
pd.set_option('display.width',1000)

In [32]:
num_cols = list(df.describe().columns)
col_categorical = list(set(df.columns).difference(num_cols))
remove_list = ['EmployeeCount','EmployeeNumber','StandardHours']
col_numerical = [e for e in num_cols if e not in remove_list]

In [33]:
col_categorical.sort()
df['Attrition'].unique()
attrition_to_num = {
    'Yes':1,
    'No':0
}
df['Attrition_num']=df['Attrition'].map(attrition_to_num)
df_cat = pd.get_dummies(df[col_categorical])
df_cat.head()
X = pd.concat([df[col_numerical],df_cat],axis=1)
y = df['Attrition_num']

In [34]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier 
from sklearn.ensemble import BaggingClassifier
from sklearn.metrics import classification_report,accuracy_score,confusion_matrix
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.4)
clf = DecisionTreeClassifier(random_state = 42)
clf.fit(X_train,y_train)

In [35]:
from sklearn import preprocessing
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.metrics import accuracy_score, classification_report
from sklearn.metrics import confusion_matrix, roc_auc_score
def print_score(clf, X_train, X_test, y_train, y_test, train=True):
    '''
    v0.1 Follow the scikit learn library format in terms of input
    print the accuracy score, classification report and confusion matrix of classifier
    '''
    lb = preprocessing.LabelBinarizer()
    lb.fit(y_train)
    if train:
        '''
        training performance
        '''
        res = clf.predict(X_train)
        print("Train Result:\n")
        print("accuracy score: {0:.4f}\n".format(accuracy_score(y_train, 
                                                                res)))
        print("Classification Report: \n {}\n".format(classification_report(y_train, 
                                                                            res)))
        print("Confusion Matrix: \n {}\n".format(confusion_matrix(y_train, 
                                                                  res)))
        print("ROC AUC: {0:.4f}\n".format(roc_auc_score(lb.transform(y_train), 
                                                      lb.transform(res))))

        res = cross_val_score(clf, X_train, y_train, cv=10, scoring='accuracy')
        print("Average Accuracy: \t {0:.4f}".format(np.mean(res)))
        print("Accuracy SD: \t\t {0:.4f}".format(np.std(res)))
        
    elif train==False:
        '''
        test performance
        '''
        res_test = clf.predict(X_test)
        print("Test Result:\n")        
        print("accuracy score: {0:.4f}\n".format(accuracy_score(y_test, 
                                                                res_test)))
        print("Classification Report: \n {}\n".format(classification_report(y_test, 
                                                                            res_test)))
        print("Confusion Matrix: \n {}\n".format(confusion_matrix(y_test, 
                                                                  res_test)))   
        print("ROC AUC: {0:.4f}\n".format(roc_auc_score(lb.transform(y_test), 
                                                      lb.transform(res_test))))
        

In [36]:
print_score(clf, X_train, X_test, y_train, y_test, train=True)

Train Result:

accuracy score: 1.0000

Classification Report: 
               precision    recall  f1-score   support

           0       1.00      1.00      1.00       734
           1       1.00      1.00      1.00       148

    accuracy                           1.00       882
   macro avg       1.00      1.00      1.00       882
weighted avg       1.00      1.00      1.00       882


Confusion Matrix: 
 [[734   0]
 [  0 148]]

ROC AUC: 1.0000

Average Accuracy: 	 1.0000
Accuracy SD: 		 0.0000


In [37]:
print_score(clf, X_train, X_test, y_train, y_test, train=False)

Test Result:

accuracy score: 1.0000

Classification Report: 
               precision    recall  f1-score   support

           0       1.00      1.00      1.00       499
           1       1.00      1.00      1.00        89

    accuracy                           1.00       588
   macro avg       1.00      1.00      1.00       588
weighted avg       1.00      1.00      1.00       588


Confusion Matrix: 
 [[499   0]
 [  0  89]]

ROC AUC: 1.0000



In [38]:
bag_clf = BaggingClassifier(estimator=clf, n_estimators=1000,
                            bootstrap=True, oob_score=True,
                            n_jobs=-1, random_state=42)
bag_clf.fit(X_train, y_train)
bag_clf.oob_score_
print_score(clf, X_train, X_test, y_train, y_test, train=True)
print("\n********************************\n")
print_score(clf, X_train, X_test, y_train, y_test, train=False)

Train Result:

accuracy score: 1.0000

Classification Report: 
               precision    recall  f1-score   support

           0       1.00      1.00      1.00       734
           1       1.00      1.00      1.00       148

    accuracy                           1.00       882
   macro avg       1.00      1.00      1.00       882
weighted avg       1.00      1.00      1.00       882


Confusion Matrix: 
 [[734   0]
 [  0 148]]

ROC AUC: 1.0000

Average Accuracy: 	 1.0000
Accuracy SD: 		 0.0000

********************************

Test Result:

accuracy score: 1.0000

Classification Report: 
               precision    recall  f1-score   support

           0       1.00      1.00      1.00       499
           1       1.00      1.00      1.00        89

    accuracy                           1.00       588
   macro avg       1.00      1.00      1.00       588
weighted avg       1.00      1.00      1.00       588


Confusion Matrix: 
 [[499   0]
 [  0  89]]

ROC AUC: 1.0000

