In [None]:
import pandas as pd
from pandas import DataFrame,Series
import numpy as np
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')
import seaborn as sns
from sklearn import metrics
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB 
from sklearn.model_selection import train_test_split 
from sklearn.metrics import confusion_matrix,auc,roc_auc_score
from sklearn.metrics import recall_score, precision_score, accuracy_score, f1_score

%matplotlib inline

In [None]:
df = pd.read_csv('attrition.csv')
df.head()

In [None]:
df.isnull().sum()

In [None]:
df.columns

In [None]:
df.info()

In [None]:
print('Gender: ', df.Gender.unique() )
print('BusinessTravel: ', df.BusinessTravel.unique() )
print('Department: ', df.Department.unique() )
print('EducationField: ', df.EducationField.unique() )
print('JobRole: ', df.JobRole.unique() )
print('MaritalStatus: ', df.MaritalStatus.unique() )
print('OverTime: ', df.OverTime.unique() )

In [None]:
df.groupby('Attrition').mean()

# Replacement

In [None]:
df['Gender'].replace(['Female','Male'], [0 , 1], inplace = True)
df['BusinessTravel'].replace(['Travel_Rarely','Travel_Frequently','Non-Travel'], [0 , 1 , 2], inplace = True)
df['Department'].replace(['Sales' , 'RandD', 'HR'], [0 , 1 , 2], inplace = True)
df['EducationField'].replace(['Life_SC','Other','Medical','Marketing','TECH','EDU_HR'], [0,1,2,3,4,5], inplace = True)
df['JobRole'].replace(['Sales_exec','Scientist','Lab_tech','mfg_director','health_rep','Manager',
 'Sales_rep','Research_dir','Job_HR'], [0,1,2,3,4,5,6,7,8], inplace = True)
df['MaritalStatus'].replace(['Single','Married','Divorced'], [0,1,2], inplace = True)
df['OverTime'].replace(['Yes','No'], [0 , 1], inplace = True)
df.head()

# Reduntant Column Removal

Here, we are not able to visualize clearly. So, we are going for different apporach of finding correlation

In [None]:
## code for getting larger figure of correlation
cmap=sns.diverging_palette(5, 250, as_cmap=True)

def magnify():
    return [dict(selector="th",
                 props=[("font-size", "7pt")]),
            dict(selector="td",
                 props=[('padding', "0em 0em")]),
            dict(selector="th:hover",
                 props=[("font-size", "12pt")]),
            dict(selector="tr:hover td:hover",
                 props=[('max-width', '200px'),
                        ('font-size', '12pt')])
]
corr=df.corr()
corr.style.background_gradient(cmap, axis=1)\
    .set_properties(**{'max-width': '80px', 'font-size': '10pt'})\
    .set_caption("Hover to magify")\
    .set_precision(2)\
    .set_table_styles(magnify())

While comparing we didnot get any of the correlation > 0.90. So, it is better to keep all columns. As because none of the columns is in high relation with Attrition so chances of bring collinearity is minimum.
Here, we can find certain correlation among the variable as :

    Age                vs    TotalWorkingYears          AS   68%                                                                   JobLevel           vs    MonthlyIncome              As   95%
    JobLevel           vs    TotalWorkingYears          As   78%
    MaritalStatus      vs    StockObtainLevel           As   66%
    MonthlyIncome      vs    TotalWorkingYears          As   77%
    PercentSalaryHike  vs    PerformanceRating          As   77%
    YearsAtCompany     vs    YearsInCurrentRole         As   76%
    YearsAtCompany     vs    YearsSinceLastPromotion    As   62%
    YearsAtCompany     vs    YearsWithCurrManager       As   77%
   

We are considering approx 80 percent correlation as good correlation.
So, we are left with:

    JobLevel           vs    MonthlyIncome              As   95%


So we can drop JobLevel, MonthlyIncome , PercentSalaryHike , YearsAtCompany.

# Dropping columns

In [None]:
df = df.drop(['JobLevel'], axis=1)
df.head()

# Conversion to categories

In [None]:
df['Attrition'] = df['Attrition'].astype('category')
df.info()

In [None]:
df['Gender'] = df['Gender'].astype('category')
df['EducationField'] = df['EducationField'].astype('category')
df['JobRole'] = df['JobRole'].astype('category')
df['Department'] = df['Department'].astype('category')
df['MaritalStatus'] = df['MaritalStatus'].astype('category')
# df['OverTime'] = df['OverTime'].astype('category')
df['BusinessTravel'] = df['BusinessTravel'].astype('category')
df.info()

In [None]:
df.columns

# EDA

In [None]:
sns.boxplot(x='Attrition', y='Age', data=df)
plt.title('Age Vs Attrition')
plt.show()

In [None]:
sns.boxplot(x='Attrition', y='DailyRate', data=df)
plt.title('Attrition Vs DailyRate')
plt.show()

In [None]:
round(pd.crosstab(df.Attrition , df.BusinessTravel,  normalize = 'columns'),2)

                                             0 - Travel-Rarely
                                             1 - Travel-Frequently
                                             2 - Non-Travel

In [None]:
sns.boxplot(x='Attrition', y='DistanceFromHome', data=df)
plt.title('Attrition Vs DistanceFromHome')
plt.show()

In [None]:
ACC_perct = round(pd.crosstab(df.Attrition , df.JobRole,  normalize = 'columns'),2)
print(ACC_perct)

    0 - 'Sales_exec'
    1 - 'Scientist'
    2 - 'Lab_tech'
    3 - 'mfg_director'
    4 - 'health_rep'
    5 - 'Manager'
    6 - 'Sales_rep'
    7 - 'Research_dir'
    8 - 'Job_HR'

In [None]:
ACC_perct = round(pd.crosstab(df.Attrition , df.MaritalStatus,  normalize = 'columns'),2)
print(ACC_perct)

    0- 'Single'
    1- 'Married'
    2- 'Divorced'

In [None]:
ACC_perct = round(pd.crosstab(df.Attrition , df.OverTime,  normalize = 'columns'),2)
print(ACC_perct)

                                0 - 'Yes'
                                1 - 'No'

In [None]:
sns.boxplot(x='Attrition', y='TotalWorkingYears', data=df)
plt.title('Attrition Vs TotalWorkingYears')
plt.show()

In [None]:
sns.boxplot(x='Attrition', y='YearsInCurrentRole', data=df)
plt.title('Attrition Vs YearsInCurrentRole')
plt.show()

In [None]:
sns.boxplot(x='Attrition', y='YearsWithCurrManager', data=df)
plt.title('Attrition Vs YearsWithCurrManager')
plt.show()

# Decision Tree

# Spilitting of dataset

In [None]:
#Create dataframes for X and Y variables
x = df.drop(["Attrition"], axis=1)
y = df[['Attrition']]
print(x.shape)
print(y.shape)

In [None]:
##Convert x to dummy variables
x=pd.get_dummies(x , drop_first = True)

In [None]:
x.columns

In [None]:
##Train test split
from sklearn.model_selection import train_test_split
seed = 7
np.random.seed(seed)

X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.30, random_state = 123)

In [None]:
X_train.shape,X_test.shape

In [None]:
y_train

# Model1:- Criterion1='gini'

In [None]:
model_gini=DecisionTreeClassifier()
model_gini.fit(X_train, y_train)
preds_gini = model_gini.predict(X_test)

In [None]:
model_gini_train=DecisionTreeClassifier()
model_gini_train.fit(X_train, y_train)
preds_gini_train= model_gini.predict(X_train)

# Evaluation Metrics

In [None]:
#Confusion matrix
from sklearn.metrics import classification_report,confusion_matrix
mat_gini = confusion_matrix(y_test,preds_gini)

print("confusion matrix = \n",mat_gini)

In [None]:
#Calculate accuracy
print(accuracy_score(y_test,preds_gini))
print(accuracy_score(y_train,preds_gini_train))

In [None]:
print(metrics.classification_report(y_test,preds_gini))

# Model1:- Criterion2='entropy'

In [None]:
model_entropy=DecisionTreeClassifier(criterion='entropy')
model_entropy.fit(X_train, y_train)

In [None]:
preds_entropy = model_entropy.predict(X_test)
preds_entropy_train = model_entropy.predict(X_train)
#Confusion matrix
from sklearn.metrics import classification_report,confusion_matrix
mat_gini = confusion_matrix(y_test,preds_entropy)

print("confusion matrix = \n",mat_gini)

# Evaluation Metrics

In [None]:
print(accuracy_score(y_test,preds_entropy))
print(accuracy_score(y_train,preds_entropy_train))

#### It is an overfit model as because the acc of train > acc of test

In [None]:
print(metrics.classification_report(y_test,preds_entropy))

In [None]:
#Confusion matrix
from sklearn.metrics import classification_report,confusion_matrix
mat_entropy = confusion_matrix(y_test,preds_entropy)

print("confusion matrix = \n",mat_entropy)

## Visualizing the tree

In [None]:
from sklearn.tree import export_graphviz
#from sklearn.externals.six import StringIO  
from six import StringIO
from IPython.display import Image  
import pydotplus
import graphviz
from graphviz import Digraph
from sklearn.tree import export_graphviz

feature_cols = x.columns

### Tree from entropy

In [None]:
dot_data = StringIO()
export_graphviz(model_gini, out_file=dot_data,  
                filled=True, rounded=True,
                special_characters=True,feature_names = feature_cols,class_names=['0','1'])
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())  
graph.write_png('delay_gini.png')
Image(graph.create_png())

#### The tree is overgrown due to which there is overfitting - we will now prune the tree and re-evaluate the model

## Pruning

In [None]:
clf_pruned = DecisionTreeClassifier(criterion = "gini", random_state = 100,
                               max_depth=5, min_samples_leaf=5)
clf_pruned.fit(X_train, y_train)

In [None]:
dot_data = StringIO()
export_graphviz(clf_pruned, out_file=dot_data,  
                filled=True, rounded=True,
                special_characters=True,feature_names = feature_cols,class_names=['0','1'])
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())  
graph.write_png('delay_pruned.png')
Image(graph.create_png())

In [None]:
preds_pruned = clf_pruned.predict(X_test)
preds_pruned_train = clf_pruned.predict(X_train)

In [None]:
#Confusion matrix
from sklearn.metrics import classification_report,confusion_matrix
mat_pruned = confusion_matrix(y_test,preds_pruned)

print("confusion matrix = \n",mat_pruned)

In [None]:
print(accuracy_score(y_test,preds_pruned))
print(accuracy_score(y_train,preds_pruned_train))

In [None]:
print(metrics.classification_report(y_test,preds_pruned))

## Calculating feature importance

In [None]:
feat_importance = clf_pruned.tree_.compute_feature_importances(normalize=False)
print("feat importance = " + str(feat_importance))

In [None]:
feat_imp_dict = dict(zip(feature_cols, clf_pruned.feature_importances_))
feat_imp = pd.DataFrame.from_dict(feat_imp_dict, orient='index')
feat_imp.sort_values(by=0, ascending=False).head(12)

In [None]:
# print the first 10 predicted probabilities of class membership
clf_pruned.predict_proba(X_test)[0:10]


In [None]:
# store the predicted probabilities for class 1
y_pred_prob = clf_pruned.predict_proba(X_test)[:, 1]
y_pred_prob[1:20]

In [None]:
# histogram of predicted probabilities

# 8 bins
plt.hist(y_pred_prob, bins=20)

# x-axis limit from 0 to 1
plt.xlim(0,1)
plt.title('Histogram of predicted probabilities')
plt.xlabel('Predicted probability of diabetes')
plt.ylabel('Frequency')

In [None]:
## Changing the cut off value for prediction
pred_proba_df = pd.DataFrame(clf_pruned.predict_proba(X_test))
threshold_list = [0.01,0.02,0.03,0.04,0.05,0.1,0.15,0.17,0.19,0.2,0.25,0.3,0.35,0.38,0.4,0.41,0.42,0.43,0.45,.5,0.6,.7,.8,.9,.99]
for i in threshold_list:
    print ('\n******** For i = {} ******'.format(i))
    y_test_pred = pred_proba_df.applymap(lambda x:1 if x>i else 0)
    test_accuracy = metrics.accuracy_score(y_test.values.reshape(y_test.values.size,1),
                                           y_test_pred.iloc[:,1].values.reshape(y_test_pred.iloc[:,1].values.size,1))
    print('Our testing accuracy is {:.2f}'.format(test_accuracy))

    print(confusion_matrix(y_test.values.reshape(y_test.values.size,1),
                           y_test_pred.iloc[:,1].values.reshape(y_test_pred.iloc[:,1].values.size,1)))

In [None]:
# allow plots to appear in the notebook
%matplotlib inline
import matplotlib.pyplot as plt

# adjust the font size 
plt.rcParams['font.size'] = 12

In [None]:
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.metrics import roc_curve, precision_recall_curve, auc, make_scorer, recall_score, accuracy_score, precision_score, confusion_matrix

In [None]:
clf = DecisionTreeClassifier()

param_grid = {
    'max_depth': [3, 4, 5,6,7,8],
    }

scorers = {
    'precision_score': make_scorer(precision_score),
    'recall_score': make_scorer(recall_score),
    'accuracy_score': make_scorer(accuracy_score)
}

In [None]:
def grid_search_wrapper(refit_score='precision_score'):
    """
    fits a GridSearchCV classifier using refit_score for optimization
    prints classifier performance metrics
    """
    skf = StratifiedKFold(n_splits=10)
    grid_search = GridSearchCV(clf, param_grid, scoring=scorers, refit=refit_score,
                           cv=skf, return_train_score=True, n_jobs=-1)
    grid_search.fit(X_train.values, y_train.values)

    # make the predictions
    y_pred = grid_search.predict(X_test.values)

    print('Best params for {}'.format(refit_score))
    print(grid_search.best_params_)

    # confusion matrix on the test data.
    print('\nConfusion matrix of optimized for {} on the test data:'.format(refit_score))
    print(pd.DataFrame(confusion_matrix(y_test, y_pred),
                 columns=['pred_neg', 'pred_pos'], index=['neg', 'pos']))
    return grid_search

In [None]:
grid_search_clf = grid_search_wrapper(refit_score='recall_score')

In [None]:
results = pd.DataFrame(grid_search_clf.cv_results_)
results = results.sort_values(by='mean_test_precision_score', ascending=False)
results[['mean_test_precision_score', 'mean_test_recall_score', 'mean_test_accuracy_score', 'param_max_depth']].round(3)

In [None]:
y_scores = grid_search_clf.predict_proba(X_test)[:, 1]
# for classifiers with decision_function, this achieves similar results
# y_scores = classifier.decision_function(X_test)

In [None]:
p, r, thresholds = precision_recall_curve(y_test, y_scores)

In [None]:
def adjusted_classes(y_scores, t):
    """
    This function adjusts class predictions based on the prediction threshold (t).
    Will only work for binary classification problems.
    """
    return [1 if y >= t else 0 for y in y_scores]

def precision_recall_threshold(p, r, thresholds, t=0.5):
    """
    plots the precision recall curve and shows the current value for each
    by identifying the classifier's threshold (t).
    """
    
    # generate new class predictions based on the adjusted_classes
    # function above and view the resulting confusion matrix.
    y_pred_adj = adjusted_classes(y_scores, t)
    print(pd.DataFrame(confusion_matrix(y_test, y_pred_adj),
                       columns=['pred_neg', 'pred_pos'], 
                       index=['neg', 'pos']))
    
    
   

In [None]:
def plot_precision_recall_vs_threshold(precisions, recalls, thresholds):
    """
    Modified from:
    Hands-On Machine learning with Scikit-Learn
    and TensorFlow; p.89
    """
    plt.figure(figsize=(8, 8))
    plt.title("Precision and Recall Scores as a function of the decision threshold")
    plt.plot(thresholds, precisions[:-1], "b--", label="Precision")
    plt.plot(thresholds, recalls[:-1], "g-", label="Recall")
    plt.ylabel("Score")
    plt.xlabel("Decision Threshold")
    plt.legend(loc='best')

In [None]:
# use the same p, r, thresholds that were previously calculated
plot_precision_recall_vs_threshold(p, r, thresholds)

## Model building with threshold value 0.19

In [None]:
clf_pruned_final = DecisionTreeClassifier(criterion = "gini", random_state = 100,
                               max_depth=8, min_samples_leaf=5)
clf_pruned_final.fit(X_train, y_train)

In [None]:
dot_data = StringIO()
export_graphviz(clf_pruned_final, out_file=dot_data,  
                filled=True, rounded=True,
                special_characters=True,feature_names = feature_cols,class_names=['0','1'])
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())  
graph.write_png('delay_pruned.png')
Image(graph.create_png())

In [None]:
preds_pruned_final = clf_pruned_final.predict(X_test)
preds_pruned_train1 = clf_pruned_final.predict(X_train)

In [None]:
#Confusion matrix
from sklearn.metrics import classification_report,confusion_matrix
mat_pruned_final = confusion_matrix(y_test,preds_pruned_final)

print("confusion matrix = \n",mat_pruned_final)

In [None]:
## Changing the cut off value for prediction
pred_proba = pd.DataFrame(clf_pruned_final.predict_proba(X_test))
threshold_list = [0.19]
for i in threshold_list:
    print ('\n******** For i = {} ******'.format(i))
    y_test_pred = pred_proba.applymap(lambda x:1 if x>i else 0)
    test_accurac = metrics.accuracy_score(y_test.as_matrix().reshape(y_test.as_matrix().size,1),
                                           y_test_pred.iloc[:,1].as_matrix().reshape(y_test_pred.iloc[:,1].as_matrix().size,1))
    print('Our testing a
          ccuracy is {:.2f}'.format(test_accurac))

    print(confusion_matrix(y_test.as_matrix().reshape(y_test.as_matrix().size,1),
                           y_test_pred.iloc[:,1].as_matrix().reshape(y_test_pred.iloc[:,1].as_matrix().size,1)))

In [None]:
print(accuracy_score(y_test,preds_pruned_final))
print(accuracy_score(y_train,preds_pruned_train1))

In [None]:
print(metrics.classification_report(y_test,preds_pruned))

## Calculating feature importance

In [None]:
feat_importance1 = clf_pruned.tree_.compute_feature_importances(normalize=False)
print("feat importance = " + str(feat_importance1))

#### Features affecting employees staying.

In [None]:
feat_imp_dict = dict(zip(feature_cols, clf_pruned.feature_importances_))
feat_imp = pd.DataFrame.from_dict(feat_imp_dict, orient='index')
feat_imp.sort_values(by=0, ascending=False).head(10)

In [None]:
y_pred_proba = clf_pruned_final.predict_proba(x)[:, 1]
y_pred_proba[0:20]

#### Adding probabilities of Attrition to the dataframe

In [None]:
w=pd.Series(y_pred_proba)
df['y_pred_proba']=w
df.head()