In [1]:
from __future__ import print_function
%matplotlib inline
import os
import warnings
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.image as image
import pandas as pd
from pandas_profiling import ProfileReport
import seaborn as sns
plt.style.use("ggplot")
warnings.simplefilter("ignore")

In [2]:
plt.rcParams['figure.figsize'] = (12,8)

In [3]:
df = pd.read_csv("employee_data.csv")

In [4]:
df.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,quit,promotion_last_5years,department,salary
0,0.38,0.53,2,157,3,0,1,0,sales,low
1,0.8,0.86,5,262,6,0,1,0,sales,medium
2,0.11,0.88,7,272,4,0,1,0,sales,medium
3,0.72,0.87,5,223,5,0,1,0,sales,low
4,0.37,0.52,2,159,3,0,1,0,sales,low


In [5]:
pd.crosstab(df.salary,df.quit).plot(kind="bar")
plt.xlabel("salary")
plt.ylabel("Frequency of turnover")
plt.title("Employee turn over based on the salary")
plt.show()

In [6]:
pd.crosstab(df.department,df.quit).plot(kind="bar",figsize=(15,10))
plt.xlabel("department")
plt.ylabel("Frequency of turn over")
plt.title("Employee turn over based on the Department ")
plt.show()

In [7]:
cat_vars = ['department','salary']
for var in cat_vars:
    cat_list = pd.get_dummies(df[var],prefix = var)
    df = df.join(cat_list)

In [8]:
df.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,quit,promotion_last_5years,department,salary,...,department_hr,department_management,department_marketing,department_product_mng,department_sales,department_support,department_technical,salary_high,salary_low,salary_medium
0,0.38,0.53,2,157,3,0,1,0,sales,low,...,0,0,0,0,1,0,0,0,1,0
1,0.8,0.86,5,262,6,0,1,0,sales,medium,...,0,0,0,0,1,0,0,0,0,1
2,0.11,0.88,7,272,4,0,1,0,sales,medium,...,0,0,0,0,1,0,0,0,0,1
3,0.72,0.87,5,223,5,0,1,0,sales,low,...,0,0,0,0,1,0,0,0,1,0
4,0.37,0.52,2,159,3,0,1,0,sales,low,...,0,0,0,0,1,0,0,0,1,0


In [9]:
df.drop(columns =['department','salary'],axis=1,inplace = True)

In [10]:
df.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,quit,promotion_last_5years,department_IT,department_RandD,...,department_hr,department_management,department_marketing,department_product_mng,department_sales,department_support,department_technical,salary_high,salary_low,salary_medium
0,0.38,0.53,2,157,3,0,1,0,0,0,...,0,0,0,0,1,0,0,0,1,0
1,0.8,0.86,5,262,6,0,1,0,0,0,...,0,0,0,0,1,0,0,0,0,1
2,0.11,0.88,7,272,4,0,1,0,0,0,...,0,0,0,0,1,0,0,0,0,1
3,0.72,0.87,5,223,5,0,1,0,0,0,...,0,0,0,0,1,0,0,0,1,0
4,0.37,0.52,2,159,3,0,1,0,0,0,...,0,0,0,0,1,0,0,0,1,0


In [11]:
plt.plot(figsize=(10,5))
sns.countplot(x="quit",data = df)
plt.title("frequency of categories in quit label")
plt.show()

In [12]:
X = df.loc[:,df.columns!='quit']
Y = df.quit

In [13]:
X.columns

Index(['satisfaction_level', 'last_evaluation', 'number_project',
       'average_montly_hours', 'time_spend_company', 'Work_accident',
       'promotion_last_5years', 'department_IT', 'department_RandD',
       'department_accounting', 'department_hr', 'department_management',
       'department_marketing', 'department_product_mng', 'department_sales',
       'department_support', 'department_technical', 'salary_high',
       'salary_low', 'salary_medium'],
      dtype='object')

In [14]:
X.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,promotion_last_5years,department_IT,department_RandD,department_accounting,department_hr,department_management,department_marketing,department_product_mng,department_sales,department_support,department_technical,salary_high,salary_low,salary_medium
0,0.38,0.53,2,157,3,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0
1,0.8,0.86,5,262,6,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1
2,0.11,0.88,7,272,4,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1
3,0.72,0.87,5,223,5,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0
4,0.37,0.52,2,159,3,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0


In [15]:
Y.head()

0    1
1    1
2    1
3    1
4    1
Name: quit, dtype: int64

In [16]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(X,Y,test_size=0.2,random_state = 10,stratify = Y)

In [17]:
print(x_train.shape)
print(y_train.shape)
print(x_test.shape)
print(y_test.shape)

(11999, 20)
(11999,)
(3000, 20)
(3000,)


In [18]:
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.tree import export_graphviz # display the tree within a Jupyter notebook
from IPython.display import SVG
from graphviz import Source
from IPython.display import display
from ipywidgets import interactive, IntSlider, FloatSlider, interact
import ipywidgets
from IPython.display import Image
from subprocess import call
import matplotlib.image as mpimg

In [26]:
@interact
def plot_tree_rf(crit=['gini','entropy'],
                 split=['best','random'],
                 depth=IntSlider(min=1,max=30,value=2, continuous_update=False),
                 min_split=IntSlider(min=2,max=5,value=2, continuous_update=False),
                 min_leaf=IntSlider(min=1,max=5,value=1, continuous_update=False)):
    estimator = DecisionTreeClassifier(random_state=0,criterion=crit,splitter = split,max_depth = depth,min_samples_split = min_split,min_samples_leaf = min_leaf)
    estimator.fit(x_train,y_train)
    print("training accuracy {:.3f}".format(accuracy_score(y_train,estimator.predict(x_train))*100))
    print("testing accuracy {:.3f}".format(accuracy_score(y_test,estimator.predict(x_test))*100))
    graph = Source(tree.export_graphviz(estimator,out_file = None,feature_names = x_train.columns,class_names = ['stayed','quit'],filled = True))
    display(Image(data=graph.pipe(format = 'png')))
    
    

interactive(children=(Dropdown(description='crit', options=('gini', 'entropy'), value='gini'), Dropdown(descri…

In [27]:
@interact
def plot_tree_rf(crit=['gini','entropy'],
                 bootstrap=[True,False],
                 depth=IntSlider(min=1,max=30,value=3, continuous_update=False),
                 forests=IntSlider(min=1,max=200,value=100,continuous_update=False),
                 min_split=IntSlider(min=2,max=5,value=2, continuous_update=False),
                 min_leaf=IntSlider(min=1,max=5,value=1, continuous_update=False)):
    estimator = RandomForestClassifier(random_state = 1,criterion = crit,bootstrap = bootstrap,n_estimators = forests,max_depth = depth,min_samples_split = min_split,min_samples_leaf = min_leaf,n_jobs = -1,verbose = False)
    estimator.fit(x_train,y_train)
    print("training accuracy {:.3f}".format(accuracy_score(y_train,estimator.predict(x_train))*100))
    print("testing accuracy {:.3f}".format(accuracy_score(y_test,estimator.predict(x_test))*100))
    num_tree = estimator.estimators_[0]
    print("visualizing tree",0)
    graph = Source(tree.export_graphviz(num_tree,out_file = None,feature_names = x_train.columns,class_names = ['stayed','quit'],filled = True))
    display(Image(data=graph.pipe(format = 'png')))
    
    
    

interactive(children=(Dropdown(description='crit', options=('gini', 'entropy'), value='gini'), Dropdown(descri…