In [28]:
# Initial imports
from __future__ import print_function
%matplotlib inline
import os
import warnings
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.image as image
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
plt.style.use("ggplot")
warnings.simplefilter("ignore")

## Loading and Preprocessing Employee HR Encoded Data

Load the `employee_data_5.csv` in a pandas DataFrame called `employee_attrition_df`.

In [29]:
# Read the CSV file from the Resources folder into a Pandas DataFrame
employee_attrition_df = pd.read_csv("Resources/employees_numeric.csv")

# Review the DataFrame
display(employee_attrition_df.head())
display(employee_attrition_df.tail())

Unnamed: 0.1,Unnamed: 0,satisfaction_level,last_evaluation,number_project,avg_monthly_hours,company_tenure,work_accident,left,promotion_last_5years,department_IT,...,department_engineering,department_hr,department_management,department_marketing,department_product_mng,department_sales,department_support,salary_high,salary_low,salary_medium
0,0,0.38,0.53,2,157,3,0,1,0,0,...,0,0,0,0,0,1,0,0,1,0
1,1,0.8,0.86,5,262,6,0,1,0,0,...,0,0,0,0,0,1,0,0,0,1
2,2,0.11,0.88,7,272,4,0,1,0,0,...,0,0,0,0,0,1,0,0,0,1
3,3,0.72,0.87,5,223,5,0,1,0,0,...,0,0,0,0,0,1,0,0,1,0
4,4,0.37,0.52,2,159,3,0,1,0,0,...,0,0,0,0,0,1,0,0,1,0


Unnamed: 0.1,Unnamed: 0,satisfaction_level,last_evaluation,number_project,avg_monthly_hours,company_tenure,work_accident,left,promotion_last_5years,department_IT,...,department_engineering,department_hr,department_management,department_marketing,department_product_mng,department_sales,department_support,salary_high,salary_low,salary_medium
14994,14994,0.4,0.57,2,151,3,0,1,0,0,...,0,0,0,0,0,0,1,0,1,0
14995,14995,0.37,0.48,2,160,3,0,1,0,0,...,0,0,0,0,0,0,1,0,1,0
14996,14996,0.37,0.53,2,143,3,0,1,0,0,...,0,0,0,0,0,0,1,0,1,0
14997,14997,0.11,0.96,6,280,4,0,1,0,0,...,0,0,0,0,0,0,1,0,1,0
14998,14998,0.37,0.52,2,158,3,0,1,0,0,...,0,0,0,0,0,0,1,0,1,0


In [30]:
# Drop the non-beneficial ID columns, 'employee_id'.
employee_attrition_df = employee_attrition_df.drop(columns=['Unnamed: 0'], axis=1)
employee_attrition_df.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,avg_monthly_hours,company_tenure,work_accident,left,promotion_last_5years,department_IT,department_R&D,...,department_engineering,department_hr,department_management,department_marketing,department_product_mng,department_sales,department_support,salary_high,salary_low,salary_medium
0,0.38,0.53,2,157,3,0,1,0,0,0,...,0,0,0,0,0,1,0,0,1,0
1,0.8,0.86,5,262,6,0,1,0,0,0,...,0,0,0,0,0,1,0,0,0,1
2,0.11,0.88,7,272,4,0,1,0,0,0,...,0,0,0,0,0,1,0,0,0,1
3,0.72,0.87,5,223,5,0,1,0,0,0,...,0,0,0,0,0,1,0,0,1,0
4,0.37,0.52,2,159,3,0,1,0,0,0,...,0,0,0,0,0,1,0,0,1,0


Create the target vector by assigning the values of the `left` column from the `employee_attrition_df` DataFrame.  Define the features set, by copying the `employee_attrition_df` DataFrame and dropping the `left` column.

In [31]:
# Separate the data into labels and features
# Separate the y variable, the target
y = employee_attrition_df['left']

# Separate the X variable, the features
X = employee_attrition_df.drop(['left'], axis=1)

In [32]:
# Review the y variable Series
y.head()

0    1
1    1
2    1
3    1
4    1
Name: left, dtype: int64

In [33]:
# Review the X variable DataFrame
X.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,avg_monthly_hours,company_tenure,work_accident,promotion_last_5years,department_IT,department_R&D,department_accounting,department_engineering,department_hr,department_management,department_marketing,department_product_mng,department_sales,department_support,salary_high,salary_low,salary_medium
0,0.38,0.53,2,157,3,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0
1,0.8,0.86,5,262,6,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1
2,0.11,0.88,7,272,4,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1
3,0.72,0.87,5,223,5,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0
4,0.37,0.52,2,159,3,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0


In [34]:
# Check the balance of our target value
y.value_counts()

0    11428
1     3571
Name: left, dtype: int64

Split the data into training and testing sets.

In [35]:
# Import the train_test_learn module
from sklearn.model_selection import train_test_split

# Split the data using train_test_split
# Assign a random_state of 1 to the function
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)

Use the `StandardScaler` to scale the features data, remember that only `X_train` and `X_test` DataFrames should be scaled.

In [41]:
# Create a StandardScaler instances
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

## Fitting the Decision Tree Model

Once data is scaled, create a decision tree instance and train it with the training data (`X_train_scaled` and `y_train`).

In [42]:
# Create the decision tree classifier instance
decision_tree_model = DecisionTreeClassifier(max_depth=6, random_state=1)

In [43]:
# Fit the model
decision_tree_model.fit(X_train, y_train)

## Making Predictions Using the Tree Model

Validate the trained model, by predicting fraudulent loan applications using the testing data (`X_test_scaled`).

In [44]:
# Predict test set labels
y_pred = decision_tree_model.predict(X_test)
print(y_pred[0:5])

# Making predictions using the testing data
predictions = decision_tree_model.predict(X_test_scaled)

[0 0 1 0 0]


## Model Evaluation

Evaluate model's results, by using `sklearn` to calculate the confusion matrix, the accuracy score and to generate the classification report.

In [45]:
# Calculating the confusion matrix
cm = confusion_matrix(y_test, predictions)
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"]
)

# Calculating the accuracy score
acc_score = accuracy_score(y_test, predictions)



In [46]:
# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,2864,0
Actual 1,821,65


Accuracy Score : 0.7810666666666667
Classification Report
              precision    recall  f1-score   support

           0       0.78      1.00      0.87      2864
           1       1.00      0.07      0.14       886

    accuracy                           0.78      3750
   macro avg       0.89      0.54      0.51      3750
weighted avg       0.83      0.78      0.70      3750

