## Preprocessing

In [1]:
# Import our dependencies
from __future__ import print_function
import os
import warnings
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import balanced_accuracy_score, confusion_matrix, classification_report
import tensorflow as tf
warnings.simplefilter("ignore")

#  Read in the csv file and print the first five rows for the dataframe
employee_attrition_df = pd.read_csv("Resources/employees_numeric.csv")
employee_attrition_df.head()

Unnamed: 0.1,Unnamed: 0,satisfaction_level,last_evaluation,number_project,avg_monthly_hours,company_tenure,work_accident,left,promotion_last_5years,department_IT,...,department_engineering,department_hr,department_management,department_marketing,department_product_mng,department_sales,department_support,salary_high,salary_low,salary_medium
0,0,0.38,0.53,2,157,3,0,1,0,0,...,0,0,0,0,0,1,0,0,1,0
1,1,0.8,0.86,5,262,6,0,1,0,0,...,0,0,0,0,0,1,0,0,0,1
2,2,0.11,0.88,7,272,4,0,1,0,0,...,0,0,0,0,0,1,0,0,0,1
3,3,0.72,0.87,5,223,5,0,1,0,0,...,0,0,0,0,0,1,0,0,1,0
4,4,0.37,0.52,2,159,3,0,1,0,0,...,0,0,0,0,0,1,0,0,1,0


In [2]:
# Drop the non-beneficial ID columns, 'employee_id'.
employee_attrition_df = employee_attrition_df.drop(columns=['Unnamed: 0'], axis=1)
employee_attrition_df.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,avg_monthly_hours,company_tenure,work_accident,left,promotion_last_5years,department_IT,department_R&D,...,department_engineering,department_hr,department_management,department_marketing,department_product_mng,department_sales,department_support,salary_high,salary_low,salary_medium
0,0.38,0.53,2,157,3,0,1,0,0,0,...,0,0,0,0,0,1,0,0,1,0
1,0.8,0.86,5,262,6,0,1,0,0,0,...,0,0,0,0,0,1,0,0,0,1
2,0.11,0.88,7,272,4,0,1,0,0,0,...,0,0,0,0,0,1,0,0,0,1
3,0.72,0.87,5,223,5,0,1,0,0,0,...,0,0,0,0,0,1,0,0,1,0
4,0.37,0.52,2,159,3,0,1,0,0,0,...,0,0,0,0,0,1,0,0,1,0


In [3]:
# Determine the number of unique values in each column.
employee_attrition_df.nunique()

satisfaction_level         92
last_evaluation            65
number_project              6
avg_monthly_hours         215
company_tenure              8
work_accident               2
left                        2
promotion_last_5years       2
department_IT               2
department_R&D              2
department_accounting       2
department_engineering      2
department_hr               2
department_management       2
department_marketing        2
department_product_mng      2
department_sales            2
department_support          2
salary_high                 2
salary_low                  2
salary_medium               2
dtype: int64

In [10]:
# Convert categorical data to numeric with `pd.get_dummies`
employee_attrition_df_numeric = pd.get_dummies(employee_attrition_df)

In [11]:
# Split our preprocessed data into our features and target arrays
X = employee_attrition_df_numeric.drop(['left'], axis=1)
y = employee_attrition_df_numeric['left']

# Split the preprocessed data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=58)

In [12]:
# Create a StandardScaler instances
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

## Compile, Train and Evaluate the Model

In [13]:
# Define the model - deep neural net, i.e., the number of input features and hidden nodes for each layer.
number_input_features = len(X_train_scaled[0])
hidden_nodes_layer1 = 10
hidden_nodes_layer2 = 8
hidden_nodes_layer3= 6

nn2 = tf.keras.models.Sequential()

# First hidden layer
nn2.add(tf.keras.layers.Dense(units=hidden_nodes_layer1,
             input_dim=number_input_features, activation="relu"))

# Second hidden layer
nn2.add(tf.keras.layers.Dense(units=hidden_nodes_layer2, activation="sigmoid"))

# Third hidden layer
nn2.add(tf.keras.layers.Dense(units=hidden_nodes_layer3, activation="sigmoid"))

# Output layer
nn2.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))

# Check the structure of the model
nn2.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 10)                210       
                                                                 
 dense_1 (Dense)             (None, 8)                 88        
                                                                 
 dense_2 (Dense)             (None, 6)                 54        
                                                                 
 dense_3 (Dense)             (None, 1)                 7         
                                                                 
Total params: 359 (1.40 KB)
Trainable params: 359 (1.40 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [14]:
# Compile the model
nn2.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [15]:
# Train the model
fit_model = nn2.fit(X_train_scaled, y_train, epochs=25)

Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25


In [16]:
# Evaluate the model using the test data
model_loss, model_accuracy = nn2.evaluate(X_test_scaled,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

118/118 - 0s - loss: 0.1312 - accuracy: 0.9608 - 290ms/epoch - 2ms/step
Loss: 0.13120850920677185, Accuracy: 0.9607999920845032


In [17]:
# Make a prediction using the testing data
testing_employee_attrition_predictions = nn2.predict(X_test)
print(len(testing_employee_attrition_predictions))

3750


In [18]:
# Export our model to HDF5 file
nn2.save("employee_attrition_nn.h5")