In [27]:
# Import the modules
from __future__ import print_function
%matplotlib inline
import os
import warnings
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.image as image
from pathlib import Path
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import balanced_accuracy_score, confusion_matrix, classification_report
plt.style.use("ggplot")
warnings.simplefilter("ignore")

In [28]:
plt.rcParams['figure.figsize'] = (12,8)

In [29]:
# Read the CSV file from the Resources folder into a Pandas DataFrame
employee_attrition_df = pd.read_csv("Resources/employees_numeric.csv")

# Review the DataFrame
display(employee_attrition_df.head())
display(employee_attrition_df.tail())

Unnamed: 0.1,Unnamed: 0,satisfaction_level,last_evaluation,number_project,avg_monthly_hours,company_tenure,work_accident,left,promotion_last_5years,department_IT,...,department_engineering,department_hr,department_management,department_marketing,department_product_mng,department_sales,department_support,salary_high,salary_low,salary_medium
0,0,0.38,0.53,2,157,3,0,1,0,0,...,0,0,0,0,0,1,0,0,1,0
1,1,0.8,0.86,5,262,6,0,1,0,0,...,0,0,0,0,0,1,0,0,0,1
2,2,0.11,0.88,7,272,4,0,1,0,0,...,0,0,0,0,0,1,0,0,0,1
3,3,0.72,0.87,5,223,5,0,1,0,0,...,0,0,0,0,0,1,0,0,1,0
4,4,0.37,0.52,2,159,3,0,1,0,0,...,0,0,0,0,0,1,0,0,1,0


Unnamed: 0.1,Unnamed: 0,satisfaction_level,last_evaluation,number_project,avg_monthly_hours,company_tenure,work_accident,left,promotion_last_5years,department_IT,...,department_engineering,department_hr,department_management,department_marketing,department_product_mng,department_sales,department_support,salary_high,salary_low,salary_medium
14994,14994,0.4,0.57,2,151,3,0,1,0,0,...,0,0,0,0,0,0,1,0,1,0
14995,14995,0.37,0.48,2,160,3,0,1,0,0,...,0,0,0,0,0,0,1,0,1,0
14996,14996,0.37,0.53,2,143,3,0,1,0,0,...,0,0,0,0,0,0,1,0,1,0
14997,14997,0.11,0.96,6,280,4,0,1,0,0,...,0,0,0,0,0,0,1,0,1,0
14998,14998,0.37,0.52,2,158,3,0,1,0,0,...,0,0,0,0,0,0,1,0,1,0


In [31]:
# Drop the non-beneficial ID columns, 'employee_id'.
employee_attrition_df = employee_attrition_df.drop(columns=['Unnamed: 0'], axis=1)
employee_attrition_df.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,avg_monthly_hours,company_tenure,work_accident,left,promotion_last_5years,department_IT,department_R&D,...,department_engineering,department_hr,department_management,department_marketing,department_product_mng,department_sales,department_support,salary_high,salary_low,salary_medium
0,0.38,0.53,2,157,3,0,1,0,0,0,...,0,0,0,0,0,1,0,0,1,0
1,0.8,0.86,5,262,6,0,1,0,0,0,...,0,0,0,0,0,1,0,0,0,1
2,0.11,0.88,7,272,4,0,1,0,0,0,...,0,0,0,0,0,1,0,0,0,1
3,0.72,0.87,5,223,5,0,1,0,0,0,...,0,0,0,0,0,1,0,0,1,0
4,0.37,0.52,2,159,3,0,1,0,0,0,...,0,0,0,0,0,1,0,0,1,0


In [32]:
# Determine the number of unique values in each column.
employee_attrition_df.nunique()

satisfaction_level         92
last_evaluation            65
number_project              6
avg_monthly_hours         215
company_tenure              8
work_accident               2
left                        2
promotion_last_5years       2
department_IT               2
department_R&D              2
department_accounting       2
department_engineering      2
department_hr               2
department_management       2
department_marketing        2
department_product_mng      2
department_sales            2
department_support          2
salary_high                 2
salary_low                  2
salary_medium               2
dtype: int64

In [33]:
# Convert categorical data to numeric with `pd.get_dummies`
employee_attrition_df_numeric = pd.get_dummies(employee_attrition_df)
employee_attrition_df_numeric.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,avg_monthly_hours,company_tenure,work_accident,left,promotion_last_5years,department_IT,department_R&D,...,department_engineering,department_hr,department_management,department_marketing,department_product_mng,department_sales,department_support,salary_high,salary_low,salary_medium
0,0.38,0.53,2,157,3,0,1,0,0,0,...,0,0,0,0,0,1,0,0,1,0
1,0.8,0.86,5,262,6,0,1,0,0,0,...,0,0,0,0,0,1,0,0,0,1
2,0.11,0.88,7,272,4,0,1,0,0,0,...,0,0,0,0,0,1,0,0,0,1
3,0.72,0.87,5,223,5,0,1,0,0,0,...,0,0,0,0,0,1,0,0,1,0
4,0.37,0.52,2,159,3,0,1,0,0,0,...,0,0,0,0,0,1,0,0,1,0


### Create the labels set (`y`)  from the “left” column, and then create the features (`X`) DataFrame from the remaining columns.

In [34]:
# Separate the data into labels and features
# Separate the y variable, the target
y = employee_attrition_df_numeric['left']

# Separate the X variable, the features
x = employee_attrition_df_numeric.drop(['left'], axis=1)

In [35]:
# Review the y variable Series
y.head()

0    1
1    1
2    1
3    1
4    1
Name: left, dtype: int64

In [36]:
# Review the X variable DataFrame
x.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,avg_monthly_hours,company_tenure,work_accident,promotion_last_5years,department_IT,department_R&D,department_accounting,department_engineering,department_hr,department_management,department_marketing,department_product_mng,department_sales,department_support,salary_high,salary_low,salary_medium
0,0.38,0.53,2,157,3,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0
1,0.8,0.86,5,262,6,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1
2,0.11,0.88,7,272,4,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1
3,0.72,0.87,5,223,5,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0
4,0.37,0.52,2,159,3,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0


### Check the balance of the target variable (`y`) by using the `value_counts` function.

In [37]:
# Check the balance of our target value
y.value_counts()

0    11428
1     3571
Name: left, dtype: int64

### Split the data into training and testing datasets by using `train_test_split`.

In [38]:
# Import the train_test_learn module
from sklearn.model_selection import train_test_split

# Split the data using train_test_split
# Assign a random_state of 1 to the function
X_train, X_test, y_train, y_test = train_test_split(x, y, random_state=1, stratify=y)

In [39]:
# Create a StandardScaler instances
#scaler = StandardScaler()

# Fit the StandardScaler
#X_scaler = scaler.fit(X_train)

# Scale the data
#X_train_scaled = X_scaler.transform(X_train)
#X_test_scaled = X_scaler.transform(X_test)

## Create a Logistic Regression Model with the Original Data

###  Fit a logistic regression model by using the training data (`X_train` and `y_train`).

In [40]:
# Import the LogisticRegression module from SKLearn
from sklearn.linear_model import LogisticRegression

# Instantiate the Logistic Regression model
# Assign a random_state parameter of 1 to the model
logistic_regression_model_default_risk = LogisticRegression(random_state=1)

# Fit the model using training data
employee_attrition_logistic_model = logistic_regression_model_default_risk.fit(X_train, y_train)

### Save the predictions on the testing data labels by using the testing feature data (`X_test`) and the fitted model.

In [41]:
# Make a prediction using the testing data
employee_attrition_logistic_predictions = employee_attrition_logistic_model.predict(X_test)
print(len(employee_attrition_logistic_predictions))

3750


### Evaluate the model’s performance by doing the following:

* Calculate the accuracy score of the model.

* Generate a confusion matrix.

* Print the classification report.

In [42]:
# Print the balanced_accuracy score of the model
logistic_accuracy_score = balanced_accuracy_score(y_test, employee_attrition_logistic_predictions)
print(logistic_accuracy_score)

0.6367860946238801


In [43]:
# Generate a confusion matrix for the model
logistic_confusion_matrix = confusion_matrix(y_test, employee_attrition_logistic_predictions)
print(logistic_confusion_matrix)

[[2666  191]
 [ 589  304]]


In [44]:
# Print the classification report for the model
logistic_classification_report = classification_report(y_test, employee_attrition_logistic_predictions)
print(logistic_classification_report)

              precision    recall  f1-score   support

           0       0.82      0.93      0.87      2857
           1       0.61      0.34      0.44       893

    accuracy                           0.79      3750
   macro avg       0.72      0.64      0.66      3750
weighted avg       0.77      0.79      0.77      3750



## Predict a Logistic Regression Model with Resampled Training Data

### Use the `RandomOverSampler` module from the imbalanced-learn library to resample the data. Be sure to confirm that the labels have an equal number of data points. 

In [45]:
# Import the RandomOverSampler module form imbalanced-learn
from imblearn.over_sampling import RandomOverSampler

# Instantiate the random oversampler model
# Assign a random_state parameter of 1 to the model
random_over_sample = RandomOverSampler(random_state=1)

# Fit the original training data to the random_oversampler model
x_resample, y_resample = random_over_sample.fit_resample(X_train, y_train)

In [46]:
# Count the distinct values of the resampled Target data - left
y_resample.value_counts()

0    8571
1    8571
Name: left, dtype: int64

In [47]:
# View the distinct values of the resampled X feature data
x_resample.describe()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,avg_monthly_hours,company_tenure,work_accident,promotion_last_5years,department_IT,department_R&D,department_accounting,department_engineering,department_hr,department_management,department_marketing,department_product_mng,department_sales,department_support,salary_high,salary_low,salary_medium
count,17142.0,17142.0,17142.0,17142.0,17142.0,17142.0,17142.0,17142.0,17142.0,17142.0,17142.0,17142.0,17142.0,17142.0,17142.0,17142.0,17142.0,17142.0,17142.0,17142.0
mean,0.553211,0.71773,3.825808,202.955198,3.635982,0.111539,0.016042,0.080387,0.044161,0.055944,0.186326,0.051453,0.035235,0.053961,0.058395,0.281064,0.153074,0.059795,0.52736,0.412846
std,0.266158,0.18078,1.453354,53.883056,1.33004,0.314808,0.125642,0.2719,0.205458,0.229821,0.389381,0.220925,0.184379,0.225947,0.234495,0.449532,0.36007,0.237113,0.499265,0.49236
min,0.09,0.36,2.0,96.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.38,0.54,2.0,152.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.56,0.73,4.0,203.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
75%,0.79,0.88,5.0,252.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0
max,1.0,1.0,7.0,310.0,10.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


### Use the `LogisticRegression` classifier and the resampled data to fit the model and make predictions.

In [48]:
# Instantiate the Logistic Regression model
# Assign a random_state parameter of 1 to the model
logistic_classifier = LogisticRegression(solver='lbfgs', random_state=1)

# Fit the model using the resampled training data
logistic_classifier.fit(x_resample, y_resample)

# Make a prediction using the testing data
test_predictions = logistic_classifier.predict(X_test)
pd.DataFrame({"Prediction": test_predictions, "Actual": y_test})

Unnamed: 0,Prediction,Actual
9382,0,0
11276,1,0
11161,0,0
1681,1,1
14506,1,1
...,...,...
9411,0,0
10186,0,0
11449,0,0
3640,0,0


### Evaluate the model’s performance by doing the following:

* Calculate the accuracy score of the model.

* Generate a confusion matrix.

* Print the classification report.

In [49]:
# Print the balanced_accuracy score of the model 
predicted_logistic_accuracy_score2 = balanced_accuracy_score(y_test, test_predictions)
print(predicted_logistic_accuracy_score2)

0.7719865276578499


In [50]:
# Generate a confusion matrix for the model
predicted_logistic_confusion_matrix2 = confusion_matrix(y_test, test_predictions)
print(predicted_logistic_confusion_matrix2)

[[2178  679]
 [ 195  698]]


In [51]:
# Print the classification report for the model
predicted_logistic_classification_report2 = classification_report(y_test, test_predictions)
print(predicted_logistic_classification_report2)

              precision    recall  f1-score   support

           0       0.92      0.76      0.83      2857
           1       0.51      0.78      0.61       893

    accuracy                           0.77      3750
   macro avg       0.71      0.77      0.72      3750
weighted avg       0.82      0.77      0.78      3750



In [52]:
from sklearn.inspection import permutation_importance

# compute importances
model_fi = permutation_importance(logistic_classifier, X_test, y_test)
model_fi['importances_mean']    

array([ 1.32213333e-01,  4.42666667e-03,  2.42666667e-02,  1.39733333e-02,
        4.54933333e-02,  1.87200000e-02,  6.40000000e-04, -2.13333333e-04,
        7.46666667e-04,  1.60000000e-04,  1.49333333e-03,  5.33333333e-05,
        1.17333333e-03,  5.33333333e-04,  0.00000000e+00, -8.53333333e-04,
       -2.02666667e-03,  5.49333333e-03,  2.09600000e-02,  9.92000000e-03])

In [53]:
X_test.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,avg_monthly_hours,company_tenure,work_accident,promotion_last_5years,department_IT,department_R&D,department_accounting,department_engineering,department_hr,department_management,department_marketing,department_product_mng,department_sales,department_support,salary_high,salary_low,salary_medium
9382,0.73,0.74,3,221,3,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0
11276,0.81,0.62,3,240,6,0,1,0,0,0,0,0,0,0,0,1,0,0,1,0
11161,0.69,0.82,4,252,3,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1
1681,0.83,0.94,4,264,5,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0
14506,0.38,0.51,2,146,3,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1
