In [1]:
# Import Dependencies
# Dependencies
from sklearn import datasets
from sklearn.linear_model import LogisticRegression 
from sklearn.model_selection import train_test_split
import pandas as pd
import matplotlib.pyplot as plt

## HR Employee Attrition
Keywords: multivariate, classification.

## Description
This dataset is used to evaluae what features lead to employee attrition. This is a fictional data set that was created by IBM data scientists to reflect real world data.

## Source
https://www.kaggle.com/pavansubhasht/ibm-hr-analytics-attrition-dataset

In [2]:
# Read the csv file into a pandas DataFrame
hr_data = pd.read_csv('../Resources/HREmployeeAttrition.csv')
hr_data.head()

Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeNumber,...,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,1,2,1102,2,1,2,1,1,1,...,1,80,0,8,0,1,6,4,0,5
1,49,0,1,279,1,8,1,1,1,2,...,4,80,1,10,3,3,10,7,1,7
2,37,1,2,1373,1,2,2,4,1,4,...,2,80,0,7,3,3,0,0,0,0
3,33,0,1,1392,1,3,4,1,1,5,...,3,80,0,8,3,3,8,7,3,0
4,27,0,2,591,1,2,1,3,1,7,...,4,80,1,6,3,3,2,2,2,2


In [3]:
# Select our independent X variables, and our dependent y variable. 
X = hr_data.drop(columns = ['Attrition'])
y = hr_data['Attrition']


In [4]:
# Confirm imblance of target variable 'Attrition'
# 1 = Attrition; 0 = No Attrition
print("Total Counts:")
print(y.value_counts())
print("")
print("Percentage:")
print(y.value_counts(normalize=True))

Total Counts:
0    1233
1     237
Name: Attrition, dtype: int64

Percentage:
0    0.838776
1    0.161224
Name: Attrition, dtype: float64


### Imbalanced

In [5]:
# Create Validation training and testing datasets without balancing.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=1)


In [6]:
# Confirm imblance of target variable 'Attrition' in the test dataset
# 1 = Attrition; 0 = No Attrition


In [7]:
# Create a Linear Regression model object
model = LogisticRegression(solver='liblinear')


In [8]:
# Train the model using the training datasets
model.fit(X_train, y_train) 
LogisticRegression(solver='liblinear')

LogisticRegression(solver='liblinear')

In [9]:
# Make predictions using the testing dataset
y_pred = model.predict(X_test) 

In [10]:
# Plot the Confusion Matrix values
# Plot the Confusion Matrix prediction percentages
# Print classification report

print(f'Accuracy: {model.score(X_test, y_test)}')

# Create DataFrame of results
df = pd.DataFrame({"Predicted": y_pred, "Actual": y_test})

# Calculate the True Positive & False Negative values
true_positive = len(df[(df['Actual'] == 1) & (df['Predicted'] == 1)])
false_negative = len(df[(df['Actual'] == 1) & (df['Predicted'] == 0)])
print(f'True Positives: {true_positive}')
print(f'False Negatives: {false_negative}')
print(f'Actual Positive Correct: {true_positive/(true_positive + false_negative)}')
print(f'Test Fraud Transactions: {true_positive + false_negative}')

Accuracy: 0.8163265306122449
True Positives: 9
False Negatives: 49
Actual Positive Correct: 0.15517241379310345
Test Fraud Transactions: 58


### Balanced

In [11]:
# Create our Validation training and testing datasets with balancing.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=1, stratify=y)


In [12]:
# Confirm imblance of target variable 'Attrition' in our stratified dataset
# 1 = Attrition; 0 = No Attrition
print("Total Counts:")
print(y.value_counts())
print("")
print("Percentage:")
print(y.value_counts(normalize=True))

Total Counts:
0    1233
1     237
Name: Attrition, dtype: int64

Percentage:
0    0.838776
1    0.161224
Name: Attrition, dtype: float64


In [13]:
# Create the Linear Regression model object
model = LogisticRegression(solver='liblinear')


In [14]:
# Train the model using the training sets
model.fit(X_train, y_train)


LogisticRegression(solver='liblinear')

In [15]:
# Make predictions using the testing dataset
y_pred = model.predict(X_test) 

In [16]:
# Plot the Confusion Matrix values
# Plot the Confusion Matrix prediction percentages
# Print classification report
print(f'Accuracy: {model.score(X_test, y_test)}')

# Create DataFrame of results
df = pd.DataFrame({"Predicted": y_pred, "Actual": y_test})

# Calculate the True Positive & False Negative values
true_positive = len(df[(df['Actual'] == 1) & (df['Predicted'] == 1)])
false_negative = len(df[(df['Actual'] == 1) & (df['Predicted'] == 0)])
print(f'True Positives: {true_positive}')
print(f'False Negatives: {false_negative}')
print(f'Actual Positive Correct: {true_positive/(true_positive + false_negative)}')
print(f'Test Fraud Transactions: {true_positive + false_negative}')

Accuracy: 0.8503401360544217
True Positives: 10
False Negatives: 37
Actual Positive Correct: 0.2127659574468085
Test Fraud Transactions: 47
