# IBM Employee Attrition

Data source: https://www.kaggle.com/pavansubhasht/ibm-hr-analytics-attrition-dataset

### Dependencies and data

In [1]:
# Dependencies
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
import tensorflow.keras as keras

%matplotlib inline

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [2]:
# Data
emp_df = pd.read_csv('data/ibm-employees.csv')
emp_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1470 entries, 0 to 1469
Data columns (total 35 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   Age                       1470 non-null   int64 
 1   Attrition                 1470 non-null   object
 2   BusinessTravel            1470 non-null   object
 3   DailyRate                 1470 non-null   int64 
 4   Department                1470 non-null   object
 5   DistanceFromHome          1470 non-null   int64 
 6   Education                 1470 non-null   int64 
 7   EducationField            1470 non-null   object
 8   EmployeeCount             1470 non-null   int64 
 9   EmployeeNumber            1470 non-null   int64 
 10  EnvironmentSatisfaction   1470 non-null   int64 
 11  Gender                    1470 non-null   object
 12  HourlyRate                1470 non-null   int64 
 13  JobInvolvement            1470 non-null   int64 
 14  JobLevel                

### Data preprocessing

In [3]:
# Categorical features
cat_feats = emp_df.dtypes[emp_df.dtypes == object].index.tolist()

# Unique counts
emp_df[cat_feats].nunique()

Attrition         2
BusinessTravel    3
Department        3
EducationField    6
Gender            2
JobRole           9
MaritalStatus     3
Over18            1
OverTime          2
dtype: int64

In [4]:
# Remove `Over18` from list
cat_feats.remove('Over18')

# Drop `Over18`
emp_df.drop(columns='Over18', inplace=True)
emp_df.shape

(1470, 34)

In [5]:
# One-hot encoding
enc = OneHotEncoder(drop='if_binary', sparse=False)
emp_ohe = enc.fit_transform(emp_df[cat_feats])
emp_ohe = pd.DataFrame(emp_ohe, columns=enc.get_feature_names(cat_feats)) # convert to df
emp_ohe.head()

Unnamed: 0,Attrition_Yes,BusinessTravel_Non-Travel,BusinessTravel_Travel_Frequently,BusinessTravel_Travel_Rarely,Department_Human Resources,Department_Research & Development,Department_Sales,EducationField_Human Resources,EducationField_Life Sciences,EducationField_Marketing,...,JobRole_Manager,JobRole_Manufacturing Director,JobRole_Research Director,JobRole_Research Scientist,JobRole_Sales Executive,JobRole_Sales Representative,MaritalStatus_Divorced,MaritalStatus_Married,MaritalStatus_Single,OverTime_Yes
0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0
1,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
2,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
3,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0
4,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [6]:
# Merge data
df = emp_df.merge(emp_ohe, left_index=True, right_index=True)

# Drop categorical columns
df.drop(columns=cat_feats, inplace=True)
df.head()

Unnamed: 0,Age,DailyRate,DistanceFromHome,Education,EmployeeCount,EmployeeNumber,EnvironmentSatisfaction,HourlyRate,JobInvolvement,JobLevel,...,JobRole_Manager,JobRole_Manufacturing Director,JobRole_Research Director,JobRole_Research Scientist,JobRole_Sales Executive,JobRole_Sales Representative,MaritalStatus_Divorced,MaritalStatus_Married,MaritalStatus_Single,OverTime_Yes
0,41,1102,1,2,1,1,2,94,3,2,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0
1,49,279,8,1,1,2,3,61,2,2,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
2,37,1373,2,2,1,4,4,92,2,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
3,33,1392,3,4,1,5,4,56,3,1,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0
4,27,591,2,1,1,7,1,40,3,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [9]:
# Feature/target split
X = df.drop(columns='Attrition_Yes')
y = df['Attrition_Yes']

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=24)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((1102, 52), (368, 52), (1102,), (368,))

In [10]:
# Scale data
scaler = StandardScaler()
X_train_scaled = pd.DataFrame(scaler.fit_transform(X_train), columns=X_train.columns)
X_test_scaled = pd.DataFrame(scaler.transform(X_test), columns=X_test.columns)
X_train_scaled.head()

Unnamed: 0,Age,DailyRate,DistanceFromHome,Education,EmployeeCount,EmployeeNumber,EnvironmentSatisfaction,HourlyRate,JobInvolvement,JobLevel,...,JobRole_Manager,JobRole_Manufacturing Director,JobRole_Research Director,JobRole_Research Scientist,JobRole_Sales Executive,JobRole_Sales Representative,MaritalStatus_Divorced,MaritalStatus_Married,MaritalStatus_Single,OverTime_Yes
0,-1.757276,-1.342093,0.063851,-0.928254,0.0,0.399736,-1.570753,-1.025049,-1.000146,-0.972112,...,-0.279782,-0.342997,-0.242069,-0.503684,-0.531871,-0.235702,-0.544491,-0.93496,1.509888,-0.627035
1,-0.431348,-0.707979,0.063851,0.050632,0.0,-0.991895,-0.652953,-0.975917,-1.000146,-0.078645,...,-0.279782,-0.342997,-0.242069,-0.503684,1.880154,-0.235702,-0.544491,-0.93496,1.509888,-0.627035
2,0.121122,-1.361832,-0.297609,1.029518,0.0,-0.552521,1.182645,-0.975917,-1.000146,-0.078645,...,-0.279782,-0.342997,-0.242069,-0.503684,1.880154,-0.235702,-0.544491,-0.93496,1.509888,-0.627035
3,-0.541842,0.866204,0.666286,1.029518,0.0,1.529079,0.264846,-1.565506,0.387966,-0.078645,...,-0.279782,-0.342997,-0.242069,-0.503684,-0.531871,-0.235702,1.836577,-0.93496,-0.662301,-0.627035
4,-0.873324,-1.097823,0.425312,0.050632,0.0,1.343639,-1.570753,-0.484592,-1.000146,-0.972112,...,-0.279782,-0.342997,-0.242069,-0.503684,-0.531871,-0.235702,1.836577,-0.93496,-0.662301,1.594807
