In [1]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 

In [2]:
df = pd.read_csv('./Human_Resources.csv')
df.head(5)

Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeNumber,...,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,Yes,Travel_Rarely,1102,Sales,1,2,Life Sciences,1,1,...,1,80,0,8,0,1,6,4,0,5
1,49,No,Travel_Frequently,279,Research & Development,8,1,Life Sciences,1,2,...,4,80,1,10,3,3,10,7,1,7
2,37,Yes,Travel_Rarely,1373,Research & Development,2,2,Other,1,4,...,2,80,0,7,3,3,0,0,0,0
3,33,No,Travel_Frequently,1392,Research & Development,3,4,Life Sciences,1,5,...,3,80,0,8,3,3,8,7,3,0
4,27,No,Travel_Rarely,591,Research & Development,2,1,Medical,1,7,...,4,80,1,6,3,3,2,2,2,2


In [3]:
# dataframe size
df.shape

(1470, 35)

In [4]:
# column data types
df.dtypes

Age                          int64
Attrition                   object
BusinessTravel              object
DailyRate                    int64
Department                  object
DistanceFromHome             int64
Education                    int64
EducationField              object
EmployeeCount                int64
EmployeeNumber               int64
EnvironmentSatisfaction      int64
Gender                      object
HourlyRate                   int64
JobInvolvement               int64
JobLevel                     int64
JobRole                     object
JobSatisfaction              int64
MaritalStatus               object
MonthlyIncome                int64
MonthlyRate                  int64
NumCompaniesWorked           int64
Over18                      object
OverTime                    object
PercentSalaryHike            int64
PerformanceRating            int64
RelationshipSatisfaction     int64
StandardHours                int64
StockOptionLevel             int64
TotalWorkingYears   

In [5]:
# numerical and categorical columns
num_cols = df.columns[df.dtypes == 'int64']
cat_cols = df.columns[df.dtypes == 'object']

# checking if every column is included 
print(len(num_cols) + len(cat_cols) == len(df.columns))

True


Encoding:
- One Hot: Department, MaritlStatus, Job Role
- Ordinal: BusinessTravel,Attrition
- Label: Gender, OverTime

So we will do data transformations two ways when it comes to encoding: 
1. For general classification Models
2. For tree based models

In [6]:
# 1. Check columns that can be dropped
cols_drop = []
for col in cat_cols:
    if len(df[col].unique()) == 1:
        cols_drop.append(col)

cols_drop

['Over18']

In [7]:
df['StandardHours'].unique()

array([80], dtype=int64)

Columns to drop: Over18, StandardHours and EmployeeNumber

In [8]:
df.drop(['Over18','StandardHours','EmployeeNumber'],axis = 1,inplace= True)
df.shape

(1470, 32)

In [9]:
# 2. Checking Missing Values
df.isnull().sum().sum()

0

So there are no missing values. <br>
Now we will be using non-tree based models like Logistic Regression and SVC, then tree based models like Random forest, XGBoost and Catboost. Hence the encoding and scaling will be different. <br>
Now we will start by splittng the data into train and test sets, with a 80:20 ratio. The train set will also be used for doing cross valaidation to tune hyperparameters and get the best model.
And then a final result on the test set.

In [10]:
# Making the input data matrix and target vector
X = df.drop('Attrition',axis=1)
y = df['Attrition']
X.shape, y.shape

((1470, 31), (1470,))

In [12]:
# Creating the train and test dataset
from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test = train_test_split(
    X,y,test_size=0.2,stratify=y,random_state=42
)

In [13]:
X_train.shape,X_test.shape,y_train.shape,y_test.shape

((1176, 31), (294, 31), (1176,), (294,))

In [None]:
from sklearn.preprocessing import OneHotEncoder,OrdinalEncoder,LabelEncoder,StandardScaler

In [None]:
ohe_cols = ['Department','EducationField','JobRole','MaritalStatus']
ord_cols = ['BusinessTravel','Gender','OverTime']

ohe = OneHotEncoder()
ord = OrdinalEncoder()
le = LabelEncoder()

In [16]:
X_train['BusinessTravel'].unique()

array(['Travel_Rarely', 'Travel_Frequently', 'Non-Travel'], dtype=object)

In [17]:
X_train['Department'].unique()

array(['Sales', 'Research & Development', 'Human Resources'], dtype=object)

In [18]:
X_train['EducationField'].unique()

array(['Life Sciences', 'Technical Degree', 'Marketing', 'Medical',
       'Other', 'Human Resources'], dtype=object)

In [19]:
X_train['JobRole'].unique()

array(['Manager', 'Laboratory Technician', 'Sales Representative',
       'Research Scientist', 'Manufacturing Director', 'Sales Executive',
       'Research Director', 'Healthcare Representative',
       'Human Resources'], dtype=object)

In [21]:
X_train['Gender'].unique()

array(['Female', 'Male'], dtype=object)

In [22]:
X_train['MaritalStatus'].unique()

array(['Divorced', 'Married', 'Single'], dtype=object)

In [23]:
X_train['OverTime'].unique()

array(['No', 'Yes'], dtype=object)

In [20]:
cat_cols

Index(['Attrition', 'BusinessTravel', 'Department', 'EducationField', 'Gender',
       'JobRole', 'MaritalStatus', 'Over18', 'OverTime'],
      dtype='object')

In [None]:
j = lovely wamr up 