In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
import warnings
warnings.filterwarnings("ignore")

In [3]:
data = pd.read_excel('../../data/raw/INX_Future_Inc_Employee_Performance_CDS_Project2_Data_V1.8.xls')
data

Unnamed: 0,EmpNumber,Age,Gender,EducationBackground,MaritalStatus,EmpDepartment,EmpJobRole,BusinessTravelFrequency,DistanceFromHome,EmpEducationLevel,...,EmpRelationshipSatisfaction,TotalWorkExperienceInYears,TrainingTimesLastYear,EmpWorkLifeBalance,ExperienceYearsAtThisCompany,ExperienceYearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager,Attrition,PerformanceRating
0,E1001000,32,Male,Marketing,Single,Sales,Sales Executive,Travel_Rarely,10,3,...,4,10,2,2,10,7,0,8,No,3
1,E1001006,47,Male,Marketing,Single,Sales,Sales Executive,Travel_Rarely,14,4,...,4,20,2,3,7,7,1,7,No,3
2,E1001007,40,Male,Life Sciences,Married,Sales,Sales Executive,Travel_Frequently,5,4,...,3,20,2,3,18,13,1,12,No,4
3,E1001009,41,Male,Human Resources,Divorced,Human Resources,Manager,Travel_Rarely,10,4,...,2,23,2,2,21,6,12,6,No,3
4,E1001010,60,Male,Marketing,Single,Sales,Sales Executive,Travel_Rarely,16,4,...,4,10,1,3,2,2,2,2,No,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1195,E100992,27,Female,Medical,Divorced,Sales,Sales Executive,Travel_Frequently,3,1,...,2,6,3,3,6,5,0,4,No,4
1196,E100993,37,Male,Life Sciences,Single,Development,Senior Developer,Travel_Rarely,10,2,...,1,4,2,3,1,0,0,0,No,3
1197,E100994,50,Male,Medical,Married,Development,Senior Developer,Travel_Rarely,28,1,...,3,20,3,3,20,8,3,8,No,3
1198,E100995,34,Female,Medical,Single,Data Science,Data Scientist,Travel_Rarely,9,3,...,2,9,3,4,8,7,7,7,No,3


<h4 style="font-size: 20px;" align="center"><b>DOMAIN ANALYSIS</b></h4>

The dataset from INX Future Inc provides a comprehensive overview of employee performance metrics across various departments. Here's a domain analysis based on the available data columns:

1. **Employee Demographics**: Includes `EmpNumber`, `Age`, `Gender`, `EducationBackground`, `MaritalStatus`. These fields help in understanding the diversity and background of the workforce.

2. **Departmental Data**: `EmpDepartment` and `EmpJobRole` indicate the department and specific roles of the employees, crucial for analyzing department-specific performance.

3. **Work-related Details**: 
   - `BusinessTravelFrequency` and `DistanceFromHome` could affect employee satisfaction and performance.
   - `EmpEducationLevel`, `EmpEnvironmentSatisfaction`, `EmpHourlyRate`, `EmpJobInvolvement`, `EmpJobLevel`, `EmpJobSatisfaction` provide insights into the educational background and job satisfaction levels which are directly linked to performance.

4. **Performance Metrics**:
   - `PerformanceRating` is the key outcome variable. Other related metrics include `YearsSinceLastPromotion`, `YearsWithCurrManager`, and `ExperienceYearsAtThisCompany`, which help in understanding career progression and its impact on performance.

5. **Additional Attributes**:
   - `OverTime` and `Attrition` indicate work-life balance and employee retention, respectively, which are critical for organizational health.

This analysis helps in identifying key areas for improving employee performance and satisfaction.

<h4 style="font-size: 16px"><b>DATA PREPROCESSING</b></h4>

In [4]:
data.isnull().sum()

EmpNumber                       0
Age                             0
Gender                          0
EducationBackground             0
MaritalStatus                   0
EmpDepartment                   0
EmpJobRole                      0
BusinessTravelFrequency         0
DistanceFromHome                0
EmpEducationLevel               0
EmpEnvironmentSatisfaction      0
EmpHourlyRate                   0
EmpJobInvolvement               0
EmpJobLevel                     0
EmpJobSatisfaction              0
NumCompaniesWorked              0
OverTime                        0
EmpLastSalaryHikePercent        0
EmpRelationshipSatisfaction     0
TotalWorkExperienceInYears      0
TrainingTimesLastYear           0
EmpWorkLifeBalance              0
ExperienceYearsAtThisCompany    0
ExperienceYearsInCurrentRole    0
YearsSinceLastPromotion         0
YearsWithCurrManager            0
Attrition                       0
PerformanceRating               0
dtype: int64

In [5]:
data.duplicated().sum()

0

In [6]:
# From the basic checks, it was clear that

binary_features = ['OverTime', 'Attrition']
nominal_features = ['Gender', 'EducationBackground', 'MaritalStatus', 'EmpDepartment', 'EmpJobRole', 'BusinessTravelFrequency']

# taking a copy of original data for encoding
encoded_data = data.copy()
encoded_data.shape

(1200, 28)

In [7]:
continuous_col = encoded_data[['Age','DistanceFromHome', 'EmpHourlyRate', 'TotalWorkExperienceInYears', 'ExperienceYearsAtThisCompany']] 
continuous_col

Unnamed: 0,Age,DistanceFromHome,EmpHourlyRate,TotalWorkExperienceInYears,ExperienceYearsAtThisCompany
0,32,10,55,10,10
1,47,14,42,20,7
2,40,5,48,20,18
3,41,10,73,23,21
4,60,16,84,10,2
...,...,...,...,...,...
1195,27,3,71,6,6
1196,37,10,80,4,1
1197,50,28,74,20,20
1198,34,9,46,9,8


In [8]:
Q1 = continuous_col.quantile(0.25)
Q3 = continuous_col.quantile(0.75)
IQR = Q3 - Q1

outliers = ((continuous_col < (Q1 - 1.5 * IQR)) | (continuous_col > (Q3 + 1.5 * IQR)))

number_of_outliers = outliers.sum()

number_of_outliers.to_frame().T

Unnamed: 0,Age,DistanceFromHome,EmpHourlyRate,TotalWorkExperienceInYears,ExperienceYearsAtThisCompany
0,0,0,0,51,56


In [9]:
# Hadling the outliers for TotalWorkExperienceInYears column

q1 = data['TotalWorkExperienceInYears'].quantile(0.25)
q3 = data['TotalWorkExperienceInYears'].quantile(0.75)
iqr = q3 - q1

upper_bound = q3 + 1.5 * iqr
lower_bound = q1 - 1.5 * iqr

print('Upper bound', '\033[1m', upper_bound, 'and lower bound', '\033[1m', lower_bound)

Upper bound [1m 28.5 and lower bound [1m -7.5


In [10]:
data.loc[data['TotalWorkExperienceInYears'] < lower_bound]

Unnamed: 0,EmpNumber,Age,Gender,EducationBackground,MaritalStatus,EmpDepartment,EmpJobRole,BusinessTravelFrequency,DistanceFromHome,EmpEducationLevel,...,EmpRelationshipSatisfaction,TotalWorkExperienceInYears,TrainingTimesLastYear,EmpWorkLifeBalance,ExperienceYearsAtThisCompany,ExperienceYearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager,Attrition,PerformanceRating


In [11]:
data.loc[data['TotalWorkExperienceInYears'] > upper_bound]

Unnamed: 0,EmpNumber,Age,Gender,EducationBackground,MaritalStatus,EmpDepartment,EmpJobRole,BusinessTravelFrequency,DistanceFromHome,EmpEducationLevel,...,EmpRelationshipSatisfaction,TotalWorkExperienceInYears,TrainingTimesLastYear,EmpWorkLifeBalance,ExperienceYearsAtThisCompany,ExperienceYearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager,Attrition,PerformanceRating
17,E1001040,56,Male,Medical,Married,Development,Developer,Travel_Rarely,9,3,...,3,30,1,2,10,7,1,1,No,3
25,E1001054,52,Male,Marketing,Married,Sales,Manager,Travel_Rarely,3,4,...,1,34,3,4,34,6,1,16,No,4
48,E1001093,50,Male,Medical,Married,Development,Developer,Travel_Rarely,2,3,...,4,30,3,3,4,3,0,3,No,3
72,E1001129,51,Female,Marketing,Married,Sales,Manager,Travel_Rarely,26,4,...,3,29,2,2,20,6,4,17,No,3
77,E1001140,53,Female,Marketing,Married,Sales,Sales Executive,Travel_Rarely,7,2,...,4,35,3,3,5,2,0,4,No,3
96,E1001179,53,Female,Life Sciences,Single,Development,Developer,Travel_Rarely,23,4,...,3,33,0,3,12,9,3,8,No,3
100,E1001183,50,Male,Life Sciences,Divorced,Development,Developer,Non-Travel,2,4,...,4,31,3,3,31,6,14,7,No,3
149,E1001275,50,Male,Medical,Divorced,Development,Senior Developer,Travel_Rarely,1,2,...,3,32,1,2,5,4,1,3,No,4
173,E1001310,59,Female,Life Sciences,Single,Research & Development,Manufacturing Director,Travel_Rarely,2,3,...,1,30,4,3,5,3,4,3,No,3
178,E1001316,52,Female,Marketing,Married,Sales,Manager,Travel_Rarely,2,1,...,4,33,3,3,32,14,6,9,Yes,2


In [12]:
data.loc[data['TotalWorkExperienceInYears'] > upper_bound, 'TotalWorkExperienceInYears'] = np.median(data['TotalWorkExperienceInYears'])

In [13]:
data.loc[data['TotalWorkExperienceInYears'] > upper_bound]

Unnamed: 0,EmpNumber,Age,Gender,EducationBackground,MaritalStatus,EmpDepartment,EmpJobRole,BusinessTravelFrequency,DistanceFromHome,EmpEducationLevel,...,EmpRelationshipSatisfaction,TotalWorkExperienceInYears,TrainingTimesLastYear,EmpWorkLifeBalance,ExperienceYearsAtThisCompany,ExperienceYearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager,Attrition,PerformanceRating


In [14]:
# ExperienceYearsAtThisCompany

q1 = data['ExperienceYearsAtThisCompany'].quantile(0.25)
q3 = data['ExperienceYearsAtThisCompany'].quantile(0.75)
iqr = q3 - q1

lower_limit = q1 - 1.5 * iqr
upper_limit = q3 + 1.5 * iqr

outliers= (data['ExperienceYearsAtThisCompany'] < lower_limit) | (data['ExperienceYearsAtThisCompany'] > upper_limit)
outliers_percent= (outliers.sum() / (len(data))) * 100
outliers_percent 

4.666666666666667

In [15]:
data.loc[data['ExperienceYearsAtThisCompany'] < lower_limit]

Unnamed: 0,EmpNumber,Age,Gender,EducationBackground,MaritalStatus,EmpDepartment,EmpJobRole,BusinessTravelFrequency,DistanceFromHome,EmpEducationLevel,...,EmpRelationshipSatisfaction,TotalWorkExperienceInYears,TrainingTimesLastYear,EmpWorkLifeBalance,ExperienceYearsAtThisCompany,ExperienceYearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager,Attrition,PerformanceRating


In [16]:
data.loc[data['ExperienceYearsAtThisCompany'] > upper_limit]

Unnamed: 0,EmpNumber,Age,Gender,EducationBackground,MaritalStatus,EmpDepartment,EmpJobRole,BusinessTravelFrequency,DistanceFromHome,EmpEducationLevel,...,EmpRelationshipSatisfaction,TotalWorkExperienceInYears,TrainingTimesLastYear,EmpWorkLifeBalance,ExperienceYearsAtThisCompany,ExperienceYearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager,Attrition,PerformanceRating
3,E1001009,41,Male,Human Resources,Divorced,Human Resources,Manager,Travel_Rarely,10,4,...,2,23,2,2,21,6,12,6,No,3
11,E1001024,47,Female,Medical,Divorced,Sales,Sales Executive,Travel_Frequently,3,3,...,4,28,2,2,22,2,11,13,No,3
25,E1001054,52,Male,Marketing,Married,Sales,Manager,Travel_Rarely,3,4,...,1,10,3,4,34,6,1,16,No,4
52,E1001098,47,Female,Other,Married,Development,Developer,Travel_Rarely,2,2,...,3,26,3,2,26,14,3,0,No,3
100,E1001183,50,Male,Life Sciences,Divorced,Development,Developer,Non-Travel,2,4,...,4,10,3,3,31,6,14,7,No,3
168,E1001304,39,Female,Life Sciences,Married,Research & Development,Manufacturing Director,Travel_Rarely,1,1,...,3,21,3,3,21,6,11,8,No,3
175,E1001313,41,Female,Life Sciences,Divorced,Research & Development,Research Director,Non-Travel,7,1,...,2,21,3,3,21,16,5,10,No,2
178,E1001316,52,Female,Marketing,Married,Sales,Manager,Travel_Rarely,2,1,...,4,10,3,3,32,14,6,9,Yes,2
181,E1001321,39,Female,Medical,Single,Research & Development,Manufacturing Director,Travel_Frequently,22,3,...,3,21,2,3,21,6,2,8,No,3
216,E1001387,42,Male,Medical,Married,Research & Development,Manager,Travel_Rarely,2,2,...,4,23,3,3,22,6,13,7,No,3


In [17]:
data.loc[data['ExperienceYearsAtThisCompany'] > upper_limit, 'ExperienceYearsAtThisCompany'] = np.median(data['ExperienceYearsAtThisCompany'])

In [18]:
data.loc[data['ExperienceYearsAtThisCompany'] > upper_limit]

Unnamed: 0,EmpNumber,Age,Gender,EducationBackground,MaritalStatus,EmpDepartment,EmpJobRole,BusinessTravelFrequency,DistanceFromHome,EmpEducationLevel,...,EmpRelationshipSatisfaction,TotalWorkExperienceInYears,TrainingTimesLastYear,EmpWorkLifeBalance,ExperienceYearsAtThisCompany,ExperienceYearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager,Attrition,PerformanceRating


In [20]:
for column in data.drop('PerformanceRating', axis=1):
    if data[column].dtype =='object':
        print(column)
        print("--------------------------------")

EmpNumber
--------------------------------
Gender
--------------------------------
EducationBackground
--------------------------------
MaritalStatus
--------------------------------
EmpDepartment
--------------------------------
EmpJobRole
--------------------------------
BusinessTravelFrequency
--------------------------------
OverTime
--------------------------------
Attrition
--------------------------------


In [21]:
# binary features
# OverTime
encoded_data.OverTime.value_counts()

OverTime
No     847
Yes    353
Name: count, dtype: int64

In [22]:
# yes=1, No= 0
# mapping is done
encoded_data['OverTime'] = encoded_data['OverTime'].map({"No": 0, "Yes": 1})

In [23]:
encoded_data.OverTime.value_counts()

OverTime
0    847
1    353
Name: count, dtype: int64

In [24]:
# yes=1, No= 0
# mapping is done
encoded_data['Attrition']= encoded_data['Attrition'].map({"No": 0, "Yes": 1})

In [25]:
encoded_data.Attrition.value_counts()

Attrition
0    1022
1     178
Name: count, dtype: int64

In [26]:
# nominal features--> one-hot encoding is done
# Gender
encoded_data.Gender.value_counts()

Gender
Male      725
Female    475
Name: count, dtype: int64

In [27]:
encoded_data['Gender'] = pd.get_dummies(encoded_data['Gender'], drop_first=True)

In [28]:
encoded_data.Gender.value_counts()

Gender
True     725
False    475
Name: count, dtype: int64

In [29]:
# EducationBackground
encoded_data.EducationBackground.value_counts()

EducationBackground
Life Sciences       492
Medical             384
Marketing           137
Technical Degree    100
Other                66
Human Resources      21
Name: count, dtype: int64

In [30]:
EducationBackground= pd.get_dummies(encoded_data['EducationBackground'],prefix= 'EducationBackground', drop_first= True)

In [31]:
encoded_data= pd.concat([encoded_data, EducationBackground], axis=1)

In [32]:
encoded_data.drop('EducationBackground', axis=1, inplace= True)
encoded_data.head()

Unnamed: 0,EmpNumber,Age,Gender,MaritalStatus,EmpDepartment,EmpJobRole,BusinessTravelFrequency,DistanceFromHome,EmpEducationLevel,EmpEnvironmentSatisfaction,...,ExperienceYearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager,Attrition,PerformanceRating,EducationBackground_Life Sciences,EducationBackground_Marketing,EducationBackground_Medical,EducationBackground_Other,EducationBackground_Technical Degree
0,E1001000,32,True,Single,Sales,Sales Executive,Travel_Rarely,10,3,4,...,7,0,8,0,3,False,True,False,False,False
1,E1001006,47,True,Single,Sales,Sales Executive,Travel_Rarely,14,4,4,...,7,1,7,0,3,False,True,False,False,False
2,E1001007,40,True,Married,Sales,Sales Executive,Travel_Frequently,5,4,4,...,13,1,12,0,4,True,False,False,False,False
3,E1001009,41,True,Divorced,Human Resources,Manager,Travel_Rarely,10,4,2,...,6,12,6,0,3,False,False,False,False,False
4,E1001010,60,True,Single,Sales,Sales Executive,Travel_Rarely,16,4,1,...,2,2,2,0,3,False,True,False,False,False


In [33]:
# MaritalStatus
encoded_data.MaritalStatus.value_counts()

MaritalStatus
Married     548
Single      384
Divorced    268
Name: count, dtype: int64

In [34]:
# one hot encoding
MaritalStatus= pd.get_dummies(encoded_data['MaritalStatus'],prefix= 'MaritalStatus', drop_first= True)
encoded_data= pd.concat([encoded_data, MaritalStatus], axis= 1)

In [35]:
encoded_data.drop('MaritalStatus', axis=1, inplace= True)
encoded_data.head()

Unnamed: 0,EmpNumber,Age,Gender,EmpDepartment,EmpJobRole,BusinessTravelFrequency,DistanceFromHome,EmpEducationLevel,EmpEnvironmentSatisfaction,EmpHourlyRate,...,YearsWithCurrManager,Attrition,PerformanceRating,EducationBackground_Life Sciences,EducationBackground_Marketing,EducationBackground_Medical,EducationBackground_Other,EducationBackground_Technical Degree,MaritalStatus_Married,MaritalStatus_Single
0,E1001000,32,True,Sales,Sales Executive,Travel_Rarely,10,3,4,55,...,8,0,3,False,True,False,False,False,False,True
1,E1001006,47,True,Sales,Sales Executive,Travel_Rarely,14,4,4,42,...,7,0,3,False,True,False,False,False,False,True
2,E1001007,40,True,Sales,Sales Executive,Travel_Frequently,5,4,4,48,...,12,0,4,True,False,False,False,False,True,False
3,E1001009,41,True,Human Resources,Manager,Travel_Rarely,10,4,2,73,...,6,0,3,False,False,False,False,False,False,False
4,E1001010,60,True,Sales,Sales Executive,Travel_Rarely,16,4,1,84,...,2,0,3,False,True,False,False,False,False,True


In [36]:
encoded_data.EmpDepartment.value_counts()

EmpDepartment
Sales                     373
Development               361
Research & Development    343
Human Resources            54
Finance                    49
Data Science               20
Name: count, dtype: int64

In [37]:
EmpDepartment= pd.get_dummies(encoded_data['EmpDepartment'],prefix= 'EmpDepartment', drop_first= True)
encoded_data= pd.concat([encoded_data, EmpDepartment], axis=1)

In [38]:
encoded_data.drop('EmpDepartment', axis=1, inplace= True)
encoded_data.head()

Unnamed: 0,EmpNumber,Age,Gender,EmpJobRole,BusinessTravelFrequency,DistanceFromHome,EmpEducationLevel,EmpEnvironmentSatisfaction,EmpHourlyRate,EmpJobInvolvement,...,EducationBackground_Medical,EducationBackground_Other,EducationBackground_Technical Degree,MaritalStatus_Married,MaritalStatus_Single,EmpDepartment_Development,EmpDepartment_Finance,EmpDepartment_Human Resources,EmpDepartment_Research & Development,EmpDepartment_Sales
0,E1001000,32,True,Sales Executive,Travel_Rarely,10,3,4,55,3,...,False,False,False,False,True,False,False,False,False,True
1,E1001006,47,True,Sales Executive,Travel_Rarely,14,4,4,42,3,...,False,False,False,False,True,False,False,False,False,True
2,E1001007,40,True,Sales Executive,Travel_Frequently,5,4,4,48,2,...,False,False,False,True,False,False,False,False,False,True
3,E1001009,41,True,Manager,Travel_Rarely,10,4,2,73,2,...,False,False,False,False,False,False,False,True,False,False
4,E1001010,60,True,Sales Executive,Travel_Rarely,16,4,1,84,3,...,False,False,False,False,True,False,False,False,False,True


In [39]:
encoded_data.EmpJobRole.value_counts()

EmpJobRole
Sales Executive              270
Developer                    236
Manager R&D                   94
Research Scientist            77
Sales Representative          69
Laboratory Technician         64
Senior Developer              52
Manager                       51
Finance Manager               49
Human Resources               45
Technical Lead                38
Manufacturing Director        33
Healthcare Representative     33
Data Scientist                20
Research Director             19
Business Analyst              16
Senior Manager R&D            15
Delivery Manager              12
Technical Architect            7
Name: count, dtype: int64

In [40]:
EmpJobRole= pd.get_dummies(encoded_data['EmpJobRole'],prefix= 'EmpJobRole', drop_first= True)
encoded_data= pd.concat([encoded_data, EmpJobRole], axis=1)

In [41]:
encoded_data.drop('EmpJobRole', axis=1, inplace= True)
encoded_data.head()

Unnamed: 0,EmpNumber,Age,Gender,BusinessTravelFrequency,DistanceFromHome,EmpEducationLevel,EmpEnvironmentSatisfaction,EmpHourlyRate,EmpJobInvolvement,EmpJobLevel,...,EmpJobRole_Manager R&D,EmpJobRole_Manufacturing Director,EmpJobRole_Research Director,EmpJobRole_Research Scientist,EmpJobRole_Sales Executive,EmpJobRole_Sales Representative,EmpJobRole_Senior Developer,EmpJobRole_Senior Manager R&D,EmpJobRole_Technical Architect,EmpJobRole_Technical Lead
0,E1001000,32,True,Travel_Rarely,10,3,4,55,3,2,...,False,False,False,False,True,False,False,False,False,False
1,E1001006,47,True,Travel_Rarely,14,4,4,42,3,2,...,False,False,False,False,True,False,False,False,False,False
2,E1001007,40,True,Travel_Frequently,5,4,4,48,2,3,...,False,False,False,False,True,False,False,False,False,False
3,E1001009,41,True,Travel_Rarely,10,4,2,73,2,5,...,False,False,False,False,False,False,False,False,False,False
4,E1001010,60,True,Travel_Rarely,16,4,1,84,3,2,...,False,False,False,False,True,False,False,False,False,False


In [42]:
encoded_data.BusinessTravelFrequency.value_counts()

BusinessTravelFrequency
Travel_Rarely        846
Travel_Frequently    222
Non-Travel           132
Name: count, dtype: int64

In [43]:
BusinessTravelFrequency= pd.get_dummies(encoded_data['BusinessTravelFrequency'],prefix= 'BusinessTravelFrequency', drop_first= True)
encoded_data= pd.concat([encoded_data, BusinessTravelFrequency], axis=1)

In [44]:
encoded_data.drop('BusinessTravelFrequency', axis=1, inplace= True)
encoded_data.head()

Unnamed: 0,EmpNumber,Age,Gender,DistanceFromHome,EmpEducationLevel,EmpEnvironmentSatisfaction,EmpHourlyRate,EmpJobInvolvement,EmpJobLevel,EmpJobSatisfaction,...,EmpJobRole_Research Director,EmpJobRole_Research Scientist,EmpJobRole_Sales Executive,EmpJobRole_Sales Representative,EmpJobRole_Senior Developer,EmpJobRole_Senior Manager R&D,EmpJobRole_Technical Architect,EmpJobRole_Technical Lead,BusinessTravelFrequency_Travel_Frequently,BusinessTravelFrequency_Travel_Rarely
0,E1001000,32,True,10,3,4,55,3,2,4,...,False,False,True,False,False,False,False,False,False,True
1,E1001006,47,True,14,4,4,42,3,2,1,...,False,False,True,False,False,False,False,False,False,True
2,E1001007,40,True,5,4,4,48,2,3,1,...,False,False,True,False,False,False,False,False,True,False
3,E1001009,41,True,10,4,2,73,2,5,4,...,False,False,False,False,False,False,False,False,False,True
4,E1001010,60,True,16,4,1,84,3,2,1,...,False,False,True,False,False,False,False,False,False,True


In [45]:
# checking the datatypes after encoding

encoded_data.dtypes

EmpNumber                                    object
Age                                           int64
Gender                                         bool
DistanceFromHome                              int64
EmpEducationLevel                             int64
EmpEnvironmentSatisfaction                    int64
EmpHourlyRate                                 int64
EmpJobInvolvement                             int64
EmpJobLevel                                   int64
EmpJobSatisfaction                            int64
NumCompaniesWorked                            int64
OverTime                                      int64
EmpLastSalaryHikePercent                      int64
EmpRelationshipSatisfaction                   int64
TotalWorkExperienceInYears                    int64
TrainingTimesLastYear                         int64
EmpWorkLifeBalance                            int64
ExperienceYearsAtThisCompany                  int64
ExperienceYearsInCurrentRole                  int64
YearsSinceLa

In [46]:
# EmpNumber is ignored since it is a unique feature.
encoded_data.drop('EmpNumber', axis=1, inplace= True)
encoded_data.columns

Index(['Age', 'Gender', 'DistanceFromHome', 'EmpEducationLevel',
       'EmpEnvironmentSatisfaction', 'EmpHourlyRate', 'EmpJobInvolvement',
       'EmpJobLevel', 'EmpJobSatisfaction', 'NumCompaniesWorked', 'OverTime',
       'EmpLastSalaryHikePercent', 'EmpRelationshipSatisfaction',
       'TotalWorkExperienceInYears', 'TrainingTimesLastYear',
       'EmpWorkLifeBalance', 'ExperienceYearsAtThisCompany',
       'ExperienceYearsInCurrentRole', 'YearsSinceLastPromotion',
       'YearsWithCurrManager', 'Attrition', 'PerformanceRating',
       'EducationBackground_Life Sciences', 'EducationBackground_Marketing',
       'EducationBackground_Medical', 'EducationBackground_Other',
       'EducationBackground_Technical Degree', 'MaritalStatus_Married',
       'MaritalStatus_Single', 'EmpDepartment_Development',
       'EmpDepartment_Finance', 'EmpDepartment_Human Resources',
       'EmpDepartment_Research & Development', 'EmpDepartment_Sales',
       'EmpJobRole_Data Scientist', 'EmpJobRole_Del

In [47]:
encoded_data

Unnamed: 0,Age,Gender,DistanceFromHome,EmpEducationLevel,EmpEnvironmentSatisfaction,EmpHourlyRate,EmpJobInvolvement,EmpJobLevel,EmpJobSatisfaction,NumCompaniesWorked,...,EmpJobRole_Research Director,EmpJobRole_Research Scientist,EmpJobRole_Sales Executive,EmpJobRole_Sales Representative,EmpJobRole_Senior Developer,EmpJobRole_Senior Manager R&D,EmpJobRole_Technical Architect,EmpJobRole_Technical Lead,BusinessTravelFrequency_Travel_Frequently,BusinessTravelFrequency_Travel_Rarely
0,32,True,10,3,4,55,3,2,4,1,...,False,False,True,False,False,False,False,False,False,True
1,47,True,14,4,4,42,3,2,1,2,...,False,False,True,False,False,False,False,False,False,True
2,40,True,5,4,4,48,2,3,1,5,...,False,False,True,False,False,False,False,False,True,False
3,41,True,10,4,2,73,2,5,4,3,...,False,False,False,False,False,False,False,False,False,True
4,60,True,16,4,1,84,3,2,1,8,...,False,False,True,False,False,False,False,False,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1195,27,False,3,1,4,71,4,2,4,1,...,False,False,True,False,False,False,False,False,True,False
1196,37,True,10,2,4,80,4,1,4,3,...,False,False,False,False,True,False,False,False,False,True
1197,50,True,28,1,4,74,4,1,3,1,...,False,False,False,False,True,False,False,False,False,True
1198,34,False,9,3,4,46,2,3,2,1,...,False,False,False,False,False,False,False,False,False,True


In [49]:
encoded_data.to_csv('./encoded_data.csv')
encoded_data.to_csv('../models/encoded_data.csv')