# HR Prediction

## Feature engineering

In [163]:
import pandas as pd
import numpy as np

In [164]:
pd.set_option("display.max_columns",100)
import warnings
warnings.filterwarnings('ignore')

In [165]:
data_frame=pd.read_csv('hr_data.csv')

In [166]:
data_frame.head()

Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeNumber,EnvironmentSatisfaction,Gender,HourlyRate,JobInvolvement,JobLevel,JobRole,JobSatisfaction,MaritalStatus,MonthlyIncome,MonthlyRate,NumCompaniesWorked,Over18,OverTime,PercentSalaryHike,PerformanceRating,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,Yes,Travel_Rarely,1102,Sales,1,2,Life Sciences,1,1,2,Female,94,3,2,Sales Executive,4,Single,5993,19479,8,Y,Yes,11,3,1,80,0,8,0,1,6,4,0,5
1,49,No,Travel_Frequently,279,Research & Development,8,1,Life Sciences,1,2,3,Male,61,2,2,Research Scientist,2,Married,5130,24907,1,Y,No,23,4,4,80,1,10,3,3,10,7,1,7
2,37,Yes,Travel_Rarely,1373,Research & Development,2,2,Other,1,4,4,Male,92,2,1,Laboratory Technician,3,Single,2090,2396,6,Y,Yes,15,3,2,80,0,7,3,3,0,0,0,0
3,33,No,Travel_Frequently,1392,Research & Development,3,4,Life Sciences,1,5,4,Female,56,3,1,Research Scientist,3,Married,2909,23159,1,Y,Yes,11,3,3,80,0,8,3,3,8,7,3,0
4,27,No,Travel_Rarely,591,Research & Development,2,1,Medical,1,7,1,Male,40,3,1,Laboratory Technician,2,Married,3468,16632,9,Y,No,12,3,4,80,1,6,3,3,2,2,2,2


In [167]:
# missing values.
data_frame.isnull().sum()

Age                         0
Attrition                   0
BusinessTravel              0
DailyRate                   0
Department                  0
DistanceFromHome            0
Education                   0
EducationField              0
EmployeeCount               0
EmployeeNumber              0
EnvironmentSatisfaction     0
Gender                      0
HourlyRate                  0
JobInvolvement              0
JobLevel                    0
JobRole                     0
JobSatisfaction             0
MaritalStatus               0
MonthlyIncome               0
MonthlyRate                 0
NumCompaniesWorked          0
Over18                      0
OverTime                    0
PercentSalaryHike           0
PerformanceRating           0
RelationshipSatisfaction    0
StandardHours               0
StockOptionLevel            0
TotalWorkingYears           0
TrainingTimesLastYear       0
WorkLifeBalance             0
YearsAtCompany              0
YearsInCurrentRole          0
YearsSince

In [168]:
categorical_features=[]
numerical_features=[]

In [169]:
for cols in data_frame.columns:
    if data_frame[cols].dtype=='O':
        categorical_features.append(cols)
    else:
        numerical_features.append(cols)

In [170]:
#transforming categorieal features into numerical features

In [171]:
#getting the cardinality(number of unique categories) in feature.
for cols in categorical_features:
    print(cols,' has ',len(data_frame[cols].unique()),' unique values')

Attrition  has  2  unique values
BusinessTravel  has  3  unique values
Department  has  3  unique values
EducationField  has  6  unique values
Gender  has  2  unique values
JobRole  has  9  unique values
MaritalStatus  has  3  unique values
Over18  has  1  unique values
OverTime  has  2  unique values


In [172]:
#only EducationField and JobRole features are transformed to numerical using LabelEncoding.
label_en=['EducationField','JobRole']
for cols in label_en:
    rank=data_frame[cols].value_counts(ascending=True).index
    mapping={i:k for k,i in enumerate(rank,0)}
    data_frame[cols]=data_frame[cols].map(mapping)

In [173]:
#rest of the categorical features are coverted to numeircal using one hot encoding.
for cols in categorical_features:
    if cols not in label_en:
        if (cols!='Attrition')&(cols!='OverTime'):
            new_data=pd.get_dummies(data_frame[cols],drop_first=True)
            data_frame=pd.concat([data_frame,new_data],axis=1)
            data_frame.drop([cols],axis=1,inplace=True)

In [174]:
data_frame['Attrition'].replace('Yes',1,inplace=True)
data_frame['Attrition'].replace('No',0,inplace=True)
data_frame['OverTime'].replace('Yes',1,inplace=True)
data_frame['OverTime'].replace('No',0,inplace=True)

In [175]:
data_frame.head()

Unnamed: 0,Age,Attrition,DailyRate,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeNumber,EnvironmentSatisfaction,HourlyRate,JobInvolvement,JobLevel,JobRole,JobSatisfaction,MonthlyIncome,MonthlyRate,NumCompaniesWorked,OverTime,PercentSalaryHike,PerformanceRating,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager,Travel_Frequently,Travel_Rarely,Research & Development,Sales,Male,Married,Single
0,41,1,1102,1,2,5,1,1,2,94,3,2,8,4,5993,19479,8,1,11,3,1,80,0,8,0,1,6,4,0,5,0,1,0,1,0,0,1
1,49,0,279,8,1,5,1,2,3,61,2,2,7,2,5130,24907,1,0,23,4,4,80,1,10,3,3,10,7,1,7,1,0,1,0,1,1,0
2,37,1,1373,2,2,1,1,4,4,92,2,1,6,3,2090,2396,6,1,15,3,2,80,0,7,3,3,0,0,0,0,0,1,1,0,1,0,1
3,33,0,1392,3,4,5,1,5,4,56,3,1,7,3,2909,23159,1,1,11,3,3,80,0,8,3,3,8,7,3,0,1,0,1,0,0,1,0
4,27,0,591,2,1,4,1,7,1,40,3,1,6,2,3468,16632,9,0,12,3,4,80,1,6,3,3,2,2,2,2,0,1,1,0,1,1,0


In [176]:
# Transformation of continuos features into log values.

In [177]:
for cols in numerical_features:
    if len(data_frame[cols].unique())>25:
        if 0 not in data_frame[cols].unique():
            data_frame[cols]=np.log(data_frame[cols])

In [178]:
data_frame.head()

Unnamed: 0,Age,Attrition,DailyRate,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeNumber,EnvironmentSatisfaction,HourlyRate,JobInvolvement,JobLevel,JobRole,JobSatisfaction,MonthlyIncome,MonthlyRate,NumCompaniesWorked,OverTime,PercentSalaryHike,PerformanceRating,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager,Travel_Frequently,Travel_Rarely,Research & Development,Sales,Male,Married,Single
0,3.713572,1,7.004882,0.0,2,5,1,0.0,2,4.543295,3,2,8,4,8.698347,9.877092,8,1,11,3,1,80,0,8,0,1,6,4,0,5,0,1,0,1,0,0,1
1,3.89182,0,5.631212,2.079442,1,5,1,0.693147,3,4.110874,2,2,7,2,8.542861,10.122904,1,0,23,4,4,80,1,10,3,3,10,7,1,7,1,0,1,0,1,1,0
2,3.610918,1,7.224753,0.693147,2,1,1,1.386294,4,4.521789,2,1,6,3,7.644919,7.781556,6,1,15,3,2,80,0,7,3,3,0,0,0,0,0,1,1,0,1,0,1
3,3.496508,0,7.238497,1.098612,4,5,1,1.609438,4,4.025352,3,1,7,3,7.975565,10.050139,1,1,11,3,3,80,0,8,3,3,8,7,3,0,1,0,1,0,0,1,0
4,3.295837,0,6.381816,0.693147,1,4,1,1.94591,1,3.688879,3,1,6,2,8.151333,9.719084,9,0,12,3,4,80,1,6,3,3,2,2,2,2,0,1,1,0,1,1,0


In [179]:
#transforming all features using minmaxscaler.

In [180]:
from sklearn.preprocessing import MinMaxScaler

In [181]:
scaler=MinMaxScaler()

In [182]:
scaler.fit(data_frame)

MinMaxScaler()

In [183]:
final_data=pd.DataFrame(scaler.transform(data_frame),columns=data_frame.columns)

In [184]:
final_data.head()

Unnamed: 0,Age,Attrition,DailyRate,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeNumber,EnvironmentSatisfaction,HourlyRate,JobInvolvement,JobLevel,JobRole,JobSatisfaction,MonthlyIncome,MonthlyRate,NumCompaniesWorked,OverTime,PercentSalaryHike,PerformanceRating,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager,Travel_Frequently,Travel_Rarely,Research & Development,Sales,Male,Married,Single
0,0.683737,1.0,0.885521,0.0,0.25,1.0,0.0,0.0,0.333333,0.948607,0.666667,0.25,1.0,1.0,0.596518,0.872312,0.888889,1.0,0.0,0.0,0.0,0.0,0.0,0.2,0.0,0.0,0.15,0.222222,0.0,0.294118,0.0,1.0,0.0,1.0,0.0,0.0,1.0
1,0.831787,0.0,0.374403,0.61754,0.0,1.0,0.0,0.090793,0.666667,0.589446,0.333333,0.25,0.875,0.333333,0.544458,0.968455,0.111111,0.0,0.857143,1.0,1.0,0.0,0.333333,0.25,0.5,0.666667,0.25,0.388889,0.066667,0.411765,1.0,0.0,1.0,0.0,1.0,1.0,0.0
2,0.598474,1.0,0.967331,0.205847,0.25,0.2,0.0,0.181587,1.0,0.930745,0.333333,0.0,0.75,0.666667,0.243814,0.052694,0.666667,1.0,0.285714,0.0,0.333333,0.0,0.0,0.175,0.5,0.666667,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0
3,0.503446,0.0,0.972445,0.32626,0.75,1.0,0.0,0.210816,1.0,0.518412,0.666667,0.0,0.875,0.666667,0.354519,0.939995,0.111111,1.0,0.0,0.0,0.666667,0.0,0.0,0.2,0.5,0.666667,0.2,0.388889,0.2,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0
4,0.336773,0.0,0.653689,0.205847,0.0,0.8,0.0,0.254889,0.0,0.238944,0.666667,0.0,0.75,0.333333,0.413369,0.810511,1.0,0.0,0.071429,0.0,1.0,0.0,0.333333,0.15,0.5,0.666667,0.05,0.111111,0.133333,0.117647,0.0,1.0,1.0,0.0,1.0,1.0,0.0


In [185]:
final_data.to_csv('modified_data.csv',index=False)