In [1]:
# Importing all the necessary libraries
import pandas as pd
import numpy as np
import sklearn
import pickle
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error
from tpot import TPOTRegressor
import warnings
warnings.filterwarnings('ignore')

### Data reading part 

In [2]:
train = pd.read_csv("HR_Analytics_Train_Data.csv")

### Preprocessing

In [3]:
train.head()

Unnamed: 0,employee_id,department,region,education,gender,recruitment_channel,no_of_trainings,age,previous_year_rating,length_of_service,KPIs_met >80%,awards_won?,avg_training_score,is_promoted
0,65438,Sales & Marketing,region_7,Master's & above,f,sourcing,1,35,5.0,8,1,0,49,0
1,65141,Operations,region_22,Bachelor's,m,other,1,30,5.0,4,0,0,60,0
2,7513,Sales & Marketing,region_19,Bachelor's,m,sourcing,1,34,3.0,7,0,0,50,0
3,2542,Sales & Marketing,region_23,Bachelor's,m,other,2,39,1.0,10,0,0,50,0
4,48945,Technology,region_26,Bachelor's,m,other,1,45,3.0,2,0,0,73,0


In [4]:
# Renaming 
train["KPIs_met"]=train["KPIs_met >80%"]
train = train.drop("KPIs_met >80%", axis=1)
train["awards_won"]=train["awards_won?"]
train = train.drop("awards_won?", axis=1)

In [5]:
# Keeping only important columns
train = train[["department", "education", "no_of_trainings", "previous_year_rating", "length_of_service", "KPIs_met", "awards_won", "avg_training_score", "is_promoted"]]

In [6]:
train.shape

(54808, 9)

In [7]:
train.isnull().sum()

department                 0
education               2409
no_of_trainings            0
previous_year_rating    4124
length_of_service          0
KPIs_met                   0
awards_won                 0
avg_training_score         0
is_promoted                0
dtype: int64

In [8]:
# I assume that they are Freshers i.e length_of_service = 1
# Therefore I'll impute the null of previous_year_rating with "0"
train["previous_year_rating"]=train["previous_year_rating"].fillna(0)
train["previous_year_rating"]=train["previous_year_rating"].astype(int)

In [9]:
# Also using ffill and bfill for education
# Bfill(backward-fill) propagates the first observed non-null value backward until another non-null value is met.
# While ffill(forward-fill) propagates the last observed non-null value forward until another non-null value is encountered.
train["education"]=train["education"].ffill(axis=0)
train["education"]=train["education"].bfill(axis=0)

### Feature Engineering

In [10]:
train['department'].value_counts()

Sales & Marketing    16840
Operations           11348
Technology            7138
Procurement           7138
Analytics             5352
Finance               2536
HR                    2418
Legal                 1039
R&D                    999
Name: department, dtype: int64

In [11]:
#Label Encoding department Column
le_department = LabelEncoder()
train['department'] = le_department.fit_transform(train['department'])

In [12]:
train['education'].value_counts()

Bachelor's          38390
Master's & above    15579
Below Secondary       839
Name: education, dtype: int64

In [13]:
def clean_education(x):
    if "Bachelor's" in x:
        return 'Bachelors degree'
    if "Master's & above" in x:
        return 'Masters degree'
    return 'Less than a Bachelors'

train['education'] = train['education'].apply(clean_education)

In [14]:
#Label Encoding education Column
le_education = LabelEncoder()
train['education'] = le_education.fit_transform(train['education'])

### Modeling

In [15]:
# Creating X and y df for training the model
X = train.drop("is_promoted", axis=1)
y = train["is_promoted"]

In [16]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 54808 entries, 0 to 54807
Data columns (total 9 columns):
 #   Column                Non-Null Count  Dtype
---  ------                --------------  -----
 0   department            54808 non-null  int32
 1   education             54808 non-null  int32
 2   no_of_trainings       54808 non-null  int64
 3   previous_year_rating  54808 non-null  int32
 4   length_of_service     54808 non-null  int64
 5   KPIs_met              54808 non-null  int64
 6   awards_won            54808 non-null  int64
 7   avg_training_score    54808 non-null  int64
 8   is_promoted           54808 non-null  int64
dtypes: int32(3), int64(6)
memory usage: 3.1 MB


In [17]:
# Using TPOT
tpot = TPOTRegressor(generations=5, population_size=50, verbosity=2, random_state=42)
tpot.fit(X, y)

DecisionTreeRegressor(random_state=0)

In [18]:
y_pred = dec_tree_reg.predict(X)
y_pred=y_pred.astype(int)

In [19]:
error = np.sqrt(mean_squared_error(y, y_pred))

In [20]:
error

0.19221644310098548

In [21]:
X

Unnamed: 0,department,education,no_of_trainings,previous_year_rating,length_of_service,KPIs_met,awards_won,avg_training_score
0,7,2,1,5,8,1,0,49
1,4,0,1,5,4,0,0,60
2,7,0,1,3,7,0,0,50
3,7,0,2,1,10,0,0,50
4,8,0,1,3,2,0,0,73
...,...,...,...,...,...,...,...,...
54803,8,0,1,3,17,0,0,78
54804,4,2,1,2,6,0,0,56
54805,0,0,1,5,3,1,0,79
54806,7,0,1,1,2,0,0,45


In [22]:
# Checking with manual inputs
X = np.array([['Technology','Bachelors degree',1,5,1,0,0,50]])
X

array([['Technology', 'Bachelors degree', '1', '5', '1', '0', '0', '50']],
      dtype='<U16')

In [23]:
X[:, 0] = le_department.transform(X[:,0])
X[:, 1] = le_education.transform(X[:,1])
X = X.astype(int)
X

array([[ 8,  0,  1,  5,  1,  0,  0, 50]])

In [24]:
y_pred = dec_tree_reg.predict(X)
y_pred=y_pred.astype(int)
y_pred

array([0])

In [25]:
data = {"model": dec_tree_reg, "le_department": le_department, "le_education": le_education}
with open('saved_steps.pkl', 'wb') as file:
    pickle.dump(data, file)

In [26]:
with open('saved_steps.pkl', 'rb') as file:
    data = pickle.load(file)

regressor_loaded = data["model"]
le_department = data["le_department"]
le_education = data["le_education"]

In [27]:
y_pred = regressor_loaded.predict(X)
y_pred=y_pred.astype(int)
y_pred

array([0])