## IMPORTING NECESSARY LIBRARIES

In [39]:
import numpy as np
import pandas as  pd
from sklearn.model_selection import train_test_split
from dateutil.relativedelta import relativedelta

## LOADING DATASET

In [2]:
data = pd.read_excel('employee_burnout_analysis.xlsx')

## DATA OVERVIEW

In [3]:
data.head()


Unnamed: 0,Employee ID,Date of Joining,Gender,Company Type,WFH Setup Available,Designation,Resource Allocation,Mental Fatigue Score,Burn Rate
0,fffe32003000360033003200,2008-09-30,Female,Service,No,2,3.0,3.8,0.16
1,fffe3700360033003500,2008-11-30,Male,Service,Yes,1,2.0,5.0,0.36
2,fffe31003300320037003900,2008-03-10,Female,Product,Yes,2,,5.8,0.49
3,fffe32003400380032003900,2008-11-03,Male,Service,Yes,1,1.0,2.6,0.2
4,fffe31003900340031003600,2008-07-24,Female,Service,No,3,7.0,6.9,0.52


In [4]:
data.tail()

Unnamed: 0,Employee ID,Date of Joining,Gender,Company Type,WFH Setup Available,Designation,Resource Allocation,Mental Fatigue Score,Burn Rate
22745,fffe31003500370039003100,2008-12-30,Female,Service,No,1,3.0,,0.41
22746,fffe33003000350031003800,2008-01-19,Female,Product,Yes,3,6.0,6.7,0.59
22747,fffe390032003000,2008-11-05,Male,Service,Yes,3,7.0,,0.72
22748,fffe33003300320036003900,2008-01-10,Female,Service,No,2,5.0,5.9,0.52
22749,fffe3400350031003800,2008-01-06,Male,Product,No,3,6.0,7.8,0.61


In [5]:
data.describe()

Unnamed: 0,Designation,Resource Allocation,Mental Fatigue Score,Burn Rate
count,22750.0,21369.0,20633.0,21626.0
mean,2.178725,4.481398,5.728188,0.452005
std,1.135145,2.047211,1.920839,0.198226
min,0.0,1.0,0.0,0.0
25%,1.0,3.0,4.6,0.31
50%,2.0,4.0,5.9,0.45
75%,3.0,6.0,7.1,0.59
max,5.0,10.0,10.0,1.0


In [6]:
data.shape

(22750, 9)

In [7]:
data.size

204750

In [8]:
data.columns.tolist()

['Employee ID',
 'Date of Joining',
 'Gender',
 'Company Type',
 'WFH Setup Available',
 'Designation',
 'Resource Allocation',
 'Mental Fatigue Score',
 'Burn Rate']

In [9]:
data.nunique()

Employee ID             22750
Date of Joining           366
Gender                      2
Company Type                2
WFH Setup Available         2
Designation                 6
Resource Allocation        10
Mental Fatigue Score      101
Burn Rate                 101
dtype: int64

In [10]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22750 entries, 0 to 22749
Data columns (total 9 columns):
 #   Column                Non-Null Count  Dtype         
---  ------                --------------  -----         
 0   Employee ID           22750 non-null  object        
 1   Date of Joining       22750 non-null  datetime64[ns]
 2   Gender                22750 non-null  object        
 3   Company Type          22750 non-null  object        
 4   WFH Setup Available   22750 non-null  object        
 5   Designation           22750 non-null  int64         
 6   Resource Allocation   21369 non-null  float64       
 7   Mental Fatigue Score  20633 non-null  float64       
 8   Burn Rate             21626 non-null  float64       
dtypes: datetime64[ns](1), float64(3), int64(1), object(4)
memory usage: 1.6+ MB


In [11]:
data.isnull().sum()

Employee ID                0
Date of Joining            0
Gender                     0
Company Type               0
WFH Setup Available        0
Designation                0
Resource Allocation     1381
Mental Fatigue Score    2117
Burn Rate               1124
dtype: int64

In [12]:
data.isnull().sum().values.sum()

4622

## DATA PREPROCESSING

In [13]:
#Check for NULL values in dataset
null_rows = data.isnull().any(axis = 1)

In [15]:
#Deleting NULL values
data_notnull = data[~null_rows]

In [16]:
#Saving the cleaned dataset to csv file  
data_notnull.to_csv("Burnoutanalysis.csv", index = False)

In [17]:
#Loading the recently saved dataset
new_data = pd.read_csv("Burnoutanalysis.csv")

In [18]:
#check the dataset whether the NULL values are present or not
new_data.isnull().sum()

Employee ID             0
Date of Joining         0
Gender                  0
Company Type            0
WFH Setup Available     0
Designation             0
Resource Allocation     0
Mental Fatigue Score    0
Burn Rate               0
dtype: int64

In [19]:
new_data.isnull().sum().values.sum()

0

Here we done with the data preprocessing step. Now the data is ready to build a model

In [22]:
#dropping the column employee id because it doesn't have any importance in prediction
data.drop('Employee ID', axis = 1)

Unnamed: 0,Date of Joining,Gender,Company Type,WFH Setup Available,Designation,Resource Allocation,Mental Fatigue Score,Burn Rate
0,2008-09-30,Female,Service,No,2,3.0,3.8,0.16
1,2008-11-30,Male,Service,Yes,1,2.0,5.0,0.36
2,2008-03-10,Female,Product,Yes,2,,5.8,0.49
3,2008-11-03,Male,Service,Yes,1,1.0,2.6,0.20
4,2008-07-24,Female,Service,No,3,7.0,6.9,0.52
...,...,...,...,...,...,...,...,...
22745,2008-12-30,Female,Service,No,1,3.0,,0.41
22746,2008-01-19,Female,Product,Yes,3,6.0,6.7,0.59
22747,2008-11-05,Male,Service,Yes,3,7.0,,0.72
22748,2008-01-10,Female,Service,No,2,5.0,5.9,0.52
