In [1]:
# Dataset url - https://www.kaggle.com/datasets/ravindrasinghrana/employeedataset
import pandas as pd
import numpy as np

In [2]:
employeeData = pd.read_csv('employee_data.csv')
engagementData = pd.read_csv('employee_engagement_survey_data.csv')
recruitmentData = pd.read_csv('recruitment_data.csv')
trainingData = pd.read_csv('training_and_development_data.csv')

In [3]:
pd.set_option('display.max_rows', 10)

In [4]:
engagementData.dtypes

Employee ID                 int64
Survey Date                object
Engagement Score            int64
Satisfaction Score          int64
Work-Life Balance Score     int64
dtype: object

In [5]:
# convert Survey Date to date
engagementData['Survey Date'] = pd.to_datetime(engagementData['Survey Date'], format='%d-%m-%Y')

In [6]:
# count employee surveyed by year
engagementData.groupby(engagementData['Survey Date'].dt.year).size().reset_index(name='Total Participants')

Unnamed: 0,Survey Date,Total Participants
0,2022,1187
1,2023,1813


In [7]:
# find the date when the last survey was done
engagementData['Survey Date'].max()

Timestamp('2023-08-05 00:00:00')

In [8]:
# find the date when first survey was done
engagementData['Survey Date'].min()

Timestamp('2022-08-05 00:00:00')

In [9]:
# find gender wise engagement score of employees
engagementData.merge(
    employeeData, how='inner', left_on='Employee ID', right_on='EmpID'
).groupby('GenderCode').size().reset_index(name = 'Total Participants')

Unnamed: 0,GenderCode,Total Participants
0,Female,1682
1,Male,1318


In [10]:
# group work life balance on division
engagementData.merge(
    employeeData, how='inner', left_on='Employee ID', right_on='EmpID'
).groupby(['Division', 'Work-Life Balance Score']).size().reset_index(name = 'Total Participants')

Unnamed: 0,Division,Work-Life Balance Score,Total Participants
0,Aerial,1,37
1,Aerial,2,36
2,Aerial,3,50
3,Aerial,4,45
4,Aerial,5,28
...,...,...,...
114,Yard (Material Handling),1,16
115,Yard (Material Handling),2,10
116,Yard (Material Handling),3,12
117,Yard (Material Handling),4,11


In [11]:
# find out which department employees have the best work life balance
engagementData.merge(
    employeeData, how='inner', left_on='Employee ID', right_on='EmpID'
).groupby(
    ['DepartmentType', 'Work-Life Balance Score']
).size().reset_index(name = 'Total Participants').sort_values(by=['Work-Life Balance Score', 'Total Participants'], ascending=[False, False]).head(1)

Unnamed: 0,DepartmentType,Work-Life Balance Score,Total Participants
19,Production,5,379


In [12]:
# grouping on satisfaction score and calculating percentage
groupedDF = engagementData.groupby('Satisfaction Score').size().reset_index(name='No of Participants')
groupedDF['Percentage(%)'] = round((groupedDF['No of Participants']*100)/(groupedDF['No of Participants'].sum()), 2)
groupedDF

Unnamed: 0,Satisfaction Score,No of Participants,Percentage(%)
0,1,592,19.73
1,2,574,19.13
2,3,604,20.13
3,4,636,21.2
4,5,594,19.8


In [13]:
# what is the average engagement score in finance and Accounting division
groupedDF = engagementData.merge(
    employeeData, how='inner', left_on='Employee ID', right_on='EmpID'
)
round(groupedDF.loc[groupedDF['Division']=='Finance & Accounting']['Engagement Score'].mean(), 2)

2.84

In [14]:
employeeData.columns

Index(['EmpID', 'FirstName', 'LastName', 'StartDate', 'ExitDate', 'Title',
       'Supervisor', 'ADEmail', 'BusinessUnit', 'EmployeeStatus',
       'EmployeeType', 'PayZone', 'EmployeeClassificationType',
       'TerminationType', 'TerminationDescription', 'DepartmentType',
       'Division', 'DOB', 'State', 'JobFunctionDescription', 'GenderCode',
       'LocationCode', 'RaceDesc', 'MaritalDesc', 'Performance Score',
       'Current Employee Rating'],
      dtype='object')

In [15]:
employeeData.head(2)

Unnamed: 0,EmpID,FirstName,LastName,StartDate,ExitDate,Title,Supervisor,ADEmail,BusinessUnit,EmployeeStatus,...,Division,DOB,State,JobFunctionDescription,GenderCode,LocationCode,RaceDesc,MaritalDesc,Performance Score,Current Employee Rating
0,3427,Uriah,Bridges,20-Sep-19,,Production Technician I,Peter Oneill,uriah.bridges@bilearner.com,CCDR,Active,...,Finance & Accounting,07-10-1969,MA,Accounting,Female,34904,White,Widowed,Fully Meets,4
1,3428,Paula,Small,11-Feb-23,,Production Technician I,Renee Mccormick,paula.small@bilearner.com,EW,Active,...,Aerial,30-08-1965,MA,Labor,Male,6593,Hispanic,Widowed,Fully Meets,3


In [16]:
employeeData.dtypes

EmpID                       int64
FirstName                  object
LastName                   object
StartDate                  object
ExitDate                   object
                            ...  
LocationCode                int64
RaceDesc                   object
MaritalDesc                object
Performance Score          object
Current Employee Rating     int64
Length: 26, dtype: object

In [17]:
employeeData['StartDate'] = pd.to_datetime(employeeData['StartDate'], format='%d-%b-%y')

In [18]:
# which employees have a performance score of fully meets and and engagement score higher than 4
groupedDF = engagementData.merge(
    employeeData, how='inner', left_on='Employee ID', right_on='EmpID'
)
groupedDF.loc[(groupedDF['Performance Score'] == 'Fully Meets') & (groupedDF['Engagement Score'] > 4)][
['Employee ID', 'FirstName', 'LastName', 'Performance Score', 'Engagement Score']
]

Unnamed: 0,Employee ID,FirstName,LastName,Performance Score,Engagement Score
5,1006,Colby,Andreola,Fully Meets,5
7,1008,Judith,Carabbio,Fully Meets,5
12,1013,Jasmin,Shah,Fully Meets,5
20,1021,Joe,Fletcher,Fully Meets,5
23,1024,Kasey,Boyer,Fully Meets,5
...,...,...,...,...,...
2971,3972,Eleanor,Patton,Fully Meets,5
2983,3984,Christine,Skinner,Fully Meets,5
2984,3985,Zayne,Mccullough,Fully Meets,5
2987,3988,Kenzie,Mullins,Fully Meets,5


In [19]:
# what is the average work life balance score of employees who started in 2023
groupedDF = engagementData.merge(
    employeeData, how='inner', left_on='Employee ID', right_on='EmpID'
)
groupedDF.loc[groupedDF['StartDate'].dt.year == 2023]['Work-Life Balance Score'].mean()

2.8597014925373134

In [20]:
# What is the trend of engagement score over time for each business unit
engagementData.merge(
    employeeData, 
    how='inner', 
    left_on='Employee ID', 
    right_on='EmpID'
).groupby(['BusinessUnit', 'Engagement Score']).size().reset_index(name='No of Employee')

Unnamed: 0,BusinessUnit,Engagement Score,No of Employee
0,BPC,1,69
1,BPC,2,62
2,BPC,3,51
3,BPC,4,70
4,BPC,5,51
...,...,...,...
45,WBL,1,67
46,WBL,2,61
47,WBL,3,44
48,WBL,4,58


In [21]:
employeeData.head(2)

Unnamed: 0,EmpID,FirstName,LastName,StartDate,ExitDate,Title,Supervisor,ADEmail,BusinessUnit,EmployeeStatus,...,Division,DOB,State,JobFunctionDescription,GenderCode,LocationCode,RaceDesc,MaritalDesc,Performance Score,Current Employee Rating
0,3427,Uriah,Bridges,2019-09-20,,Production Technician I,Peter Oneill,uriah.bridges@bilearner.com,CCDR,Active,...,Finance & Accounting,07-10-1969,MA,Accounting,Female,34904,White,Widowed,Fully Meets,4
1,3428,Paula,Small,2023-02-11,,Production Technician I,Renee Mccormick,paula.small@bilearner.com,EW,Active,...,Aerial,30-08-1965,MA,Labor,Male,6593,Hispanic,Widowed,Fully Meets,3


In [22]:
# what is the average performance score by race and gender
engagementData.merge(
    employeeData, 
    how='inner', 
    left_on='Employee ID', 
    right_on='EmpID'
).groupby(['RaceDesc', 'GenderCode'])[['Satisfaction Score', 'Current Employee Rating']].mean().reset_index()

Unnamed: 0,RaceDesc,GenderCode,Satisfaction Score,Current Employee Rating
0,Asian,Female,2.988439,2.976879
1,Asian,Male,3.236749,2.918728
2,Black,Female,2.927746,2.953757
3,Black,Male,2.944853,2.966912
4,Hispanic,Female,3.095385,2.944615
5,Hispanic,Male,2.951417,2.991903
6,Other,Female,3.097484,2.993711
7,Other,Male,2.950758,3.0
8,White,Female,2.933718,3.054755
9,White,Male,3.115079,2.865079


In [23]:
engagementData.head()

Unnamed: 0,Employee ID,Survey Date,Engagement Score,Satisfaction Score,Work-Life Balance Score
0,1001,2022-10-10,2,5,5
1,1002,2023-08-03,4,5,3
2,1003,2023-01-03,2,5,2
3,1004,2023-07-30,3,5,3
4,1005,2023-06-19,2,4,5
