In [1]:
# Dataset url - https://www.kaggle.com/datasets/ravindrasinghrana/employeedataset
import pandas as pd
import numpy as np

In [2]:
employeeData = pd.read_csv('employee_data.csv')
engagementData = pd.read_csv('employee_engagement_survey_data.csv')
recruitmentData = pd.read_csv('recruitment_data.csv')
trainingData = pd.read_csv('training_and_development_data.csv')

In [3]:
# show all the columns and rows
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [4]:
trainingData.shape

(3000, 9)

In [5]:
trainingData.dtypes

Employee ID                  int64
Training Date               object
Training Program Name       object
Training Type               object
Training Outcome            object
Location                    object
Trainer                     object
Training Duration(Days)      int64
Training Cost              float64
dtype: object

In [6]:
# convert training date to date type
trainingData['Training Date'] = pd.to_datetime(trainingData['Training Date'], format='%d-%b-%y')

In [7]:
# categories on Training type
trainingData['Training Type'].value_counts(dropna=False)

Training Type
Internal    1509
External    1491
Name: count, dtype: int64

In [8]:
# find distinct years training was conducted
trainingData['Training Date'].dt.year.unique()

array([2022, 2023])

In [9]:
# categories on training outcome
trainingData.groupby('Training Outcome').size().reset_index(name='No of Employees')

Unnamed: 0,Training Outcome,No of Employees
0,Completed,770
1,Failed,716
2,Incomplete,775
3,Passed,739


In [10]:
# how many current employees Passed vs Failed their training
joinedDF = trainingData.merge(employeeData, how='inner', left_on='Employee ID', right_on='EmpID')
joinedDF.loc[
(joinedDF.EmployeeStatus=='Active') & ((joinedDF['Training Outcome']=='Failed') | (joinedDF['Training Outcome']=='Passed'))
].groupby('Training Outcome').size().reset_index(name='No of Employees')

Unnamed: 0,Training Outcome,No of Employees
0,Failed,582
1,Passed,594


In [11]:
# how many current employee attended the training in the most recent year
trainingData.loc[trainingData['Training Date'].dt.year == trainingData['Training Date'].dt.year.max()]['Employee ID'].count()

1784

In [12]:
# which year was the first training organized, who was the trainer, location and the outcome
trainingData.loc[
    trainingData['Training Date'].dt.year == trainingData['Training Date'].dt.year.min()
][['Training Date', 'Training Program Name', 'Trainer', 'Location', 'Training Outcome']].sort_values(by='Training Date')

Unnamed: 0,Training Date,Training Program Name,Trainer,Location,Training Outcome
1585,2022-08-05,Customer Service,Diana Walsh,Bradborough,Completed
824,2022-08-05,Technical Skills,Robert Weaver,North Robertside,Completed
863,2022-08-06,Project Management,Laurie Miller,South Waltershire,Completed
399,2022-08-06,Communication Skills,David Dennis,New Melissachester,Failed
431,2022-08-06,Communication Skills,Crystal Baldwin,Patelstad,Completed
447,2022-08-06,Leadership Development,Vicki Cook,Billymouth,Failed
230,2022-08-06,Project Management,Douglas Dean,Port Aliciaview,Passed
2432,2022-08-06,Customer Service,David Stewart,Lancehaven,Passed
1470,2022-08-06,Leadership Development,Judith Ramsey,Hubbardtown,Passed
2798,2022-08-07,Communication Skills,Travis Moore,Sheenaville,Incomplete


In [13]:
# find out all the locations where trainings were held
trainingData.Location.unique()
# or
trainingData.Location.value_counts(dropna=False)

Location
Smithchester              6
South Jennifer            5
Lake Michael              5
East Michael              5
Joneston                  4
South William             4
Port Joshua               4
South Michael             4
New Michael               4
New Jennifer              4
West David                4
New David                 4
North Rebecca             4
Davismouth                3
South Robin               3
Cynthiaview               3
New Richard               3
South David               3
West Christopher          3
Smithborough              3
New Eric                  3
Port John                 3
East John                 3
Jenniferfurt              3
East Christopher          3
New William               3
East Melissa              3
North William             3
West Robert               3
Port Cynthia              3
Port Robert               3
Port Gary                 3
West Michael              3
West Christina            3
New Jordan                3
Jonesberg  

In [14]:
# find out the number of training in each location and which location has had the most trainings
trainingData.Location.value_counts(dropna=False).reset_index(name='No of Employees').sort_values(by='No of Employees', ascending=False).head(1)

Unnamed: 0,Location,No of Employees
0,Smithchester,6


In [54]:
# how many distinct trainers have trained the employees
trainingData.Trainer.unique().size

2942

In [72]:
# find trainers who trained the employees more than once
groupedDF = trainingData.groupby('Trainer').size().reset_index(name='No of Trainings')
groupedDF.loc[groupedDF['No of Trainings'] > 1].count()

Trainer            51
No of Trainings    51
dtype: int64

In [74]:
# which trainer took the most number of trainings and how many
trainingData.groupby('Trainer').size().reset_index(name='No of Trainings').sort_values(by='No of Trainings', ascending=False).head(1)

Unnamed: 0,Trainer,No of Trainings
2004,Michael Smith,4


In [78]:
# what is the average training duration 
round(trainingData['Training Duration(Days)'].mean(), 2)

2.98

In [118]:
# what is the average training duration for each year
trainingData.groupby(trainingData['Training Date'].dt.year)['Training Duration(Days)'].mean().reset_index(name='average').rename(
    columns = {
        'Training Date': 'Training Year',
        'average': 'Average Duration(Days)'
    }
)

Unnamed: 0,Training Year,Average Duration(Days)
0,2022,3.029605
1,2023,2.938901


In [120]:
# find the total days of training conducted by the firm
trainingData['Training Duration(Days)'].sum()

8927

In [130]:
# find the total training cost incurred to the firm
round(trainingData['Training Cost'].sum(), 2)

1675886.09

In [15]:
trainingData.head()

Unnamed: 0,Employee ID,Training Date,Training Program Name,Training Type,Training Outcome,Location,Trainer,Training Duration(Days),Training Cost
0,1001,2022-09-21,Customer Service,Internal,Failed,Port Greg,Amanda Daniels,4,510.83
1,1002,2023-07-19,Leadership Development,Internal,Failed,Brandonview,Brittany Chambers,2,582.37
2,1003,2023-02-24,Technical Skills,Internal,Incomplete,Port Briannahaven,Mark Roberson,4,777.06
3,1004,2023-01-12,Customer Service,Internal,Completed,Knightborough,Richard Fisher,2,824.3
4,1005,2023-05-12,Communication Skills,External,Passed,Bruceshire,Heather Shaffer,4,145.99


In [138]:
# find total earnings for each trainer
trainingData.groupby('Trainer')['Training Cost'].sum().reset_index(name='Total Earning').sort_values(by='Total Earning', ascending=False)

Unnamed: 0,Trainer,Total Earning
2004,Michael Smith,2403.64
1959,Michael Brown,2222.59
1955,Michael Bailey,1927.07
1095,James Perez,1874.32
2199,Patricia Hernandez,1849.91
1254,Jessica Jones,1697.84
2376,Robert Stevens,1695.11
2793,Timothy Taylor,1687.37
1023,Heather Tucker,1635.74
129,Andrea Lopez,1548.62


In [24]:
# describe training cost
trainingData['Training Cost'].describe()

count    3000.000000
mean      558.628697
std       263.217698
min       100.040000
25%       327.587500
50%       572.125000
75%       786.987500
max       999.970000
Name: Training Cost, dtype: float64