# Employee Compensation
Dataset about employees in a company

In [1]:
import pandas as pd
raw_emp_comp = pd.read_csv('Datasets/Compensation.csv')

import warnings
warnings.filterwarnings('ignore')

In [2]:
raw_emp_comp.head()

Unnamed: 0,Id,EmployeeName,JobTitle,BasePay,OvertimePay,OtherPay,Benefits,TotalPay,TotalPayBenefits,Year,Notes,Agency,Status
0,1,NATHANIEL FORD,GENERAL MANAGER-METROPOLITAN TRANSIT AUTHORITY,167411.18,0.0,400184.25,,567595.43,567595.43,2011,,San Francisco,
1,2,GARY JIMENEZ,CAPTAIN III (POLICE DEPARTMENT),155966.02,245131.88,137811.38,,538909.28,538909.28,2011,,San Francisco,
2,3,ALBERT PARDINI,CAPTAIN III (POLICE DEPARTMENT),212739.13,106088.18,16452.6,,335279.91,335279.91,2011,,San Francisco,
3,4,CHRISTOPHER CHONG,WIRE ROPE CABLE MAINTENANCE MECHANIC,77916.0,56120.71,198306.9,,332343.61,332343.61,2011,,San Francisco,
4,5,PATRICK GARDNER,"DEPUTY CHIEF OF DEPARTMENT,(FIRE DEPARTMENT)",134401.6,9737.0,182234.59,,326373.19,326373.19,2011,,San Francisco,


In [3]:
raw_emp_comp.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 148654 entries, 0 to 148653
Data columns (total 13 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   Id                148654 non-null  int64  
 1   EmployeeName      148654 non-null  object 
 2   JobTitle          148654 non-null  object 
 3   BasePay           148045 non-null  float64
 4   OvertimePay       148650 non-null  float64
 5   OtherPay          148650 non-null  float64
 6   Benefits          112491 non-null  float64
 7   TotalPay          148654 non-null  float64
 8   TotalPayBenefits  148654 non-null  float64
 9   Year              148654 non-null  int64  
 10  Notes             0 non-null       float64
 11  Agency            148654 non-null  object 
 12  Status            0 non-null       float64
dtypes: float64(8), int64(2), object(3)
memory usage: 14.7+ MB


## Data Cleaning
Removing rows with negative and null Base Pay values.

In [4]:
emp_comp = raw_emp_comp[(raw_emp_comp['BasePay'] >= 0)]
emp_comp.info()

<class 'pandas.core.frame.DataFrame'>
Index: 148034 entries, 0 to 148653
Data columns (total 13 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   Id                148034 non-null  int64  
 1   EmployeeName      148034 non-null  object 
 2   JobTitle          148034 non-null  object 
 3   BasePay           148034 non-null  float64
 4   OvertimePay       148034 non-null  float64
 5   OtherPay          148034 non-null  float64
 6   Benefits          111875 non-null  float64
 7   TotalPay          148034 non-null  float64
 8   TotalPayBenefits  148034 non-null  float64
 9   Year              148034 non-null  int64  
 10  Notes             0 non-null       float64
 11  Agency            148034 non-null  object 
 12  Status            0 non-null       float64
dtypes: float64(8), int64(2), object(3)
memory usage: 15.8+ MB


## Average Base Pay

In [5]:
emp_comp['BasePay'].mean()

66330.38379554695

## Average Base Pay By Year

In [6]:
emp_comp.groupby('Year')['BasePay'].mean()

Year
2011    63595.956517
2012    65456.016752
2013    69630.030216
2014    66564.421924
Name: BasePay, dtype: float64

## Lowest Base Pay (Greater Than 0)

In [7]:
only_w_basepay = emp_comp[(emp_comp['BasePay'] > 0)]
only_w_basepay [only_w_basepay ['BasePay'] == only_w_basepay ['BasePay'].min()]

Unnamed: 0,Id,EmployeeName,JobTitle,BasePay,OvertimePay,OtherPay,Benefits,TotalPay,TotalPayBenefits,Year,Notes,Agency,Status
148619,148620,Ian V Cameron,IS Program Analyst-Assistant,6.04,0.0,10.05,2.3,16.09,18.39,2014,,San Francisco,


## Most Overtime Pay

In [8]:
emp_comp[emp_comp.OvertimePay == emp_comp.OvertimePay.max()]

Unnamed: 0,Id,EmployeeName,JobTitle,BasePay,OvertimePay,OtherPay,Benefits,TotalPay,TotalPayBenefits,Year,Notes,Agency,Status
1,2,GARY JIMENEZ,CAPTAIN III (POLICE DEPARTMENT),155966.02,245131.88,137811.38,,538909.28,538909.28,2011,,San Francisco,


## Highest Paid Employee

In [9]:
emp_comp[emp_comp.TotalPayBenefits == emp_comp['TotalPayBenefits'].max()]

Unnamed: 0,Id,EmployeeName,JobTitle,BasePay,OvertimePay,OtherPay,Benefits,TotalPay,TotalPayBenefits,Year,Notes,Agency,Status
0,1,NATHANIEL FORD,GENERAL MANAGER-METROPOLITAN TRANSIT AUTHORITY,167411.18,0.0,400184.25,,567595.43,567595.43,2011,,San Francisco,


## Lowest Paid Employee

In [10]:
emp_comp[emp_comp.TotalPayBenefits == emp_comp['TotalPayBenefits'].min()]

Unnamed: 0,Id,EmployeeName,JobTitle,BasePay,OvertimePay,OtherPay,Benefits,TotalPay,TotalPayBenefits,Year,Notes,Agency,Status
148653,148654,Joe Lopez,"Counselor, Log Cabin Ranch",0.0,0.0,-618.13,0.0,-618.13,-618.13,2014,,San Francisco,


## Employee Overview

Show detailed information about a particular employee by deriving their row.

In [11]:
emp_comp[emp_comp.EmployeeName == 'THERESE VALENA']

Unnamed: 0,Id,EmployeeName,JobTitle,BasePay,OvertimePay,OtherPay,Benefits,TotalPay,TotalPayBenefits,Year,Notes,Agency,Status
195,196,THERESE VALENA,DATA ANALYST,105934.66,84172.44,16662.29,,206769.39,206769.39,2011,,Metro Manila,


### Employee Role
Finding specific information (job title) about a particular employee.

In [12]:
emp_comp['JobTitle'][emp_comp.EmployeeName == 'THERESE VALENA']

195    DATA ANALYST
Name: JobTitle, dtype: object

### Employee Total Compensation
Finding specific information (total pay with benefits) about a particular employee.

In [13]:
emp_comp[emp_comp.EmployeeName == 'THERESE VALENA']['TotalPayBenefits']

195    206769.39
Name: TotalPayBenefits, dtype: float64

## Number of Unique Job Titles

In [14]:
emp_comp['JobTitle'].nunique()

2155

## 10 Most Common Jobs

In [15]:
emp_comp.JobTitle.value_counts().head(10)

JobTitle
Transit Operator                6975
Special Nurse                   4382
Registered Nurse                3725
Public Svc Aide-Public Works    2514
Police Officer 3                2411
Custodian                       2409
TRANSIT OPERATOR                2388
Firefighter                     2348
Recreation Leader               1968
Patient Care Assistant          1940
Name: count, dtype: int64

## Number of roles that were occupied by only one person in 2011

In [16]:
#Method 1:
emp_year_2011 = emp_comp[emp_comp['Year'] == 2011]
position_one_person = emp_year_2011['JobTitle'].value_counts()==1
position_one_person.sum()

201

In [17]:
#Method 2:
year11 = emp_comp[emp_comp['Year'] == 2011]
position_counts = year11['JobTitle'].value_counts()
single_person_titles = position_counts[position_counts == 1]

single_person_titles.count()

201

In [18]:
# Method 3:
sum(emp_comp[emp_comp['Year']==2011]['JobTitle'].value_counts()==1)

201

## Number of Chiefs
The number of job titles containing the word 'chief' regardless of any capitalization of letters.

In [19]:
sum(emp_comp['JobTitle'].str.contains('chief', case = False))

624

## Correlation between job title length and compensation
Checking correlation between the number of characters in a job title and compensation (total pay with benefits).

A reason to check this would be more senior positions having added words in their titles such as 'General Manager', 'Chief', or 'Director'. However, the same can be said for higher positions with short titles, for example, 'President' is shorter than 'Vice President'.

In [20]:
emp_comp['title_len'] = emp_comp['JobTitle'].apply(len)

In [21]:
emp_comp[['title_len','TotalPayBenefits']].corr()

Unnamed: 0,title_len,TotalPayBenefits
title_len,1.0,-0.036792
TotalPayBenefits,-0.036792,1.0


No correlation.