In [8]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
from itertools import product
import scipy.stats as ss

In [9]:
pd.options.display.max_rows = 100
pd.options.display.max_columns = 100

In [10]:
hist = pd.read_csv("datasets/soft/history.csv")

hist.head()

Unnamed: 0,EmployeeID,Date,DevCenterID,SBUID,PositionID,PositionLevel,IsTrainee,LanguageLevelID,CustomerID,ProjectID,IsInternalProject,Utilization,HourVacation,HourMobileReserve,HourLockedReserve,OnSite,CompetenceGroupID,FunctionalOfficeID,PaymentTypeId,BonusOneTime,APM,WageGross,MonthOnPosition,MonthOnSalary
0,00116D71-E87D-4B64-A566-1F29B2A798A8,7/1/17,3,292,70,2,0,11,893EA22F-08BE-4F11-AD93-C50746E4565F,7F97465B-ED3C-45DB-BE96-86C7E493F0CD,0,0.7619,40,0,0,0,15,1,9,0,39,0.887446,1,1
1,00116D71-E87D-4B64-A566-1F29B2A798A8,8/1/17,3,332,70,2,0,11,893EA22F-08BE-4F11-AD93-C50746E4565F,7F97465B-ED3C-45DB-BE96-86C7E493F0CD,0,1.0,0,0,0,0,15,1,9,200,28,0.887446,2,2
2,00116D71-E87D-4B64-A566-1F29B2A798A8,9/1/17,3,332,70,2,0,11,893EA22F-08BE-4F11-AD93-C50746E4565F,7F97465B-ED3C-45DB-BE96-86C7E493F0CD,0,0.5,72,0,0,0,15,1,9,0,43,0.887446,3,3
3,00116D71-E87D-4B64-A566-1F29B2A798A8,10/1/17,3,332,70,2,0,11,893EA22F-08BE-4F11-AD93-C50746E4565F,7F97465B-ED3C-45DB-BE96-86C7E493F0CD,0,0.8636,16,0,0,0,15,1,9,0,49,0.887446,4,4
4,00116D71-E87D-4B64-A566-1F29B2A798A8,11/1/17,3,332,70,2,0,11,893EA22F-08BE-4F11-AD93-C50746E4565F,7F97465B-ED3C-45DB-BE96-86C7E493F0CD,0,0.8,8,0,0,0,15,1,9,0,43,0.887446,5,5


Employee History

Employee History data for 1.5 years, which is gathered on regular basis (ones per month)

* EmployeeID - Employee identifier
* Date - Month of Employee Statistics gathering
* DevCenterID - Employee Location in terms of Company Geography. DevCenters locate in different cities such as Kyiv, Lviv, Kharkiv, Dnipro.
* SBUID - Employee Location in terms of Company Structure. Other words, it's department in the company. Many unique values => isn't typical IT, HR, R&D, Finance. Same PositionID in different SBUID => inner separation in different centers.
1. What does Company Structure / Company Geography mean? Geographical location - coordinates, structural - company structure element
* PositionID - Identifier of Employee Position (like QC Engineer, Development Consultant, etc)
1. What positions do codes mean? Info isn't provided.
* IsTrainee - Trainee flag of Employee
* LanguageLevelID - English Level Identifier (like Intermediate low, Upper-intermediate, etc). ordered level of employee 1-Elementary and the highest is Native level;
* CustomerID - Client Identifier (one client may be related to the several projects). 
1. Is it the main client who paid for project where employee worked? Yes, it's Id of client where Employee is still working or worked.
* ProjectID - Employee Main Project Identifier
* IsInternalProject - Internal / External project flag
* Utilization - percent of Employee load on Non-Internal Projects during last month
1. Does this feature matter for employees who worked on internal project? Yes because only main project can be internal, whereas there are also other projects that are external.
2. What is the difference between an internal and an external project? Internal project - company creates own product, external project - company creates product on customer's plan.
* HourVacation - vacation hours are spent as on the last month
* HourMobileReserve - total hours in Mobile reserve as on the last month. It's hours which Employee spends in his/her Technology group without project.
* HourLockedReserve - total hours in Locked reserve as on the last month. Each manager has Team with different job roles. It's hours which Employee spends in team of his/her Manager without project.
* OnSide - was Employee involved to OnSite visit last month
1. What is on-site visit? Maybe, client's visit on place.
* MonthOnPosition - month without position changing as on the last month
* MonthOnSalary - month without salary increasing as on the last month
* CompetenceGroupID - Employee Competency Group (like QC, Big Data, Data Science, etc)
* FunctionalOfficeID - Functional Office Identifier (like SDO (Software Development), QMO (Quality Management Office), etc). Maybe, competency groups are parts of functional offices
* PaymentTypeId - Payment with respect to the country-specifics employment
1. What is it? Type of Employees indentity: FOP, contract, usual employee
* WageGross - Compensation GROSS
1. Why is a wage gross given in percents? It is anonymized gross compensation. Treat as salary.
* BonusOneTime - One Time Bonus
1. Month salary bonus? Maybe.
* APM - Employee APM.
1. What is it?  ACHIEVABLE PROFIT MARGIN. Approximate explanation looks like https://www.performlaw.com/hubfs/Pdf/Achievable_Profit_Margin_Analysis.pdf?t=1501943820897
* PositionLevel - Employee Seniority Level (Junior, Middle, Senior, etc). PositionLevelID is also ordered 1-Junior, 9-11 Managers levels.

In [11]:
hist.dtypes

EmployeeID             object
Date                   object
DevCenterID             int64
SBUID                   int64
PositionID              int64
PositionLevel           int64
IsTrainee               int64
LanguageLevelID         int64
CustomerID             object
ProjectID              object
IsInternalProject       int64
Utilization           float64
HourVacation            int64
HourMobileReserve       int64
HourLockedReserve       int64
OnSite                  int64
CompetenceGroupID       int64
FunctionalOfficeID      int64
PaymentTypeId           int64
BonusOneTime            int64
APM                     int64
WageGross             float64
MonthOnPosition         int64
MonthOnSalary           int64
dtype: object

In [12]:
hist['Date'] = pd.to_datetime(hist['Date'], format='%m/%d/%y')
hist['Date'].head()

0   2017-07-01
1   2017-08-01
2   2017-09-01
3   2017-10-01
4   2017-11-01
Name: Date, dtype: datetime64[ns]

In [13]:
np.where(hist.applymap(lambda x: x == ''))

(array([], dtype=int64), array([], dtype=int64))

In [14]:
hist.isnull().sum(axis=0)

EmployeeID               0
Date                     0
DevCenterID              0
SBUID                    0
PositionID               0
PositionLevel            0
IsTrainee                0
LanguageLevelID          0
CustomerID               0
ProjectID             6017
IsInternalProject        0
Utilization              0
HourVacation             0
HourMobileReserve        0
HourLockedReserve        0
OnSite                   0
CompetenceGroupID        0
FunctionalOfficeID       0
PaymentTypeId            0
BonusOneTime             0
APM                      0
WageGross                0
MonthOnPosition          0
MonthOnSalary            0
dtype: int64

In [15]:
hist.to_pickle('./datasets/soft/clean/hist.pkl')

In [16]:
emp = pd.read_csv('datasets/soft/employees.csv')

emp.head()

Unnamed: 0,EmployeeID,HiringDate,DismissalDate
0,01EDC1A7-5DA7-4939-854B-0002E0DF548D,9/1/18,
1,91C351D9-38A1-40CB-9B69-000774211859,1/1/18,2/1/19
2,D699327D-8A78-4FBE-9CEA-001355717D3F,2/1/18,
3,22102E27-5EEA-44F9-8632-0017B04B1236,10/1/14,5/1/18
4,08C0AFA9-ABC3-4FAF-8DE2-0018AE9A69F4,9/1/10,


Employee

Information about Employee employment

EmployeeID - Employee identifier
HiringDate - Date of Hiring
DismissalDate - Date of Dismissal

In [17]:
emp[emp['DismissalDate'].isna()].shape

(4356, 3)

In [18]:
sum(emp['EmployeeID'].duplicated())

0

In [19]:
emp.dtypes

EmployeeID       object
HiringDate       object
DismissalDate    object
dtype: object

In [20]:
emp['HiringDate'] = pd.to_datetime(emp['HiringDate'], format='%m/%d/%y')
emp['DismissalDate'] = pd.to_datetime(emp['DismissalDate'], format='%m/%d/%y')

In [21]:
emp.isnull().sum(axis=0)

EmployeeID          0
HiringDate          0
DismissalDate    4356
dtype: int64

In [22]:
emp.to_pickle('./datasets/soft/clean/emp.pkl')