In [1]:
import os
import pandas as pd
import numpy as np
import random as rd
import datetime as dt
import re
from datetime import timedelta
import warnings
warnings.filterwarnings('ignore')
import time
import csv
import pickle
import matplotlib.pyplot as plt
import seaborn as sns

import sklearn
from sklearn import metrics
from sklearn.metrics import make_scorer, r2_score, mean_squared_error, explained_variance_score, mean_absolute_error, mean_squared_error
from sklearn.model_selection import train_test_split, cross_val_score, RandomizedSearchCV, StratifiedKFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_curve, auc, confusion_matrix
from sklearn.compose import  ColumnTransformer, make_column_transformer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler, MinMaxScaler, LabelBinarizer, RobustScaler
from sklearn.dummy import DummyClassifier
from sklearn.impute import SimpleImputer
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.naive_bayes import MultinomialNB, ComplementNB, CategoricalNB
from sklearn.svm import SVC  
from sklearn import linear_model
from sklearn.linear_model import LogisticRegression, LinearRegression, Lasso, Ridge, ElasticNet
from lightgbm import LGBMClassifier
from imblearn.over_sampling import SMOTE, SMOTENC
from itertools import cycle

import joblib

pd.set_option('display.max_columns', 1000)
pd.set_option('display.max_rows', 1000)
pd.set_option('display.width', 1000)

# Step 1: Load libraries and dataset

## Preparing data

In [2]:
#load in data
df = pd.read_csv('./1. Data/Headcount/merged4.csv', encoding='utf-8')

In [5]:
df.shape

(9064, 149)

In [10]:
#change the string varaibles; male to M, female to F
d = {'Male': 'M', 'Female' :'F'}
cols = ['GENDER']
df[cols] = df[cols].replace(d, regex=True)

In [11]:
#create numeric column for outcome
df['TERMINATED_NUM'] = df['TERMINATED'].replace(['ACTIVE','TERMINATED'], [0, 1])

## Subsetting data

In [13]:
#Check termination types
df['TERM_SUB_GROUP_TYPE_DESCRIPTION'].unique()

array(['Involuntary Termination', 'Voluntary Termination', nan],
      dtype=object)

In [14]:
#select only those that are active or terminated voluntarily
df2 = df.loc[(df['TERMINATED']=='ACTIVE') | (df['TERM_SUB_GROUP_TYPE_DESCRIPTION']=='Voluntary Termination')]

In [15]:
#Check
df2.shape

(8565, 150)

# Step 2: Exploratory Data Analysis

In [19]:
#check value counts
df3['TERMINATED'].value_counts()

ACTIVE        1503
TERMINATED     737
Name: TERMINATED, dtype: int64

In [20]:
#Check value counts
df3['TERMINATED_NUM'].value_counts()

0    1503
1     737
Name: TERMINATED_NUM, dtype: int64

# 3. Feature Engineering

## 3.1 Creating additional features


In [25]:
#create a new df 
df = df3.copy()

In [27]:
hc_unique = pd.read_csv('./1. Data/Headcount/hc.csv', encoding='utf-8')

In [29]:
#create subset of manageremail and age; to vlookup manager details with hc details
df_mgr_info = hc_unique[['EMAIL_ADDRESS', 'COUNTRY', 'AGE_YEARS', 'AGE_BAND', 'GENDER', 'TENURE_CONTINUOUS_SERVICE_DATE_IN_YEARS', 'TENURE_LATEST_HIRE_DATE_IN_YEARS', 'JOB_TENURE_IN_YEARS', 'GENERATIONS', 'CAREER_LEVEL_TENURE_IN_YEARS', 'JOB_FUNCTION_TENURE_IN_YEARS', 'JOB_LEVEL_TENURE_IN_YEARS']]
df_mgr_info = df_mgr_info.rename(columns={'COUNTRY':'MANAGER_COUNTRY', 'AGE_YEARS':'MANAGER_AGE', 'AGE_BAND':'MANAGER_AGE_BAND','GENDER':'MANAGER_GENDER','TENURE_CONTINUOUS_SERVICE_DATE_IN_YEARS':'MANAGER_TENURE_CONTINUOUS_SERVICE_DATE_IN_YEARS','TENURE_LATEST_HIRE_DATE_IN_YEARS':'MANAGER_TENURE_LATEST_HIRE_DATE_IN_YEARS'
                                         ,'JOB_TENURE_IN_YEARS':'MANAGER_JOB_TENURE_IN_YEARS','GENERATIONS':'MANAGER_GENERATIONS','CAREER_LEVEL_TENURE_IN_YEARS':'MANAGER_CAREER_LEVEL_TENURE_IN_YEARS','JOB_FUNCTION_TENURE_IN_YEARS':'MANAGER_JOB_FUNCTION_TENURE_IN_YEARS','JOB_LEVEL_TENURE_IN_YEARS':'MANAGER_JOB_LEVEL_TENURE_IN_YEARS'}) 

In [31]:
#Vlookup with Manager's age and gender
df2 = pd.merge(df2, df_mgr_info, how = 'left', left_on = 'MANAGER_EMAIL_ADDRESS', right_on = 'EMAIL_ADDRESS')

In [33]:
#drop the unnecessary columns; EMAIL_ADDRESS_y = MANAGER_EMAIL_ADDRESS
df2 = df2.drop(columns=['EMAIL_ADDRESS_y'])

In [34]:
#rename EMAIL_ADDRESS_x to EMAIL_ADDRESS
df2.rename(columns={"EMAIL_ADDRESS_x":"EMAIL_ADDRESS"},inplace=True)

In [36]:
#check for those with null values for manager age
df2.loc[df2['MANAGER_AGE'].isnull()].groupby('MANAGER_EMAIL_ADDRESS').agg(set).shape
#there are no managers with missing age

(0, 160)

In [37]:
#check for those with null values for manager gender
df2.loc[df2['MANAGER_GENDER'].isnull()].groupby('MANAGER_EMAIL_ADDRESS').agg(set).shape
#there are no managers with missing gender

(0, 160)

### 3.1.2 Features - Age difference between Manager and Employee

In [41]:
#get the absolute value of the age difference between manager's age and employee's age
df2['AGE_DIFF_BETW_MGR_EMP'] = abs(round(df2['MANAGER_AGE'] - df2['AGE_YEARS'],0))

In [44]:
df2['LOCATION_MGR_EMPLOYEE'] = ''

# loop through each row of the DataFrame and check if the country is the same as the manager's country
for index, row in df2.iterrows():
    if row['COUNTRY'] == row['MANAGER_COUNTRY']:
        df2.at[index, 'LOCATION_MGR_EMPLOYEE'] = 'SAME_COUNTRY'
    else:
        df2.at[index, 'LOCATION_MGR_EMPLOYEE'] = 'DIFFERENT_COUNTRY'

In [46]:
df2['GENERATIONS_MGR_EMPLOYEE'] = ''

# loop through each row of the DataFrame and check if the country is the same as the manager's country
for index, row in df2.iterrows():
    if row['GENERATIONS'] == row['MANAGER_GENERATIONS']:
        df2.at[index, 'GENERATIONS_MGR_EMPLOYEE'] = 'SAME_GENERATION'
    else:
        df2.at[index, 'GENERATIONS_MGR_EMPLOYEE'] = 'DIFFERENT_GENERATION'

### 3.1.3 Features - Survey1

In [58]:
bins = [-1, 25, 50, 75, float("inf")]
bands = ['Low', 'Avg', 'High', 'Very High']

df2['PFY_survey1'] = pd.cut(df2['PFY_survey1_employee'], bins, labels=bands)


### 3.1.4 Features - Survey2

In [69]:
bins = [-1, 25, 50, 75, float("inf")]
bands = ['Low', 'Avg', 'High', 'Very High']

df2['PFY_survey2'] = pd.cut(df2['PFY_survey2_employee'], bins, labels=bands)

### 3.1.5 Features - Attainment Values

In [77]:
bins = [-1, 25, 50, 75, float("inf")]
bands = ['Low', 'Avg', 'High', 'Very High']

df2['attainment'] = pd.cut(df2['attainment_employee'], bins, labels=bands)

### 3.1.6 Features - Time Bands

In [84]:
#bin the tenure and job date values into categorical string

bins = [-1, 0.999, 1.999, 2.999, 3.999, 4.999, 9.999, 19.999, float("inf")]

bands = ['0 to 1 yr', '1 to 2 yrs', '2 to 3 yrs','3 to 4 yrs', '4 to 5 yrs', '5 to 10 yrs', '10 to 20 yrs', 'above 20 yrs']

df2['TENURE_CONTINUOUS_SERVICE_DATE_BAND_LIN'] = pd.cut(df2['TENURE_CONTINUOUS_SERVICE_DATE_IN_YEARS'], bins, labels=bands)

df2['TENURE_LATEST_HIRE_DATE_BAND_LIN'] = pd.cut(df2['TENURE_LATEST_HIRE_DATE_IN_YEARS'], bins, labels=bands)

df2['JOB_TENURE_BAND_LIN'] = pd.cut(df2['JOB_TENURE_IN_YEARS'], bins, labels=bands)

df2['TIME_SINCE_LAST_SALARY_INCR_BAND_LIN'] = pd.cut(df2['TIME_SINCE_LAST_SALARY_INCR_IN_YEARS'], bins, labels=bands)

df2['CAREER_LEVEL_TENURE_IN_YEARS_LIN'] = pd.cut(df2['CAREER_LEVEL_TENURE_IN_YEARS'], bins, labels=bands)

df2['JOB_LEVEL_TENURE_IN_YEARS_LIN'] = pd.cut(df2['JOB_LEVEL_TENURE_IN_YEARS'], bins, labels=bands)

df2['JOB_FUNCTION_TENURE_IN_YEARS_LIN'] = pd.cut(df2['JOB_FUNCTION_TENURE_IN_YEARS'], bins, labels=bands)

df2['PREVIOUS_CAREER_LEVEL_TENURE_IN_YEARS_LIN'] = pd.cut(df2['PREVIOUS_CAREER_LEVEL_TENURE_IN_YEARS'], bins, labels=bands)

In [85]:
#Bands for Managers
df2['MANAGER_TENURE_CONTINUOUS_SERVICE_DATE_IN_YEARS_LIN'] = pd.cut(df2['MANAGER_TENURE_CONTINUOUS_SERVICE_DATE_IN_YEARS'], bins, labels=bands)
df2
df2['MANAGER_TENURE_LATEST_HIRE_DATE_IN_YEARS_LIN'] = pd.cut(df2['MANAGER_TENURE_LATEST_HIRE_DATE_IN_YEARS'], bins, labels=bands)
df2
df2['MANAGER_JOB_TENURE_IN_YEARS_LIN'] = pd.cut(df2['MANAGER_JOB_TENURE_IN_YEARS'], bins, labels=bands)

df2['TIME_SINCE_LAST_SALARY_INCR_BAND_LIN'] = pd.cut(df2['TIME_SINCE_LAST_SALARY_INCR_IN_YEARS'], bins, labels=bands)

df2['MANAGER_CAREER_LEVEL_TENURE_IN_YEARS_LIN'] = pd.cut(df2['MANAGER_CAREER_LEVEL_TENURE_IN_YEARS'], bins, labels=bands)

df2['MANAGER_JOB_LEVEL_TENURE_IN_YEARS_LIN'] = pd.cut(df2['MANAGER_JOB_LEVEL_TENURE_IN_YEARS'], bins, labels=bands)

df2['MANAGER_JOB_FUNCTION_TENURE_IN_YEARS_LIN'] = pd.cut(df2['MANAGER_JOB_FUNCTION_TENURE_IN_YEARS'], bins, labels=bands)



### 3.1.8 Dealing with NaNs

In [116]:
#replace na value with missing 
df2[['PRODUCT_LINE', 'PRODUCT_ASSOCIATION','MANAGER_JOB_LEVEL_CATEGORY_MGR', 'PILLAR', 'MANAGER_JOB_LEVEL_CATEGORY_SELF']] = df2[['PRODUCT_LINE', 'PRODUCT_ASSOCIATION','MANAGER_JOB_LEVEL_CATEGORY_MGR', 'PILLAR', 'MANAGER_JOB_LEVEL_CATEGORY_SELF']].fillna('Missing')

In [119]:
#save the dataframe for visualisation purposes
df2.to_csv('for_viz.csv', encoding='utf-8')