In [1]:
# HR Analytics: Job Change of Data Scientists
# https://www.kaggle.com/arashnic/hr-analytics-job-change-of-data-scientists?select=aug_train.csv

In [2]:
import os
import pandas as pd

def load_data():
    csv_path = os.path.join("aug_train.csv")
    return pd.read_csv(csv_path)

In [3]:
project_data = load_data()
project_data.head()

Unnamed: 0,enrollee_id,city,city_development_index,gender,relevent_experience,enrolled_university,education_level,major_discipline,experience,company_size,company_type,last_new_job,training_hours,target
0,8949,city_103,0.92,Male,Has relevent experience,no_enrollment,Graduate,STEM,>20,,,1,36,1.0
1,29725,city_40,0.776,Male,No relevent experience,no_enrollment,Graduate,STEM,15,50-99,Pvt Ltd,>4,47,0.0
2,11561,city_21,0.624,,No relevent experience,Full time course,Graduate,STEM,5,,,never,83,0.0
3,33241,city_115,0.789,,No relevent experience,,Graduate,Business Degree,<1,,Pvt Ltd,never,52,1.0
4,666,city_162,0.767,Male,Has relevent experience,no_enrollment,Masters,STEM,>20,50-99,Funded Startup,4,8,0.0


In [28]:
fields = ["relevent_experience", "gender", "enrolled_university", "education_level", "major_discipline",
          "experience", "company_size", "company_type", "last_new_job", "city"]

for field in project_data:
    print(f"NaN: {project_data[field].isna().sum()}")
    print(project_data[field].value_counts())
    print()

NaN: 0
2047     1
2692     1
4743     1
27272    1
25225    1
        ..
17682    1
19731    1
29972    1
25878    1
6147     1
Name: enrollee_id, Length: 19158, dtype: int64

NaN: 0
city_103    4355
city_21     2702
city_16     1533
city_114    1336
city_160     845
            ... 
city_129       3
city_111       3
city_121       3
city_140       1
city_171       1
Name: city, Length: 123, dtype: int64

NaN: 0
0.920    5200
0.624    2702
0.910    1533
0.926    1336
0.698     683
         ... 
0.649       4
0.807       4
0.781       3
0.625       3
0.664       1
Name: city_development_index, Length: 93, dtype: int64

NaN: 4508
Male      13221
Female     1238
Other       191
Name: gender, dtype: int64

NaN: 0
Has relevent experience    13792
No relevent experience      5366
Name: relevent_experience, dtype: int64

NaN: 386
no_enrollment       13817
Full time course     3757
Part time course     1198
Name: enrolled_university, dtype: int64

NaN: 460
Graduate          11598
Masters      

In [5]:
project_data.describe()

Unnamed: 0,enrollee_id,city_development_index,training_hours,target
count,19158.0,19158.0,19158.0,19158.0
mean,16875.358179,0.828848,65.366896,0.249348
std,9616.292592,0.123362,60.058462,0.432647
min,1.0,0.448,1.0,0.0
25%,8554.25,0.74,23.0,0.0
50%,16982.5,0.903,47.0,0.0
75%,25169.75,0.92,88.0,0.0
max,33380.0,0.949,336.0,1.0


In [6]:
import numpy as np

np.random.seed(42)
project_data.shape

(19158, 14)

In [7]:
corr_matrix = project_data.corr()

In [8]:
corr_matrix

Unnamed: 0,enrollee_id,city_development_index,training_hours,target
enrollee_id,1.0,-0.040455,0.000998,0.049475
city_development_index,-0.040455,1.0,0.00192,-0.341665
training_hours,0.000998,0.00192,1.0,-0.021577
target,0.049475,-0.341665,-0.021577,1.0


In [9]:
'''
Modify data in the following way:
    - drop enrollee_id column
    - replace NaN gender values with unlisted - consider dropping these rows instead
    - 
'''

project_data_clean = project_data.drop("enrollee_id", axis=1)
project_data_clean['gender'] = project_data_clean['gender'].fillna('Unlisted')

project_data_clean.tail(10)

Unnamed: 0,city,city_development_index,gender,relevent_experience,enrolled_university,education_level,major_discipline,experience,company_size,company_type,last_new_job,training_hours,target
19148,city_21,0.624,Unlisted,Has relevent experience,no_enrollment,Masters,STEM,3,100-500,Pvt Ltd,3,40,1.0
19149,city_103,0.92,Male,Has relevent experience,no_enrollment,Masters,STEM,9,50-99,Pvt Ltd,1,36,1.0
19150,city_160,0.92,Female,Has relevent experience,no_enrollment,Graduate,STEM,10,100-500,Public Sector,3,23,0.0
19151,city_149,0.689,Male,No relevent experience,Full time course,Graduate,,2,,,1,60,0.0
19152,city_103,0.92,Female,Has relevent experience,no_enrollment,Graduate,Humanities,7,10/49,Funded Startup,1,25,0.0
19153,city_173,0.878,Male,No relevent experience,no_enrollment,Graduate,Humanities,14,,,1,42,1.0
19154,city_103,0.92,Male,Has relevent experience,no_enrollment,Graduate,STEM,14,,,4,52,1.0
19155,city_103,0.92,Male,Has relevent experience,no_enrollment,Graduate,STEM,>20,50-99,Pvt Ltd,4,44,0.0
19156,city_65,0.802,Male,Has relevent experience,no_enrollment,High School,,<1,500-999,Pvt Ltd,2,97,0.0
19157,city_67,0.855,Unlisted,No relevent experience,no_enrollment,Primary School,,2,,,1,127,0.0


In [10]:
from sklearn.preprocessing import OneHotEncoder

gender_category = project_data_clean[["gender"]]
gender_encoder = OneHotEncoder()
ohe_gender = gender_encoder.fit_transform(gender_category)

ohe_gender.toarray()[-10:]

array([[0., 0., 0., 1.],
       [0., 1., 0., 0.],
       [1., 0., 0., 0.],
       [0., 1., 0., 0.],
       [1., 0., 0., 0.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [0., 0., 0., 1.]])

In [11]:
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline

numeric_attributes = ["city_development_index", "training_hours"]
almost_numeric_attributes = ["last_new_job", "company_size", "experience", "education_level"]

# numeric_data = project_data.select_dtypes(include=np.number)
numeric_data = project_data[numeric_attributes]

num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('std_scaler', StandardScaler()),
])

num_pipeline.fit_transform(numeric_data)

array([[ 0.73891926, -0.4889846 ],
       [-0.42840976, -0.30582494],
       [-1.66059039,  0.29360665],
       ...,
       [ 0.73891926, -0.35577758],
       [-0.21764202,  0.52671894],
       [ 0.21199991,  1.02624527]])

In [12]:
from sklearn.base import BaseEstimator, TransformerMixin

def replace_end_value(x):
    if x == '>20':
        return 21
    elif x == '<1':
        return 0
    else:
        try:
            return int(x)
        except:
            # return np.nan
            return 0

class ExperienceTransformer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        return X.applymap(replace_end_value)
    
experience_data = project_data[["experience"]]
experience_pipeline = Pipeline([
    ('extreme_value_removal', ExperienceTransformer()),
    ('std_scaler', StandardScaler()),
])

experience_pipeline.fit_transform(experience_data)

array([[ 1.61014154],
       [ 0.7265914 ],
       [-0.74599217],
       ...,
       [ 1.61014154],
       [-1.48228396],
       [-1.18776725]])

In [13]:
def binary_hre(x):
    if x == "Has relevent experience":
        return 1
    else:
        return 0
    
class RelevantExperienceTransformer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        return X.applymap(binary_hre)

re_data = project_data[["relevent_experience"]]
relevant_experience_pipeline = Pipeline([
    ('set_to_binary', RelevantExperienceTransformer()),
])
relevant_experience_pipeline.fit_transform(re_data)

Unnamed: 0,relevent_experience
0,1
1,0
2,0
3,0
4,1
...,...
19153,0
19154,1
19155,1
19156,1


In [14]:
class PandarizerTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, columns):
        self.columns = columns
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        return pd.DataFrame(X, columns=self.columns)

project_data[["enrolled_university"]]

def to_enroll_transform(x):
    if x == "no_enrollment" or x == "0":
        return 0
    elif x == "Part time course":
        return 1
    elif x == "Full time course":
        return 2
    else:
        return x

class EnrollmentTransformer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        return X.applymap(to_enroll_transform)


enrollment_pipeline = Pipeline([
    ('fill_imputer', SimpleImputer(strategy='constant', fill_value="0")),
    ("pandarizer", PandarizerTransformer(["enrolled_university"])),
    ('enroll_numeric_transformer', EnrollmentTransformer()),
])
enrollment_pipeline.fit_transform(project_data[["enrolled_university"]])

Unnamed: 0,enrolled_university
0,0
1,0
2,2
3,0
4,0
...,...
19153,0
19154,0
19155,0
19156,0


In [15]:
def to_education_level_transform(x):
    if x == "Primary School":
        return 0.00
    elif x == "High School":
        return 0.25
    elif x == "Graduate":
        return 0.50
    elif x == "Masters":
        return 0.75
    elif x == "Phd":
        return 1.00
    else:
        return 0.00

class EducationLevelTransformer:
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        return X.applymap(to_education_level_transform)

education_level_pipeline = Pipeline([
    ('education_level_numeric_transformer', EducationLevelTransformer()),
])

education_level_pipeline.fit_transform(project_data[["education_level"]])

Unnamed: 0,education_level
0,0.50
1,0.50
2,0.50
3,0.50
4,0.75
...,...
19153,0.50
19154,0.50
19155,0.50
19156,0.25


In [26]:
'''
NaN: 5938
50-99        3083
100-500      2571
10000+       2019
10/49        1471
1000-4999    1328
<10          1308
500-999       877
5000-9999     563
'''

def to_company_size_transform(x):
    if x == "<10":
        return 5
    elif x == "10000+":
        return 10000
    elif x == "10/49":
        return 30
    elif '-' in x:
        a, b = list(map(int, x.split('-')))
        return (a+b) // 2
    else:
        return int(x)

class CompanySizeTransformer:
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        return X.applymap(to_company_size_transform)

company_size_pipeline = Pipeline([
    ('fill_imputer', SimpleImputer(strategy="constant", fill_value="15")),
    ('pandarizer', PandarizerTransformer(["company_size"])),
    ('company_size_numeric_transformer', CompanySizeTransformer()),
])

company_size_pipeline.fit_transform(project_data[["company_size"]])

Unnamed: 0,company_size
0,15
1,74
2,15
3,15
4,74
...,...
19153,15
19154,15
19155,74
19156,749


In [59]:
# NaN: 423
# 1        8040
# >4       3290
# 2        2900
# never    2452
# 4        1029
# 3        1024
# Name: last_new_job, dtype: int64

def lnj_to_numeric(x):
    if x in ("1", "2", "3", "4"):
        return int(x)
    elif x == ">4":
        return 5
    elif x == "never":
        return 10
    else:
        return x

class LNJNumericTransformer:
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        return X.applymap(lnj_to_numeric)

last_new_job_pipeline = Pipeline([
    ('numeric_transformer', LNJNumericTransformer()),
    ('fill_nan_imputer', SimpleImputer()),
    ('pandarizer', PandarizerTransformer(["last_new_job"])),
])

last_new_job_pipeline.fit_transform(project_data[["last_new_job"]])

Unnamed: 0,last_new_job
0,1.0
1,5.0
2,10.0
3,10.0
4,4.0
...,...
19153,1.0
19154,4.0
19155,4.0
19156,2.0


In [61]:
from sklearn.compose import ColumnTransformer

full_pipeline = ColumnTransformer([
    ("num", num_pipeline, ["city_development_index", "training_hours"]),
    ("experience", experience_pipeline, ["experience"]),
    ("relevant_experience", relevant_experience_pipeline, ["relevent_experience"]),
    ("enrolled_university", enrollment_pipeline, ["enrolled_university"]),
    ("education_level", education_level_pipeline, ["education_level"]),
    ("company_size", company_size_pipeline, ["company_size"]),
    ("last_new_job", last_new_job_pipeline, ["last_new_job"]),
])

data_prepared = full_pipeline.fit_transform(project_data)
print(data_prepared)

[[ 7.38919261e-01 -4.88984597e-01  1.61014154e+00 ...  5.00000000e-01
   1.50000000e+01  1.00000000e+00]
 [-4.28409756e-01 -3.05824943e-01  7.26591397e-01 ...  5.00000000e-01
   7.40000000e+01  5.00000000e+00]
 [-1.66059039e+00  2.93606651e-01 -7.45992174e-01 ...  5.00000000e-01
   1.50000000e+01  1.00000000e+01]
 ...
 [ 7.38919261e-01 -3.55777576e-01  1.61014154e+00 ...  5.00000000e-01
   7.40000000e+01  4.00000000e+00]
 [-2.17642017e-01  5.26718938e-01 -1.48228396e+00 ...  2.50000000e-01
   7.49000000e+02  2.00000000e+00]
 [ 2.11999913e-01  1.02624527e+00 -1.18776725e+00 ...  0.00000000e+00
   1.50000000e+01  1.00000000e+00]]
