In [1]:
# HR Analytics: Job Change of Data Scientists
# https://www.kaggle.com/arashnic/hr-analytics-job-change-of-data-scientists?select=aug_train.csv

In [2]:
import os
import pandas as pd

def load_data():
    csv_path = os.path.join("aug_train.csv")
    return pd.read_csv(csv_path)

In [3]:
project_data = load_data()
project_data.head()

Unnamed: 0,enrollee_id,city,city_development_index,gender,relevent_experience,enrolled_university,education_level,major_discipline,experience,company_size,company_type,last_new_job,training_hours,target
0,8949,city_103,0.92,Male,Has relevent experience,no_enrollment,Graduate,STEM,>20,,,1,36,1.0
1,29725,city_40,0.776,Male,No relevent experience,no_enrollment,Graduate,STEM,15,50-99,Pvt Ltd,>4,47,0.0
2,11561,city_21,0.624,,No relevent experience,Full time course,Graduate,STEM,5,,,never,83,0.0
3,33241,city_115,0.789,,No relevent experience,,Graduate,Business Degree,<1,,Pvt Ltd,never,52,1.0
4,666,city_162,0.767,Male,Has relevent experience,no_enrollment,Masters,STEM,>20,50-99,Funded Startup,4,8,0.0


In [4]:
fields = ["relevent_experience", "gender", "enrolled_university", "education_level", "major_discipline",
          "experience", "company_size", "company_type", "last_new_job", "city"]

for field in fields:
    print(project_data[field].value_counts())
    print()

Has relevent experience    13792
No relevent experience      5366
Name: relevent_experience, dtype: int64

Male      13221
Female     1238
Other       191
Name: gender, dtype: int64

no_enrollment       13817
Full time course     3757
Part time course     1198
Name: enrolled_university, dtype: int64

Graduate          11598
Masters            4361
High School        2017
Phd                 414
Primary School      308
Name: education_level, dtype: int64

STEM               14492
Humanities           669
Other                381
Business Degree      327
Arts                 253
No Major             223
Name: major_discipline, dtype: int64

>20    3286
5      1430
4      1403
3      1354
6      1216
2      1127
7      1028
10      985
9       980
8       802
15      686
11      664
14      586
1       549
<1      522
16      508
12      494
13      399
17      342
19      304
18      280
20      148
Name: experience, dtype: int64

50-99        3083
100-500      2571
10000+       2019
10/

In [5]:
project_data.describe()

Unnamed: 0,enrollee_id,city_development_index,training_hours,target
count,19158.0,19158.0,19158.0,19158.0
mean,16875.358179,0.828848,65.366896,0.249348
std,9616.292592,0.123362,60.058462,0.432647
min,1.0,0.448,1.0,0.0
25%,8554.25,0.74,23.0,0.0
50%,16982.5,0.903,47.0,0.0
75%,25169.75,0.92,88.0,0.0
max,33380.0,0.949,336.0,1.0


In [6]:
import numpy as np

np.random.seed(42)
project_data.shape

(19158, 14)

In [7]:
corr_matrix = project_data.corr()

In [8]:
corr_matrix

Unnamed: 0,enrollee_id,city_development_index,training_hours,target
enrollee_id,1.0,-0.040455,0.000998,0.049475
city_development_index,-0.040455,1.0,0.00192,-0.341665
training_hours,0.000998,0.00192,1.0,-0.021577
target,0.049475,-0.341665,-0.021577,1.0


In [9]:
'''
Modify data in the following way:
    - drop enrollee_id column
    - replace NaN gender values with unlisted - consider dropping these rows instead
    - 
'''

project_data_clean = project_data.drop("enrollee_id", axis=1)
project_data_clean['gender'] = project_data_clean['gender'].fillna('Unlisted')

project_data_clean.tail(10)

Unnamed: 0,city,city_development_index,gender,relevent_experience,enrolled_university,education_level,major_discipline,experience,company_size,company_type,last_new_job,training_hours,target
19148,city_21,0.624,Unlisted,Has relevent experience,no_enrollment,Masters,STEM,3,100-500,Pvt Ltd,3,40,1.0
19149,city_103,0.92,Male,Has relevent experience,no_enrollment,Masters,STEM,9,50-99,Pvt Ltd,1,36,1.0
19150,city_160,0.92,Female,Has relevent experience,no_enrollment,Graduate,STEM,10,100-500,Public Sector,3,23,0.0
19151,city_149,0.689,Male,No relevent experience,Full time course,Graduate,,2,,,1,60,0.0
19152,city_103,0.92,Female,Has relevent experience,no_enrollment,Graduate,Humanities,7,10/49,Funded Startup,1,25,0.0
19153,city_173,0.878,Male,No relevent experience,no_enrollment,Graduate,Humanities,14,,,1,42,1.0
19154,city_103,0.92,Male,Has relevent experience,no_enrollment,Graduate,STEM,14,,,4,52,1.0
19155,city_103,0.92,Male,Has relevent experience,no_enrollment,Graduate,STEM,>20,50-99,Pvt Ltd,4,44,0.0
19156,city_65,0.802,Male,Has relevent experience,no_enrollment,High School,,<1,500-999,Pvt Ltd,2,97,0.0
19157,city_67,0.855,Unlisted,No relevent experience,no_enrollment,Primary School,,2,,,1,127,0.0


In [10]:
from sklearn.preprocessing import OneHotEncoder

gender_category = project_data_clean[["gender"]]
gender_encoder = OneHotEncoder()
ohe_gender = gender_encoder.fit_transform(gender_category)

ohe_gender.toarray()[-10:]

array([[0., 0., 0., 1.],
       [0., 1., 0., 0.],
       [1., 0., 0., 0.],
       [0., 1., 0., 0.],
       [1., 0., 0., 0.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [0., 0., 0., 1.]])

In [29]:
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline

numeric_attributes = ["city_development_index", "training_hours"]
almost_numeric_attributes = ["last_new_job", "company_size", "experience", "education_level"]

# numeric_data = project_data.select_dtypes(include=np.number)
numeric_data = project_data[numeric_attributes]

num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('std_scaler', StandardScaler())
])

numeric_project_data_tr = num_pipeline.fit_transform(numeric_data)

In [32]:
from sklearn.base import BaseEstimator, TransformerMixin

def replace_end_value(x):
    if x == '>20':
        return 20
    elif x == '<1':
        return 0
    elif x == NaN:
        return NaN
    else:
        return int(x)

class ExperienceTransformer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        return X.apply(replace_end_value)
    
experience_data = project_data[["experience"]]
experience_pipeline = Pipeline([
    ('extreme_value_removal', ExperienceTransformer())
])

experience_pipeline.fit_transform(experience_data)

ValueError: The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().

In [21]:
from sklearn.compose import ColumnTransformer


full_pipeline = ColumnTransformer([
    ("num", num_pipeline, numeric_attributes)
])

data_prepared = full_pipeline.fit_transform(project_data)
data_prepared

array([[ 0.73891926, -0.4889846 ],
       [-0.42840976, -0.30582494],
       [-1.66059039,  0.29360665],
       ...,
       [ 0.73891926, -0.35577758],
       [-0.21764202,  0.52671894],
       [ 0.21199991,  1.02624527]])