## HR Analytic dataset

In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

### Reading the data

In [4]:
df = pd.read_csv('aug_train.csv')
df.head()

Unnamed: 0,enrollee_id,city,city_development_index,gender,relevent_experience,enrolled_university,education_level,major_discipline,experience,company_size,company_type,last_new_job,training_hours,target
0,8949,city_103,0.92,Male,Has relevent experience,no_enrollment,Graduate,STEM,>20,,,1,36,1.0
1,29725,city_40,0.776,Male,No relevent experience,no_enrollment,Graduate,STEM,15,50-99,Pvt Ltd,>4,47,0.0
2,11561,city_21,0.624,,No relevent experience,Full time course,Graduate,STEM,5,,,never,83,0.0
3,33241,city_115,0.789,,No relevent experience,,Graduate,Business Degree,<1,,Pvt Ltd,never,52,1.0
4,666,city_162,0.767,Male,Has relevent experience,no_enrollment,Masters,STEM,>20,50-99,Funded Startup,4,8,0.0


In [5]:
df.isnull().sum()

enrollee_id                  0
city                         0
city_development_index       0
gender                    4508
relevent_experience          0
enrolled_university        386
education_level            460
major_discipline          2813
experience                  65
company_size              5938
company_type              6140
last_new_job               423
training_hours               0
target                       0
dtype: int64

In [6]:
df.shape

(19158, 14)

In [9]:
df_copy = df

In [7]:
df['target'].value_counts()

0.0    14381
1.0     4777
Name: target, dtype: int64

### Initial obeservations : 
#### 1. Target coulmn is our target feature
#### 2. The data is imbalanced between the classes
#### 3. there are many mising values in the dataset
#### 4. quite moderate dataset
#### 5. Gender, relevant experience, enrolled univeristy, education level, major discipline, company type are the categorical features

### starting the FE&EDA

In [11]:
df.drop('enrollee_id', axis = 1, inplace = True)
df.head()

Unnamed: 0,city,city_development_index,gender,relevent_experience,enrolled_university,education_level,major_discipline,experience,company_size,company_type,last_new_job,training_hours,target
0,city_103,0.92,Male,Has relevent experience,no_enrollment,Graduate,STEM,>20,,,1,36,1.0
1,city_40,0.776,Male,No relevent experience,no_enrollment,Graduate,STEM,15,50-99,Pvt Ltd,>4,47,0.0
2,city_21,0.624,,No relevent experience,Full time course,Graduate,STEM,5,,,never,83,0.0
3,city_115,0.789,,No relevent experience,,Graduate,Business Degree,<1,,Pvt Ltd,never,52,1.0
4,city_162,0.767,Male,Has relevent experience,no_enrollment,Masters,STEM,>20,50-99,Funded Startup,4,8,0.0


### handling the gender column

In [12]:
df['gender'].value_counts()

Male      13221
Female     1238
Other       191
Name: gender, dtype: int64

In [13]:
df['gender'].unique()

array(['Male', nan, 'Female', 'Other'], dtype=object)

In [15]:
df['gender'].isnull().sum()

4508

In [18]:
### since the gender class is imbalanced before using the onehot encoding, I am creating the two categories male, and female
### nan, other are subsituted to female
df['gender'] = df['gender'].apply(lambda x : 'Male' if x == 'Male' else 'Female')

In [19]:
df['gender'].value_counts()

Male      13221
Female     5937
Name: gender, dtype: int64

In [21]:
df['gender'].isnull().sum()

0

In [33]:
### one hot encoding
gender = pd.get_dummies(df['gender'], prefix = 'gender', dtype = int)
df = pd.concat([df, gender], axis = 1)
df.head()

Unnamed: 0,city,city_development_index,gender,relevent_experience,enrolled_university,education_level,major_discipline,experience,company_size,company_type,last_new_job,training_hours,target,gender_Female,gender_Male
0,city_103,0.92,Male,Has relevent experience,no_enrollment,Graduate,STEM,>20,,,1,36.0,1.0,0,1
1,city_40,0.776,Male,No relevent experience,no_enrollment,Graduate,STEM,15,50-99,Pvt Ltd,>4,47.0,0.0,0,1
2,city_21,0.624,Female,No relevent experience,Full time course,Graduate,STEM,5,,,never,83.0,0.0,1,0
3,city_115,0.789,Female,No relevent experience,,Graduate,Business Degree,<1,,Pvt Ltd,never,52.0,1.0,1,0
4,city_162,0.767,Male,Has relevent experience,no_enrollment,Masters,STEM,>20,50-99,Funded Startup,4,8.0,0.0,0,1


In [34]:
df.drop('gender', axis = 1, inplace = True)

In [35]:
df.head()

Unnamed: 0,city,city_development_index,relevent_experience,enrolled_university,education_level,major_discipline,experience,company_size,company_type,last_new_job,training_hours,target,gender_Female,gender_Male
0,city_103,0.92,Has relevent experience,no_enrollment,Graduate,STEM,>20,,,1,36.0,1.0,0,1
1,city_40,0.776,No relevent experience,no_enrollment,Graduate,STEM,15,50-99,Pvt Ltd,>4,47.0,0.0,0,1
2,city_21,0.624,No relevent experience,Full time course,Graduate,STEM,5,,,never,83.0,0.0,1,0
3,city_115,0.789,No relevent experience,,Graduate,Business Degree,<1,,Pvt Ltd,never,52.0,1.0,1,0
4,city_162,0.767,Has relevent experience,no_enrollment,Masters,STEM,>20,50-99,Funded Startup,4,8.0,0.0,0,1


### handling the relevant experience column