In [1]:
## Problem statement
# Based on the data given for month predict the employee 
# who can leave for new job.
# 0 not looking for new job.
# 1 looking for new job.

In [1]:
## Libraries
#
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt


In [112]:
## Load training data and testing dataset
#
dataset_train = pd.read_csv('aug_train.csv')
column_name = list(dataset_train.columns)

print("Training size : {}".format(len(dataset_train)))
dataset_test = pd.read_csv('aug_test.csv')
dataset_train = dataset_train.append(dataset_test, ignore_index = True)

# See 5 rows
dataset_train.head()

Training size : 19158


Unnamed: 0,enrollee_id,city,city_development_index,gender,relevent_experience,enrolled_university,education_level,major_discipline,experience,company_size,company_type,last_new_job,training_hours,target
0,8949,city_103,0.92,Male,Has relevent experience,no_enrollment,Graduate,STEM,>20,,,1,36,1.0
1,29725,city_40,0.776,Male,No relevent experience,no_enrollment,Graduate,STEM,15,50-99,Pvt Ltd,>4,47,0.0
2,11561,city_21,0.624,,No relevent experience,Full time course,Graduate,STEM,5,,,never,83,0.0
3,33241,city_115,0.789,,No relevent experience,,Graduate,Business Degree,<1,,Pvt Ltd,never,52,1.0
4,666,city_162,0.767,Male,Has relevent experience,no_enrollment,Masters,STEM,>20,50-99,Funded Startup,4,8,0.0


In [113]:
## Check missing value of each column
# count
dataset_train.isna().sum()

enrollee_id                  0
city                         0
city_development_index       0
gender                    5016
relevent_experience          0
enrolled_university        417
education_level            512
major_discipline          3125
experience                  70
company_size              6560
company_type              6774
last_new_job               463
training_hours               0
target                    2129
dtype: int64

In [114]:
# percentage
missing_percentage = [(clm_name, dataset_train[clm_name].isna().mean() * 100) for clm_name in dataset_train]
missing_percentage = pd.DataFrame(missing_percentage, columns = ['column_names', 'percentage'])
missing_percentage

Unnamed: 0,column_names,percentage
0,enrollee_id,0.0
1,city,0.0
2,city_development_index,0.0
3,gender,23.563677
4,relevent_experience,0.0
5,enrolled_university,1.958942
6,education_level,2.405224
7,major_discipline,14.680321
8,experience,0.328839
9,company_size,30.816931


In [115]:
## Handle missing values
# Using frequency distribution of each category
# and select the top category which is mode.
dataset_train = dataset_train.fillna(dataset_train['gender'].value_counts().index[0])


In [116]:
# Check results
dataset_train.isna().sum()

enrollee_id               0
city                      0
city_development_index    0
gender                    0
relevent_experience       0
enrolled_university       0
education_level           0
major_discipline          0
experience                0
company_size              0
company_type              0
last_new_job              0
training_hours            0
target                    0
dtype: int64

In [117]:
dataset_train.head()

Unnamed: 0,enrollee_id,city,city_development_index,gender,relevent_experience,enrolled_university,education_level,major_discipline,experience,company_size,company_type,last_new_job,training_hours,target
0,8949,city_103,0.92,Male,Has relevent experience,no_enrollment,Graduate,STEM,>20,Male,Male,1,36,1
1,29725,city_40,0.776,Male,No relevent experience,no_enrollment,Graduate,STEM,15,50-99,Pvt Ltd,>4,47,0
2,11561,city_21,0.624,Male,No relevent experience,Full time course,Graduate,STEM,5,Male,Male,never,83,0
3,33241,city_115,0.789,Male,No relevent experience,Male,Graduate,Business Degree,<1,Male,Pvt Ltd,never,52,1
4,666,city_162,0.767,Male,Has relevent experience,no_enrollment,Masters,STEM,>20,50-99,Funded Startup,4,8,0


In [118]:
# Load training data
#
dataset_train = pd.read_csv('aug_train.csv')
column_name = list(dataset_train.columns)

print("Training size : {}".format(len(dataset_train)))
dataset_test = pd.read_csv('aug_test.csv')
dataset_train = dataset_train.append(dataset_test, ignore_index = True)

Training size : 19158


In [119]:
# Using SimpleImputer
# After using simple imputer the column names 
# change into numeric index
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values=np.nan,strategy="most_frequent")
dataset_train = imputer.fit_transform(dataset_train)

dataset_train = pd.DataFrame(dataset_train)
dataset_train.columns = column_name
dataset_train.isna().sum()

enrollee_id               0
city                      0
city_development_index    0
gender                    0
relevent_experience       0
enrolled_university       0
education_level           0
major_discipline          0
experience                0
company_size              0
company_type              0
last_new_job              0
training_hours            0
target                    0
dtype: int64

In [120]:
## Check data type of each column
#
dataset_train.dtypes

enrollee_id               object
city                      object
city_development_index    object
gender                    object
relevent_experience       object
enrolled_university       object
education_level           object
major_discipline          object
experience                object
company_size              object
company_type              object
last_new_job              object
training_hours            object
target                    object
dtype: object

In [121]:
## Change data type of column into respective
#
dataset_train.enrollee_id = dataset_train.enrollee_id.astype('int')

dataset_train.city_development_index = dataset_train.city_development_index.astype('int')

dataset_train.training_hours = dataset_train.training_hours.astype('float')

dataset_train.target = dataset_train.target.astype('int32')

dataset_train.city = dataset_train.city.astype('category')

# using dictionary to convert specific columns 
convert_dict = {'city': 'category',
                'gender': 'category',
                'relevent_experience':'category',
                'enrolled_university':'category',
                'education_level':'category',
                'major_discipline':'category',
                'experience':'category',
                'company_size':'category',
                'company_type':'category',
                'last_new_job': 'category'
               } 
  
dataset_train = dataset_train.astype(convert_dict) 
dataset_train.dtypes


enrollee_id                  int32
city                      category
city_development_index       int32
gender                    category
relevent_experience       category
enrolled_university       category
education_level           category
major_discipline          category
experience                category
company_size              category
company_type              category
last_new_job              category
training_hours             float64
target                       int32
dtype: object

In [122]:
## Handle categorical datatype columns
# Using LabelEncoder for multiple columns
from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()

cl_names = ['city','gender','relevent_experience','enrolled_university',
 'education_level',
 'major_discipline',
 'experience',
 'company_size',
 'company_type',
 'last_new_job']

dataset_train[cl_names] = dataset_train[cl_names].apply(encoder.fit_transform)
dataset_train.head()


Unnamed: 0,enrollee_id,city,city_development_index,gender,relevent_experience,enrolled_university,education_level,major_discipline,experience,company_size,company_type,last_new_job,training_hours,target
0,8949,5,0,1,0,2,0,5,21,4,5,0,36.0,1
1,29725,77,0,1,1,2,0,5,6,4,5,4,47.0,0
2,11561,64,0,1,1,0,0,5,15,4,5,5,83.0,0
3,33241,14,0,1,1,2,0,1,20,4,5,5,52.0,1
4,666,50,0,1,0,2,2,5,21,4,1,3,8.0,0


In [123]:
## Back to test data
#
dataset_test =dataset_train[19158:]
dataset_test.head()


Unnamed: 0,enrollee_id,city,city_development_index,gender,relevent_experience,enrolled_university,education_level,major_discipline,experience,company_size,company_type,last_new_job,training_hours,target
19158,32403,78,0,1,0,0,0,5,19,7,5,0,21.0,0
19159,9858,5,0,0,0,2,0,5,15,4,5,0,98.0,0
19160,31806,64,0,1,1,2,1,5,20,4,5,5,15.0,0
19161,27385,26,0,1,0,2,2,5,2,0,5,0,39.0,0
19162,27724,5,0,1,0,2,0,5,21,3,5,4,72.0,0


In [126]:
## Check missing value of each column
# count
dataset_test.isna().sum()

enrollee_id               0
city                      0
city_development_index    0
gender                    0
relevent_experience       0
enrolled_university       0
education_level           0
major_discipline          0
experience                0
company_size              0
company_type              0
last_new_job              0
training_hours            0
target                    0
dtype: int64

In [130]:
## Build model
#
independent_variables =["enrollee_id","city","city_development_index","gender","relevent_experience","enrolled_university","education_level","major_discipline","experience","company_size","company_type","last_new_job","training_hours"]

dependent_variables = 'target'

## Installation
# pip install catboost
from catboost import CatBoostRegressor, Pool
from sklearn.metrics import r2_score, mean_squared_error

model = CatBoostRegressor(objective='RMSE')
model.fit(dataset_train[independent_variables], dataset_train[dependent_variables])


earn: 0.3529082	total: 4.02s	remaining: 2.51s
615:	learn: 0.3528920	total: 4.02s	remaining: 2.51s
616:	learn: 0.3528617	total: 4.03s	remaining: 2.5s
617:	learn: 0.3528376	total: 4.04s	remaining: 2.5s
618:	learn: 0.3527931	total: 4.04s	remaining: 2.49s
619:	learn: 0.3527472	total: 4.05s	remaining: 2.48s
620:	learn: 0.3526988	total: 4.06s	remaining: 2.48s
621:	learn: 0.3526660	total: 4.07s	remaining: 2.47s
622:	learn: 0.3526299	total: 4.07s	remaining: 2.46s
623:	learn: 0.3525868	total: 4.08s	remaining: 2.46s
624:	learn: 0.3525445	total: 4.08s	remaining: 2.45s
625:	learn: 0.3525166	total: 4.09s	remaining: 2.44s
626:	learn: 0.3524822	total: 4.09s	remaining: 2.44s
627:	learn: 0.3524590	total: 4.1s	remaining: 2.43s
628:	learn: 0.3524117	total: 4.11s	remaining: 2.42s
629:	learn: 0.3523848	total: 4.11s	remaining: 2.42s
630:	learn: 0.3523438	total: 4.12s	remaining: 2.41s
631:	learn: 0.3523082	total: 4.12s	remaining: 2.4s
632:	learn: 0.3522934	total: 4.13s	remaining: 2.39s
633:	learn: 0.3522440	

<catboost.core.CatBoostRegressor at 0x2cc45392248>

In [131]:
## Predictions
#
predictions = model.predict(dataset_test[independent_variables])
predictions

array([ 0.14209087,  0.26018582,  0.3081245 , ..., -0.03572781,
        0.03118641,  0.07421645])

In [134]:
## Accuracy
#
from sklearn import metrics
fpr, tpr, thresholds = metrics.roc_curve(dataset_train[dependent_variables],  model.predict(dataset_train[independent_variables]))
metrics.auc(fpr, tpr)

0.8791558759942371

In [135]:
## Generate report
#
results = pd.DataFrame({'enroll_id':dataset_test.enrollee_id, 'result':predictions})
results.head()

Unnamed: 0,enroll_id,result
19158,32403,0.142091
19159,9858,0.260186
19160,31806,0.308125
19161,27385,0.091266
19162,27724,0.109962


In [145]:
# segregate result
def segregator(data):
    if(data.result > 0.5):
        return 1
    else:
        return 0
    
results['result'] = results.apply(func = segregator, axis = 'columns')

results = results.sort_values(by = 'result', ascending = False)
print("Following are enroll_id are about to get new job as per our preduction")
top10 = results[:10]
print(top10)

Following are enroll_id are about to get new job as per our preduction
       enroll_id  result
21010      22468       1
20460       7645       1
19418      26325       1
20036      27338       1
21044      22430       1
21043      19648       1
21042      25614       1
20031      22520       1
20474      28252       1
20808      19550       1


In [146]:
# Sheet
file_name = "final_results.xlsx"
results.to_excel(file_name, index = False)
print("prediction list generated and saved.")


prediction list generated and saved.


In [2]:
! jupyter nbconvert --to script new_job.ipynb

[NbConvertApp] Converting notebook new_job.ipynb to script
[NbConvertApp] Writing 5314 bytes to new_job.py


In [None]:
## End
#