In [118]:
import pandas as pd
import numpy as np
import math
import gc
import os

import lightgbm as lgb
import xgboost as xgb
from sklearn.ensemble import GradientBoostingClassifier
from xgboost                          import XGBClassifier
from catboost                         import CatBoostClassifier
from lightgbm                         import LGBMClassifier

from sklearn.preprocessing            import LabelEncoder, OneHotEncoder 
from sklearn.preprocessing            import StandardScaler, MinMaxScaler, Normalizer, RobustScaler, MaxAbsScaler
from sklearn.model_selection          import KFold, StratifiedKFold, train_test_split, cross_val_score
from sklearn.tree                     import DecisionTreeClassifier
from sklearn.ensemble                 import VotingClassifier, RandomForestClassifier
from sklearn.metrics                  import f1_score, confusion_matrix, classification_report

In [71]:
train = pd.read_csv('./input/train.csv')
test = pd.read_csv('./input/test.csv')
submission = pd.read_csv('./input/sample_submission.csv')

In [72]:
submission.head()

Unnamed: 0,employee_id,is_promoted
0,8724,0
1,74430,0
2,72255,0
3,38562,0
4,64486,0


In [73]:
train.describe()

Unnamed: 0,employee_id,no_of_trainings,age,previous_year_rating,length_of_service,KPIs_met >80%,awards_won?,avg_training_score,is_promoted
count,54808.0,54808.0,54808.0,50684.0,54808.0,54808.0,54808.0,54808.0,54808.0
mean,39195.830627,1.253011,34.803915,3.329256,5.865512,0.351974,0.023172,63.38675,0.08517
std,22586.581449,0.609264,7.660169,1.259993,4.265094,0.47759,0.15045,13.371559,0.279137
min,1.0,1.0,20.0,1.0,1.0,0.0,0.0,39.0,0.0
25%,19669.75,1.0,29.0,3.0,3.0,0.0,0.0,51.0,0.0
50%,39225.5,1.0,33.0,3.0,5.0,0.0,0.0,60.0,0.0
75%,58730.5,1.0,39.0,4.0,7.0,1.0,0.0,76.0,0.0
max,78298.0,10.0,60.0,5.0,37.0,1.0,1.0,99.0,1.0


In [74]:
train.head()

Unnamed: 0,employee_id,department,region,education,gender,recruitment_channel,no_of_trainings,age,previous_year_rating,length_of_service,KPIs_met >80%,awards_won?,avg_training_score,is_promoted
0,65438,Sales & Marketing,region_7,Master's & above,f,sourcing,1,35,5.0,8,1,0,49,0
1,65141,Operations,region_22,Bachelor's,m,other,1,30,5.0,4,0,0,60,0
2,7513,Sales & Marketing,region_19,Bachelor's,m,sourcing,1,34,3.0,7,0,0,50,0
3,2542,Sales & Marketing,region_23,Bachelor's,m,other,2,39,1.0,10,0,0,50,0
4,48945,Technology,region_26,Bachelor's,m,other,1,45,3.0,2,0,0,73,0


In [75]:
test.head()

Unnamed: 0,employee_id,department,region,education,gender,recruitment_channel,no_of_trainings,age,previous_year_rating,length_of_service,KPIs_met >80%,awards_won?,avg_training_score
0,8724,Technology,region_26,Bachelor's,m,sourcing,1,24,,1,1,0,77
1,74430,HR,region_4,Bachelor's,f,other,1,31,3.0,5,0,0,51
2,72255,Sales & Marketing,region_13,Bachelor's,m,other,1,31,1.0,4,0,0,47
3,38562,Procurement,region_2,Bachelor's,f,other,3,31,2.0,9,0,0,65
4,64486,Finance,region_29,Bachelor's,m,sourcing,1,30,4.0,7,0,0,61


In [76]:
display(train.isnull().sum())

employee_id                0
department                 0
region                     0
education               2409
gender                     0
recruitment_channel        0
no_of_trainings            0
age                        0
previous_year_rating    4124
length_of_service          0
KPIs_met >80%              0
awards_won?                0
avg_training_score         0
is_promoted                0
dtype: int64

In [77]:
train["previous_year_rating"] = train["previous_year_rating"].fillna(0)
test["previous_year_rating"] = test["previous_year_rating"].fillna(0)

In [78]:
train = train.replace(np.nan, '', regex=True)
test = test.replace(np.nan, '', regex=True)

In [79]:
train['Age_bin_round'] = np.array(np.floor(np.array(train['age']) / 10.))
test['Age_bin_round'] = np.array(np.floor(np.array(test['age']) / 10.))

In [80]:
train = train.drop(['age'],axis = 1)
test = test.drop(['age'],axis = 1)
train.head()

Unnamed: 0,employee_id,department,region,education,gender,recruitment_channel,no_of_trainings,previous_year_rating,length_of_service,KPIs_met >80%,awards_won?,avg_training_score,is_promoted,Age_bin_round
0,65438,Sales & Marketing,region_7,Master's & above,f,sourcing,1,5.0,8,1,0,49,0,3.0
1,65141,Operations,region_22,Bachelor's,m,other,1,5.0,4,0,0,60,0,3.0
2,7513,Sales & Marketing,region_19,Bachelor's,m,sourcing,1,3.0,7,0,0,50,0,3.0
3,2542,Sales & Marketing,region_23,Bachelor's,m,other,2,1.0,10,0,0,50,0,3.0
4,48945,Technology,region_26,Bachelor's,m,other,1,3.0,2,0,0,73,0,4.0


In [81]:
test.head()

Unnamed: 0,employee_id,department,region,education,gender,recruitment_channel,no_of_trainings,previous_year_rating,length_of_service,KPIs_met >80%,awards_won?,avg_training_score,Age_bin_round
0,8724,Technology,region_26,Bachelor's,m,sourcing,1,0.0,1,1,0,77,2.0
1,74430,HR,region_4,Bachelor's,f,other,1,3.0,5,0,0,51,3.0
2,72255,Sales & Marketing,region_13,Bachelor's,m,other,1,1.0,4,0,0,47,3.0
3,38562,Procurement,region_2,Bachelor's,f,other,3,2.0,9,0,0,65,3.0
4,64486,Finance,region_29,Bachelor's,m,sourcing,1,4.0,7,0,0,61,3.0


In [82]:
y_train = train['is_promoted'].to_frame()
X_train = train.drop('is_promoted',axis =1)
X_test = test
y_train.head()

Unnamed: 0,is_promoted
0,0
1,0
2,0
3,0
4,0


In [83]:
#OHE
def data_encoding( df , cols ):
    df = pd.get_dummies(df)
    
    return df

In [85]:
encoding_columns  = ["department","region", "education", "gender", "recruitment_channel" ]
train_encode = data_encoding( X_train , encoding_columns )
test_encode =  data_encoding( X_test , encoding_columns )

In [86]:
train_encode.head()

Unnamed: 0,employee_id,no_of_trainings,previous_year_rating,length_of_service,KPIs_met >80%,awards_won?,avg_training_score,Age_bin_round,department_Analytics,department_Finance,...,region_region_9,education_,education_Bachelor's,education_Below Secondary,education_Master's & above,gender_f,gender_m,recruitment_channel_other,recruitment_channel_referred,recruitment_channel_sourcing
0,65438,1,5.0,8,1,0,49,3.0,0,0,...,0,0,0,0,1,1,0,0,0,1
1,65141,1,5.0,4,0,0,60,3.0,0,0,...,0,0,1,0,0,0,1,1,0,0
2,7513,1,3.0,7,0,0,50,3.0,0,0,...,0,0,1,0,0,0,1,0,0,1
3,2542,2,1.0,10,0,0,50,3.0,0,0,...,0,0,1,0,0,0,1,1,0,0
4,48945,1,3.0,2,0,0,73,4.0,0,0,...,0,0,1,0,0,0,1,1,0,0


In [87]:
def data_scaling(  df , cols ):
    df[cols] = RobustScaler().fit_transform(df[cols])
          
    return df

In [88]:

train_scale = data_scaling( train_encode , train_encode.columns )
test_scale  = data_scaling( test_encode  , test_encode.columns )

In [129]:
params_xgb_1={

            'learning_rate': 0.1,
            'n_estimators':494,

            'max_depth':5,
            'subsample': 0.7968351296815959,
            'verbosity':0,
            'scale_pos_weight':2.5,
            'updater' : 'grow_histmaker',
            'base_score'  : 0.2
    }
params_xgb_2={
            'num_threads': 20,
            'random_state': 2000,
            'learning_rate': 0.15,
            'n_estimators':497,
  
}
params_lgb_1={
            'subsample_freq':2,
            'importance_type':'gain',
            'objective': 'binary',
            'verbosity': -1,
            'boosting_type': 'dart',
            'max_bin' : 60,
            'num_leaves' :300,
            'num_threads': 20,

            'learning_rate': 0.15,
            'num_iterations':494,
            'max_depth':5,
            'scale_pos_weight':2.5
          
    }
params_lgb_2={
            'objective': 'binary',

            'boosting_type': 'dart',
            'boost_from_average': True,
            'num_threads': 20,
            'random_state': 1000,

            'learning_rate': 0.1,
            'num_iterations':494,
    
            'importance_type' : 'gain',
            'subsample_freq' : 2,
            'verbosity' : -1, 
            'max_bin' : 60,
            'num_leaves' : 300
            
           
    }

params_cb={
            'learning_rate':0.15, 
            'n_estimators':494,
            'subsample':0.085,
            'max_depth':5, 
            'scale_pos_weight':2.5
}
params_gbc_1={
            'num_threads': 20,
            'random_state': 1000,
            'learning_rate': 0.1,
            'num_iterations':494,
         

            
    }
params_gbc_2={
            'num_threads': 20,
            'random_state': 2000,

            'learning_rate': 0.1,
            'num_iterations':494,

            
            
            
    }
xgb1 = xgb.XGBClassifier(params=params_xgb_1)
xgb2 = xgb.XGBClassifier(params=params_xgb_2)
# lgb1 = lgb.LGBMClassifier(objective='binary',boosting_type='gbdt',boost_from_average=True,num_threads=20,random_state=1000,learning_rate=0.01,num_iterations=1000,num_leaves=100,min_data_in_leaf=92,min_child_weight=0.0010123391323415569,max_depth=10,bagging_fraction=0.7968351296815959,feature_fraction=0.8556374471450119,lambda_l1=0.23497601594060086,lambda_l2=0.15889208239516134)
# lgb2 = lgb.LGBMClassifier(objective='binary',boosting_type='gbdt',boost_from_average=True,num_threads=20,random_state=2000,learning_rate=0.01,num_iterations=1000,num_leaves=100,min_data_in_leaf=92,min_child_weight=0.0010123391323415569,max_depth=10,bagging_fraction=0.7968351296815959,feature_fraction=0.8556374471450119,lambda_l1=0.23497601594060086,lambda_l2=0.15889208239516134)
# gbc1 = GradientBoostingClassifier(random_state=2000,learning_rate=0.01,n_estimators=1000,max_leaf_nodes=100,min_samples_leaf=92,max_depth=10,min_samples_split=0.7968351296815959)
# gbc2 = GradientBoostingClassifier(random_state=1000,learning_rate=0.01,n_estimators=1000,max_leaf_nodes=100,min_samples_leaf=92,max_depth=10,min_samples_split=0.7968351296815959)
lgb1 = lgb.LGBMClassifier(params=params_lgb_1)
lgb2 = lgb.LGBMClassifier(params=params_lgb_2)
gbc1 = GradientBoostingClassifier(params_gbc_1)
gbc2 = GradientBoostingClassifier(params_gbc_2)
cb1 = CatBoostClassifier(params_cb)

# voting = VotingClassifier(estimators=[('xgb1', xgb1),('xgb2',xgb2),('lgb1',lgb1),('lgb2',lgb2)], voting='soft')

# voting = VotingClassifier(estimators=[('lgb1',lgb1),('lgb2',lgb2)], voting='soft')

# voting = VotingClassifier(estimators=[('lgb1',lgb1),('xgb1',xgb1),('cb1',cb1)], voting='soft',weights=[5.2,5,5])

In [131]:
xgb1 =  XGBClassifier(learning_rate =0.1, n_estimators=494, max_depth=5,subsample = 0.70, verbosity = 0,
                                            scale_pos_weight = 2.5,updater ="grow_histmaker",base_score  = 0.2)
cb1 = CatBoostClassifier(learning_rate=0.15, n_estimators=494, subsample=0.085, 
                                                 max_depth=5, scale_pos_weight=2.5)

lgb1 = LGBMClassifier(subsample_freq = 2, objective ="binary",importance_type = "gain",
                                             verbosity = -1, max_bin = 60,num_leaves = 300,
                                             boosting_type = 'dart',learning_rate=0.15, 
                                             n_estimators=494, max_depth=5, scale_pos_weight=2.5)

voting = VotingClassifier(estimators=[('lgb1',lgb1),('xgb1',xgb1),('cb1',cb1)], voting='soft',weights=[5.2,5,5])

In [132]:
model = voting.fit(train_scale,y_train) 


0:	learn: 0.5934158	total: 76.2ms	remaining: 37.5s
1:	learn: 0.5183410	total: 98.3ms	remaining: 24.2s
2:	learn: 0.4486699	total: 121ms	remaining: 19.8s
3:	learn: 0.4243598	total: 143ms	remaining: 17.5s
4:	learn: 0.3919670	total: 166ms	remaining: 16.2s
5:	learn: 0.3824993	total: 189ms	remaining: 15.4s
6:	learn: 0.3624580	total: 212ms	remaining: 14.7s
7:	learn: 0.3420449	total: 240ms	remaining: 14.6s
8:	learn: 0.3370621	total: 262ms	remaining: 14.1s
9:	learn: 0.3337417	total: 297ms	remaining: 14.4s
10:	learn: 0.3238366	total: 320ms	remaining: 14.1s
11:	learn: 0.3225940	total: 357ms	remaining: 14.3s
12:	learn: 0.3177527	total: 382ms	remaining: 14.1s
13:	learn: 0.3100267	total: 405ms	remaining: 13.9s
14:	learn: 0.3062602	total: 428ms	remaining: 13.7s
15:	learn: 0.3033961	total: 453ms	remaining: 13.5s
16:	learn: 0.2969590	total: 492ms	remaining: 13.8s
17:	learn: 0.2963793	total: 516ms	remaining: 13.7s
18:	learn: 0.2927184	total: 539ms	remaining: 13.5s
19:	learn: 0.2916135	total: 562ms	remai

164:	learn: 0.2471430	total: 4.21s	remaining: 8.4s
165:	learn: 0.2470614	total: 4.24s	remaining: 8.38s
166:	learn: 0.2460927	total: 4.26s	remaining: 8.35s
167:	learn: 0.2460218	total: 4.29s	remaining: 8.32s
168:	learn: 0.2459553	total: 4.31s	remaining: 8.29s
169:	learn: 0.2458871	total: 4.33s	remaining: 8.26s
170:	learn: 0.2458306	total: 4.36s	remaining: 8.24s
171:	learn: 0.2456979	total: 4.39s	remaining: 8.22s
172:	learn: 0.2455366	total: 4.42s	remaining: 8.19s
173:	learn: 0.2454316	total: 4.44s	remaining: 8.16s
174:	learn: 0.2453469	total: 4.46s	remaining: 8.13s
175:	learn: 0.2451592	total: 4.48s	remaining: 8.1s
176:	learn: 0.2450049	total: 4.51s	remaining: 8.07s
177:	learn: 0.2449163	total: 4.53s	remaining: 8.04s
178:	learn: 0.2448309	total: 4.55s	remaining: 8.01s
179:	learn: 0.2447129	total: 4.58s	remaining: 7.99s
180:	learn: 0.2446145	total: 4.61s	remaining: 7.96s
181:	learn: 0.2445155	total: 4.63s	remaining: 7.94s
182:	learn: 0.2443702	total: 4.65s	remaining: 7.91s
183:	learn: 0.

323:	learn: 0.2331504	total: 8.83s	remaining: 4.63s
324:	learn: 0.2331167	total: 8.86s	remaining: 4.6s
325:	learn: 0.2330801	total: 8.88s	remaining: 4.58s
326:	learn: 0.2330335	total: 8.9s	remaining: 4.54s
327:	learn: 0.2329777	total: 8.92s	remaining: 4.51s
328:	learn: 0.2329077	total: 8.94s	remaining: 4.49s
329:	learn: 0.2328217	total: 8.97s	remaining: 4.46s
330:	learn: 0.2327963	total: 8.99s	remaining: 4.43s
331:	learn: 0.2327525	total: 9.02s	remaining: 4.4s
332:	learn: 0.2327015	total: 9.04s	remaining: 4.37s
333:	learn: 0.2326094	total: 9.06s	remaining: 4.34s
334:	learn: 0.2325300	total: 9.08s	remaining: 4.31s
335:	learn: 0.2324857	total: 9.11s	remaining: 4.28s
336:	learn: 0.2324550	total: 9.13s	remaining: 4.25s
337:	learn: 0.2323531	total: 9.15s	remaining: 4.22s
338:	learn: 0.2322942	total: 9.17s	remaining: 4.19s
339:	learn: 0.2322335	total: 9.2s	remaining: 4.17s
340:	learn: 0.2322055	total: 9.23s	remaining: 4.14s
341:	learn: 0.2321513	total: 9.26s	remaining: 4.12s
342:	learn: 0.23

485:	learn: 0.2224632	total: 12.9s	remaining: 212ms
486:	learn: 0.2224035	total: 12.9s	remaining: 185ms
487:	learn: 0.2223806	total: 12.9s	remaining: 159ms
488:	learn: 0.2222744	total: 12.9s	remaining: 132ms
489:	learn: 0.2222206	total: 13s	remaining: 106ms
490:	learn: 0.2221829	total: 13s	remaining: 79.3ms
491:	learn: 0.2221011	total: 13s	remaining: 52.8ms
492:	learn: 0.2220368	total: 13s	remaining: 26.4ms
493:	learn: 0.2219750	total: 13.1s	remaining: 0us


In [133]:
predict = model.predict_proba( test_scale )[::,1]

In [134]:
predictions_2 = [int(round(value)) for value in predict]
Result_Promoted = pd.DataFrame({'employee_id': test["employee_id"], 'is_promoted' : predictions_2})
pd.DataFrame(Result_Promoted).to_csv("ss"+".csv",index=False)