In [16]:
import numpy as np
import pandas as pd

from imblearn.over_sampling import SMOTE

import catboost as cb
from sklearn import model_selection, metrics, linear_model, ensemble, naive_bayes, neighbors, svm, tree
from xgboost import XGBClassifier
import lightgbm

import os
import gc
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/ibm-hr-analytics-attrition-dataset/WA_Fn-UseC_-HR-Employee-Attrition.csv
/kaggle/input/playground-series-s3e3/sample_submission.csv
/kaggle/input/playground-series-s3e3/train.csv
/kaggle/input/playground-series-s3e3/test.csv


In [3]:
train = pd.read_csv(r'/kaggle/input/playground-series-s3e3/train.csv')
train_o = pd.read_csv(r'/kaggle/input/ibm-hr-analytics-attrition-dataset/WA_Fn-UseC_-HR-Employee-Attrition.csv')
test = pd.read_csv(r'/kaggle/input/playground-series-s3e3/test.csv')
ss = pd.read_csv(r'/kaggle/input/playground-series-s3e3/sample_submission.csv')

In [4]:
print(train.shape, train_o.shape, test.shape)

(1677, 35) (1470, 35) (1119, 34)


In [5]:
train.isnull().sum() + train_o.isnull().sum()

Age                         0.0
Attrition                   0.0
BusinessTravel              0.0
DailyRate                   0.0
Department                  0.0
DistanceFromHome            0.0
Education                   0.0
EducationField              0.0
EmployeeCount               0.0
EmployeeNumber              NaN
EnvironmentSatisfaction     0.0
Gender                      0.0
HourlyRate                  0.0
JobInvolvement              0.0
JobLevel                    0.0
JobRole                     0.0
JobSatisfaction             0.0
MaritalStatus               0.0
MonthlyIncome               0.0
MonthlyRate                 0.0
NumCompaniesWorked          0.0
Over18                      0.0
OverTime                    0.0
PercentSalaryHike           0.0
PerformanceRating           0.0
RelationshipSatisfaction    0.0
StandardHours               0.0
StockOptionLevel            0.0
TotalWorkingYears           0.0
TrainingTimesLastYear       0.0
WorkLifeBalance             0.0
YearsAtC

In [6]:
display(train.head())
display(train_o.head())

Unnamed: 0,id,Age,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EnvironmentSatisfaction,...,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager,Attrition
0,0,36,Travel_Frequently,599,Research & Development,24,3,Medical,1,4,...,80,1,10,2,3,10,0,7,8,0
1,1,35,Travel_Rarely,921,Sales,8,3,Other,1,1,...,80,1,4,3,3,4,2,0,3,0
2,2,32,Travel_Rarely,718,Sales,26,3,Marketing,1,3,...,80,2,4,3,3,3,2,1,2,0
3,3,38,Travel_Rarely,1488,Research & Development,2,3,Medical,1,3,...,80,0,15,1,1,6,0,0,2,0
4,4,50,Travel_Rarely,1017,Research & Development,5,4,Medical,1,2,...,80,0,31,0,3,31,14,4,10,1


Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeNumber,...,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,Yes,Travel_Rarely,1102,Sales,1,2,Life Sciences,1,1,...,1,80,0,8,0,1,6,4,0,5
1,49,No,Travel_Frequently,279,Research & Development,8,1,Life Sciences,1,2,...,4,80,1,10,3,3,10,7,1,7
2,37,Yes,Travel_Rarely,1373,Research & Development,2,2,Other,1,4,...,2,80,0,7,3,3,0,0,0,0
3,33,No,Travel_Frequently,1392,Research & Development,3,4,Life Sciences,1,5,...,3,80,0,8,3,3,8,7,3,0
4,27,No,Travel_Rarely,591,Research & Development,2,1,Medical,1,7,...,4,80,1,6,3,3,2,2,2,2


In [7]:
display(train['Attrition'].value_counts())
display(train_o['Attrition'].value_counts())

0    1477
1     200
Name: Attrition, dtype: int64

No     1233
Yes     237
Name: Attrition, dtype: int64

In [8]:
train_o['Attrition'] = train_o['Attrition'].map(lambda x: 1 if x == "Yes" else 0)
train_o['id'] = train_o['EmployeeNumber']

In [9]:
train_all = pd.concat([train, train_o.drop('EmployeeNumber', axis = 1)])

In [10]:
display(train_all.shape)
display(train_all['Attrition'].value_counts())

(3147, 35)

0    2710
1     437
Name: Attrition, dtype: int64

In [11]:
target = train_all['Attrition']

data = pd.concat([train_all.drop('Attrition', axis = 1), test]).reset_index(drop = True)

print(train_all.shape, test.shape, data.shape)

(3147, 35) (1119, 34) (4266, 34)


In [12]:
text_features = []

for column in data.columns:
    if data[column].dtype == 'object':
        text_features.append(column)

for text_feature in text_features:
    data = data.join(pd.get_dummies(data[text_feature], prefix=text_feature))
    data = data.drop(text_feature, axis = 1)

In [13]:
data.head()

Unnamed: 0,id,Age,DailyRate,DistanceFromHome,Education,EmployeeCount,EnvironmentSatisfaction,HourlyRate,JobInvolvement,JobLevel,...,JobRole_Research Director,JobRole_Research Scientist,JobRole_Sales Executive,JobRole_Sales Representative,MaritalStatus_Divorced,MaritalStatus_Married,MaritalStatus_Single,Over18_Y,OverTime_No,OverTime_Yes
0,0,36,599,24,3,1,4,42,3,1,...,0,0,0,0,0,1,0,1,0,1
1,1,35,921,8,3,1,1,46,3,1,...,0,0,0,1,0,1,0,1,1,0
2,2,32,718,26,3,1,3,80,3,2,...,0,0,1,0,1,0,0,1,1,0
3,3,38,1488,2,3,1,3,40,3,2,...,0,0,0,0,0,1,0,1,1,0
4,4,50,1017,5,4,1,2,37,3,5,...,0,0,0,0,0,0,1,1,0,1


In [74]:
df_train = data.iloc[:len(target), :]

df_test = data.iloc[len(target):, :]

df_train.shape, target.shape, df_test.shape

((3147, 55), (3147,), (1119, 55))

# "MANUAL" ML

In [15]:
X_train, X_test, y_train, y_test = model_selection.train_test_split(df_train.drop('id', axis = 1), target, random_state = 42)

### CATBOOST

In [39]:
model_cb = cb.CatBoostClassifier(depth = 3,
                                 learning_rate = 0.01,
                                 rsm = 0.5,
                                 random_seed = 42,
                                 verbose = False).fit(X_train, y_train)
print(metrics.roc_auc_score(model_cb.predict(X_test), y_test))
preds_cb = model_cb.predict(df_test.drop('id', axis = 1))
proba_cb = model_cb.predict_proba(df_test.drop('id', axis = 1))

0.8764324324324325


### XGBOOST

In [40]:
model_xgb = XGBClassifier().fit(X_train, y_train)
print(metrics.roc_auc_score(model_xgb.predict(X_test), y_test))
preds_xgb = model_xgb.predict(df_test.drop('id', axis = 1))
proba_xgb = model_cb.predict_proba(df_test.drop('id', axis = 1))

0.7526585094549498


### LIGHTGBM

In [41]:
model_lgbm = lightgbm.LGBMClassifier().fit(X_train, y_train)
print(metrics.roc_auc_score(model_lgbm.predict(X_test), y_test))
preds_lgbm = model_lgbm.predict(df_test.drop('id', axis = 1))
proba_lgbm = model_lgbm.predict_proba(df_test.drop('id', axis = 1))

0.7554751430868927


### BLENDING THEM ALL TOGETHER

In [43]:
proba_global = (proba_cb+proba_xgb+proba_lgbm)/3
proba_global

array([[0.73188835, 0.26811165],
       [0.87269235, 0.12730765],
       [0.92160381, 0.07839619],
       ...,
       [0.96059179, 0.03940821],
       [0.96705772, 0.03294228],
       [0.98617029, 0.01382971]])

# AUTO ML

## Init cluster and data loading

In [53]:
import h2o        
from h2o.automl import H2OAutoML

In [54]:
h2o.init()

Checking whether there is an H2O instance running at http://localhost:54321 ..... not found.
Attempting to start a local H2O server...
  Java Version: openjdk version "11.0.17" 2022-10-18; OpenJDK Runtime Environment (build 11.0.17+8-post-Ubuntu-1ubuntu220.04); OpenJDK 64-Bit Server VM (build 11.0.17+8-post-Ubuntu-1ubuntu220.04, mixed mode, sharing)
  Starting server from /opt/conda/lib/python3.7/site-packages/h2o/backend/bin/h2o.jar
  Ice root: /tmp/tmp03igzlhg
  JVM stdout: /tmp/tmp03igzlhg/h2o_unknownUser_started_from_python.out
  JVM stderr: /tmp/tmp03igzlhg/h2o_unknownUser_started_from_python.err
  Server is running at http://127.0.0.1:54321
Connecting to H2O server at http://127.0.0.1:54321 ... successful.


0,1
H2O_cluster_uptime:,03 secs
H2O_cluster_timezone:,Etc/UTC
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.38.0.3
H2O_cluster_version_age:,1 month and 25 days
H2O_cluster_name:,H2O_from_python_unknownUser_7hi6h6
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,7.500 Gb
H2O_cluster_total_cores:,4
H2O_cluster_allowed_cores:,4


In [76]:
df_train

Unnamed: 0,id,Age,DailyRate,DistanceFromHome,Education,EmployeeCount,EnvironmentSatisfaction,HourlyRate,JobInvolvement,JobLevel,...,JobRole_Research Director,JobRole_Research Scientist,JobRole_Sales Executive,JobRole_Sales Representative,MaritalStatus_Divorced,MaritalStatus_Married,MaritalStatus_Single,Over18_Y,OverTime_No,OverTime_Yes
0,0,36,599,24,3,1,4,42,3,1,...,0,0,0,0,0,1,0,1,0,1
1,1,35,921,8,3,1,1,46,3,1,...,0,0,0,1,0,1,0,1,1,0
2,2,32,718,26,3,1,3,80,3,2,...,0,0,1,0,1,0,0,1,1,0
3,3,38,1488,2,3,1,3,40,3,2,...,0,0,0,0,0,1,0,1,1,0
4,4,50,1017,5,4,1,2,37,3,5,...,0,0,0,0,0,0,1,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3142,2061,36,884,23,2,1,3,41,4,2,...,0,0,0,0,0,1,0,1,1,0
3143,2062,39,613,6,1,1,4,42,2,3,...,0,0,0,0,0,1,0,1,1,0
3144,2064,27,155,4,3,1,2,87,4,2,...,0,0,0,0,0,1,0,1,0,1
3145,2065,49,1023,2,3,1,4,63,2,2,...,0,0,1,0,0,1,0,1,1,0


In [83]:
target.to_frame().reset_index(drop = True)

Unnamed: 0,Attrition
0,0
1,0
2,0
3,0
4,1
...,...
3142,0
3143,0
3144,0
3145,0


In [84]:
df_train_h2o = df_train.copy()
df_train_h2o['Attrition'] = target.to_frame().reset_index(drop = True)
train_h2o = h2o.H2OFrame(df_train_h2o)

Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%


In [85]:
test_h2o = h2o.H2OFrame(df_test)

Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%


## Training

In [86]:
train, test = train_h2o.split_frame(ratios = [0.75])

x = train.columns
y = 'Attrition'
x.remove(y)

In [91]:
# For binary classification, response should be a factor
train[y] = train[y].asfactor()
test[y] = test[y].asfactor()

# Run AutoML for 20 base models
aml = H2OAutoML(max_models=20,
                max_runtime_secs=600,
                seed=42)

aml.train(x=x, y=y, training_frame=train)

AutoML progress: |
16:41:32.957: _train param, Dropping bad and constant columns: [StandardHours, EmployeeCount, Over18_Y]

█
16:41:36.55: _train param, Dropping bad and constant columns: [StandardHours, EmployeeCount, Over18_Y]

█
16:41:37.862: _train param, Dropping bad and constant columns: [StandardHours, EmployeeCount, Over18_Y]

██
16:41:41.407: _train param, Dropping bad and constant columns: [StandardHours, EmployeeCount, Over18_Y]

█
16:41:43.765: _train param, Dropping bad and constant columns: [StandardHours, EmployeeCount, Over18_Y]

██
16:41:48.301: _train param, Dropping bad and constant columns: [StandardHours, EmployeeCount, Over18_Y]

█
16:41:51.801: _train param, Dropping bad and constant columns: [StandardHours, EmployeeCount, Over18_Y]

█
16:41:54.853: _train param, Dropping bad and constant columns: [StandardHours, EmployeeCount, Over18_Y]

█
16:41:58.407: _train param, Dropping bad and constant columns: [StandardHours, EmployeeCount, Over18_Y]

█
16:42:00.757: _tr

H2OJobCancelled: Job<$03017f00000132d4ffffffff$_89b083f0ffcc5ca1af48869e51814dcb> was cancelled by the user.

In [None]:
lb = aml.leaderboard
lb.head()

In [61]:
preds = aml.leader.predict(test_h2o)
final = preds.cbind(test_h2o)
final

id,Age,DailyRate,DistanceFromHome,Education,EmployeeCount,EnvironmentSatisfaction,HourlyRate,JobInvolvement,JobLevel,JobSatisfaction,MonthlyIncome,MonthlyRate,NumCompaniesWorked,PercentSalaryHike,PerformanceRating,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager,BusinessTravel_Non-Travel,BusinessTravel_Travel_Frequently,BusinessTravel_Travel_Rarely,Department_Human Resources,Department_Research & Development,Department_Sales,EducationField_Human Resources,EducationField_Life Sciences,EducationField_Marketing,EducationField_Medical,EducationField_Other,EducationField_Technical Degree,Gender_Female,Gender_Male,JobRole_Healthcare Representative,JobRole_Human Resources,JobRole_Laboratory Technician,JobRole_Manager,JobRole_Manufacturing Director,JobRole_Research Director,JobRole_Research Scientist,JobRole_Sales Executive,JobRole_Sales Representative,MaritalStatus_Divorced,MaritalStatus_Married,MaritalStatus_Single,Over18_Y,OverTime_No,OverTime_Yes
1,35,921,8,3,1,1,46,3,1,1,2899,10778,1,17,3,4,80,1,4,3,3,4,2,0,3,0,0,1,0,0,1,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,1,0,1,0,1,1,0
2,32,718,26,3,1,3,80,3,2,4,4627,16495,0,17,3,4,80,2,4,3,3,3,2,1,2,0,0,1,0,0,1,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,1,0,1,0,0,1,1,0
3,38,1488,2,3,1,3,40,3,2,1,5347,13384,3,14,3,3,80,0,15,1,1,6,0,0,2,0,0,1,0,1,0,0,0,0,1,0,0,1,0,1,0,0,0,0,0,0,0,0,0,1,0,1,1,0
4,50,1017,5,4,1,2,37,3,5,1,19033,19805,1,13,3,3,80,0,31,0,3,31,14,4,10,0,0,1,0,1,0,0,0,0,1,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,1,1,0,1
5,27,566,2,3,1,3,56,3,2,2,4197,7103,5,11,3,4,80,0,6,0,3,1,0,1,0,0,0,1,0,1,0,0,0,0,0,1,0,1,0,0,0,0,0,1,0,0,0,0,0,0,1,1,1,0
6,34,944,10,4,1,2,36,3,1,1,1281,16900,1,13,3,1,80,0,1,2,3,1,0,0,0,0,0,1,0,1,0,0,0,0,1,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,1,1,1,0
7,40,1009,2,3,1,4,74,3,1,4,3067,12916,2,12,3,4,80,1,6,3,2,3,2,1,2,0,0,1,0,1,0,0,1,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,1,0,0,1,1,0
9,25,806,9,1,1,3,82,3,1,3,2741,7950,1,15,3,3,80,1,9,2,2,9,7,7,8,0,0,1,0,1,0,0,0,0,1,0,0,1,0,0,0,1,0,0,0,0,0,0,0,1,0,1,1,0
12,38,138,2,2,1,4,95,2,1,2,3280,7288,8,12,3,3,80,0,4,2,2,2,2,2,2,0,0,1,0,1,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,1,0,1,0,1
14,26,884,1,2,1,4,62,3,1,3,3102,4284,6,16,3,4,80,0,6,2,3,5,4,1,4,0,0,1,0,1,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,1,1,0,1


# MAKING SUBMISSION

In [50]:
ss_test = ss.copy()

ss_test[['Stay','Attrition']] = proba_global

In [51]:
ss_test

Unnamed: 0,id,Attrition,Stay
0,1677,0.268112,0.731888
1,1678,0.127308,0.872692
2,1679,0.078396,0.921604
3,1680,0.061298,0.938702
4,1681,0.523901,0.476099
...,...,...,...
1114,2791,0.075159,0.924841
1115,2792,0.012388,0.987612
1116,2793,0.039408,0.960592
1117,2794,0.032942,0.967058


In [52]:
ss_test[['id','Attrition']].to_csv('submission.csv', index = False)