<a href="https://www.kaggle.com/code/valentindefour/tps-s03e03-eda-automl-fun-explainable-ai?scriptVersionId=117168272" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

# IMPORTING LIBRARIES

In [None]:
import numpy as np
import pandas as pd

import optuna

import catboost as cb
from sklearn import model_selection, metrics, linear_model, ensemble, naive_bayes, neighbors, svm, tree
from xgboost import XGBClassifier
import lightgbm

import os
import gc
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# LOADING COMPETITION DATA + HISTORIC DATA FROM ORIGINAL DATASET

In [None]:
train = pd.read_csv(r'/kaggle/input/playground-series-s3e3/train.csv')
train_o = pd.read_csv(r'/kaggle/input/ibm-hr-analytics-attrition-dataset/WA_Fn-UseC_-HR-Employee-Attrition.csv')
test = pd.read_csv(r'/kaggle/input/playground-series-s3e3/test.csv')
ss = pd.read_csv(r'/kaggle/input/playground-series-s3e3/sample_submission.csv')

# EDA

In [None]:
print(train.shape, train_o.shape, test.shape)

In [None]:
train.isnull().sum()

In [None]:
train_o.isnull().sum()

In [None]:
display(train.head())
display(train_o.head())

In [None]:
display(train['Attrition'].value_counts())
display(train_o['Attrition'].value_counts())

In [None]:
train_o['Attrition'] = train_o['Attrition'].map(lambda x: 1 if x == "Yes" else 0)
train_o['id'] = train_o['EmployeeNumber']

In [None]:
train_all = pd.concat([train, train_o.drop('EmployeeNumber', axis = 1)])

In [None]:
display(train_all.shape)
display(train_all['Attrition'].value_counts())

In [None]:
target = train_all['Attrition']

data = pd.concat([train_all.drop('Attrition', axis = 1), test]).reset_index(drop = True)

print(train_all.shape, test.shape, data.shape)

In [None]:
text_features = []

for column in data.columns:
    if data[column].dtype == 'object':
        text_features.append(column)

for text_feature in text_features:
    data = data.join(pd.get_dummies(data[text_feature], prefix=text_feature))
    data = data.drop(text_feature, axis = 1)

In [None]:
data.head()

In [None]:
df_train = data.iloc[:len(target), :]

df_test = data.iloc[len(target):, :]

df_train.shape, target.shape, df_test.shape

# "MANUAL" ML

In [None]:
X_train, X_test, y_train, y_test = model_selection.train_test_split(df_train.drop('id', axis = 1), target, random_state = 42)

### CATBOOST

In [None]:
parameters = {'depth':[4,5,6,7,8,9, 10],
              'learning_rate':[0.01,0.02,0.03,0.04],
              'iterations':[10,20,30,40,50,60,70,80,90,100]}

In [None]:
# gs_cb = model_selection.GridSearchCV(estimator=cb.CatBoostClassifier(), param_grid=parameters, verbose=0, cv = 3, n_jobs = -1)

# gs_cb.fit(X_train, y_train)

In [None]:
model_cb = cb.CatBoostClassifier(depth = 7,
                                 learning_rate = 0.04,
                                 rsm = 0.5,
                                 iterations = 100,
                                 random_seed = 42,
                                 verbose = False).fit(X_train, y_train)

print(metrics.roc_auc_score(model_cb.predict(X_test), y_test))
preds_cb = model_cb.predict(df_test.drop('id', axis = 1))
proba_cb = model_cb.predict_proba(df_test.drop('id', axis = 1))

### XGBOOST

In [None]:
clf = XGBClassifier(eval_metric = 'auc')

params_xgb = {#'learning_rate': [0.1, 0.05, 0.03],
          'n_estimators': np.arange(100, 500,50)#,
#          'max_depth': [3,4,5],
#          'lambda': [1,2,3,4,5],
#          'colsample_bytree': [0.3,0.4,0.5,0.6,0.8],
#          'subsample': [0.3,0.4,0.5,0.6,0.8],
#          'scale_pos_weight': [4,5,6.2,7]
}

gs_xgb = model_selection.GridSearchCV(clf, params_xgb, n_jobs = -1, cv = 3, scoring = 'roc_auc',
                       verbose = 1)
gs_xgb.fit(X_train, y_train)
print(gs_xgb.best_score_)
print(gs_xgb.best_params_)

In [None]:
model_xgb = XGBClassifier(subsample = 0.8, scale_pos_weight = 5 , max_depth = 3, learning_rate = 0.1, colsample_bytree = 0.8, n_estimators = 100, eval_metric = "auc").fit(X_train, y_train)

print(metrics.roc_auc_score(model_xgb.predict(X_test), y_test))

proba_xgb = model_cb.predict_proba(df_test.drop('id', axis = 1))

### LIGHTGBM

In [None]:
model_lgbm = lightgbm.LGBMClassifier().fit(X_train, y_train)
print(metrics.roc_auc_score(model_lgbm.predict(X_test), y_test))
preds_lgbm = model_lgbm.predict(df_test.drop('id', axis = 1))
proba_lgbm = model_lgbm.predict_proba(df_test.drop('id', axis = 1))

### BLENDING THEM ALL TOGETHER

In [None]:
proba_global = (proba_cb+proba_xgb+proba_lgbm)/3
proba_global

### TRYING A SUBMISSION

In [None]:
ss[['Stay','Attrition']] = proba_global
ss[['id','Attrition']].to_csv('submission.csv', index = False)

This submission got a public score of
## 0.92514.

---

### Not bad, but let's now try with h2o Auto ML :

# AUTO ML

## Init cluster and data loading

In [None]:
import h2o        
from h2o.automl import H2OAutoML

In [None]:
h2o.init()

In [None]:
target.to_frame().reset_index(drop = True)

In [None]:
df_train_h2o = df_train.copy()
df_train_h2o['Attrition'] = target.to_frame().reset_index(drop = True)
train_h2o = h2o.H2OFrame(df_train_h2o)

In [None]:
test_h2o = h2o.H2OFrame(df_test)

## Training

In [None]:
train, test = train_h2o.split_frame(ratios = [0.75])

x = train.columns
y = 'Attrition'
x.remove(y)

In [None]:
train[y] = train[y].asfactor()
test[y] = test[y].asfactor()

# Run AutoML for 20 base models - or 10 min
aml = H2OAutoML(max_models=20,
                max_runtime_secs=600,
                seed=42)

aml.train(x=x, y=y, training_frame=train)

In [None]:
lb = aml.leaderboard
lb.head()

In [None]:
preds = aml.leader.predict(test_h2o)

In [None]:
ss['Attrition'] = preds.as_data_frame()['p1']
ss.to_csv('submission.csv', index = False)