In [None]:
import pandas as pd
import numpy as np

import gc

from imblearn.over_sampling import SMOTE

import catboost as cb
from sklearn import model_selection, metrics, linear_model, ensemble, naive_bayes, neighbors, svm, tree
from xgboost import XGBClassifier

In [None]:
train = pd.read_csv(r'./data/train.csv')
train_o = pd.read_csv(r'./data/train_original.csv')
test = pd.read_csv(r'./data/test.csv')
ss = pd.read_csv(r'./data/sample_submission.csv')

In [None]:
print(train.shape, train_o.shape, test.shape)

In [None]:
train.isnull().sum()

In [None]:
train_o.isnull().sum()

In [None]:
display(train.head())
display(train_o.head())

In [None]:
display(train['Attrition'].value_counts())
display(train_o['Attrition'].value_counts())

In [None]:
train_o['Attrition'] = train_o['Attrition'].map(lambda x: 1 if x == "Yes" else 0)
train_o['id'] = train_o['EmployeeNumber']

In [None]:
train_all = pd.concat([train, train_o.drop('EmployeeNumber', axis = 1)])

In [None]:
display(train_all.shape)
display(train_all['Attrition'].value_counts())

In [None]:
target = train_all['Attrition']

data = pd.concat([train_all.drop('Attrition', axis = 1), test]).reset_index(drop = True)

print(train_all.shape, test.shape, data.shape)

In [None]:
data.info()

In [None]:
text_features = []

for column in data.columns:
    if data[column].dtype == 'object':
        text_features.append(column)

for text_feature in text_features:
    data = data.join(pd.get_dummies(data[text_feature], prefix=text_feature))
    data = data.drop(text_feature, axis = 1)

In [None]:
data.head()

In [None]:
df_train = data.iloc[:len(target), :]

df_test = data.iloc[len(target):, :]

df_train.shape, target.shape, df_test.shape

In [None]:
X_train, X_test, y_train, y_test = model_selection.train_test_split(df_train.drop('id', axis = 1), target, random_state = 42)
    
# sm = SMOTE(random_state = 42)
# X_train, y_train = sm.fit_resample(X_train, y_train)

In [None]:
FOLDS = 10
seed = 69

def norm_0to1(preds):
    return (preds - np.min(preds)) / (np.max(preds) - np.min(preds))

cb_params = {
    'depth': 3,
    'learning_rate': 0.01,
    'rsm': 0.5,
    'subsample': 0.931,
    'l2_leaf_reg': 69,
    'min_data_in_leaf': 20,
    'random_strength': 0.175,
    
    'random_seed': seed,
    'use_best_model': True,
    'task_type': 'CPU',
    'bootstrap_type': 'Bernoulli',
    'grow_policy': 'SymmetricTree',
    'loss_function': 'Logloss',
    'eval_metric': 'AUC'
}

f_imp = pd.DataFrame({'feature': df_train.columns})
predictions, scores = np.zeros(len(df_test)), []

k = model_selection.StratifiedKFold(n_splits=FOLDS, random_state=seed, shuffle=True)
for fold, (train_idx, val_idx) in enumerate(k.split(df_train, target)):    
    cb_train = cb.Pool(data=df_train.iloc[train_idx],
                       label=target.iloc[train_idx])
    cb_valid = cb.Pool(data=df_train.iloc[val_idx],
                       label=target.iloc[val_idx])
    
    model = cb.train(params=cb_params,
                     dtrain=cb_train,
                     num_boost_round=10000,
                     evals=cb_valid, 
                     early_stopping_rounds=500,
                     verbose=False)
    
    f_imp['fold_'+str(fold+1)] = model.get_feature_importance()
    val_preds = model.predict(cb_valid)
    val_score = metrics.roc_auc_score(target.iloc[val_idx], val_preds)
    scores.append(val_score)
    
    predictions += model.predict(df_test) / FOLDS
    print(f'- FOLD {fold+1} AUC: {round(val_score, 4)} -')
    
    del cb_train, cb_valid, val_preds, val_score, model
    gc.collect()

print('*'*45)
print(f'Mean AUC: {round(np.mean(scores), 4)}')

predictions = norm_0to1(predictions)

In [None]:
cb_preds = predictions
ss['Attrition'] = predictions
ss.to_csv('cb_submission.csv', index=False)