In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

In [None]:
import zipfile

zip_file_path = "dataset/playground-series-s3e3.zip"
extracted_path = "dataset/"

with zipfile.ZipFile(zip_file_path,"r") as zip_ref:
    zip_ref.extractall(extracted_path)
    
print("Zip file Extracted")

In [2]:
df = pd.read_csv("dataset/train.csv")
test = pd.read_csv("dataset/test.csv")
df.head()

Unnamed: 0,id,Age,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EnvironmentSatisfaction,...,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager,Attrition
0,0,36,Travel_Frequently,599,Research & Development,24,3,Medical,1,4,...,80,1,10,2,3,10,0,7,8,0
1,1,35,Travel_Rarely,921,Sales,8,3,Other,1,1,...,80,1,4,3,3,4,2,0,3,0
2,2,32,Travel_Rarely,718,Sales,26,3,Marketing,1,3,...,80,2,4,3,3,3,2,1,2,0
3,3,38,Travel_Rarely,1488,Research & Development,2,3,Medical,1,3,...,80,0,15,1,1,6,0,0,2,0
4,4,50,Travel_Rarely,1017,Research & Development,5,4,Medical,1,2,...,80,0,31,0,3,31,14,4,10,1


In [3]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
for col in df.columns:
    if df[col].dtype == "O":
        le.fit(df[col])
        df[col] = le.transform(df[col])
        test[col] = le.transform(test[col])

In [4]:
x_cols = [col for col in df.columns if col not in ["id","Attrition"]]
y_cols = ["Attrition"]

In [5]:
att = df[df["Attrition"] == 1]
dff = pd.concat([df,att,att,att], axis = 0)

# Shuffle the DataFrame
shuffled_df = dff.sample(frac=1.0, random_state=42)

# Reset the index if needed
shuffled_df.reset_index(drop=True, inplace=True)


In [6]:
X = df[x_cols]
y = df[y_cols]

In [7]:
Xd = shuffled_df[x_cols]
xtest = test[x_cols]
yd = shuffled_df[y_cols]

In [8]:
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.ensemble import AdaBoostClassifier, HistGradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

In [9]:
import warnings
warnings.filterwarnings("ignore")

In [10]:
import xgboost as xgb
import lightgbm as lgb
from lightgbm import early_stopping, log_evaluation
import optuna

In [89]:
def cross_validation(X,y,params,n_splits, xtest=None):

    skf = StratifiedKFold(n_splits=n_splits, random_state=42, shuffle=True)

    acc_scr = []
    f1_scr = []
    auc_scr = []
    eval_results_ = {}
    
    if xtest is not None:
        preds = np.zeros(len(xtest))
    else:
        preds = None

    for i, (train_index, test_index) in enumerate(skf.split(X, y)):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        
        # Dmatrix train and Test dataset        
        dtrain = lgb.Dataset(X_train, label=y_train)
        dvalid = lgb.Dataset(X_test, label=y_test)
        
        
        
        model = lgb.train(params = params,
                          train_set = dtrain,
                          valid_sets=[dtrain, dvalid],
                          verbose_eval =False,
                          callbacks = [early_stopping(100), log_evaluation(100)])

        y_preds = np.rint(model.predict(X_test))     
         

        if xtest is not None:
            test_preds = model.predict(xtest) / n_splits
            if preds is None:
                preds = test_preds
            else:
                preds += test_preds

        acc_scr.append(accuracy_score(y_test, y_preds))
        f1_scr.append(f1_score(y_test, y_preds))
        auc_scr.append(roc_auc_score(y_test, model.predict(X_test, num_iteration=model.best_iteration)))

    avg_acc = round(np.mean(acc_scr), 4)
    avg_f1 = round(np.mean(f1_scr), 4)
    avg_roc = round(np.mean(auc_scr), 4)

    print(f"Average of {n_splits} splits accuracy score : {avg_acc:.4f} | f1 score : {avg_f1:.4f} | roc score : {avg_roc:.4f}")

    return avg_roc, preds

        "verbose" :-1,
        "force_col_wise":True,
        "boosting_type": trial.suggest_categorical('boosting_type',['gbdt','dart','goss']),
        'num_iterations' : trial.suggest_int('num_iterations',200,500,step=50),
        "num_leaves": trial.suggest_int("num_leaves", 3, 10),
        "max_depth": trial.suggest_int("max_depth", 3, 5),
        "learning_rate": trial.suggest_float("learning_rate", 0.001, 0.5,log=True),
        'scale_pos_weight': trial.suggest_int('scale_pos_weight',2,7),
        'colsample_bytree' :trial.suggest_float('colsample_bytree',0.5,0.8,step=0.1),
        'reg_alpha' : trial.suggest_int('reg_alpha',2,10,step=1),
        'reg_lambda' : trial.suggest_int('reg_lambda',2,10,step=1),
        #"bagging_fraction": trial.suggest_float("bagging_fraction", 0.5, 1.0),
        #"bagging_freq": trial.suggest_int("bagging_freq", 1, 8),
        'n_jobs' : -1,
        'metric': 'AUC',
        'verbose': 0  

In [99]:
%%time

optuna.logging.set_verbosity(optuna.logging.WARNING)

base_params = { "objective": "binary",
                    "metric": "auc",
                    "verbose": -1,
                    "random_state" : 42,
                    "force_col_wise":True
                    }

def objective_pr(trial):
        
    params = {
                "boosting_type": trial.suggest_categorical('boosting_type',['gbdt','dart','rf']),
                # alias reg_alpha
                "reg_alpha": trial.suggest_float("reg_alpha", 1e-2, 10.0),
                # alias reg_lmabda
                "reg_lambda": trial.suggest_float("reg_lambda", 1e-2, 10.0),
                "learning_rate": trial.suggest_float("learning_rate", 1e-2, 0.9),
                
                # Num of boosting rounds
                "n_estimators": trial.suggest_int("n_estimators", 200, 5000, step = 100),
                # increase for better score/will overfit default = 31
                "num_leaves": trial.suggest_int("num_leaves", 3, 10, step = 1),
                # depth of tree, -1 for full depth
                "max_depth": trial.suggest_int("max_depth", 3, 5, step = 1),
                "min_child_samples": trial.suggest_int("min_child_samples", 20, 200, step = 5),
                'scale_pos_weight': trial.suggest_int('scale_pos_weight',2,7, step = 1),
                
                # alias colsample_bytree, basically number of columns
                "feature_fraction": trial.suggest_float("feature_fraction", 0.4, 0.95, step = 0.1),
                # alias subsample, fraction of sample of whole train data
                "bagging_fraction": trial.suggest_float("bagging_fraction", 0.4, 0.94, step = 0.1),
                # alias subsample_freq after how many iters get smaple again
                "bagging_freq": trial.suggest_int("bagging_freq", 3, 10, step = 1)
    }
    
    params.update(base_params)
    
    scores = cross_validation(X, y, params, 5, xtest=None)[0]
    return np.mean(scores)

study = optuna.create_study(direction='maximize')
study.optimize(objective_pr, n_trials=5)
best_params = study.best_params

Training until validation scores don't improve for 100 rounds
[100]	training's auc: 0.796081	valid_1's auc: 0.720904
[200]	training's auc: 0.801818	valid_1's auc: 0.733024
[300]	training's auc: 0.803408	valid_1's auc: 0.740752
[400]	training's auc: 0.802535	valid_1's auc: 0.739696
Early stopping, best iteration is:
[328]	training's auc: 0.804112	valid_1's auc: 0.742145
Training until validation scores don't improve for 100 rounds
[100]	training's auc: 0.79511	valid_1's auc: 0.843877
Early stopping, best iteration is:
[34]	training's auc: 0.793885	valid_1's auc: 0.854223
Training until validation scores don't improve for 100 rounds
[100]	training's auc: 0.810559	valid_1's auc: 0.727076
Early stopping, best iteration is:
[97]	training's auc: 0.810612	valid_1's auc: 0.727246
Training until validation scores don't improve for 100 rounds
[100]	training's auc: 0.799268	valid_1's auc: 0.746441
[200]	training's auc: 0.801198	valid_1's auc: 0.748475
[300]	training's auc: 0.802401	valid_1's auc:

{'boosting_type': 'goss',
 'num_iterations': 350,
 'num_leaves': 4,
 'max_depth': 5,
 'learning_rate': 0.138101890373062,
 'scale_pos_weight': 2,
 'colsample_bytree': 0.5,
 'reg_alpha': 7,
 'reg_lambda': 2}

In [100]:
best_params

{'boosting_type': 'gbdt',
 'reg_alpha': 5.9843742075490685,
 'reg_lambda': 9.70367924752612,
 'learning_rate': 0.21054713557022756,
 'n_estimators': 3200,
 'num_leaves': 4,
 'max_depth': 4,
 'min_child_samples': 160,
 'scale_pos_weight': 2,
 'feature_fraction': 0.5,
 'bagging_fraction': 0.7000000000000001,
 'bagging_freq': 6}

In [102]:
best_params.update(base_params)

best_params

{'boosting_type': 'gbdt',
 'reg_alpha': 5.9843742075490685,
 'reg_lambda': 9.70367924752612,
 'learning_rate': 0.21054713557022756,
 'n_estimators': 3200,
 'num_leaves': 4,
 'max_depth': 4,
 'min_child_samples': 160,
 'scale_pos_weight': 2,
 'feature_fraction': 0.5,
 'bagging_fraction': 0.7000000000000001,
 'bagging_freq': 6,
 'objective': 'binary',
 'metric': 'auc',
 'verbose': -1,
 'random_state': 42,
 'force_col_wise': True}

In [103]:
cross_validation(X,y,best_params,5, None)
print("-"*50)
cross_validation(Xd,yd,best_params,5, None)

Training until validation scores don't improve for 100 rounds
[100]	training's auc: 0.919359	valid_1's auc: 0.795017
[200]	training's auc: 0.938818	valid_1's auc: 0.808953
[300]	training's auc: 0.95362	valid_1's auc: 0.805152
Early stopping, best iteration is:
[219]	training's auc: 0.940866	valid_1's auc: 0.812753
Training until validation scores don't improve for 100 rounds
[100]	training's auc: 0.915972	valid_1's auc: 0.841132
Early stopping, best iteration is:
[36]	training's auc: 0.870666	valid_1's auc: 0.851267
Training until validation scores don't improve for 100 rounds
[100]	training's auc: 0.920606	valid_1's auc: 0.775847
Early stopping, best iteration is:
[54]	training's auc: 0.90183	valid_1's auc: 0.786186
Training until validation scores don't improve for 100 rounds
[100]	training's auc: 0.913986	valid_1's auc: 0.852373
Early stopping, best iteration is:
[99]	training's auc: 0.913484	valid_1's auc: 0.854492
Training until validation scores don't improve for 100 rounds
[100]

(0.9532, None)

In [104]:
val = cross_validation(X,y,best_params,5, xtest=xtest)[1]

Training until validation scores don't improve for 100 rounds
[100]	training's auc: 0.919359	valid_1's auc: 0.795017
[200]	training's auc: 0.938818	valid_1's auc: 0.808953
[300]	training's auc: 0.95362	valid_1's auc: 0.805152
Early stopping, best iteration is:
[219]	training's auc: 0.940866	valid_1's auc: 0.812753
Training until validation scores don't improve for 100 rounds
[100]	training's auc: 0.915972	valid_1's auc: 0.841132
Early stopping, best iteration is:
[36]	training's auc: 0.870666	valid_1's auc: 0.851267
Training until validation scores don't improve for 100 rounds
[100]	training's auc: 0.920606	valid_1's auc: 0.775847
Early stopping, best iteration is:
[54]	training's auc: 0.90183	valid_1's auc: 0.786186
Training until validation scores don't improve for 100 rounds
[100]	training's auc: 0.913986	valid_1's auc: 0.852373
Early stopping, best iteration is:
[99]	training's auc: 0.913484	valid_1's auc: 0.854492
Training until validation scores don't improve for 100 rounds
[100]

In [98]:
samp_sub = pd.read_csv("dataset/sample_submission.csv")
samp_sub["Attrition"] = val
samp_sub.to_csv("est_sub_lgb_param_cv.csv", index=False)