In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

In [2]:
import zipfile

zip_file_path = "dataset/playground-series-s3e3.zip"
extracted_path = "dataset/"

with zipfile.ZipFile(zip_file_path,"r") as zip_ref:
    zip_ref.extractall(extracted_path)
    
print("Zip file Extracted")

Zip file Extracted


In [3]:
df = pd.read_csv("dataset/train.csv")
test = pd.read_csv("dataset/test.csv")
df.head()

Unnamed: 0,id,Age,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EnvironmentSatisfaction,...,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager,Attrition
0,0,36,Travel_Frequently,599,Research & Development,24,3,Medical,1,4,...,80,1,10,2,3,10,0,7,8,0
1,1,35,Travel_Rarely,921,Sales,8,3,Other,1,1,...,80,1,4,3,3,4,2,0,3,0
2,2,32,Travel_Rarely,718,Sales,26,3,Marketing,1,3,...,80,2,4,3,3,3,2,1,2,0
3,3,38,Travel_Rarely,1488,Research & Development,2,3,Medical,1,3,...,80,0,15,1,1,6,0,0,2,0
4,4,50,Travel_Rarely,1017,Research & Development,5,4,Medical,1,2,...,80,0,31,0,3,31,14,4,10,1


In [4]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
for col in df.columns:
    if df[col].dtype == "O":
        le.fit(df[col])
        df[col] = le.transform(df[col])
        test[col] = le.transform(test[col])

In [5]:
x_cols = [col for col in df.columns if col not in ["id","Attrition"]]
y_cols = ["Attrition"]

In [6]:
att = df[df["Attrition"] == 1]
dff = pd.concat([df,att,att,att], axis = 0)

# Shuffle the DataFrame
shuffled_df = dff.sample(frac=1.0, random_state=42)

# Reset the index if needed
shuffled_df.reset_index(drop=True, inplace=True)


In [7]:
X = df[x_cols]
y = df[y_cols]

In [8]:
Xd = shuffled_df[x_cols]
xtest = test[x_cols]
yd = shuffled_df[y_cols]

In [9]:
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.ensemble import AdaBoostClassifier, HistGradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

In [10]:
import warnings
warnings.filterwarnings("ignore")

In [11]:
import xgboost as xgb
import lightgbm as lgbm
from lightgbm import early_stopping, log_evaluation
import optuna

In [97]:
def cross_validation(X,y,estimator,n_splits, xtest=None):

    skf = StratifiedKFold(n_splits=n_splits, random_state=42, shuffle=True)

    acc_scr = []
    f1_scr = []
    auc_scr = []
    
    if xtest is not None:
        preds = np.zeros(len(xtest))
    else:
        preds = None

    for i, (train_index, test_index) in enumerate(skf.split(X, y)):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

        estimator.fit(X_train, y_train)

        y_preds = estimator.predict(X_test)

        if xtest is not None:
            test_preds = estimator.predict_proba(xtest)[:, 1] / n_splits
            if preds is None:
                preds = test_preds
            else:
                preds += test_preds

        acc_scr.append(accuracy_score(y_test, y_preds))
        f1_scr.append(f1_score(y_test, y_preds))
        auc_scr.append(roc_auc_score(y_test, estimator.predict_proba(X_test)[:, 1]))

    avg_acc = round(np.mean(acc_scr), 4)
    avg_f1 = round(np.mean(f1_scr), 4)
    avg_roc = round(np.mean(auc_scr), 4)

    print(f"Average accuracy score of {n_splits} is : {avg_acc}")
    print(f"Average f1 score of {n_splits} is       : {avg_f1}")
    print(f"Average roc score of {n_splits} is      : {avg_roc}")

    return preds

        "verbose" :-1,
        "force_col_wise":True,
        "boosting_type": trial.suggest_categorical('boosting_type',['gbdt','dart','goss']),
        'num_iterations' : trial.suggest_int('num_iterations',200,500,step=50),
        "num_leaves": trial.suggest_int("num_leaves", 3, 10),
        "max_depth": trial.suggest_int("max_depth", 3, 5),
        "learning_rate": trial.suggest_float("learning_rate", 0.001, 0.5,log=True),
        'scale_pos_weight': trial.suggest_int('scale_pos_weight',2,7),
        'colsample_bytree' :trial.suggest_float('colsample_bytree',0.5,0.8,step=0.1),
        'reg_alpha' : trial.suggest_int('reg_alpha',2,10,step=1),
        'reg_lambda' : trial.suggest_int('reg_lambda',2,10,step=1),
        #"bagging_fraction": trial.suggest_float("bagging_fraction", 0.5, 1.0),
        #"bagging_freq": trial.suggest_int("bagging_freq", 1, 8),
        'n_jobs' : -1,
        'metric': 'AUC',
        'verbose': 0  

In [92]:
%%time

base_params = { "objective": ["binary"],
                    "metric": ["auc"],
                    "verbose": [-100],
                    "n_jobs": [-1],
                    "random_state" : [42],
                    "force_col_wise":[True]
                    }



params = {
            "boosting_type": ['gbdt'],
            
            # alias reg_alpha
            "reg_alpha": [0],
            # alias reg_lmabda
            "reg_lambda": [0.001],
            "learning_rate": [0.01],
            
            # Num of boosting rounds
            "n_estimators": [500],
            # maximum number of leaves | increase for better score/will overfit default = 31
            "num_leaves": [15],
            # depth of tree, -1 for full depth
            "max_depth": [5],
            # Min samples in leaf for split
            "min_child_samples": [15],
            'scale_pos_weight': [7],
            
            # alias feature_fraction, basically number of columns
            "colsample_bytree": [0.5],
            # alias bagging_fraction, fraction of sample of whole train data
            "subsample": [0.3],
            # alias bagging_freq after how many iters get smaple again
            "subsample_freq": [3]
}

params.update(base_params)

lgbm_clf = lgbm.LGBMClassifier(random_state=42)
    
grid_lgbm = GridSearchCV(estimator=lgbm_clf,
                            param_grid=params,
                            cv=5,
                            verbose=True,
                            scoring='roc_auc')


CPU times: total: 0 ns
Wall time: 998 µs


In [93]:
grid_lgbm.fit(X,y)

Fitting 5 folds for each of 1 candidates, totalling 5 fits


In [94]:
print(grid_lgbm.best_params_)
print(grid_lgbm.best_score_)

{'boosting_type': 'gbdt', 'colsample_bytree': 0.5, 'force_col_wise': True, 'learning_rate': 0.01, 'max_depth': 5, 'metric': 'auc', 'min_child_samples': 15, 'n_estimators': 500, 'n_jobs': -1, 'num_leaves': 15, 'objective': 'binary', 'random_state': 42, 'reg_alpha': 0, 'reg_lambda': 0.001, 'scale_pos_weight': 7, 'subsample': 0.3, 'subsample_freq': 3, 'verbose': -100}
0.836610112230875


In [95]:
best_lgbm = grid_lgbm.best_estimator_

In [98]:
cross_validation(X,y,best_lgbm,5, None)
print("-"*80)
cross_validation(Xd,yd,best_lgbm,5, None)

Average accuracy score of 5 is : 0.8712
Average f1 score of 5 is       : 0.4644
Average roc score of 5 is      : 0.8266
--------------------------------------------------------------------------------
Average accuracy score of 5 is : 0.7953
Average f1 score of 5 is       : 0.7719
Average roc score of 5 is      : 0.9535


In [103]:
val = cross_validation(X,y,best_lgbm,5, xtest=xtest)

Average accuracy score of 5 is : 0.8712
Average f1 score of 5 is       : 0.4644
Average roc score of 5 is      : 0.8266


In [104]:
samp_sub = pd.read_csv("dataset/sample_submission.csv")
samp_sub["Attrition"] = val
samp_sub.to_csv("est_sub_lgb_param_cv.csv", index=False)