In [1]:
import numpy as np
import pandas as pd
import os
import random

def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

SEED = 42
seed_everything(SEED)

### Preprocessing

In [2]:
df = pd.read_csv('Telco-Customer-Churn.csv')
df[df['TotalCharges'] == " "].shape

(11, 21)

In [3]:
df = pd.read_csv('Telco-Customer-Churn.csv')
df['TotalCharges'] = df['TotalCharges'].replace(" ", 0).astype('float32')
print("df shape: ", df.shape)

cat_features = df.drop(['customerID','TotalCharges', 'MonthlyCharges', 'SeniorCitizen', 'tenure', 'Churn'],axis=1).columns
cat_features

from sklearn.preprocessing import OneHotEncoder
ohe = OneHotEncoder(sparse=False)
ohe.fit(df[cat_features])

dff = ohe.transform(df[cat_features])
dff = pd.DataFrame(dff, columns=ohe.get_feature_names())
dff = pd.concat([dff, df[['SeniorCitizen', 'MonthlyCharges', 'TotalCharges', 'tenure']]], axis=1)

dff['TotalCharges'].astype('float')

bin_dict = {'No':0, 'Yes':1}
df.Churn = df.Churn.map(bin_dict)

from sklearn.model_selection import train_test_split

feature_names = dff.columns

from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

X = dff
y = df.Churn

X = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, shuffle=True, random_state=42)

df shape:  (7043, 21)


In [4]:
print(df.shape)
print("\n")
print("X_train: ", X_train.shape)
print("y_train: ", y_train.shape)
#print("\n")
print("X_test: ", X_test.shape)
print("y_test: ", y_test.shape)

(7043, 21)


X_train:  (5634, 45)
y_train:  (5634,)
X_test:  (1409, 45)
y_test:  (1409,)


## HyperOpt Function for LightGBM, CatBoost and XGBoost

In [5]:
#import required packages
import lightgbm as lgb
import xgboost as xgb
import catboost as cb
import gc
from hyperopt import hp, tpe, Trials, STATUS_OK
from hyperopt.fmin import fmin
from hyperopt.pyll.stochastic import sample
#optional but advised
import warnings
warnings.filterwarnings('ignore')

#GLOBAL HYPEROPT PARAMETERS
NUM_EVALS = 1000 #number of hyperopt evaluation rounds
N_FOLDS = 5 #number of cross-validation folds on data in each evaluation round

#LIGHTGBM PARAMETERS
LGBM_MAX_LEAVES = 2**11 #maximum number of leaves per tree for LightGBM
LGBM_MAX_DEPTH = 25 #maximum tree depth for LightGBM
EVAL_METRIC_LGBM_REG = 'mae' #LightGBM regression metric. Note that 'rmse' is more commonly used 
EVAL_METRIC_LGBM_CLASS = 'auc'#LightGBM classification metric

#XGBOOST PARAMETERS
XGB_MAX_LEAVES = 2**12 #maximum number of leaves when using histogram splitting
XGB_MAX_DEPTH = 25 #maximum tree depth for XGBoost
EVAL_METRIC_XGB_REG = 'mae' #XGBoost regression metric
EVAL_METRIC_XGB_CLASS = 'auc' #XGBoost classification metric

#CATBOOST PARAMETERS
CB_MAX_DEPTH = 8 #maximum tree depth in CatBoost
OBJECTIVE_CB_REG = 'MAE' #CatBoost regression metric
OBJECTIVE_CB_CLASS = 'Logloss' #CatBoost classification metric

#OPTIONAL OUTPUT
BEST_SCORE = 0

def quick_hyperopt(data, labels, package='lgbm', num_evals=NUM_EVALS, diagnostic=False):
    
    #==========
    #LightGBM
    #==========
    
    if package=='lgbm':
        
        print('Running {} rounds of LightGBM parameter optimisation:'.format(num_evals))
        #clear space
        gc.collect()
        
        integer_params = ['max_depth',
                         'num_leaves',
                          'max_bin',
                         'min_data_in_leaf',
                         'min_data_in_bin']
        
        def objective(space_params):
            
            #cast integer params from float to int
            for param in integer_params:
                space_params[param] = int(space_params[param])
            
            #extract nested conditional parameters
            if space_params['boosting']['boosting'] == 'goss':
                top_rate = space_params['boosting'].get('top_rate')
                other_rate = space_params['boosting'].get('other_rate')
                #0 <= top_rate + other_rate <= 1
                top_rate = max(top_rate, 0)
                top_rate = min(top_rate, 0.5)
                other_rate = max(other_rate, 0)
                other_rate = min(other_rate, 0.5)
                space_params['top_rate'] = top_rate
                space_params['other_rate'] = other_rate
            
            subsample = space_params['boosting'].get('subsample', 1.0)
            space_params['boosting'] = space_params['boosting']['boosting']
            space_params['subsample'] = subsample
            
            #for classification, set stratified=True and metrics=EVAL_METRIC_LGBM_CLASS
            cv_results = lgb.cv(space_params, train, nfold = N_FOLDS, stratified=True,
                                early_stopping_rounds=100, metrics=EVAL_METRIC_LGBM_CLASS, seed=42)
            
            #best_loss = cv_results['l1-mean'][-1] #'l2-mean' for rmse
            #for classification, comment out the line above and uncomment the line below:
            best_loss = 1 - cv_results['auc-mean'][-1]
            #if necessary, replace 'auc-mean' with '[your-preferred-metric]-mean'
            return{'loss':best_loss, 'status': STATUS_OK }
        
        train = lgb.Dataset(data, labels, free_raw_data = False)
                
        #integer and string parameters, used with hp.choice()
        boosting_list = [{'boosting': 'gbdt',
                          'subsample': hp.uniform('subsample', 0.5, 1)},
                         {'boosting': 'goss',
                          'subsample': 1.0,
                         'top_rate': hp.uniform('top_rate', 0, 0.5),
                         'other_rate': hp.uniform('other_rate', 0, 0.5)}] #if including 'dart', make sure to set 'n_estimators'
        #metric_list = ['MAE', 'RMSE'] 
        #for classification comment out the line above and uncomment the line below
        metric_list = ['auc'] #modify as required for other classification metrics
        objective_list_reg = ['huber', 'gamma', 'fair', 'tweedie']
        objective_list_class = ['binary', 'cross_entropy']
        #for classification set objective_list = objective_list_class
        objective_list = objective_list_class

        space ={'boosting' : hp.choice('boosting', boosting_list),
                'num_leaves' : hp.quniform('num_leaves', 2, LGBM_MAX_LEAVES, 1),
                'max_depth': hp.quniform('max_depth', 2, LGBM_MAX_DEPTH, 1),
                'max_bin': hp.quniform('max_bin', 32, 255, 1),
                'min_data_in_leaf': hp.quniform('min_data_in_leaf', 1, 256, 1),
                'min_data_in_bin': hp.quniform('min_data_in_bin', 1, 256, 1),
                'min_gain_to_split' : hp.quniform('min_gain_to_split', 0.1, 5, 0.01),
                'lambda_l1' : hp.uniform('lambda_l1', 0, 5),
                'lambda_l2' : hp.uniform('lambda_l2', 0, 5),
                'learning_rate' : hp.loguniform('learning_rate', np.log(0.005), np.log(0.2)),
                'metric' : hp.choice('metric', metric_list),
                'objective' : hp.choice('objective', objective_list),
                'feature_fraction' : hp.quniform('feature_fraction', 0.5, 1, 0.01),
                'bagging_fraction' : hp.quniform('bagging_fraction', 0.5, 1, 0.01),
                'verbose': -1,
                'n_jobs': 1
            }
        
        #optional: activate GPU for LightGBM
        #follow compilation steps here:
        #https://www.kaggle.com/vinhnguyen/gpu-acceleration-for-lightgbm/
        #then uncomment lines below:
        #space['device'] = 'gpu'
        #space['gpu_platform_id'] = 0,
        #space['gpu_device_id'] =  0

        trials = Trials()
        best = fmin(fn=objective,
                    space=space,
                    algo=tpe.suggest,
                    max_evals=num_evals, 
                    trials=trials)
                
        #fmin() will return the index of values chosen from the lists/arrays in 'space'
        #to obtain actual values, index values are used to subset the original lists/arrays
        best['boosting'] = boosting_list[best['boosting']]['boosting']#nested dict, index twice
        best['metric'] = metric_list[best['metric']]
        best['objective'] = objective_list[best['objective']]
                
        #cast floats of integer params to int
        for param in integer_params:
            best[param] = int(best[param])
        
        print('{' + '\n'.join('{}: {}'.format(k, v) for k, v in best.items()) + '}')
        if diagnostic:
            return(best, trials)
        else:
            return(best)
    
    #==========
    #XGBoost
    #==========
    
    if package=='xgb':
        
        print('Running {} rounds of XGBoost parameter optimisation:'.format(num_evals))
        #clear space
        gc.collect()
        
        integer_params = ['max_depth']
        
        def objective(space_params):
            
            for param in integer_params:
                space_params[param] = int(space_params[param])
                
            #extract multiple nested tree_method conditional parameters
            #libera te tutemet ex inferis
            if space_params['tree_method']['tree_method'] == 'hist':
                max_bin = space_params['tree_method'].get('max_bin')
                space_params['max_bin'] = int(max_bin)
                if space_params['tree_method']['grow_policy']['grow_policy']['grow_policy'] == 'depthwise':
                    grow_policy = space_params['tree_method'].get('grow_policy').get('grow_policy').get('grow_policy')
                    space_params['grow_policy'] = grow_policy
                    space_params['tree_method'] = 'hist'
                else:
                    max_leaves = space_params['tree_method']['grow_policy']['grow_policy'].get('max_leaves')
                    space_params['grow_policy'] = 'lossguide'
                    space_params['max_leaves'] = int(max_leaves)
                    space_params['tree_method'] = 'hist'
            else:
                space_params['tree_method'] = space_params['tree_method'].get('tree_method')
                
            #for classification replace EVAL_METRIC_XGB_REG with EVAL_METRIC_XGB_CLASS
            cv_results = xgb.cv(space_params, train, nfold=N_FOLDS, metrics=[EVAL_METRIC_XGB_CLASS],
                             early_stopping_rounds=100, stratified=True, seed=42, verbose_eval=0)
            
            #best_loss = cv_results['test-mae-mean'].iloc[-1] #or 'test-rmse-mean' if using RMSE
            #for classification, comment out the line above and uncomment the line below:
            best_loss = 1 - cv_results['test-auc-mean'].iloc[-1]
            #if necessary, replace 'test-auc-mean' with 'test-[your-preferred-metric]-mean'
            return{'loss':best_loss, 'status': STATUS_OK }
        
        train = xgb.DMatrix(data, labels)
        
        #integer and string parameters, used with hp.choice()
        boosting_list = ['gbtree', 'gblinear'] #if including 'dart', make sure to set 'n_estimators'
        #metric_list = ['MAE', 'RMSE'] 
        #for classification comment out the line above and uncomment the line below
        metric_list = ['auc']
        #modify as required for other classification metrics classification
        
        tree_method = [{'tree_method' : 'exact'},
               {'tree_method' : 'approx'},
               {'tree_method' : 'hist',
                'max_bin': hp.quniform('max_bin', 2**3, 2**7, 1),
                'grow_policy' : {'grow_policy': {'grow_policy':'depthwise'},
                                'grow_policy' : {'grow_policy':'lossguide',
                                                  'max_leaves': hp.quniform('max_leaves', 32, XGB_MAX_LEAVES, 1)}}}]
        
        #if using GPU, replace 'exact' with 'gpu_exact' and 'hist' with
        #'gpu_hist' in the nested dictionary above
        
        objective_list_reg = ['reg:linear', 'reg:gamma', 'reg:tweedie']
        objective_list_class = ['reg:logistic', 'binary:logistic']
        #for classification change line below to 'objective_list = objective_list_class'
        objective_list = objective_list_class
        
        space ={'boosting' : hp.choice('boosting', boosting_list),
                'tree_method' : hp.choice('tree_method', tree_method),
                'max_depth': hp.quniform('max_depth', 2, XGB_MAX_DEPTH, 1),
                'reg_alpha' : hp.uniform('reg_alpha', 0, 5),
                'reg_lambda' : hp.uniform('reg_lambda', 0, 5),
                'min_child_weight' : hp.uniform('min_child_weight', 0, 5),
                'gamma' : hp.uniform('gamma', 0, 5),
                'learning_rate' : hp.loguniform('learning_rate', np.log(0.005), np.log(0.2)),
                'eval_metric' : hp.choice('eval_metric', metric_list),
                'objective' : hp.choice('objective', objective_list),
                'colsample_bytree' : hp.quniform('colsample_bytree', 0.1, 1, 0.01),
                'colsample_bynode' : hp.quniform('colsample_bynode', 0.1, 1, 0.01),
                'colsample_bylevel' : hp.quniform('colsample_bylevel', 0.1, 1, 0.01),
                'subsample' : hp.quniform('subsample', 0.5, 1, 0.05),
                'nthread' : -1,
                'verbosity': 0,
            }
        
        trials = Trials()
        best = fmin(fn=objective,
                    space=space,
                    algo=tpe.suggest,
                    max_evals=num_evals, 
                    trials=trials)
        
        best['tree_method'] = tree_method[best['tree_method']]['tree_method']
        best['boosting'] = boosting_list[best['boosting']]
        best['eval_metric'] = metric_list[best['eval_metric']]
        best['objective'] = objective_list[best['objective']]
        
        #cast floats of integer params to int
        for param in integer_params:
            best[param] = int(best[param])
        if 'max_leaves' in best:
            best['max_leaves'] = int(best['max_leaves'])
        if 'max_bin' in best:
            best['max_bin'] = int(best['max_bin'])
        
        print('{' + '\n'.join('{}: {}'.format(k, v) for k, v in best.items()) + '}')
        
        if diagnostic:
            return(best, trials)
        else:
            return(best)
    
    #==========
    #CatBoost
    #==========
    
    if package=='cb':
        
        print('Running {} rounds of CatBoost parameter optimisation:'.format(num_evals))
        
        #clear memory 
        gc.collect()
            
        integer_params = ['depth',
                          #'one_hot_max_size', #for categorical data
                          'min_data_in_leaf',
                          'max_bin']
        
        def objective(space_params):
                        
            #cast integer params from float to int
            for param in integer_params:
                space_params[param] = int(space_params[param])
                
            #extract nested conditional parameters
            if space_params['bootstrap_type']['bootstrap_type'] == 'Bayesian':
                bagging_temp = space_params['bootstrap_type'].get('bagging_temperature')
                space_params['bagging_temperature'] = bagging_temp
                
            if space_params['grow_policy']['grow_policy'] == 'LossGuide':
                max_leaves = space_params['grow_policy'].get('max_leaves')
                space_params['max_leaves'] = int(max_leaves)
                
            space_params['bootstrap_type'] = space_params['bootstrap_type']['bootstrap_type']
            space_params['grow_policy'] = space_params['grow_policy']['grow_policy']
                           
            #random_strength cannot be < 0
            space_params['random_strength'] = max(space_params['random_strength'], 0)
            #fold_len_multiplier cannot be < 1
            space_params['fold_len_multiplier'] = max(space_params['fold_len_multiplier'], 1)
                       
            #for classification set stratified=True
            cv_results = cb.cv(train, space_params, fold_count=N_FOLDS, 
                             early_stopping_rounds=25, stratified=True, partition_random_seed=42)
           
            #best_loss = cv_results['test-MAE-mean'].iloc[-1] #'test-RMSE-mean' for RMSE
            #for classification, comment out the line above and uncomment the line below:
            best_loss = cv_results['test-Logloss-mean'].iloc[-1]
            #if necessary, replace 'test-Logloss-mean' with 'test-[your-preferred-metric]-mean'
            
            return{'loss':best_loss, 'status': STATUS_OK}
        
        train = cb.Pool(data, labels.astype('float32'))
        
        #integer and string parameters, used with hp.choice()
        bootstrap_type = [#{'bootstrap_type':'Poisson'}, 
                           {'bootstrap_type':'Bayesian',
                            'bagging_temperature' : hp.loguniform('bagging_temperature', np.log(1), np.log(50))},
                          {'bootstrap_type':'Bernoulli'}] 
        LEB = ['No', 'AnyImprovement'] #remove 'Armijo' if not using GPU
        #score_function = ['Correlation', 'L2', 'NewtonCorrelation', 'NewtonL2']
        grow_policy = [{'grow_policy':'SymmetricTree'},
                       {'grow_policy':'Depthwise'},
                       {'grow_policy':'Lossguide',
                        'max_leaves': hp.quniform('max_leaves', 2, 32, 1)}]
        eval_metric_list_reg = ['MAE', 'RMSE']
        eval_metric_list_class = ['Logloss', 'AUC']
        #for classification change line below to 'eval_metric_list = eval_metric_list_class'
        eval_metric_list = eval_metric_list_class
                
        space ={'depth': hp.quniform('depth', 2, CB_MAX_DEPTH, 1),
                'max_bin' : hp.quniform('max_bin', 1, 32, 1), #if using CPU just set this to 254
                'l2_leaf_reg' : hp.uniform('l2_leaf_reg', 0, 5),
                'min_data_in_leaf' : hp.quniform('min_data_in_leaf', 1, 50, 1),
                'random_strength' : hp.loguniform('random_strength', np.log(0.005), np.log(5)),
                #'one_hot_max_size' : hp.quniform('one_hot_max_size', 2, 16, 1), #uncomment if using categorical features
                'bootstrap_type' : hp.choice('bootstrap_type', bootstrap_type),
                'learning_rate' : hp.uniform('learning_rate', 0.05, 0.25),
                'eval_metric' : hp.choice('eval_metric', eval_metric_list),
                'objective' : OBJECTIVE_CB_CLASS,
                #'score_function' : hp.choice('score_function', score_function), #crashes kernel - reason unknown
                'leaf_estimation_backtracking' : hp.choice('leaf_estimation_backtracking', LEB),
                'grow_policy': hp.choice('grow_policy', grow_policy),
                #'colsample_bylevel' : hp.quniform('colsample_bylevel', 0.1, 1, 0.01),# CPU only
                'fold_len_multiplier' : hp.loguniform('fold_len_multiplier', np.log(1.01), np.log(2.5)),
                'od_type' : 'Iter',
                'od_wait' : 25,
                'task_type' : 'GPU',
                'verbose' : 0,
            }
        
        #optional: run CatBoost without GPU
        #uncomment line below
        space['task_type'] = 'CPU'
            
        trials = Trials()
        best = fmin(fn=objective,
                    space=space,
                    algo=tpe.suggest,
                    max_evals=num_evals, 
                    trials=trials)
        
        #unpack nested dicts first
        best['bootstrap_type'] = bootstrap_type[best['bootstrap_type']]['bootstrap_type']
        best['grow_policy'] = grow_policy[best['grow_policy']]['grow_policy']
        best['eval_metric'] = eval_metric_list[best['eval_metric']]
        
        #best['score_function'] = score_function[best['score_function']] 
        #best['leaf_estimation_method'] = LEM[best['leaf_estimation_method']] #CPU only
        best['leaf_estimation_backtracking'] = LEB[best['leaf_estimation_backtracking']]        
        
        #cast floats of integer params to int
        for param in integer_params:
            best[param] = int(best[param])
        if 'max_leaves' in best:
            best['max_leaves'] = int(best['max_leaves'])
        
        print('{' + '\n'.join('{}: {}'.format(k, v) for k, v in best.items()) + '}')
        
        if diagnostic:
            return(best, trials)
        else:
            return(best)
    
    else:
        print('Package not recognised. Please use "lgbm" for LightGBM, "xgb" for XGBoost or "cb" for CatBoost.')

In [61]:
xgb_params = quick_hyperopt(X_train, y_train, 'xgb', 1000)

Running 1000 rounds of XGBoost parameter optimisation:
100%|███████████████████████████████████████████| 1000/1000 [10:08<00:00,  1.64trial/s, best loss: 0.15438380000000007]
{boosting: gblinear
colsample_bylevel: 0.51
colsample_bynode: 0.78
colsample_bytree: 0.63
eval_metric: auc
gamma: 4.810365717428452
learning_rate: 0.17634171799584678
max_depth: 11
min_child_weight: 4.235461704433606
objective: reg:logistic
reg_alpha: 0.006389914498800886
reg_lambda: 2.1955950101749937
subsample: 0.5
tree_method: approx}


In [62]:
xgb_params

{'boosting': 'gblinear',
 'colsample_bylevel': 0.51,
 'colsample_bynode': 0.78,
 'colsample_bytree': 0.63,
 'eval_metric': 'auc',
 'gamma': 4.810365717428452,
 'learning_rate': 0.17634171799584678,
 'max_depth': 11,
 'min_child_weight': 4.235461704433606,
 'objective': 'reg:logistic',
 'reg_alpha': 0.006389914498800886,
 'reg_lambda': 2.1955950101749937,
 'subsample': 0.5,
 'tree_method': 'approx'}

In [13]:
xgb_params = {
    'boosting': 'gblinear',
    'colsample_bylevel': 0.51,
    'colsample_bynode': 0.78,
    'colsample_bytree': 0.63,
    'eval_metric': 'auc',
    'gamma': 4.810365717428452,
    'learning_rate': 0.17634171799584678,
    'max_depth': 11,
    'min_child_weight': 4.235461704433606,
    'objective': 'reg:logistic',
    'reg_alpha': 0.006389914498800886,
    'reg_lambda': 2.1955950101749937,
    'subsample': 0.5,
    'tree_method': 'approx'
}

In [14]:
clf_xgb = xgb.XGBClassifier()
clf_xgb.set_params(**xgb_params)
clf_xgb.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_test, y_test)], early_stopping_rounds=5)

Parameters: { boosting } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


[0]	validation_0-auc:0.81563	validation_1-auc:0.83390
Multiple eval metrics have been passed: 'validation_1-auc' will be used for early stopping.

Will train until validation_1-auc hasn't improved in 5 rounds.
[1]	validation_0-auc:0.84048	validation_1-auc:0.85461
[2]	validation_0-auc:0.84531	validation_1-auc:0.85670
[3]	validation_0-auc:0.84714	validation_1-auc:0.85668
[4]	validation_0-auc:0.84980	validation_1-auc:0.85758
[5]	validation_0-auc:0.85298	validation_1-auc:0.85917
[6]	validation_0-auc:0.85312	validation_1-auc:0.85964
[7]	validation_0-auc:0.85593	validation_1-auc:0.85991
[8]	validation_0-auc:0.85692	validation_1-auc:0.86130
[9]	validation_0-auc:0.85690	validation_1-auc:0.86221
[10]	validation_0-au

XGBClassifier(base_score=0.5, booster='gbtree', boosting='gblinear',
              colsample_bylevel=0.51, colsample_bynode=0.78,
              colsample_bytree=0.63, eval_metric='auc', gamma=4.810365717428452,
              gpu_id=-1, importance_type='gain', interaction_constraints='',
              learning_rate=0.17634171799584678, max_delta_step=0, max_depth=11,
              min_child_weight=4.235461704433606, missing=nan,
              monotone_constraints='()', n_estimators=100, n_jobs=0,
              num_parallel_tree=1, objective='reg:logistic', random_state=0,
              reg_alpha=0.006389914498800886, reg_lambda=2.1955950101749937,
              scale_pos_weight=1, subsample=0.5, tree_method='approx',
              validate_parameters=1, verbosity=None)

In [15]:
from sklearn.metrics import roc_auc_score
y_pred_te = clf_xgb.predict(X_test)
roc_auc_score(y_pred_te, y_test)

0.7696973899109426

In [66]:
lgb_params = quick_hyperopt(X_train, y_train, 'lgbm', 1000)

Running 1000 rounds of LightGBM parameter optimisation:
bagging_fraction is set=0.81, subsample=1.0 will be ignored. Current value: bagging_fraction=0.81                      
100%|███████████████████████████████████████████| 1000/1000 [11:45<00:00,  1.42trial/s, best loss: 0.15284779411501825]
{bagging_fraction: 0.92
boosting: goss
feature_fraction: 0.59
lambda_l1: 1.7651925160257511
lambda_l2: 1.7825505465218088
learning_rate: 0.16094579301919001
max_bin: 184
max_depth: 19
metric: auc
min_data_in_bin: 250
min_data_in_leaf: 199
min_gain_to_split: 2.04
num_leaves: 1411
objective: binary
other_rate: 0.22505088017906216
top_rate: 0.35438312429002566}


In [16]:
lgb_params = {
    'bagging_fraction': 0.92,
    'boosting': 'goss',
    'feature_fraction': 0.59,
    'lambda_l1': 1.7651925160257511,
    'lambda_l2': 1.7825505465218088,
    'learning_rate': 0.16094579301919001,
    'max_bin': 184,
    'max_depth': 19,
    'metric': 'auc',
    'min_data_in_bin': 250,
    'min_data_in_leaf': 199,
    'min_gain_to_split': 2.04,
    'num_leaves': 1411,
    'objective': 'binary',
    'other_rate': 0.22505088017906216,
    'top_rate': 0.35438312429002566
}

lgb_params = {
    'bagging_fraction': 0.71,
    'boosting': 'goss',
    'feature_fraction': 0.51,
    'lambda_l1': 1.081403239398178,
    'lambda_l2': 2.0663112415349176,
    'learning_rate': 0.06809338955140036,
    'max_bin': 50,
    'max_depth': 18,
    'metric': 'auc',
    'min_data_in_bin': 195,
    'min_data_in_leaf': 34,
    'min_gain_to_split': 2.48,
    'num_leaves': 1900,
    'objective': 'cross_entropy',
    'other_rate': 0.3489543869599704,
    'top_rate': 0.040916784491823455,
    'random_state': 42
}

In [17]:
clf_lgb = lgb.LGBMClassifier()
clf_lgb.set_params(**lgb_params) 
clf_lgb.fit(X_train, y_train, eval_set=[(X_test, y_test)], early_stopping_rounds=5)

[1]	valid_0's auc: 0.841141
Training until validation scores don't improve for 5 rounds
[2]	valid_0's auc: 0.851137
[3]	valid_0's auc: 0.852291
[4]	valid_0's auc: 0.85224
[5]	valid_0's auc: 0.854167
[6]	valid_0's auc: 0.856377
[7]	valid_0's auc: 0.8564
[8]	valid_0's auc: 0.857155
[9]	valid_0's auc: 0.857201
[10]	valid_0's auc: 0.858432
[11]	valid_0's auc: 0.859229
[12]	valid_0's auc: 0.859815
[13]	valid_0's auc: 0.859208
[14]	valid_0's auc: 0.860169
[15]	valid_0's auc: 0.860679
[16]	valid_0's auc: 0.861034
[17]	valid_0's auc: 0.861736
[18]	valid_0's auc: 0.861634
[19]	valid_0's auc: 0.861644
[20]	valid_0's auc: 0.861754
[21]	valid_0's auc: 0.861459
[22]	valid_0's auc: 0.861904
[23]	valid_0's auc: 0.861875
[24]	valid_0's auc: 0.862095
[25]	valid_0's auc: 0.862553
[26]	valid_0's auc: 0.862524
[27]	valid_0's auc: 0.86267
[28]	valid_0's auc: 0.862876
[29]	valid_0's auc: 0.862769
[30]	valid_0's auc: 0.86281
[31]	valid_0's auc: 0.862914
[32]	valid_0's auc: 0.862903
[33]	valid_0's auc: 0.8630

LGBMClassifier(bagging_fraction=0.92, boosting='goss', feature_fraction=0.59,
               lambda_l1=1.7651925160257511, lambda_l2=1.7825505465218088,
               learning_rate=0.16094579301919001, max_bin=184, max_depth=19,
               metric='auc', min_data_in_bin=250, min_data_in_leaf=199,
               min_gain_to_split=2.04, num_leaves=1411, objective='binary',
               other_rate=0.22505088017906216, top_rate=0.35438312429002566)

In [18]:
from sklearn.metrics import roc_auc_score
y_pred_te = clf_lgb.predict(X_test)
roc_auc_score(y_pred_te, y_test)

0.764436705876736

In [21]:
from pytorch_tabnet.tab_model import TabNetClassifier, TabNetRegressor
clf_tabnet = TabNetClassifier(n_d=32, n_a=32, seed=0)
clf_tabnet.fit(
    X_train=X_train, y_train=y_train,
    eval_set=[(X_train, y_train), (X_test, y_test)],
    eval_name=['train', 'valid'],
    eval_metric=['auc'],
    max_epochs=1000 , 
    drop_last=False,
)

Device used : cpu
epoch 0  | loss: 0.68866 | train_auc: 0.71802 | valid_auc: 0.6988  |  0:00:02s
epoch 1  | loss: 0.53482 | train_auc: 0.75225 | valid_auc: 0.7553  |  0:00:04s
epoch 2  | loss: 0.5015  | train_auc: 0.754   | valid_auc: 0.76153 |  0:00:06s
epoch 3  | loss: 0.48533 | train_auc: 0.78012 | valid_auc: 0.81001 |  0:00:09s
epoch 4  | loss: 0.47436 | train_auc: 0.80202 | valid_auc: 0.81279 |  0:00:12s
epoch 5  | loss: 0.46137 | train_auc: 0.8167  | valid_auc: 0.8186  |  0:00:14s
epoch 6  | loss: 0.45532 | train_auc: 0.81701 | valid_auc: 0.82122 |  0:00:16s
epoch 7  | loss: 0.45082 | train_auc: 0.82065 | valid_auc: 0.82274 |  0:00:19s
epoch 8  | loss: 0.4504  | train_auc: 0.8282  | valid_auc: 0.83147 |  0:00:21s
epoch 9  | loss: 0.44048 | train_auc: 0.83433 | valid_auc: 0.83261 |  0:00:24s
epoch 10 | loss: 0.43522 | train_auc: 0.8387  | valid_auc: 0.83114 |  0:00:27s
epoch 11 | loss: 0.43191 | train_auc: 0.84145 | valid_auc: 0.83574 |  0:00:29s
epoch 12 | loss: 0.43115 | train_a

In [22]:
from sklearn.metrics import roc_auc_score
y_pred_te = clf_tabnet.predict(X_test)
roc_auc_score(y_pred_te, y_test)

0.76954185520362

In [6]:
cb_params = {
    'bootstrap_type': 'Bernoulli',
    'depth': 2,
    'eval_metric': 'AUC',
    'fold_len_multiplier': 1.8138033787766388,
    'grow_policy': 'Lossguide',
    'l2_leaf_reg': 2.6145900508841096,
    'leaf_estimation_backtracking': 'AnyImprovement',
    'learning_rate': 0.2211543774884907,
    'max_bin': 32,
    'max_leaves': 17,
    'min_data_in_leaf': 10,
    'random_strength': 2.2878538537803936,
    'random_state': 0
}

In [25]:
clf_cb = cb.CatBoostClassifier(**cb_params, early_stopping_rounds=5)
clf_cb.fit(X_train, y_train, eval_set = (X_test, y_test))

0:	test: 0.7878389	best: 0.7878389 (0)	total: 4.2ms	remaining: 4.2s
1:	test: 0.8237718	best: 0.8237718 (1)	total: 7.3ms	remaining: 3.64s
2:	test: 0.8202252	best: 0.8237718 (1)	total: 10.3ms	remaining: 3.44s
3:	test: 0.8280469	best: 0.8280469 (3)	total: 13.7ms	remaining: 3.42s
4:	test: 0.8387669	best: 0.8387669 (4)	total: 17.1ms	remaining: 3.4s
5:	test: 0.8398990	best: 0.8398990 (5)	total: 20.3ms	remaining: 3.36s
6:	test: 0.8414258	best: 0.8414258 (6)	total: 23.7ms	remaining: 3.36s
7:	test: 0.8461175	best: 0.8461175 (7)	total: 26.8ms	remaining: 3.33s
8:	test: 0.8489214	best: 0.8489214 (8)	total: 30.1ms	remaining: 3.31s
9:	test: 0.8523684	best: 0.8523684 (9)	total: 33.1ms	remaining: 3.28s
10:	test: 0.8535251	best: 0.8535251 (10)	total: 36.1ms	remaining: 3.25s
11:	test: 0.8547440	best: 0.8547440 (11)	total: 39.1ms	remaining: 3.22s
12:	test: 0.8539948	best: 0.8547440 (11)	total: 42.1ms	remaining: 3.19s
13:	test: 0.8540168	best: 0.8547440 (11)	total: 45.3ms	remaining: 3.19s
14:	test: 0.8540

<catboost.core.CatBoostClassifier at 0x14eadeecbb0>

In [26]:
from sklearn.metrics import roc_auc_score
y_pred_te = clf_cb.predict(X_test)
roc_auc_score(y_pred_te, y_test)

0.7704800681410423