## Tuning LightGBM Model

Dataset: 

- https://www.kaggle.com/datasets/alexteboul/diabetes-health-indicators-dataset?select=diabetes_binary_health_indicators_BRFSS2015.csv

**=========================================================**

### Data Preparation

In [1]:
import pandas as pd
import numpy as np
import lightgbm as lgb

#### Load Dataset

In [2]:
# I will use function to reduce Pandas dataframe memory usage
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df

def import_data(file):
    """create a dataframe and optimize its memory usage"""
    df = pd.read_csv(file, parse_dates=True, keep_date_col=True)
    df = reduce_mem_usage(df)
    return df

In [None]:
df = import_data('diabetes_binary_health_indicators_BRFSS2015(1).csv')

Memory usage of dataframe is 42.58 MB
Memory usage after optimization is: 10.64 MB
Decreased by 75.0%


In [4]:
pd.set_option('display.max_columns', None)
df.head()

Unnamed: 0,Diabetes_binary,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,Fruits,Veggies,HvyAlcoholConsump,AnyHealthcare,NoDocbcCost,GenHlth,MentHlth,PhysHlth,DiffWalk,Sex,Age,Education,Income
0,0.0,1.0,1.0,1.0,40.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,5.0,18.0,15.0,1.0,0.0,9.0,4.0,3.0
1,0.0,0.0,0.0,0.0,25.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,3.0,0.0,0.0,0.0,0.0,7.0,6.0,1.0
2,0.0,1.0,1.0,1.0,28.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,5.0,30.0,30.0,1.0,0.0,9.0,4.0,8.0
3,0.0,1.0,0.0,1.0,27.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,2.0,0.0,0.0,0.0,0.0,11.0,3.0,6.0
4,0.0,1.0,1.0,1.0,24.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,2.0,3.0,0.0,0.0,0.0,11.0,5.0,4.0


#### Variable Types

LightGBM is able to handle categorical variables out-of-the-box and I will use this feature.

However, in real life scenario, categorical features encoding might get better performance models.

In [5]:
x_columns = list(df.columns)
response_column = 'Diabetes_binary'
x_columns.remove(response_column)

categorical_columns = ['HighBP', 'HighChol', 'CholCheck', 'Smoker',
       'Stroke', 'HeartDiseaseorAttack', 'PhysActivity', 'Fruits', 'Veggies',
       'HvyAlcoholConsump', 'AnyHealthcare', 'NoDocbcCost',
       'DiffWalk', 'Sex',
       'Income']
numerical_columns = list(set(x_columns)-set(categorical_columns))
print("Response column:", response_column)
print("Categorical columns:", categorical_columns)
print("Numerical columns:", numerical_columns)

# This is only needed if you are planning to use Sklearn API
for column in categorical_columns:
    print("Processing column:", column)
    df[column] = df[column].astype('int64') # Convert float to integer then to categorical
    df[column] = df[column].astype('category')

Response column: Diabetes_binary
Categorical columns: ['HighBP', 'HighChol', 'CholCheck', 'Smoker', 'Stroke', 'HeartDiseaseorAttack', 'PhysActivity', 'Fruits', 'Veggies', 'HvyAlcoholConsump', 'AnyHealthcare', 'NoDocbcCost', 'DiffWalk', 'Sex', 'Income']
Numerical columns: ['Education', 'Age', 'MentHlth', 'GenHlth', 'BMI', 'PhysHlth']
Processing column: HighBP
Processing column: HighChol
Processing column: CholCheck
Processing column: Smoker
Processing column: Stroke
Processing column: HeartDiseaseorAttack
Processing column: PhysActivity
Processing column: Fruits
Processing column: Veggies
Processing column: HvyAlcoholConsump
Processing column: AnyHealthcare
Processing column: NoDocbcCost
Processing column: DiffWalk
Processing column: Sex
Processing column: Income


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 253680 entries, 0 to 253679
Data columns (total 22 columns):
 #   Column                Non-Null Count   Dtype   
---  ------                --------------   -----   
 0   Diabetes_binary       253680 non-null  float16 
 1   HighBP                253680 non-null  category
 2   HighChol              253680 non-null  category
 3   CholCheck             253680 non-null  category
 4   BMI                   253680 non-null  float16 
 5   Smoker                253680 non-null  category
 6   Stroke                253680 non-null  category
 7   HeartDiseaseorAttack  253680 non-null  category
 8   PhysActivity          253680 non-null  category
 9   Fruits                253680 non-null  category
 10  Veggies               253680 non-null  category
 11  HvyAlcoholConsump     253680 non-null  category
 12  AnyHealthcare         253680 non-null  category
 13  NoDocbcCost           253680 non-null  category
 14  GenHlth               253680 non-nul

In [7]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df[x_columns], 
                                                    df[response_column], 
                                                    test_size=0.20, 
                                                    random_state=314, 
                                                    stratify=df[response_column])

In [8]:
print("Train dimensions:",X_train.shape)
print("Test dimensions:",X_test.shape)

Train dimensions: (202944, 21)
Test dimensions: (50736, 21)


In [9]:
df[response_column].value_counts()

Diabetes_binary
0.0    218334
1.0     35346
Name: count, dtype: int64

In [10]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 202944 entries, 210000 to 68327
Data columns (total 21 columns):
 #   Column                Non-Null Count   Dtype   
---  ------                --------------   -----   
 0   HighBP                202944 non-null  category
 1   HighChol              202944 non-null  category
 2   CholCheck             202944 non-null  category
 3   BMI                   202944 non-null  float16 
 4   Smoker                202944 non-null  category
 5   Stroke                202944 non-null  category
 6   HeartDiseaseorAttack  202944 non-null  category
 7   PhysActivity          202944 non-null  category
 8   Fruits                202944 non-null  category
 9   Veggies               202944 non-null  category
 10  HvyAlcoholConsump     202944 non-null  category
 11  AnyHealthcare         202944 non-null  category
 12  NoDocbcCost           202944 non-null  category
 13  GenHlth               202944 non-null  float16 
 14  MentHlth              202944 non-null

### Train LightGBM

In [11]:
X_train.head()

Unnamed: 0,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,Fruits,Veggies,HvyAlcoholConsump,AnyHealthcare,NoDocbcCost,GenHlth,MentHlth,PhysHlth,DiffWalk,Sex,Age,Education,Income
210000,1,1,1,24.0,1,0,0,1,0,0,0,1,0,4.0,0.0,5.0,1,0,12.0,5.0,6
27201,0,1,1,30.0,0,0,0,1,1,1,0,1,0,2.0,2.0,1.0,0,0,6.0,6.0,7
31381,1,0,1,33.0,0,0,0,1,0,0,0,1,1,4.0,0.0,0.0,0,0,8.0,5.0,7
25244,1,0,1,22.0,0,0,0,1,1,1,0,1,0,2.0,30.0,0.0,1,1,10.0,5.0,5
32644,0,1,1,32.0,1,1,0,1,1,1,0,1,0,3.0,0.0,20.0,0,1,9.0,6.0,7


In [12]:
train_data = lgb.Dataset(data=X_train, label=y_train, params={"verbose":-1}, categorical_feature=categorical_columns)
test_data = lgb.Dataset(data=X_test, label=y_test, params={"verbose":-1}, categorical_feature=categorical_columns)
lgb_clf = lgb.train(params={"verbose":-1},
                    train_set=train_data)

In [13]:
from sklearn.metrics import roc_auc_score
print("AUC score on Test dataset:", roc_auc_score(y_test, lgb_clf.predict(X_test)))
print("AUC score on Train dataset:", roc_auc_score(y_train, lgb_clf.predict(X_train)))

AUC score on Test dataset: 0.8291245162663716
AUC score on Train dataset: 0.8391701749746283


#### Hyper-Parameters Optimization

First try produce pretty close AUC metric for both train and test dataset. I will look for ways to improve it.

In [14]:
from optuna.integration import LightGBMPruningCallback
import tqdm as notebook_tqdm
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
import optuna
import warnings

warnings.filterwarnings("ignore", category=UserWarning)
study_model_iteractions = {}

def objective(trial, X, y):
    param_grid = {
        "num_iterations": 10000,
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3, step=0.05),
        "num_leaves": trial.suggest_int("num_leaves", 50, 150, step=5),
        "max_depth": trial.suggest_int("max_depth", 5, 20, step=2),
        "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 100, 1000, step=100),
        ##############################
        'lambda_l1': trial.suggest_float('lambda_l1', 1e-8, 10.0, log=True),
        'lambda_l2': trial.suggest_float('lambda_l2', 1e-8, 10.0, log=True),
        #"lambda_l1": trial.suggest_float("lambda_l1", 0.01, 0.1, step=0.01),
        #"lambda_l2": trial.suggest_float("lambda_l2", 0.01, 0.1, step=0.01),
        ########################
        "bagging_fraction": trial.suggest_float("bagging_fraction", 0.8, 1.0, step=0.1),
        "bagging_freq": trial.suggest_categorical("bagging_freq", [5]),
        "feature_fraction": trial.suggest_float("feature_fraction", 0.8, 1.0, step=0.1),
        #######################
        "is_unbalance": trial.suggest_categorical("is_unbalance",[True, False]),
        ########################
        "verbose": -1,
        "objective":"binary",
        "metric":"auc"
    }

    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=1121218)

    cv_scores = np.empty(5)
    cv_iteractions = np.empty(5)
    for idx, (train_idx, test_idx) in enumerate(cv.split(X, y)):
        X_train, X_valid = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_valid = y.iloc[train_idx], y.iloc[test_idx]

        train_data = lgb.Dataset(data=X_train, label=y_train, params={"verbose":-1}, categorical_feature=categorical_columns)
        valid_data = lgb.Dataset(data=X_valid, label=y_valid, params={"verbose":-1}, categorical_feature=categorical_columns)
        lgb_clf = lgb.train(params=param_grid,
                            train_set=train_data,
                            valid_sets=[valid_data],
                            categorical_feature=categorical_columns,
                            callbacks=[LightGBMPruningCallback(trial, "auc"),
                                        lgb.early_stopping(stopping_rounds=5)]  
                            )
        preds = lgb_clf.predict(X_valid)
        cv_scores[idx] = roc_auc_score(y_valid, preds)
        cv_iteractions[idx] = lgb_clf.best_iteration
    
    study_model_iteractions[trial.number] = np.mean(cv_iteractions)

    return np.mean(cv_scores)

In [15]:
study = optuna.create_study(direction="maximize", study_name="LGBM Classifier")
func = lambda trial: objective(trial, X_train, y_train)
study.optimize(func, n_trials=100)

[I 2024-12-09 17:28:18,633] A new study created in memory with name: LGBM Classifier


Training until validation scores don't improve for 5 rounds
Early stopping, best iteration is:
[45]	valid_0's auc: 0.83364
Training until validation scores don't improve for 5 rounds




Early stopping, best iteration is:
[51]	valid_0's auc: 0.825236
Training until validation scores don't improve for 5 rounds




Early stopping, best iteration is:
[49]	valid_0's auc: 0.830115
Training until validation scores don't improve for 5 rounds




Early stopping, best iteration is:
[44]	valid_0's auc: 0.830377
Training until validation scores don't improve for 5 rounds


[I 2024-12-09 17:28:21,690] Trial 0 finished with value: 0.8299180999828092 and parameters: {'learning_rate': 0.11, 'num_leaves': 60, 'max_depth': 9, 'min_data_in_leaf': 1000, 'lambda_l1': 2.7746339916922e-05, 'lambda_l2': 0.00020954551165614708, 'bagging_fraction': 1.0, 'bagging_freq': 5, 'feature_fraction': 1.0, 'is_unbalance': True}. Best is trial 0 with value: 0.8299180999828092.


Early stopping, best iteration is:
[49]	valid_0's auc: 0.830223
Training until validation scores don't improve for 5 rounds




Early stopping, best iteration is:
[39]	valid_0's auc: 0.833029
Training until validation scores don't improve for 5 rounds




Early stopping, best iteration is:
[46]	valid_0's auc: 0.825026
Training until validation scores don't improve for 5 rounds




Early stopping, best iteration is:
[38]	valid_0's auc: 0.82998
Training until validation scores don't improve for 5 rounds




Early stopping, best iteration is:
[42]	valid_0's auc: 0.830432
Training until validation scores don't improve for 5 rounds


[I 2024-12-09 17:28:27,030] Trial 1 finished with value: 0.8297834442576921 and parameters: {'learning_rate': 0.11, 'num_leaves': 125, 'max_depth': 15, 'min_data_in_leaf': 1000, 'lambda_l1': 1.8499936338264708e-08, 'lambda_l2': 0.0018418522022393801, 'bagging_fraction': 0.8, 'bagging_freq': 5, 'feature_fraction': 0.8, 'is_unbalance': True}. Best is trial 0 with value: 0.8299180999828092.


Early stopping, best iteration is:
[46]	valid_0's auc: 0.830451
Training until validation scores don't improve for 5 rounds




Early stopping, best iteration is:
[536]	valid_0's auc: 0.832977




Training until validation scores don't improve for 5 rounds
Early stopping, best iteration is:
[545]	valid_0's auc: 0.824909




Training until validation scores don't improve for 5 rounds
Early stopping, best iteration is:
[423]	valid_0's auc: 0.830017




Training until validation scores don't improve for 5 rounds
Early stopping, best iteration is:
[467]	valid_0's auc: 0.830197




Training until validation scores don't improve for 5 rounds
Early stopping, best iteration is:
[623]	valid_0's auc: 0.830595


[I 2024-12-09 17:28:50,269] Trial 2 finished with value: 0.8297391152389446 and parameters: {'learning_rate': 0.01, 'num_leaves': 135, 'max_depth': 9, 'min_data_in_leaf': 700, 'lambda_l1': 1.579123217569578e-05, 'lambda_l2': 5.3650334350163566e-05, 'bagging_fraction': 1.0, 'bagging_freq': 5, 'feature_fraction': 1.0, 'is_unbalance': False}. Best is trial 0 with value: 0.8299180999828092.


Training until validation scores don't improve for 5 rounds
Early stopping, best iteration is:
[40]	valid_0's auc: 0.832857
Training until validation scores don't improve for 5 rounds




Early stopping, best iteration is:
[42]	valid_0's auc: 0.82492
Training until validation scores don't improve for 5 rounds




Early stopping, best iteration is:
[33]	valid_0's auc: 0.829399
Training until validation scores don't improve for 5 rounds




Early stopping, best iteration is:
[36]	valid_0's auc: 0.830499
Training until validation scores don't improve for 5 rounds


[I 2024-12-09 17:28:53,004] Trial 3 finished with value: 0.8297010286061879 and parameters: {'learning_rate': 0.11, 'num_leaves': 80, 'max_depth': 9, 'min_data_in_leaf': 100, 'lambda_l1': 6.5295516768436784e-06, 'lambda_l2': 0.5226566403818748, 'bagging_fraction': 1.0, 'bagging_freq': 5, 'feature_fraction': 0.8, 'is_unbalance': True}. Best is trial 0 with value: 0.8299180999828092.


Early stopping, best iteration is:
[47]	valid_0's auc: 0.830831
Training until validation scores don't improve for 5 rounds




Early stopping, best iteration is:
[620]	valid_0's auc: 0.833315




Training until validation scores don't improve for 5 rounds
Early stopping, best iteration is:
[389]	valid_0's auc: 0.824717




Training until validation scores don't improve for 5 rounds
Early stopping, best iteration is:
[425]	valid_0's auc: 0.829779




Training until validation scores don't improve for 5 rounds
Early stopping, best iteration is:
[360]	valid_0's auc: 0.829796




Training until validation scores don't improve for 5 rounds
Early stopping, best iteration is:
[472]	valid_0's auc: 0.829862


[I 2024-12-09 17:29:21,265] Trial 4 finished with value: 0.8294937806161975 and parameters: {'learning_rate': 0.01, 'num_leaves': 100, 'max_depth': 11, 'min_data_in_leaf': 100, 'lambda_l1': 9.251733515129592, 'lambda_l2': 0.17283364094528558, 'bagging_fraction': 0.9, 'bagging_freq': 5, 'feature_fraction': 1.0, 'is_unbalance': False}. Best is trial 0 with value: 0.8299180999828092.
[I 2024-12-09 17:29:21,397] Trial 5 pruned. Trial was pruned at iteration 0.
[I 2024-12-09 17:29:21,527] Trial 6 pruned. Trial was pruned at iteration 0.
[I 2024-12-09 17:29:21,646] Trial 7 pruned. Trial was pruned at iteration 0.
[I 2024-12-09 17:29:21,806] Trial 8 pruned. Trial was pruned at iteration 1.


Training until validation scores don't improve for 5 rounds
Training until validation scores don't improve for 5 rounds
Early stopping, best iteration is:
[18]	valid_0's auc: 0.831803
Training until validation scores don't improve for 5 rounds


[I 2024-12-09 17:29:22,429] Trial 9 pruned. Trial was pruned at iteration 24.
[I 2024-12-09 17:29:22,552] Trial 10 pruned. Trial was pruned at iteration 0.
[I 2024-12-09 17:29:22,680] Trial 11 pruned. Trial was pruned at iteration 0.
[I 2024-12-09 17:29:22,803] Trial 12 pruned. Trial was pruned at iteration 0.
[I 2024-12-09 17:29:22,927] Trial 13 pruned. Trial was pruned at iteration 0.
[I 2024-12-09 17:29:23,096] Trial 14 pruned. Trial was pruned at iteration 0.
[I 2024-12-09 17:29:23,267] Trial 15 pruned. Trial was pruned at iteration 1.
[I 2024-12-09 17:29:23,403] Trial 16 pruned. Trial was pruned at iteration 0.


Training until validation scores don't improve for 5 rounds


[I 2024-12-09 17:29:23,549] Trial 17 pruned. Trial was pruned at iteration 0.
[I 2024-12-09 17:29:23,684] Trial 18 pruned. Trial was pruned at iteration 0.
[I 2024-12-09 17:29:23,823] Trial 19 pruned. Trial was pruned at iteration 0.
[I 2024-12-09 17:29:23,966] Trial 20 pruned. Trial was pruned at iteration 0.
[I 2024-12-09 17:29:24,140] Trial 21 pruned. Trial was pruned at iteration 1.
[I 2024-12-09 17:29:24,309] Trial 22 pruned. Trial was pruned at iteration 1.


Training until validation scores don't improve for 5 rounds
Training until validation scores don't improve for 5 rounds


[I 2024-12-09 17:29:24,460] Trial 23 pruned. Trial was pruned at iteration 1.
[I 2024-12-09 17:29:24,634] Trial 24 pruned. Trial was pruned at iteration 1.


Training until validation scores don't improve for 5 rounds
Training until validation scores don't improve for 5 rounds


[I 2024-12-09 17:29:24,826] Trial 25 pruned. Trial was pruned at iteration 1.
[I 2024-12-09 17:29:24,981] Trial 26 pruned. Trial was pruned at iteration 0.


Training until validation scores don't improve for 5 rounds


[I 2024-12-09 17:29:25,139] Trial 27 pruned. Trial was pruned at iteration 0.
[I 2024-12-09 17:29:25,444] Trial 28 pruned. Trial was pruned at iteration 11.


Training until validation scores don't improve for 5 rounds


[I 2024-12-09 17:29:25,605] Trial 29 pruned. Trial was pruned at iteration 0.
[I 2024-12-09 17:29:25,747] Trial 30 pruned. Trial was pruned at iteration 0.
[I 2024-12-09 17:29:25,901] Trial 31 pruned. Trial was pruned at iteration 0.
[I 2024-12-09 17:29:26,054] Trial 32 pruned. Trial was pruned at iteration 0.
[I 2024-12-09 17:29:26,204] Trial 33 pruned. Trial was pruned at iteration 0.
[I 2024-12-09 17:29:26,369] Trial 34 pruned. Trial was pruned at iteration 0.
[I 2024-12-09 17:29:26,576] Trial 35 pruned. Trial was pruned at iteration 0.
[I 2024-12-09 17:29:26,725] Trial 36 pruned. Trial was pruned at iteration 0.
[I 2024-12-09 17:29:26,867] Trial 37 pruned. Trial was pruned at iteration 0.
[I 2024-12-09 17:29:27,023] Trial 38 pruned. Trial was pruned at iteration 0.
[I 2024-12-09 17:29:27,167] Trial 39 pruned. Trial was pruned at iteration 0.
[I 2024-12-09 17:29:27,320] Trial 40 pruned. Trial was pruned at iteration 0.
[I 2024-12-09 17:29:27,502] Trial 41 pruned. Trial was pruned at

Training until validation scores don't improve for 5 rounds
Training until validation scores don't improve for 5 rounds


[I 2024-12-09 17:29:27,855] Trial 43 pruned. Trial was pruned at iteration 1.
[I 2024-12-09 17:29:28,022] Trial 44 pruned. Trial was pruned at iteration 1.


Training until validation scores don't improve for 5 rounds
Training until validation scores don't improve for 5 rounds




Training until validation scores don't improve for 5 rounds
Early stopping, best iteration is:
[39]	valid_0's auc: 0.83301
Training until validation scores don't improve for 5 rounds




Early stopping, best iteration is:
[33]	valid_0's auc: 0.825324
Training until validation scores don't improve for 5 rounds




Early stopping, best iteration is:
[35]	valid_0's auc: 0.830124
Training until validation scores don't improve for 5 rounds




Early stopping, best iteration is:
[32]	valid_0's auc: 0.829924
Training until validation scores don't improve for 5 rounds


[I 2024-12-09 17:29:30,526] Trial 45 finished with value: 0.8296739246580594 and parameters: {'learning_rate': 0.16000000000000003, 'num_leaves': 135, 'max_depth': 7, 'min_data_in_leaf': 800, 'lambda_l1': 0.028515121773680256, 'lambda_l2': 0.10638215596383571, 'bagging_fraction': 0.9, 'bagging_freq': 5, 'feature_fraction': 1.0, 'is_unbalance': True}. Best is trial 0 with value: 0.8299180999828092.


Early stopping, best iteration is:
[45]	valid_0's auc: 0.829987
Training until validation scores don't improve for 5 rounds




Early stopping, best iteration is:
[43]	valid_0's auc: 0.833364
Training until validation scores don't improve for 5 rounds




Early stopping, best iteration is:
[51]	valid_0's auc: 0.825385
Training until validation scores don't improve for 5 rounds




Early stopping, best iteration is:
[36]	valid_0's auc: 0.829536
Training until validation scores don't improve for 5 rounds




Early stopping, best iteration is:
[34]	valid_0's auc: 0.830387
Training until validation scores don't improve for 5 rounds


[I 2024-12-09 17:29:32,979] Trial 46 finished with value: 0.8297563621573184 and parameters: {'learning_rate': 0.16000000000000003, 'num_leaves': 135, 'max_depth': 7, 'min_data_in_leaf': 800, 'lambda_l1': 0.00022994737833337237, 'lambda_l2': 0.04625924115188199, 'bagging_fraction': 0.8, 'bagging_freq': 5, 'feature_fraction': 1.0, 'is_unbalance': True}. Best is trial 0 with value: 0.8299180999828092.
[I 2024-12-09 17:29:33,127] Trial 47 pruned. Trial was pruned at iteration 0.


Early stopping, best iteration is:
[33]	valid_0's auc: 0.83011


[I 2024-12-09 17:29:33,282] Trial 48 pruned. Trial was pruned at iteration 2.
[I 2024-12-09 17:29:33,422] Trial 49 pruned. Trial was pruned at iteration 0.


Training until validation scores don't improve for 5 rounds


[I 2024-12-09 17:29:33,575] Trial 50 pruned. Trial was pruned at iteration 0.
[I 2024-12-09 17:29:33,720] Trial 51 pruned. Trial was pruned at iteration 2.
[I 2024-12-09 17:29:33,874] Trial 52 pruned. Trial was pruned at iteration 0.


Training until validation scores don't improve for 5 rounds


[I 2024-12-09 17:29:34,028] Trial 53 pruned. Trial was pruned at iteration 2.


Training until validation scores don't improve for 5 rounds
Training until validation scores don't improve for 5 rounds


[I 2024-12-09 17:29:34,554] Trial 54 pruned. Trial was pruned at iteration 36.


Training until validation scores don't improve for 5 rounds
Early stopping, best iteration is:
[26]	valid_0's auc: 0.833094




Training until validation scores don't improve for 5 rounds
Early stopping, best iteration is:
[32]	valid_0's auc: 0.82448




Training until validation scores don't improve for 5 rounds
Early stopping, best iteration is:
[20]	valid_0's auc: 0.829884




Training until validation scores don't improve for 5 rounds
Early stopping, best iteration is:
[21]	valid_0's auc: 0.829963


[I 2024-12-09 17:29:36,010] Trial 55 finished with value: 0.8295010377553339 and parameters: {'learning_rate': 0.21000000000000002, 'num_leaves': 135, 'max_depth': 7, 'min_data_in_leaf': 700, 'lambda_l1': 0.019590266359572384, 'lambda_l2': 0.005113077325289361, 'bagging_fraction': 1.0, 'bagging_freq': 5, 'feature_fraction': 1.0, 'is_unbalance': True}. Best is trial 0 with value: 0.8299180999828092.


Training until validation scores don't improve for 5 rounds
Early stopping, best iteration is:
[27]	valid_0's auc: 0.830084


[I 2024-12-09 17:29:36,158] Trial 56 pruned. Trial was pruned at iteration 0.
[I 2024-12-09 17:29:36,414] Trial 57 pruned. Trial was pruned at iteration 5.


Training until validation scores don't improve for 5 rounds




Training until validation scores don't improve for 5 rounds


[I 2024-12-09 17:29:37,030] Trial 58 pruned. Trial was pruned at iteration 21.
[I 2024-12-09 17:29:37,206] Trial 59 pruned. Trial was pruned at iteration 0.
[I 2024-12-09 17:29:37,368] Trial 60 pruned. Trial was pruned at iteration 0.


Training until validation scores don't improve for 5 rounds
Early stopping, best iteration is:
[29]	valid_0's auc: 0.833318
Training until validation scores don't improve for 5 rounds




Early stopping, best iteration is:
[24]	valid_0's auc: 0.824366
Training until validation scores don't improve for 5 rounds




Early stopping, best iteration is:
[32]	valid_0's auc: 0.829868
Training until validation scores don't improve for 5 rounds




Early stopping, best iteration is:
[25]	valid_0's auc: 0.830015
Training until validation scores don't improve for 5 rounds


[I 2024-12-09 17:29:39,200] Trial 61 finished with value: 0.8295302634317727 and parameters: {'learning_rate': 0.21000000000000002, 'num_leaves': 135, 'max_depth': 7, 'min_data_in_leaf': 700, 'lambda_l1': 0.025323949405570888, 'lambda_l2': 0.00932820277393881, 'bagging_fraction': 1.0, 'bagging_freq': 5, 'feature_fraction': 1.0, 'is_unbalance': True}. Best is trial 0 with value: 0.8299180999828092.


Early stopping, best iteration is:
[27]	valid_0's auc: 0.830084
Training until validation scores don't improve for 5 rounds


[I 2024-12-09 17:29:39,581] Trial 62 pruned. Trial was pruned at iteration 28.


Training until validation scores don't improve for 5 rounds
Early stopping, best iteration is:
[27]	valid_0's auc: 0.832927




Training until validation scores don't improve for 5 rounds
Early stopping, best iteration is:
[25]	valid_0's auc: 0.824212




Training until validation scores don't improve for 5 rounds
Early stopping, best iteration is:
[23]	valid_0's auc: 0.829628




Training until validation scores don't improve for 5 rounds
Early stopping, best iteration is:
[25]	valid_0's auc: 0.828713
Training until validation scores don't improve for 5 rounds


[I 2024-12-09 17:29:41,199] Trial 63 finished with value: 0.828982354894652 and parameters: {'learning_rate': 0.26, 'num_leaves': 135, 'max_depth': 7, 'min_data_in_leaf': 800, 'lambda_l1': 0.06411309526831398, 'lambda_l2': 0.008866282061391405, 'bagging_fraction': 1.0, 'bagging_freq': 5, 'feature_fraction': 1.0, 'is_unbalance': True}. Best is trial 0 with value: 0.8299180999828092.
[I 2024-12-09 17:29:41,343] Trial 64 pruned. Trial was pruned at iteration 0.


Early stopping, best iteration is:
[26]	valid_0's auc: 0.829432


[I 2024-12-09 17:29:41,488] Trial 65 pruned. Trial was pruned at iteration 0.
[I 2024-12-09 17:29:41,661] Trial 66 pruned. Trial was pruned at iteration 6.
[I 2024-12-09 17:29:41,803] Trial 67 pruned. Trial was pruned at iteration 0.


Training until validation scores don't improve for 5 rounds


[I 2024-12-09 17:29:41,939] Trial 68 pruned. Trial was pruned at iteration 0.
[I 2024-12-09 17:29:42,076] Trial 69 pruned. Trial was pruned at iteration 0.


Training until validation scores don't improve for 5 rounds
Early stopping, best iteration is:
[27]	valid_0's auc: 0.83301
Training until validation scores don't improve for 5 rounds




Early stopping, best iteration is:
[32]	valid_0's auc: 0.824146
Training until validation scores don't improve for 5 rounds




Early stopping, best iteration is:
[28]	valid_0's auc: 0.829812
Training until validation scores don't improve for 5 rounds




Early stopping, best iteration is:
[23]	valid_0's auc: 0.829937
Training until validation scores don't improve for 5 rounds


[I 2024-12-09 17:29:43,873] Trial 70 finished with value: 0.8294150834277767 and parameters: {'learning_rate': 0.21000000000000002, 'num_leaves': 145, 'max_depth': 7, 'min_data_in_leaf': 700, 'lambda_l1': 1.8146208097540097e-05, 'lambda_l2': 0.0005259351999611559, 'bagging_fraction': 1.0, 'bagging_freq': 5, 'feature_fraction': 1.0, 'is_unbalance': True}. Best is trial 0 with value: 0.8299180999828092.


Early stopping, best iteration is:
[33]	valid_0's auc: 0.83017
Training until validation scores don't improve for 5 rounds




Early stopping, best iteration is:
[37]	valid_0's auc: 0.833249
Training until validation scores don't improve for 5 rounds




Early stopping, best iteration is:
[36]	valid_0's auc: 0.824508
Training until validation scores don't improve for 5 rounds




Early stopping, best iteration is:
[27]	valid_0's auc: 0.829724
Training until validation scores don't improve for 5 rounds




Early stopping, best iteration is:
[23]	valid_0's auc: 0.830145
Training until validation scores don't improve for 5 rounds


[I 2024-12-09 17:29:45,892] Trial 71 finished with value: 0.8295417742560808 and parameters: {'learning_rate': 0.21000000000000002, 'num_leaves': 135, 'max_depth': 7, 'min_data_in_leaf': 700, 'lambda_l1': 0.005112952518852122, 'lambda_l2': 0.006940538067424893, 'bagging_fraction': 1.0, 'bagging_freq': 5, 'feature_fraction': 1.0, 'is_unbalance': True}. Best is trial 0 with value: 0.8299180999828092.


Early stopping, best iteration is:
[27]	valid_0's auc: 0.830083
Training until validation scores don't improve for 5 rounds


[I 2024-12-09 17:29:46,224] Trial 72 pruned. Trial was pruned at iteration 23.


Training until validation scores don't improve for 5 rounds
Early stopping, best iteration is:
[37]	valid_0's auc: 0.833249
Training until validation scores don't improve for 5 rounds




Early stopping, best iteration is:
[34]	valid_0's auc: 0.824527
Training until validation scores don't improve for 5 rounds




Early stopping, best iteration is:
[27]	valid_0's auc: 0.829724
Training until validation scores don't improve for 5 rounds




Early stopping, best iteration is:
[23]	valid_0's auc: 0.830145
Training until validation scores don't improve for 5 rounds


[I 2024-12-09 17:29:48,468] Trial 73 finished with value: 0.8295454482296378 and parameters: {'learning_rate': 0.21000000000000002, 'num_leaves': 130, 'max_depth': 7, 'min_data_in_leaf': 700, 'lambda_l1': 0.004266291869880052, 'lambda_l2': 0.0029457447229353146, 'bagging_fraction': 1.0, 'bagging_freq': 5, 'feature_fraction': 1.0, 'is_unbalance': True}. Best is trial 0 with value: 0.8299180999828092.


Early stopping, best iteration is:
[27]	valid_0's auc: 0.830083
Training until validation scores don't improve for 5 rounds


[I 2024-12-09 17:29:48,690] Trial 74 pruned. Trial was pruned at iteration 3.
[I 2024-12-09 17:29:48,831] Trial 75 pruned. Trial was pruned at iteration 0.
[I 2024-12-09 17:29:48,982] Trial 76 pruned. Trial was pruned at iteration 0.
[I 2024-12-09 17:29:49,136] Trial 77 pruned. Trial was pruned at iteration 0.
[I 2024-12-09 17:29:49,270] Trial 78 pruned. Trial was pruned at iteration 0.
[I 2024-12-09 17:29:49,432] Trial 79 pruned. Trial was pruned at iteration 0.
[I 2024-12-09 17:29:49,622] Trial 80 pruned. Trial was pruned at iteration 0.
[I 2024-12-09 17:29:49,768] Trial 81 pruned. Trial was pruned at iteration 2.


Training until validation scores don't improve for 5 rounds
Training until validation scores don't improve for 5 rounds
Early stopping, best iteration is:
[29]	valid_0's auc: 0.833203
Training until validation scores don't improve for 5 rounds




Early stopping, best iteration is:
[32]	valid_0's auc: 0.82448
Training until validation scores don't improve for 5 rounds




Early stopping, best iteration is:
[20]	valid_0's auc: 0.829884
Training until validation scores don't improve for 5 rounds




Early stopping, best iteration is:
[29]	valid_0's auc: 0.830139
Training until validation scores don't improve for 5 rounds


[I 2024-12-09 17:29:52,050] Trial 82 finished with value: 0.8295580312704651 and parameters: {'learning_rate': 0.21000000000000002, 'num_leaves': 130, 'max_depth': 7, 'min_data_in_leaf': 700, 'lambda_l1': 0.014785702939229721, 'lambda_l2': 0.011598167870750506, 'bagging_fraction': 1.0, 'bagging_freq': 5, 'feature_fraction': 1.0, 'is_unbalance': True}. Best is trial 0 with value: 0.8299180999828092.


Early stopping, best iteration is:
[27]	valid_0's auc: 0.830084


[I 2024-12-09 17:29:52,331] Trial 83 pruned. Trial was pruned at iteration 6.


Training until validation scores don't improve for 5 rounds




Training until validation scores don't improve for 5 rounds


[I 2024-12-09 17:29:52,727] Trial 84 pruned. Trial was pruned at iteration 24.
[I 2024-12-09 17:29:52,871] Trial 85 pruned. Trial was pruned at iteration 0.
[I 2024-12-09 17:29:53,043] Trial 86 pruned. Trial was pruned at iteration 1.
[I 2024-12-09 17:29:53,185] Trial 87 pruned. Trial was pruned at iteration 0.


Training until validation scores don't improve for 5 rounds




Training until validation scores don't improve for 5 rounds


[I 2024-12-09 17:29:53,540] Trial 88 pruned. Trial was pruned at iteration 21.
[I 2024-12-09 17:29:53,692] Trial 89 pruned. Trial was pruned at iteration 0.
[I 2024-12-09 17:29:53,842] Trial 90 pruned. Trial was pruned at iteration 0.
[I 2024-12-09 17:29:54,025] Trial 91 pruned. Trial was pruned at iteration 3.
[I 2024-12-09 17:29:54,176] Trial 92 pruned. Trial was pruned at iteration 2.


Training until validation scores don't improve for 5 rounds
Training until validation scores don't improve for 5 rounds


[I 2024-12-09 17:29:54,413] Trial 93 pruned. Trial was pruned at iteration 7.


Training until validation scores don't improve for 5 rounds


[I 2024-12-09 17:29:54,551] Trial 94 pruned. Trial was pruned at iteration 0.
[I 2024-12-09 17:29:54,771] Trial 95 pruned. Trial was pruned at iteration 3.


Training until validation scores don't improve for 5 rounds


[I 2024-12-09 17:29:54,925] Trial 96 pruned. Trial was pruned at iteration 0.
[I 2024-12-09 17:29:55,093] Trial 97 pruned. Trial was pruned at iteration 1.
[I 2024-12-09 17:29:55,229] Trial 98 pruned. Trial was pruned at iteration 0.


Training until validation scores don't improve for 5 rounds


[I 2024-12-09 17:29:55,372] Trial 99 pruned. Trial was pruned at iteration 0.


In [16]:
print(f"\tBest value (AUC): {study.best_value:.5f}")
print(f"\tBest params:")

for key, value in study.best_params.items():
    print(f"\t\t{key}: {value}")

print("Best model best_iteration:", study_model_iteractions[study.best_trial.number])

	Best value (AUC): 0.82992
	Best params:
		learning_rate: 0.11
		num_leaves: 60
		max_depth: 9
		min_data_in_leaf: 1000
		lambda_l1: 2.7746339916922e-05
		lambda_l2: 0.00020954551165614708
		bagging_fraction: 1.0
		bagging_freq: 5
		feature_fraction: 1.0
		is_unbalance: True
Best model best_iteration: 47.6


In [17]:
best_params = {"verbose": -1,
                "objective":"binary",
                "metric":"auc"
            }
for key,val in study.best_params.items():
    best_params[key] = val

best_params["num_iterations"] = int(study_model_iteractions[study.best_trial.number])
print(best_params)

{'verbose': -1, 'objective': 'binary', 'metric': 'auc', 'learning_rate': 0.11, 'num_leaves': 60, 'max_depth': 9, 'min_data_in_leaf': 1000, 'lambda_l1': 2.7746339916922e-05, 'lambda_l2': 0.00020954551165614708, 'bagging_fraction': 1.0, 'bagging_freq': 5, 'feature_fraction': 1.0, 'is_unbalance': True, 'num_iterations': 47}


#### Train LightGBM Best Parameters

In [18]:
train_data = lgb.Dataset(data=X_train, label=y_train, params={"verbose":-1}, categorical_feature=categorical_columns)
test_data = lgb.Dataset(data=X_test, label=y_test, params={"verbose":-1}, categorical_feature=categorical_columns)
best_lgb = lgb.train(params=best_params,
                    train_set=train_data) 

In [19]:
print("AUC score on Test dataset:", roc_auc_score(y_test, best_lgb.predict(X_test)))
print("AUC score on Train dataset:", roc_auc_score(y_train, best_lgb.predict(X_train)))

AUC score on Test dataset: 0.8296202804787243
AUC score on Train dataset: 0.8382715548697526


#### Summary

I got only very small improvement in AUC on Testing dataset after model tuning. However, even 1% improvement in the AUC just by tuning hyper-parameters is significant.