## Module

In [1]:
import optuna
import numpy as np
import pandas as pd
import lightgbm as lgb
from sklearn.metrics import f1_score
from sklearn.metrics import log_loss
from sklearn.model_selection import train_test_split

## Dataset

In [2]:
train = pd.read_csv('../input/titanic/train.csv')
print(train.shape)
train.head()

(891, 12)


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [3]:
test = pd.read_csv('../input/titanic/test.csv')
print(test.shape)
test.head()

(418, 11)


Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [4]:
submission = pd.read_csv('../input/titanic/gender_submission.csv')
submission.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0
3,895,0
4,896,1


## Info

In [5]:
train['Survived'].value_counts()

0    549
1    342
Name: Survived, dtype: int64

## Preprocess

In [6]:
features = [
    'Pclass',
    'Sex',
    'Age',
    'SibSp',
    'Parch',
    'Fare',
    'Embarked'
]

target = 'Survived'

In [7]:
categorical_features = ['Sex', 'Embarked']

for col in categorical_features:
    train[col] = train[col].astype('category')

In [8]:
X_train = train[features]
y_train = train[target]
print(X_train.shape)
print(y_train.shape)

(891, 7)
(891,)


In [9]:
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.2, stratify=y_train, random_state=123)

print(X_train.shape)
print(y_valid.shape)
print(X_train.shape)
print(y_valid.shape)

(712, 7)
(179,)
(712, 7)
(179,)


## Params

In [10]:
def objective(trial):
    params = {
        'objective': 'binary',
        'learning_rate': trial.suggest_uniform('learning_rate', 0.05, 0.1),
        'num_leaves': trial.suggest_int('num_leaves', 32, 128),
        'max_depth': trial.suggest_int('max_depth', 3, 9),
        'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.6, 0.95),
        'subsample': trial.suggest_uniform('subsample', 0.6, 0.95)
    }
    
    lgb_train = lgb.Dataset(X_train, y_train, categorical_feature=categorical_features)
    lgb_eval = lgb.Dataset(X_valid, y_valid, reference=lgb_train, categorical_feature=categorical_features)
    
    model = lgb.train(
        params,
        lgb_train,
        valid_sets=[lgb_train, lgb_eval],
        verbose_eval=10,
        num_boost_round=1000,
        early_stopping_rounds=10
    )
    
    y_pred_valid = model.predict(X_valid, num_iteraition=model.best_iteration)
    score = log_loss(y_valid, y_pred_valid)
    return score

In [11]:
study = optuna.create_study(sampler=optuna.samplers.RandomSampler(seed=0))
study.optimize(objective, n_trials=40)

[32m[I 2023-10-24 08:06:36,935][0m A new study created in memory with name: no-name-76e41d59-712c-479d-96b2-828191905d42[0m
[32m[I 2023-10-24 08:06:37,123][0m Trial 0 finished with value: 0.40156446422610337 and parameters: {'learning_rate': 0.07744067519636624, 'num_leaves': 96, 'max_depth': 6, 'colsample_bytree': 0.9002809661679648, 'subsample': 0.8965381085744438}. Best is trial 0 with value: 0.40156446422610337.[0m


[LightGBM] [Info] Number of positive: 273, number of negative: 439
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 204
[LightGBM] [Info] Number of data points in the train set: 712, number of used features: 7
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.383427 -> initscore=-0.475028
[LightGBM] [Info] Start training from score -0.475028
Training until validation scores don't improve for 10 rounds
[10]	training's binary_logloss: 0.464268	valid_1's binary_logloss: 0.490495
[20]	training's binary_logloss: 0.383967	valid_1's binary_logloss: 0.436336
[30]	training's binary_logloss: 0.345602	valid_1's binary_logloss: 0.413205
[40]	training's binary_logloss: 0.325533	valid_1's binary_logloss: 0.4043
[50]	training's binary_logloss: 0.309229	valid_1's binary_logloss: 0.402149
[60]	training's binary_logloss: 0.294864	valid_1's binary_logloss: 0.410089
Early stopping, best iteration is:
[51]

[32m[I 2023-10-24 08:06:37,279][0m Trial 1 finished with value: 0.41848166402308457 and parameters: {'learning_rate': 0.08117818483929862, 'num_leaves': 68, 'max_depth': 9, 'colsample_bytree': 0.6198495420611051, 'subsample': 0.6954297031030396}. Best is trial 0 with value: 0.40156446422610337.[0m


No further splits with positive gain, best gain: -inf
[20]	training's binary_logloss: 0.388391	valid_1's binary_logloss: 0.453936
[30]	training's binary_logloss: 0.343775	valid_1's binary_logloss: 0.428968
[40]	training's binary_logloss: 0.316727	valid_1's binary_logloss: 0.426392
[50]	training's binary_logloss: 0.295708	valid_1's binary_logloss: 0.420135
[60]	training's binary_logloss: 0.280414	valid_1's binary_logloss: 0.422201
Early stopping, best iteration is:
[53]	training's binary_logloss: 0.290212	valid_1's binary_logloss: 0.418482
[LightGBM] [Info] Number of positive: 273, number of negative: 439
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 204
[LightGBM] [Info] Number of data points in the train set: 712, number of used features: 7
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.383427 -> initscore=-0.475028
[LightGBM] [Info] Start training from score -0.475028
Training 

[32m[I 2023-10-24 08:06:37,412][0m Trial 2 finished with value: 0.40268582691492133 and parameters: {'learning_rate': 0.0738832558660675, 'num_leaves': 71, 'max_depth': 9, 'colsample_bytree': 0.7988155963828762, 'subsample': 0.9239588234024313}. Best is trial 0 with value: 0.40156446422610337.[0m


No further splits with positive gain, best gain: -inf
[50]	training's binary_logloss: 0.291311	valid_1's binary_logloss: 0.407174
Early stopping, best iteration is:
[40]	training's binary_logloss: 0.310466	valid_1's binary_logloss: 0.402686
[LightGBM] [Info] Number of positive: 273, number of negative: 439
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 204
[LightGBM] [Info] Number of data points in the train set: 712, number of used features: 7
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.383427 -> initscore=-0.475028
[LightGBM] [Info] Start training from score -0.475028
Training until validation scores don't improve for 10 rounds
[10]	training's binary_logloss: 0.510681	valid_1's binary_logloss: 0.528308
[20]	training's binary_logloss: 0.442558	valid_1's binary_logloss: 0.485009
[30]	training's binary_logloss: 0.396219	valid_1's binary_logloss: 0.453413
[40]	training's binary_l

[32m[I 2023-10-24 08:06:37,650][0m Trial 3 finished with value: 0.4230211708592668 and parameters: {'learning_rate': 0.05355180290989435, 'num_leaves': 41, 'max_depth': 7, 'colsample_bytree': 0.607076439104114, 'subsample': 0.8914169459417782}. Best is trial 0 with value: 0.40156446422610337.[0m
[32m[I 2023-10-24 08:06:37,724][0m Trial 4 finished with value: 0.40083802414665415 and parameters: {'learning_rate': 0.08890783754749253, 'num_leaves': 79, 'max_depth': 3, 'colsample_bytree': 0.9425164197814674, 'subsample': 0.8797054974758531}. Best is trial 4 with value: 0.40083802414665415.[0m


No further splits with positive gain, best gain: -inf
[100]	training's binary_logloss: 0.289923	valid_1's binary_logloss: 0.423979
[110]	training's binary_logloss: 0.282051	valid_1's binary_logloss: 0.423021
[120]	training's binary_logloss: 0.275082	valid_1's binary_logloss: 0.427031
Early stopping, best iteration is:
[110]	training's binary_logloss: 0.282051	valid_1's binary_logloss: 0.423021
[LightGBM] [Info] Number of positive: 273, number of negative: 439
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 204
[LightGBM] [Info] Number of data points in the train set: 712, number of used features: 7
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.383427 -> initscore=-0.475028
[LightGBM] [Info] Start training from score -0.475028
Training until validation scores don't improve for 10 rounds
[10]	training's binary_logloss: 0.465651	valid_1's binary_logloss: 0.46619
[20]	training's binar

[32m[I 2023-10-24 08:06:37,898][0m Trial 5 finished with value: 0.4219360186374458 and parameters: {'learning_rate': 0.0730739681126466, 'num_leaves': 51, 'max_depth': 6, 'colsample_bytree': 0.6413960490541266, 'subsample': 0.8239723574646333}. Best is trial 4 with value: 0.40083802414665415.[0m


[30]	training's binary_logloss: 0.367564	valid_1's binary_logloss: 0.43442
[40]	training's binary_logloss: 0.344484	valid_1's binary_logloss: 0.430853
[50]	training's binary_logloss: 0.326487	valid_1's binary_logloss: 0.428971
[60]	training's binary_logloss: 0.31193	valid_1's binary_logloss: 0.424324
[70]	training's binary_logloss: 0.300854	valid_1's binary_logloss: 0.423097
[80]	training's binary_logloss: 0.291749	valid_1's binary_logloss: 0.422069
Early stopping, best iteration is:
[77]	training's binary_logloss: 0.294212	valid_1's binary_logloss: 0.421936
[LightGBM] [Info] Number of positive: 273, number of negative: 439
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 204
[LightGBM] [Info] Number of data points in the train set: 712, number of used features: 7
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.383427 -> initscore=-0.475028
[LightGBM] [Info] Start training from score

[32m[I 2023-10-24 08:06:38,064][0m Trial 6 finished with value: 0.41623955408346713 and parameters: {'learning_rate': 0.05716766437045232, 'num_leaves': 64, 'max_depth': 5, 'colsample_bytree': 0.7451316789966832, 'subsample': 0.6925944642366194}. Best is trial 4 with value: 0.40083802414665415.[0m
[32m[I 2023-10-24 08:06:38,171][0m Trial 7 finished with value: 0.4067806943181886 and parameters: {'learning_rate': 0.08871168447171084, 'num_leaves': 87, 'max_depth': 7, 'colsample_bytree': 0.7989518821040269, 'subsample': 0.6065764301527243}. Best is trial 4 with value: 0.40083802414665415.[0m


No further splits with positive gain, best gain: -inf
[50]	training's binary_logloss: 0.347204	valid_1's binary_logloss: 0.422871
[60]	training's binary_logloss: 0.335646	valid_1's binary_logloss: 0.420101
[70]	training's binary_logloss: 0.325436	valid_1's binary_logloss: 0.417255
[80]	training's binary_logloss: 0.317782	valid_1's binary_logloss: 0.417095
Early stopping, best iteration is:
[79]	training's binary_logloss: 0.318606	valid_1's binary_logloss: 0.41624
[LightGBM] [Info] Number of positive: 273, number of negative: 439
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 204
[LightGBM] [Info] Number of data points in the train set: 712, number of used features: 7
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.383427 -> initscore=-0.475028
[LightGBM] [Info] Start training from score -0.475028
Training until validation scores don't improve for 10 rounds
[10]	training's binary_lo

[32m[I 2023-10-24 08:06:38,253][0m Trial 8 finished with value: 0.40621579245168005 and parameters: {'learning_rate': 0.08088177485379386, 'num_leaves': 70, 'max_depth': 3, 'colsample_bytree': 0.8159268989061649, 'subsample': 0.9303118274801184}. Best is trial 4 with value: 0.40083802414665415.[0m
[32m[I 2023-10-24 08:06:38,332][0m Trial 9 finished with value: 0.406287040848306 and parameters: {'learning_rate': 0.08409101495517418, 'num_leaves': 90, 'max_depth': 3, 'colsample_bytree': 0.9158220041108944, 'subsample': 0.6347481226256408}. Best is trial 4 with value: 0.40083802414665415.[0m



[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.383427 -> initscore=-0.475028
[LightGBM] [Info] Start training from score -0.475028
Training until validation scores don't improve for 10 rounds
[10]	training's binary_logloss: 0.488738	valid_1's binary_logloss: 0.493156
[20]	training's binary_logloss: 0.421115	valid_1's binary_logloss: 0.432204
[30]	training's binary_logloss: 0.392474	valid_1's binary_logloss: 0.408178
[40]	training's binary_logloss: 0.376935	valid_1's binary_logloss: 0.40683
Early stopping, best iteration is:
[32]	training's binary_logloss: 0.38842	valid_1's binary_logloss: 0.406216
[LightGBM] [Info] Number of positive: 273, number of negative: 439
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 204
[LightGBM] [Info] Number of data points in the train set: 712, number of used features: 7
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.383427 -> initscore=-0.475028

[32m[I 2023-10-24 08:06:38,470][0m Trial 10 finished with value: 0.41320852245738116 and parameters: {'learning_rate': 0.09849045338733745, 'num_leaves': 43, 'max_depth': 9, 'colsample_bytree': 0.7253532584393337, 'subsample': 0.8627401494264597}. Best is trial 4 with value: 0.40083802414665415.[0m


[20]	training's binary_logloss: 0.35537	valid_1's binary_logloss: 0.430622
[30]	training's binary_logloss: 0.315777	valid_1's binary_logloss: 0.417936
[40]	training's binary_logloss: 0.29009	valid_1's binary_logloss: 0.414551
[50]	training's binary_logloss: 0.271366	valid_1's binary_logloss: 0.415227
Early stopping, best iteration is:
[41]	training's binary_logloss: 0.287954	valid_1's binary_logloss: 0.413209
[LightGBM] [Info] Number of positive: 273, number of negative: 439
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 204
[LightGBM] [Info] Number of data points in the train set: 712, number of used features: 7
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.383427 -> initscore=-0.475028
[LightGBM] [Info] Start training from score -0.475028
Training until validation scores don't improve for 10 rounds
[10]	training's binary_logloss: 0.483483	valid_1's binary_logloss: 0.496361
[20]

[32m[I 2023-10-24 08:06:38,605][0m Trial 11 finished with value: 0.424852885516663 and parameters: {'learning_rate': 0.08039153343577339, 'num_leaves': 44, 'max_depth': 4, 'colsample_bytree': 0.6134488992654571, 'subsample': 0.8219959202850673}. Best is trial 4 with value: 0.40083802414665415.[0m
[32m[I 2023-10-24 08:06:38,733][0m Trial 12 finished with value: 0.41650196965827235 and parameters: {'learning_rate': 0.09794746343122603, 'num_leaves': 79, 'max_depth': 7, 'colsample_bytree': 0.6564583312597486, 'subsample': 0.8285879139128894}. Best is trial 4 with value: 0.40083802414665415.[0m


[LightGBM] [Info] Number of positive: 273, number of negative: 439
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 204
[LightGBM] [Info] Number of data points in the train set: 712, number of used features: 7
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.383427 -> initscore=-0.475028
[LightGBM] [Info] Start training from score -0.475028
Training until validation scores don't improve for 10 rounds
[10]	training's binary_logloss: 0.439566	valid_1's binary_logloss: 0.476853
[20]	training's binary_logloss: 0.361859	valid_1's binary_logloss: 0.433436
[30]	training's binary_logloss: 0.324198	valid_1's binary_logloss: 0.420101
[40]	training's binary_logloss: 0.301999	valid_1's binary_logloss: 0.419567
[50]	training's binary_logloss: 0.283398	valid_1's binary_logloss: 0.417195
Early stopping, best iteration is:
[42]	training's binary_logloss: 0.297534	valid_1's binary_logloss: 0.416502
[L

[32m[I 2023-10-24 08:06:38,922][0m Trial 13 finished with value: 0.4147304746092466 and parameters: {'learning_rate': 0.06266458012698911, 'num_leaves': 47, 'max_depth': 7, 'colsample_bytree': 0.685548957200561, 'subsample': 0.6556393542759319}. Best is trial 4 with value: 0.40083802414665415.[0m


No further splits with positive gain, best gain: -inf
[30]	training's binary_logloss: 0.367263	valid_1's binary_logloss: 0.435739
[40]	training's binary_logloss: 0.339358	valid_1's binary_logloss: 0.432126
[50]	training's binary_logloss: 0.318564	valid_1's binary_logloss: 0.423872
[60]	training's binary_logloss: 0.305353	valid_1's binary_logloss: 0.420524
[70]	training's binary_logloss: 0.294005	valid_1's binary_logloss: 0.416615
[80]	training's binary_logloss: 0.284073	valid_1's binary_logloss: 0.416589
Early stopping, best iteration is:
[77]	training's binary_logloss: 0.287631	valid_1's binary_logloss: 0.41473
[LightGBM] [Info] Number of positive: 273, number of negative: 439
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 204
[LightGBM] [Info] Number of data points in the train set: 712, number of used features: 7
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.383427 -> initscor

[32m[I 2023-10-24 08:06:39,069][0m Trial 14 finished with value: 0.4076945901456776 and parameters: {'learning_rate': 0.05551875705821526, 'num_leaves': 117, 'max_depth': 3, 'colsample_bytree': 0.6483640329720148, 'subsample': 0.6688038265880187}. Best is trial 4 with value: 0.40083802414665415.[0m


No further splits with positive gain, best gain: -inf
[50]	training's binary_logloss: 0.394794	valid_1's binary_logloss: 0.415958
[60]	training's binary_logloss: 0.384534	valid_1's binary_logloss: 0.411505
[70]	training's binary_logloss: 0.376713	valid_1's binary_logloss: 0.409515
[80]	training's binary_logloss: 0.369484	valid_1's binary_logloss: 0.409105
Early stopping, best iteration is:
[76]	training's binary_logloss: 0.373373	valid_1's binary_logloss: 0.407695
[LightGBM] [Info] Number of positive: 273, number of negative: 439
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 204
[LightGBM] [Info] Number of data points in the train set: 712, number of used features: 7
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.383427 -> initscore=-0.475028
[LightGBM] [Info] Start training from score -0.475028
Training until validation scores don't improve for 10 rounds
[10]	training's binary_l

[32m[I 2023-10-24 08:06:39,262][0m Trial 15 finished with value: 0.42547390184249956 and parameters: {'learning_rate': 0.06843625853304822, 'num_leaves': 101, 'max_depth': 9, 'colsample_bytree': 0.6339854465275714, 'subsample': 0.8932807176245814}. Best is trial 4 with value: 0.40083802414665415.[0m


No further splits with positive gain, best gain: -inf
[60]	training's binary_logloss: 0.299065	valid_1's binary_logloss: 0.426462
[70]	training's binary_logloss: 0.286457	valid_1's binary_logloss: 0.42708
Early stopping, best iteration is:
[67]	training's binary_logloss: 0.289772	valid_1's binary_logloss: 0.425474
[LightGBM] [Info] Number of positive: 273, number of negative: 439
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 204
[LightGBM] [Info] Number of data points in the train set: 712, number of used features: 7
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.383427 -> initscore=-0.475028
[LightGBM] [Info] Start training from score -0.475028
Training until validation scores don't improve for 10 rounds
[10]	training's binary_logloss: 0.506428	valid_1's binary_logloss: 0.527124
[20]	training's binary_logloss: 0.430554	valid_1's binary_logloss: 0.471063
[30]	training's binary_lo

[32m[I 2023-10-24 08:06:39,436][0m Trial 16 finished with value: 0.4164561763022766 and parameters: {'learning_rate': 0.054804920394698156, 'num_leaves': 125, 'max_depth': 6, 'colsample_bytree': 0.7640279205766956, 'subsample': 0.9418663808666179}. Best is trial 4 with value: 0.40083802414665415.[0m
[32m[I 2023-10-24 08:06:39,534][0m Trial 17 finished with value: 0.41281767028328203 and parameters: {'learning_rate': 0.08024227598725231, 'num_leaves': 105, 'max_depth': 3, 'colsample_bytree': 0.7545988224370959, 'subsample': 0.942855355084455}. Best is trial 4 with value: 0.40083802414665415.[0m


No further splits with positive gain, best gain: -inf
Early stopping, best iteration is:
[76]	training's binary_logloss: 0.308365	valid_1's binary_logloss: 0.416456
[LightGBM] [Info] Number of positive: 273, number of negative: 439
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 204
[LightGBM] [Info] Number of data points in the train set: 712, number of used features: 7
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.383427 -> initscore=-0.475028
[LightGBM] [Info] Start training from score -0.475028
Training until validation scores don't improve for 10 rounds
[10]	training's binary_logloss: 0.492331	valid_1's binary_logloss: 0.495663
[20]	training's binary_logloss: 0.432211	valid_1's binary_logloss: 0.440998
[30]	training's binary_logloss: 0.402154	valid_1's binary_logloss: 0.417816
[40]	training's binary_logloss: 0.384747	valid_1's binary_logloss: 0.412818
[50]	training's binary_l

[32m[I 2023-10-24 08:06:39,663][0m Trial 18 finished with value: 0.40337564997440994 and parameters: {'learning_rate': 0.06797222319846608, 'num_leaves': 91, 'max_depth': 5, 'colsample_bytree': 0.8410314139820196, 'subsample': 0.9081665612384084}. Best is trial 4 with value: 0.40083802414665415.[0m
[32m[I 2023-10-24 08:06:39,768][0m Trial 19 finished with value: 0.4041193295144094 and parameters: {'learning_rate': 0.09591177331810724, 'num_leaves': 99, 'max_depth': 5, 'colsample_bytree': 0.7978161033117064, 'subsample': 0.9027858964568847}. Best is trial 4 with value: 0.40083802414665415.[0m


[40]	training's binary_logloss: 0.346762	valid_1's binary_logloss: 0.404938
[50]	training's binary_logloss: 0.332037	valid_1's binary_logloss: 0.404559
Early stopping, best iteration is:
[44]	training's binary_logloss: 0.340465	valid_1's binary_logloss: 0.403376
[LightGBM] [Info] Number of positive: 273, number of negative: 439
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 204
[LightGBM] [Info] Number of data points in the train set: 712, number of used features: 7
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.383427 -> initscore=-0.475028
[LightGBM] [Info] Start training from score -0.475028
Training until validation scores don't improve for 10 rounds
[10]	training's binary_logloss: 0.445451	valid_1's binary_logloss: 0.478458
[20]	training's binary_logloss: 0.372274	valid_1's binary_logloss: 0.425652
[30]	training's binary_logloss: 0.342737	valid_1's binary_logloss: 0.405995
[4

[32m[I 2023-10-24 08:06:39,867][0m Trial 20 finished with value: 0.39787349400339006 and parameters: {'learning_rate': 0.07544844803335073, 'num_leaves': 46, 'max_depth': 4, 'colsample_bytree': 0.9224051635830199, 'subsample': 0.6290893724207108}. Best is trial 20 with value: 0.39787349400339006.[0m


No further splits with positive gain, best gain: -inf
[20]	training's binary_logloss: 0.410175	valid_1's binary_logloss: 0.431479
[30]	training's binary_logloss: 0.376533	valid_1's binary_logloss: 0.405863
[40]	training's binary_logloss: 0.358238	valid_1's binary_logloss: 0.398314
Early stopping, best iteration is:
[39]	training's binary_logloss: 0.359776	valid_1's binary_logloss: 0.397873
[LightGBM] [Info] Number of positive: 273, number of negative: 439
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 204
[LightGBM] [Info] Number of data points in the train set: 712, number of used features: 7
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.383427 -> initscore=-0.475028
[LightGBM] [Info] Start training from score -0.475028
Training until validation scores don't improve for 10 rounds
[10]	training's binary_logloss: 0.49463	valid_1's binary_logloss: 0.515471
[20]	training's binary_lo

[32m[I 2023-10-24 08:06:40,022][0m Trial 21 finished with value: 0.41330138489022966 and parameters: {'learning_rate': 0.06388592806405163, 'num_leaves': 43, 'max_depth': 5, 'colsample_bytree': 0.6461292518415372, 'subsample': 0.850714521441498}. Best is trial 20 with value: 0.39787349400339006.[0m
[32m[I 2023-10-24 08:06:40,159][0m Trial 22 finished with value: 0.39911763251944066 and parameters: {'learning_rate': 0.06447030464736006, 'num_leaves': 51, 'max_depth': 9, 'colsample_bytree': 0.8052795271835291, 'subsample': 0.6070376411656228}. Best is trial 20 with value: 0.39787349400339006.[0m


[LightGBM] [Info] Number of positive: 273, number of negative: 439
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 204
[LightGBM] [Info] Number of data points in the train set: 712, number of used features: 7
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.383427 -> initscore=-0.475028
[LightGBM] [Info] Start training from score -0.475028
Training until validation scores don't improve for 10 rounds
[10]	training's binary_logloss: 0.48172	valid_1's binary_logloss: 0.506811
[20]	training's binary_logloss: 0.39771	valid_1's binary_logloss: 0.44571
[30]	training's binary_logloss: 0.350117	valid_1's binary_logloss: 0.414103
[40]	training's binary_logloss: 0.322954	valid_1's binary_logloss: 0.402958
[50]	training's binary_logloss: 0.30206	valid_1's binary_logloss: 0.400674
[60]	training's binary_logloss: 0.286494	valid_1's binary_logloss: 0.404937
Early stopping, best iteration is:
[51]	t

[32m[I 2023-10-24 08:06:40,251][0m Trial 23 finished with value: 0.4045963695900828 and parameters: {'learning_rate': 0.09144700146086816, 'num_leaves': 62, 'max_depth': 3, 'colsample_bytree': 0.8372357878786805, 'subsample': 0.6945027906172576}. Best is trial 20 with value: 0.39787349400339006.[0m
[32m[I 2023-10-24 08:06:40,338][0m Trial 24 finished with value: 0.41191386015172743 and parameters: {'learning_rate': 0.08675970110612974, 'num_leaves': 45, 'max_depth': 3, 'colsample_bytree': 0.6870636002319853, 'subsample': 0.8016550670462429}. Best is trial 20 with value: 0.39787349400339006.[0m


No further splits with positive gain, best gain: -inf
[40]	training's binary_logloss: 0.369326	valid_1's binary_logloss: 0.406429
Early stopping, best iteration is:
[39]	training's binary_logloss: 0.371205	valid_1's binary_logloss: 0.404596
[LightGBM] [Info] Number of positive: 273, number of negative: 439
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 204
[LightGBM] [Info] Number of data points in the train set: 712, number of used features: 7
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.383427 -> initscore=-0.475028
[LightGBM] [Info] Start training from score -0.475028
Training until validation scores don't improve for 10 rounds
[10]	training's binary_logloss: 0.485094	valid_1's binary_logloss: 0.488796
[20]	training's binary_logloss: 0.425856	valid_1's binary_logloss: 0.436299
[30]	training's binary_logloss: 0.398228	valid_1's binary_logloss: 0.416141
[40]	training's binary_l

[32m[I 2023-10-24 08:06:40,443][0m Trial 25 finished with value: 0.4092609925559531 and parameters: {'learning_rate': 0.07960209656359196, 'num_leaves': 98, 'max_depth': 3, 'colsample_bytree': 0.6780785714242163, 'subsample': 0.9334621540309447}. Best is trial 20 with value: 0.39787349400339006.[0m
[32m[I 2023-10-24 08:06:40,570][0m Trial 26 finished with value: 0.3998070276073272 and parameters: {'learning_rate': 0.07235626893088137, 'num_leaves': 36, 'max_depth': 6, 'colsample_bytree': 0.8448177463611265, 'subsample': 0.7041029327992967}. Best is trial 20 with value: 0.39787349400339006.[0m


No further splits with positive gain, best gain: -inf
[60]	training's binary_logloss: 0.362273	valid_1's binary_logloss: 0.412939
Early stopping, best iteration is:
[50]	training's binary_logloss: 0.373237	valid_1's binary_logloss: 0.409261
[LightGBM] [Info] Number of positive: 273, number of negative: 439
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 204
[LightGBM] [Info] Number of data points in the train set: 712, number of used features: 7
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.383427 -> initscore=-0.475028
[LightGBM] [Info] Start training from score -0.475028
Training until validation scores don't improve for 10 rounds
[10]	training's binary_logloss: 0.471892	valid_1's binary_logloss: 0.496172
[20]	training's binary_logloss: 0.392132	valid_1's binary_logloss: 0.441137
[30]	training's binary_logloss: 0.351489	valid_1's binary_logloss: 0.412587
[40]	training's binary_l

[32m[I 2023-10-24 08:06:40,657][0m Trial 27 finished with value: 0.41245442418607986 and parameters: {'learning_rate': 0.09068989098512387, 'num_leaves': 107, 'max_depth': 3, 'colsample_bytree': 0.6874286964792423, 'subsample': 0.7096264161810828}. Best is trial 20 with value: 0.39787349400339006.[0m
[32m[I 2023-10-24 08:06:40,768][0m Trial 28 finished with value: 0.40656163003558293 and parameters: {'learning_rate': 0.09827081102599636, 'num_leaves': 112, 'max_depth': 7, 'colsample_bytree': 0.8308839442242945, 'subsample': 0.7866221888708269}. Best is trial 20 with value: 0.39787349400339006.[0m


No further splits with positive gain, best gain: -inf
[30]	training's binary_logloss: 0.395797	valid_1's binary_logloss: 0.415187
[40]	training's binary_logloss: 0.37751	valid_1's binary_logloss: 0.41397
Early stopping, best iteration is:
[33]	training's binary_logloss: 0.39013	valid_1's binary_logloss: 0.412454
[LightGBM] [Info] Number of positive: 273, number of negative: 439
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 204
[LightGBM] [Info] Number of data points in the train set: 712, number of used features: 7
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.383427 -> initscore=-0.475028
[LightGBM] [Info] Start training from score -0.475028
Training until validation scores don't improve for 10 rounds
[10]	training's binary_logloss: 0.434637	valid_1's binary_logloss: 0.476313
[20]	training's binary_logloss: 0.351961	valid_1's binary_logloss: 0.425474
[30]	training's binary_logl

[32m[I 2023-10-24 08:06:40,907][0m Trial 29 finished with value: 0.4079590035626215 and parameters: {'learning_rate': 0.0615266511753165, 'num_leaves': 64, 'max_depth': 7, 'colsample_bytree': 0.8122376249447735, 'subsample': 0.6067176194082667}. Best is trial 20 with value: 0.39787349400339006.[0m


No further splits with positive gain, best gain: -inf
[20]	training's binary_logloss: 0.406918	valid_1's binary_logloss: 0.452193
[30]	training's binary_logloss: 0.359457	valid_1's binary_logloss: 0.421119
[40]	training's binary_logloss: 0.334252	valid_1's binary_logloss: 0.410704
[50]	training's binary_logloss: 0.316642	valid_1's binary_logloss: 0.409909
Early stopping, best iteration is:
[44]	training's binary_logloss: 0.326541	valid_1's binary_logloss: 0.407959
[LightGBM] [Info] Number of positive: 273, number of negative: 439
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 204
[LightGBM] [Info] Number of data points in the train set: 712, number of used features: 7
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.383427 -> initscore=-0.475028
[LightGBM] [Info] Start training from score -0.475028
Training until validation scores don't improve for 10 rounds
[10]	training's binary_l

[32m[I 2023-10-24 08:06:41,021][0m Trial 30 finished with value: 0.4100346989038031 and parameters: {'learning_rate': 0.06507874083372747, 'num_leaves': 45, 'max_depth': 3, 'colsample_bytree': 0.7342117787622318, 'subsample': 0.8622094430034536}. Best is trial 20 with value: 0.39787349400339006.[0m
[32m[I 2023-10-24 08:06:41,131][0m Trial 31 finished with value: 0.4209392918287468 and parameters: {'learning_rate': 0.0939226095138021, 'num_leaves': 111, 'max_depth': 6, 'colsample_bytree': 0.6323308613113143, 'subsample': 0.7239163317293336}. Best is trial 20 with value: 0.39787349400339006.[0m
[32m[I 2023-10-24 08:06:41,213][0m Trial 32 finished with value: 0.4034733663471659 and parameters: {'learning_rate': 0.07759081295666295, 'num_leaves': 76, 'max_depth': 3, 'colsample_bytree': 0.828236144500591, 'subsample': 0.7509964524018908}. Best is trial 20 with value: 0.39787349400339006.[0m


[LightGBM] [Info] Number of positive: 273, number of negative: 439
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 204
[LightGBM] [Info] Number of data points in the train set: 712, number of used features: 7
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.383427 -> initscore=-0.475028
[LightGBM] [Info] Start training from score -0.475028
Training until validation scores don't improve for 10 rounds
[10]	training's binary_logloss: 0.451292	valid_1's binary_logloss: 0.478669
[20]	training's binary_logloss: 0.383409	valid_1's binary_logloss: 0.446458
[30]	training's binary_logloss: 0.344922	valid_1's binary_logloss: 0.424436
[40]	training's binary_logloss: 0.324961	valid_1's binary_logloss: 0.424317
Early stopping, best iteration is:
[34]	training's binary_logloss: 0.335922	valid_1's binary_logloss: 0.420939
[LightGBM] [Info] Number of positive: 273, number of negative: 439
You can set

[32m[I 2023-10-24 08:06:41,323][0m Trial 33 finished with value: 0.4022628452969863 and parameters: {'learning_rate': 0.09482732979255315, 'num_leaves': 108, 'max_depth': 9, 'colsample_bytree': 0.9444147844774512, 'subsample': 0.6911193246555916}. Best is trial 20 with value: 0.39787349400339006.[0m
[32m[I 2023-10-24 08:06:41,424][0m Trial 34 finished with value: 0.41373005287906794 and parameters: {'learning_rate': 0.076851126068789, 'num_leaves': 75, 'max_depth': 3, 'colsample_bytree': 0.6348491811887839, 'subsample': 0.7233090814201866}. Best is trial 20 with value: 0.39787349400339006.[0m


[binary:BoostFromScore]: pavg=0.383427 -> initscore=-0.475028
[LightGBM] [Info] Start training from score -0.475028
Training until validation scores don't improve for 10 rounds
[10]	training's binary_logloss: 0.427519	valid_1's binary_logloss: 0.463983
[20]	training's binary_logloss: 0.350394	valid_1's binary_logloss: 0.414982
[30]	training's binary_logloss: 0.313557	valid_1's binary_logloss: 0.405162
Early stopping, best iteration is:
[28]	training's binary_logloss: 0.320492	valid_1's binary_logloss: 0.402263
[LightGBM] [Info] Number of positive: 273, number of negative: 439
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 204
[LightGBM] [Info] Number of data points in the train set: 712, number of used features: 7
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.383427 -> initscore=-0.475028
[LightGBM] [Info] Start training from score -0.475028
Training until validation scores don't

[32m[I 2023-10-24 08:06:41,543][0m Trial 35 finished with value: 0.4031795844344815 and parameters: {'learning_rate': 0.07346245869951211, 'num_leaves': 69, 'max_depth': 8, 'colsample_bytree': 0.9166267104733297, 'subsample': 0.6131457843398491}. Best is trial 20 with value: 0.39787349400339006.[0m


[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.383427 -> initscore=-0.475028
[LightGBM] [Info] Start training from score -0.475028
Training until validation scores don't improve for 10 rounds
[10]	training's binary_logloss: 0.466976	valid_1's binary_logloss: 0.495848
[20]	training's binary_logloss: 0.383299	valid_1's binary_logloss: 0.440953
[30]	training's binary_logloss: 0.339701	valid_1's binary_logloss: 0.410657
[40]	training's binary_logloss: 0.316241	valid_1's binary_logloss: 0.404269
Early stopping, best iteration is:
[39]	training's binary_logloss: 0.318298	valid_1's binary_logloss: 0.40318
[LightGBM] [Info] Number of positive: 273, number of negative: 439
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 204
[LightGBM] [Info] Number of data points in the train set: 712, number of used features: 7
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.383427 -> initscore=-0.475028

[32m[I 2023-10-24 08:06:41,679][0m Trial 36 finished with value: 0.40370254932019023 and parameters: {'learning_rate': 0.07541577242894455, 'num_leaves': 121, 'max_depth': 8, 'colsample_bytree': 0.896802880262782, 'subsample': 0.8825616355537538}. Best is trial 20 with value: 0.39787349400339006.[0m
[32m[I 2023-10-24 08:06:41,769][0m Trial 37 finished with value: 0.39424180520712665 and parameters: {'learning_rate': 0.07845503693072967, 'num_leaves': 67, 'max_depth': 4, 'colsample_bytree': 0.9439012359388177, 'subsample': 0.7306517627741391}. Best is trial 37 with value: 0.39424180520712665.[0m


No further splits with positive gain, best gain: -inf
Early stopping, best iteration is:
[39]	training's binary_logloss: 0.316778	valid_1's binary_logloss: 0.403703
[LightGBM] [Info] Number of positive: 273, number of negative: 439
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 204
[LightGBM] [Info] Number of data points in the train set: 712, number of used features: 7
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.383427 -> initscore=-0.475028
[LightGBM] [Info] Start training from score -0.475028
Training until validation scores don't improve for 10 rounds
[10]	training's binary_logloss: 0.467421	valid_1's binary_logloss: 0.473012
[20]	training's binary_logloss: 0.397108	valid_1's binary_logloss: 0.410631
[30]	training's binary_logloss: 0.368908	valid_1's binary_logloss: 0.394242
[40]	training's binary_logloss: 0.350838	valid_1's binary_logloss: 0.397266
Early stopping, best ite

[32m[I 2023-10-24 08:06:41,886][0m Trial 38 finished with value: 0.4266295687756471 and parameters: {'learning_rate': 0.07100376848953054, 'num_leaves': 82, 'max_depth': 7, 'colsample_bytree': 0.7279236371921936, 'subsample': 0.6058197908068536}. Best is trial 37 with value: 0.39424180520712665.[0m
[32m[I 2023-10-24 08:06:42,019][0m Trial 39 finished with value: 0.40597352066340436 and parameters: {'learning_rate': 0.06153711677033979, 'num_leaves': 103, 'max_depth': 8, 'colsample_bytree': 0.8554966968484203, 'subsample': 0.6600703870415041}. Best is trial 37 with value: 0.39424180520712665.[0m


No further splits with positive gain, best gain: -inf
[40]	training's binary_logloss: 0.327009	valid_1's binary_logloss: 0.430539
Early stopping, best iteration is:
[34]	training's binary_logloss: 0.340526	valid_1's binary_logloss: 0.42663
[LightGBM] [Info] Number of positive: 273, number of negative: 439
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 204
[LightGBM] [Info] Number of data points in the train set: 712, number of used features: 7
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.383427 -> initscore=-0.475028
[LightGBM] [Info] Start training from score -0.475028
Training until validation scores don't improve for 10 rounds
[10]	training's binary_logloss: 0.487243	valid_1's binary_logloss: 0.510987
[20]	training's binary_logloss: 0.40465	valid_1's binary_logloss: 0.452046
[30]	training's binary_logloss: 0.356581	valid_1's binary_logloss: 0.420011
[40]	training's binary_log

In [12]:
params = {
    'objective': 'binary'
}

for i, j in study.best_params.items():
    params[i] = j

params

{'objective': 'binary',
 'learning_rate': 0.07845503693072967,
 'num_leaves': 67,
 'max_depth': 4,
 'colsample_bytree': 0.9439012359388177,
 'subsample': 0.7306517627741391}

## Model

In [13]:
lgb_train = lgb.Dataset(X_train, y_train, categorical_feature=categorical_features)
lgb_eval = lgb.Dataset(X_valid, y_valid, reference=lgb_train, categorical_feature=categorical_features)
    
model = lgb.train(
    params,
    lgb_train,
    valid_sets=[lgb_train, lgb_eval],
    verbose_eval=10,
    num_boost_round=1000,
    early_stopping_rounds=10
)

[LightGBM] [Info] Number of positive: 273, number of negative: 439
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 204
[LightGBM] [Info] Number of data points in the train set: 712, number of used features: 7
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.383427 -> initscore=-0.475028
[LightGBM] [Info] Start training from score -0.475028
Training until validation scores don't improve for 10 rounds
[10]	training's binary_logloss: 0.467421	valid_1's binary_logloss: 0.473012
[20]	training's binary_logloss: 0.397108	valid_1's binary_logloss: 0.410631
[30]	training's binary_logloss: 0.368908	valid_1's binary_logloss: 0.394242
[40]	training's binary_logloss: 0.350838	valid_1's binary_logloss: 0.397266
Early stopping, best iteration is:
[30]	training's binary_logloss: 0.368908	valid_1's binary_logloss: 0.394242




## Evaluate

In [14]:
y_pred_valid = model.predict(X_valid, num_iteration=model.best_iteration)
y_pred_valid = (y_pred_valid > 0.5).astype(int)

In [15]:
f1_score(y_valid, y_pred_valid, average='macro')

0.8040503557744937

## Submit

In [16]:
for col in categorical_features:
    test[col] = test[col].astype('category')

In [17]:
X_test = test[features]

In [18]:
y_pred_test = model.predict(X_test, num_iteration=model.best_iteration)
y_pred_test = (y_pred_test > 0.5).astype(int)

In [19]:
submit = pd.DataFrame(y_pred_test, index=test['PassengerId'], columns=['Survived'])
submit.to_csv('submit.csv')