## Module

In [1]:
import optuna
import numpy as np
import pandas as pd
import lightgbm as lgb
from sklearn.metrics import log_loss
from sklearn.metrics import f1_score
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split

## Dataset

In [2]:
train = pd.read_csv('../input/titanic/train.csv')
print(train.shape)
train.head()

(891, 12)


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [3]:
test = pd.read_csv('../input/titanic/test.csv')
print(test.shape)
test.head()

(418, 11)


Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [4]:
submission = pd.read_csv('../input/titanic/gender_submission.csv')
submission.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0
3,895,0
4,896,1


## Info

In [5]:
train['Survived'].value_counts()

0    549
1    342
Name: Survived, dtype: int64

## Preprocess

In [6]:
features = [
    'Pclass',
    'Sex',
    'Age',
    'SibSp',
    'Parch',
    'Fare',
    'Embarked'
]

target = 'Survived'

In [7]:
categorical_features = ['Sex', 'SibSp', 'Parch', 'Embarked']

for col in categorical_features:
    train[col] = train[col].astype('category')
    test[col] = test[col].astype('category')

In [8]:
X_train = train[features]
y_train = train[target]
X_test = test[features]

print(X_train.shape)
print(y_train.shape)
print(X_test.shape)

(891, 7)
(891,)
(418, 7)


## Params

In [9]:
X_tr, X_val, y_tr, y_val = train_test_split(X_train, y_train, test_size=0.3, stratify=y_train, random_state=123)

print(X_tr.shape)
print(y_val.shape)
print(X_tr.shape)
print(y_val.shape)

(623, 7)
(268,)
(623, 7)
(268,)


In [10]:
def objective(trial):
    params = {
        'objective': 'binary',
        'metrics': 'binary_logloss',
        'verbose': 0,
        'seed': 71,
        'learning_rate': trial.suggest_uniform('learning_rate', 0.05, 0.1),
        'num_leaves': trial.suggest_int('num_leaves', 32, 128),
        'max_depth': trial.suggest_int('max_depth', 3, 9),
        'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.6, 0.95),
        'subsample': trial.suggest_uniform('subsample', 0.6, 0.95)
    }
    
    lgb_train = lgb.Dataset(X_tr, y_tr, categorical_feature=categorical_features)
    lgb_eval = lgb.Dataset(X_val, y_val, reference=lgb_train, categorical_feature=categorical_features)
    
    model = lgb.train(
        params,
        lgb_train,
        valid_sets=[lgb_train, lgb_eval],
        verbose_eval=10,
        num_boost_round=1000,
        early_stopping_rounds=10
    )
    
    y_pred_val = model.predict(X_val, num_iteration=model.best_iteration)
    score = log_loss(y_val, y_pred_val)
    return score

In [11]:
study = optuna.create_study(sampler=optuna.samplers.RandomSampler(seed=0))
study.optimize(objective, n_trials=40)

[32m[I 2023-10-27 03:27:27,129][0m A new study created in memory with name: no-name-e6bccc35-84a5-4817-a7fe-9784d7bf83d4[0m


You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
Training until validation scores don't improve for 10 rounds
[10]	training's binary_logloss: 0.472263	valid_1's binary_logloss: 0.498624

[32m[I 2023-10-27 03:27:27,323][0m Trial 0 finished with value: 0.42831811240700346 and parameters: {'learning_rate': 0.07744067519636624, 'num_leaves': 96, 'max_depth': 6, 'colsample_bytree': 0.9002809661679648, 'subsample': 0.8965381085744438}. Best is trial 0 with value: 0.42831811240700346.[0m


No further splits with positive gain, best gain: -inf
[20]	training's binary_logloss: 0.396195	valid_1's binary_logloss: 0.452639
[30]	training's binary_logloss: 0.360638	valid_1's binary_logloss: 0.436248
[40]	training's binary_logloss: 0.335618	valid_1's binary_logloss: 0.430864
Early stopping, best iteration is:
[39]	training's binary_logloss: 0.338269	valid_1's binary_logloss: 0.428318
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
Training until validation scores don't improve for 10 rounds

[32m[I 2023-10-27 03:27:27,531][0m Trial 1 finished with value: 0.4377418181055332 and parameters: {'learning_rate': 0.08117818483929862, 'num_leaves': 68, 'max_depth': 9, 'colsample_bytree': 0.6198495420611051, 'subsample': 0.6954297031030396}. Best is trial 0 with value: 0.42831811240700346.[0m


No further splits with positive gain, best gain: -inf
[10]	training's binary_logloss: 0.503431	valid_1's binary_logloss: 0.538564
[20]	training's binary_logloss: 0.416823	valid_1's binary_logloss: 0.475954
[30]	training's binary_logloss: 0.371101	valid_1's binary_logloss: 0.452517
[40]	training's binary_logloss: 0.338742	valid_1's binary_logloss: 0.442239
[50]	training's binary_logloss: 0.315537	valid_1's binary_logloss: 0.440805
[60]	training's binary_logloss: 0.296566	valid_1's binary_logloss: 0.438874
Early stopping, best iteration is:
[54]	training's binary_logloss: 0.308216	valid_1's binary_logloss: 0.437742
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.




Training until validation scores don't improve for 10 rounds


[32m[I 2023-10-27 03:27:28,901][0m Trial 2 finished with value: 0.4313150192797928 and parameters: {'learning_rate': 0.0738832558660675, 'num_leaves': 71, 'max_depth': 9, 'colsample_bytree': 0.7988155963828762, 'subsample': 0.9239588234024313}. Best is trial 0 with value: 0.42831811240700346.[0m


[10]	training's binary_logloss: 0.475575	valid_1's binary_logloss: 0.504564
[20]	training's binary_logloss: 0.394276	valid_1's binary_logloss: 0.456591
[30]	training's binary_logloss: 0.353719	valid_1's binary_logloss: 0.44205
[40]	training's binary_logloss: 0.323648	valid_1's binary_logloss: 0.432191
[50]	training's binary_logloss: 0.30026	valid_1's binary_logloss: 0.437866
Early stopping, best iteration is:
[41]	training's binary_logloss: 0.321594	valid_1's binary_logloss: 0.431315
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
Training until validation scores don't improve for 10 rounds
[10]	training's binary_logloss: 0.544876	valid_1's binary_logloss: 0.57009
[20]	training's binary_logloss: 0.465452	valid_1's binary_logloss: 0.509343
[30]	training's binary_logloss: 0.419427	valid_1's binary_logloss: 0.477158
[40]	training's binary_logloss: 0.389044	valid_1's binary_logloss: 0.460091
[50]	training's binary_lo

[32m[I 2023-10-27 03:27:29,052][0m Trial 3 finished with value: 0.4350344538963764 and parameters: {'learning_rate': 0.05355180290989435, 'num_leaves': 41, 'max_depth': 7, 'colsample_bytree': 0.607076439104114, 'subsample': 0.8914169459417782}. Best is trial 0 with value: 0.42831811240700346.[0m
[32m[I 2023-10-27 03:27:29,145][0m Trial 4 finished with value: 0.4249307320559762 and parameters: {'learning_rate': 0.08890783754749253, 'num_leaves': 79, 'max_depth': 3, 'colsample_bytree': 0.9425164197814674, 'subsample': 0.8797054974758531}. Best is trial 4 with value: 0.4249307320559762.[0m


You can set `force_col_wise=true` to remove the overhead.
Training until validation scores don't improve for 10 rounds
[10]	training's binary_logloss: 0.469615	valid_1's binary_logloss: 0.48395
[20]	training's binary_logloss: 0.418654	valid_1's binary_logloss: 0.444148
[30]	training's binary_logloss: 0.394233	valid_1's binary_logloss: 0.430609
[40]	training's binary_logloss: 0.379465	valid_1's binary_logloss: 0.425456
[50]	training's binary_logloss: 0.366023	valid_1's binary_logloss: 0.424987
[60]	training's binary_logloss: 0.350625	valid_1's binary_logloss: 0.426245
Early stopping, best iteration is:
[53]	training's binary_logloss: 0.361029	valid_1's binary_logloss: 0.424931
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
Training until validation scores don't improve for 10 rounds
[10]	training's binary_logloss: 0.51576	valid_1's binary_logloss: 0.546064
[20]	training's binary_logloss: 0.432377	valid_1's binary

[32m[I 2023-10-27 03:27:29,281][0m Trial 5 finished with value: 0.4362702029503283 and parameters: {'learning_rate': 0.0730739681126466, 'num_leaves': 51, 'max_depth': 6, 'colsample_bytree': 0.6413960490541266, 'subsample': 0.8239723574646333}. Best is trial 4 with value: 0.4249307320559762.[0m
[32m[I 2023-10-27 03:27:29,417][0m Trial 6 finished with value: 0.42978394515826746 and parameters: {'learning_rate': 0.05716766437045232, 'num_leaves': 64, 'max_depth': 5, 'colsample_bytree': 0.7451316789966832, 'subsample': 0.6925944642366194}. Best is trial 4 with value: 0.4249307320559762.[0m


You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
Training until validation scores don't improve for 10 rounds
[10]	training's binary_logloss: 0.526017	valid_1's binary_logloss: 0.545206
[20]	training's binary_logloss: 0.448526	valid_1's binary_logloss: 0.485027
[30]	training's binary_logloss: 0.404388	valid_1's binary_logloss: 0.453597
[40]	training's binary_logloss: 0.377592	valid_1's binary_logloss: 0.440667
[50]	training's binary_logloss: 0.360159	valid_1's binary_logloss: 0.43413
[60]	training's binary_logloss: 0.347449	valid_1's binary_logloss: 0.430267
[70]	training's binary_logloss: 0.333818	valid_1's binary_logloss: 0.4313
Early stopping, best iteration is:
[64]	training's binary_logloss: 0.342443	valid_1's binary_logloss: 0.429784
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
Training until validation scores don't improve for 10 r

[32m[I 2023-10-27 03:27:29,515][0m Trial 7 finished with value: 0.43199040169820885 and parameters: {'learning_rate': 0.08871168447171084, 'num_leaves': 87, 'max_depth': 7, 'colsample_bytree': 0.7989518821040269, 'subsample': 0.6065764301527243}. Best is trial 4 with value: 0.4249307320559762.[0m
[32m[I 2023-10-27 03:27:29,588][0m Trial 8 finished with value: 0.429721793680524 and parameters: {'learning_rate': 0.08088177485379386, 'num_leaves': 70, 'max_depth': 3, 'colsample_bytree': 0.8159268989061649, 'subsample': 0.9303118274801184}. Best is trial 4 with value: 0.4249307320559762.[0m
[32m[I 2023-10-27 03:27:29,665][0m Trial 9 finished with value: 0.4289471358311631 and parameters: {'learning_rate': 0.08409101495517418, 'num_leaves': 90, 'max_depth': 3, 'colsample_bytree': 0.9158220041108944, 'subsample': 0.6347481226256408}. Best is trial 4 with value: 0.4249307320559762.[0m


You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
Training until validation scores don't improve for 10 rounds
[10]	training's binary_logloss: 0.490875	valid_1's binary_logloss: 0.502794
[20]	training's binary_logloss: 0.429517	valid_1's binary_logloss: 0.455302
[30]	training's binary_logloss: 0.403653	valid_1's binary_logloss: 0.435584
[40]	training's binary_logloss: 0.390801	valid_1's binary_logloss: 0.429868
Early stopping, best iteration is:
[39]	training's binary_logloss: 0.391793	valid_1's binary_logloss: 0.429722
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
Training until validation scores don't improve for 10 rounds
[10]	training's binary_logloss: 0.487195	valid_1's binary_logloss: 0.49971
[20]	training's binary_logloss: 0.42652	valid_1's binary_logloss: 0.453967
[30]	training's binary_logloss: 0.402325	valid_1's binary_logloss: 0.

[32m[I 2023-10-27 03:27:29,771][0m Trial 10 finished with value: 0.4344228331858549 and parameters: {'learning_rate': 0.09849045338733745, 'num_leaves': 43, 'max_depth': 9, 'colsample_bytree': 0.7253532584393337, 'subsample': 0.8627401494264597}. Best is trial 4 with value: 0.4249307320559762.[0m
[32m[I 2023-10-27 03:27:29,881][0m Trial 11 finished with value: 0.44159812117927244 and parameters: {'learning_rate': 0.08039153343577339, 'num_leaves': 44, 'max_depth': 4, 'colsample_bytree': 0.6134488992654571, 'subsample': 0.8219959202850673}. Best is trial 4 with value: 0.4249307320559762.[0m



[40]	training's binary_logloss: 0.303698	valid_1's binary_logloss: 0.434423
[50]	training's binary_logloss: 0.280749	valid_1's binary_logloss: 0.440591
Early stopping, best iteration is:
[40]	training's binary_logloss: 0.303698	valid_1's binary_logloss: 0.434423
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
Training until validation scores don't improve for 10 rounds
[10]	training's binary_logloss: 0.515668	valid_1's binary_logloss: 0.541067
[20]	training's binary_logloss: 0.438014	valid_1's binary_logloss: 0.477058
[30]	training's binary_logloss: 0.400199	valid_1's binary_logloss: 0.45412
[40]	training's binary_logloss: 0.377674	valid_1's binary_logloss: 0.446405
[50]	training's binary_logloss: 0.361952	valid_1's binary_logloss: 0.443126
[60]	training's binary_logloss: 0.34933	valid_1's binary_logloss: 0.443171
Early stopping, best iteration is:
[58]	training's binary_logloss: 0.352065	valid_1's binary_loglos

[32m[I 2023-10-27 03:27:29,979][0m Trial 12 finished with value: 0.4317995215571732 and parameters: {'learning_rate': 0.09794746343122603, 'num_leaves': 79, 'max_depth': 7, 'colsample_bytree': 0.6564583312597486, 'subsample': 0.8285879139128894}. Best is trial 4 with value: 0.4249307320559762.[0m
[32m[I 2023-10-27 03:27:30,121][0m Trial 13 finished with value: 0.4349604631419792 and parameters: {'learning_rate': 0.06266458012698911, 'num_leaves': 47, 'max_depth': 7, 'colsample_bytree': 0.685548957200561, 'subsample': 0.6556393542759319}. Best is trial 4 with value: 0.4249307320559762.[0m


No further splits with positive gain, best gain: -inf
[30]	training's binary_logloss: 0.341569	valid_1's binary_logloss: 0.433567
[40]	training's binary_logloss: 0.312293	valid_1's binary_logloss: 0.435767
Early stopping, best iteration is:
[33]	training's binary_logloss: 0.332874	valid_1's binary_logloss: 0.4318
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
Training until validation scores don't improve for 10 rounds
[10]	training's binary_logloss: 0.512511	valid_1's binary_logloss: 0.537463
[20]	training's binary_logloss: 0.43104	valid_1's binary_logloss: 0.478788
[30]	training's binary_logloss: 0.38487	valid_1's binary_logloss: 0.450023
[40]	training's binary_logloss: 0.355453	valid_1's binary_logloss: 0.438613
[50]	training's binary_logloss: 0.335239	valid_1's binary_logloss: 0.437953
[60]	training's binary_logloss: 0.321894	valid_1's binary_logloss: 0.437705
Early stopping, best iteration is:
[54]	training

[32m[I 2023-10-27 03:27:30,227][0m Trial 14 finished with value: 0.430078103183474 and parameters: {'learning_rate': 0.05551875705821526, 'num_leaves': 117, 'max_depth': 3, 'colsample_bytree': 0.6483640329720148, 'subsample': 0.6688038265880187}. Best is trial 4 with value: 0.4249307320559762.[0m


You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
Training until validation scores don't improve for 10 rounds
[10]	training's binary_logloss: 0.542354	valid_1's binary_logloss: 0.551533
[20]	training's binary_logloss: 0.474487	valid_1's binary_logloss: 0.493675
[30]	training's binary_logloss: 0.435427	valid_1's binary_logloss: 0.458703
[40]	training's binary_logloss: 0.414029	valid_1's binary_logloss: 0.445867
[50]	training's binary_logloss: 0.400222	valid_1's binary_logloss: 0.438267
[60]	training's binary_logloss: 0.389836	valid_1's binary_logloss: 0.434115
[70]	training's binary_logloss: 0.38187	valid_1's binary_logloss: 0.430943
[80]	training's binary_logloss: 0.372495	valid_1's binary_logloss: 0.431073
Early stopping, best iteration is:
[73]	training's binary_logloss: 0.379502	valid_1's binary_logloss: 0.430078
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set 

[32m[I 2023-10-27 03:27:30,349][0m Trial 15 finished with value: 0.4373845975756149 and parameters: {'learning_rate': 0.06843625853304822, 'num_leaves': 101, 'max_depth': 9, 'colsample_bytree': 0.6339854465275714, 'subsample': 0.8932807176245814}. Best is trial 4 with value: 0.4249307320559762.[0m
[32m[I 2023-10-27 03:27:30,473][0m Trial 16 finished with value: 0.4302673002646252 and parameters: {'learning_rate': 0.054804920394698156, 'num_leaves': 125, 'max_depth': 6, 'colsample_bytree': 0.7640279205766956, 'subsample': 0.9418663808666179}. Best is trial 4 with value: 0.4249307320559762.[0m
[32m[I 2023-10-27 03:27:30,554][0m Trial 17 finished with value: 0.4292297602182267 and parameters: {'learning_rate': 0.08024227598725231, 'num_leaves': 105, 'max_depth': 3, 'colsample_bytree': 0.7545988224370959, 'subsample': 0.942855355084455}. Best is trial 4 with value: 0.4249307320559762.[0m


You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
Training until validation scores don't improve for 10 rounds
[10]	training's binary_logloss: 0.527423	valid_1's binary_logloss: 0.548115
[20]	training's binary_logloss: 0.448285	valid_1's binary_logloss: 0.48799
[30]	training's binary_logloss: 0.401054	valid_1's binary_logloss: 0.455739
[40]	training's binary_logloss: 0.372842	valid_1's binary_logloss: 0.442503
[50]	training's binary_logloss: 0.353061	valid_1's binary_logloss: 0.437484
[60]	training's binary_logloss: 0.339083	valid_1's binary_logloss: 0.431466
[70]	training's binary_logloss: 0.32741	valid_1's binary_logloss: 0.431078
Early stopping, best iteration is:
[65]	training's binary_logloss: 0.33319	valid_1's binary_logloss: 0.430267
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
Training until validation scores don't improve for 10 r

[32m[I 2023-10-27 03:27:30,659][0m Trial 18 finished with value: 0.42886911474882633 and parameters: {'learning_rate': 0.06797222319846608, 'num_leaves': 91, 'max_depth': 5, 'colsample_bytree': 0.8410314139820196, 'subsample': 0.9081665612384084}. Best is trial 4 with value: 0.4249307320559762.[0m
[32m[I 2023-10-27 03:27:30,755][0m Trial 19 finished with value: 0.4317608664333229 and parameters: {'learning_rate': 0.09591177331810724, 'num_leaves': 99, 'max_depth': 5, 'colsample_bytree': 0.7978161033117064, 'subsample': 0.9027858964568847}. Best is trial 4 with value: 0.4249307320559762.[0m


You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
Training until validation scores don't improve for 10 rounds
[10]	training's binary_logloss: 0.489174	valid_1's binary_logloss: 0.512008
[20]	training's binary_logloss: 0.413875	valid_1's binary_logloss: 0.460361
[30]	training's binary_logloss: 0.377625	valid_1's binary_logloss: 0.440391
[40]	training's binary_logloss: 0.354615	valid_1's binary_logloss: 0.431693
[50]	training's binary_logloss: 0.338865	valid_1's binary_logloss: 0.430393
[60]	training's binary_logloss: 0.322137	valid_1's binary_logloss: 0.43048
Early stopping, best iteration is:
[54]	training's binary_logloss: 0.33162	valid_1's binary_logloss: 0.428869
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
Training until validation scores don't improve for 10 rounds
[10]	training's binary_logloss: 0.452584	valid_1's binary_logloss: 0.

[32m[I 2023-10-27 03:27:30,841][0m Trial 20 finished with value: 0.42542636791283706 and parameters: {'learning_rate': 0.07544844803335073, 'num_leaves': 46, 'max_depth': 4, 'colsample_bytree': 0.9224051635830199, 'subsample': 0.6290893724207108}. Best is trial 4 with value: 0.4249307320559762.[0m
[32m[I 2023-10-27 03:27:30,962][0m Trial 21 finished with value: 0.4293348538877541 and parameters: {'learning_rate': 0.06388592806405163, 'num_leaves': 43, 'max_depth': 5, 'colsample_bytree': 0.6461292518415372, 'subsample': 0.850714521441498}. Best is trial 4 with value: 0.4249307320559762.[0m


Training until validation scores don't improve for 10 rounds
[10]	training's binary_logloss: 0.485819	valid_1's binary_logloss: 0.502227
[20]	training's binary_logloss: 0.414958	valid_1's binary_logloss: 0.448511
[30]	training's binary_logloss: 0.383447	valid_1's binary_logloss: 0.431597
[40]	training's binary_logloss: 0.363623	valid_1's binary_logloss: 0.425426
[50]	training's binary_logloss: 0.347548	valid_1's binary_logloss: 0.426135
Early stopping, best iteration is:
[40]	training's binary_logloss: 0.363623	valid_1's binary_logloss: 0.425426
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
Training until validation scores don't improve for 10 rounds
[10]	training's binary_logloss: 0.514985	valid_1's binary_logloss: 0.536326
[20]	training's binary_logloss: 0.436704	valid_1's binary_logloss: 0.476395
[30]	training's binary_logloss: 0.394126	valid_1's binary_logloss: 0.447165
[40]	training's binary_logloss: 0.369

[32m[I 2023-10-27 03:27:31,083][0m Trial 22 finished with value: 0.4332813619096118 and parameters: {'learning_rate': 0.06447030464736006, 'num_leaves': 51, 'max_depth': 9, 'colsample_bytree': 0.8052795271835291, 'subsample': 0.6070376411656228}. Best is trial 4 with value: 0.4249307320559762.[0m
[32m[I 2023-10-27 03:27:31,161][0m Trial 23 finished with value: 0.43093245724279866 and parameters: {'learning_rate': 0.09144700146086816, 'num_leaves': 62, 'max_depth': 3, 'colsample_bytree': 0.8372357878786805, 'subsample': 0.6945027906172576}. Best is trial 4 with value: 0.4249307320559762.[0m


Training until validation scores don't improve for 10 rounds
[10]	training's binary_logloss: 0.49092	valid_1's binary_logloss: 0.515285
[20]	training's binary_logloss: 0.408203	valid_1's binary_logloss: 0.46517
[30]	training's binary_logloss: 0.365962	valid_1's binary_logloss: 0.441981
[40]	training's binary_logloss: 0.336079	valid_1's binary_logloss: 0.434926
[50]	training's binary_logloss: 0.314298	valid_1's binary_logloss: 0.434088
Early stopping, best iteration is:
[42]	training's binary_logloss: 0.331932	valid_1's binary_logloss: 0.433281
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
Training until validation scores don't improve for 10 rounds
[10]	training's binary_logloss: 0.479387	valid_1's binary_logloss: 0.492034
[20]	training's binary_logloss: 0.420749	valid_1's binary_logloss: 0.448313
[30]	training's binary_logloss: 0.39861	valid_1's binary_logloss: 0.432807
[40]	training's binary_logloss: 0.382087

[32m[I 2023-10-27 03:27:31,252][0m Trial 24 finished with value: 0.4296390998134703 and parameters: {'learning_rate': 0.08675970110612974, 'num_leaves': 45, 'max_depth': 3, 'colsample_bytree': 0.6870636002319853, 'subsample': 0.8016550670462429}. Best is trial 4 with value: 0.4249307320559762.[0m
[32m[I 2023-10-27 03:27:31,352][0m Trial 25 finished with value: 0.42739792586806186 and parameters: {'learning_rate': 0.07960209656359196, 'num_leaves': 98, 'max_depth': 3, 'colsample_bytree': 0.6780785714242163, 'subsample': 0.9334621540309447}. Best is trial 4 with value: 0.4249307320559762.[0m


[10]	training's binary_logloss: 0.501663	valid_1's binary_logloss: 0.51562
[20]	training's binary_logloss: 0.434465	valid_1's binary_logloss: 0.462585
[30]	training's binary_logloss: 0.404414	valid_1's binary_logloss: 0.43664
[40]	training's binary_logloss: 0.389751	valid_1's binary_logloss: 0.430849
Early stopping, best iteration is:
[39]	training's binary_logloss: 0.392142	valid_1's binary_logloss: 0.429639
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
Training until validation scores don't improve for 10 rounds
[10]	training's binary_logloss: 0.509968	valid_1's binary_logloss: 0.522449
[20]	training's binary_logloss: 0.44178	valid_1's binary_logloss: 0.468458
[30]	training's binary_logloss: 0.410077	valid_1's binary_logloss: 0.4404
[40]	training's binary_logloss: 0.394104	valid_1's binary_logloss: 0.4333
[50]	training's binary_logloss: 0.380969	valid_1's binary_logloss: 0.431595
[60]	training's binary_loglos

[32m[I 2023-10-27 03:27:31,477][0m Trial 26 finished with value: 0.4306755673788025 and parameters: {'learning_rate': 0.07235626893088137, 'num_leaves': 36, 'max_depth': 6, 'colsample_bytree': 0.8448177463611265, 'subsample': 0.7041029327992967}. Best is trial 4 with value: 0.4249307320559762.[0m
[32m[I 2023-10-27 03:27:31,559][0m Trial 27 finished with value: 0.42869898301476655 and parameters: {'learning_rate': 0.09068989098512387, 'num_leaves': 107, 'max_depth': 3, 'colsample_bytree': 0.6874286964792423, 'subsample': 0.7096264161810828}. Best is trial 4 with value: 0.4249307320559762.[0m


[20]	training's binary_logloss: 0.40206	valid_1's binary_logloss: 0.456518
[30]	training's binary_logloss: 0.366884	valid_1's binary_logloss: 0.439992
[40]	training's binary_logloss: 0.341896	valid_1's binary_logloss: 0.433452
[50]	training's binary_logloss: 0.321669	valid_1's binary_logloss: 0.431983
[60]	training's binary_logloss: 0.303063	valid_1's binary_logloss: 0.432099
Early stopping, best iteration is:
[57]	training's binary_logloss: 0.308621	valid_1's binary_logloss: 0.430676
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
Training until validation scores don't improve for 10 rounds
[10]	training's binary_logloss: 0.497503	valid_1's binary_logloss: 0.511889
[20]	training's binary_logloss: 0.431085	valid_1's binary_logloss: 0.459936
[30]	training's binary_logloss: 0.401989	valid_1's binary_logloss: 0.435561
[40]	training's binary_logloss: 0.387429	valid_1's binary_logloss: 0.429327
Early stopping, best it

[32m[I 2023-10-27 03:27:31,657][0m Trial 28 finished with value: 0.43135351165836744 and parameters: {'learning_rate': 0.09827081102599636, 'num_leaves': 112, 'max_depth': 7, 'colsample_bytree': 0.8308839442242945, 'subsample': 0.7866221888708269}. Best is trial 4 with value: 0.4249307320559762.[0m
[32m[I 2023-10-27 03:27:31,769][0m Trial 29 finished with value: 0.4360140226551951 and parameters: {'learning_rate': 0.0615266511753165, 'num_leaves': 64, 'max_depth': 7, 'colsample_bytree': 0.8122376249447735, 'subsample': 0.6067176194082667}. Best is trial 4 with value: 0.4249307320559762.[0m


No further splits with positive gain, best gain: -inf
[20]	training's binary_logloss: 0.369168	valid_1's binary_logloss: 0.442662
[30]	training's binary_logloss: 0.333341	valid_1's binary_logloss: 0.434556
[40]	training's binary_logloss: 0.305205	valid_1's binary_logloss: 0.438257
Early stopping, best iteration is:
[32]	training's binary_logloss: 0.32812	valid_1's binary_logloss: 0.431354
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
Training until validation scores don't improve for 10 rounds
[10]	training's binary_logloss: 0.496627	valid_1's binary_logloss: 0.519077
[20]	training's binary_logloss: 0.416096	valid_1's binary_logloss: 0.466068
[30]	training's binary_logloss: 0.375063	valid_1's binary_logloss: 0.443078
[40]	training's binary_logloss: 0.346995	valid_1's binary_logloss: 0.437907
[50]	training's binary_logloss: 0.325999	valid_1's binary_logloss: 0.438812
Early stopping, best iteration is:
[44]	train

[32m[I 2023-10-27 03:27:31,876][0m Trial 30 finished with value: 0.4312132179975152 and parameters: {'learning_rate': 0.06507874083372747, 'num_leaves': 45, 'max_depth': 3, 'colsample_bytree': 0.7342117787622318, 'subsample': 0.8622094430034536}. Best is trial 4 with value: 0.4249307320559762.[0m
[32m[I 2023-10-27 03:27:31,987][0m Trial 31 finished with value: 0.43483329319172026 and parameters: {'learning_rate': 0.0939226095138021, 'num_leaves': 111, 'max_depth': 6, 'colsample_bytree': 0.6323308613113143, 'subsample': 0.7239163317293336}. Best is trial 4 with value: 0.4249307320559762.[0m


No further splits with positive gain, best gain: -inf
[10]	training's binary_logloss: 0.528387	valid_1's binary_logloss: 0.539189
[20]	training's binary_logloss: 0.459402	valid_1's binary_logloss: 0.481881
[30]	training's binary_logloss: 0.421825	valid_1's binary_logloss: 0.452801
[40]	training's binary_logloss: 0.40438	valid_1's binary_logloss: 0.438586
[50]	training's binary_logloss: 0.392218	valid_1's binary_logloss: 0.434608
[60]	training's binary_logloss: 0.382982	valid_1's binary_logloss: 0.432635
[70]	training's binary_logloss: 0.374659	valid_1's binary_logloss: 0.431213
[80]	training's binary_logloss: 0.36489	valid_1's binary_logloss: 0.432107
Early stopping, best iteration is:
[70]	training's binary_logloss: 0.374659	valid_1's binary_logloss: 0.431213
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
Training until validation scores don't improve for 10 rounds
[10]	training's binary_logloss: 0.490164	valid

[32m[I 2023-10-27 03:27:32,087][0m Trial 32 finished with value: 0.42959340268327284 and parameters: {'learning_rate': 0.07759081295666295, 'num_leaves': 76, 'max_depth': 3, 'colsample_bytree': 0.828236144500591, 'subsample': 0.7509964524018908}. Best is trial 4 with value: 0.4249307320559762.[0m
[32m[I 2023-10-27 03:27:32,196][0m Trial 33 finished with value: 0.43192228927859533 and parameters: {'learning_rate': 0.09482732979255315, 'num_leaves': 108, 'max_depth': 9, 'colsample_bytree': 0.9444147844774512, 'subsample': 0.6911193246555916}. Best is trial 4 with value: 0.4249307320559762.[0m


Training until validation scores don't improve for 10 rounds
[10]	training's binary_logloss: 0.494773	valid_1's binary_logloss: 0.506332
[20]	training's binary_logloss: 0.432569	valid_1's binary_logloss: 0.458327
[30]	training's binary_logloss: 0.406413	valid_1's binary_logloss: 0.437863
[40]	training's binary_logloss: 0.394334	valid_1's binary_logloss: 0.431282
[50]	training's binary_logloss: 0.381209	valid_1's binary_logloss: 0.43135
[60]	training's binary_logloss: 0.367709	valid_1's binary_logloss: 0.432138
Early stopping, best iteration is:
[54]	training's binary_logloss: 0.375959	valid_1's binary_logloss: 0.429593
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
Training until validation scores don't improve for 10 rounds
[10]	training's binary_logloss: 0.432679	valid_1's binary_logloss: 0.476477
[20]	training's binary_logloss: 0.360616	valid_1's binary_logloss: 0.444248
[30]	training's binary_logloss: 0.3207

[32m[I 2023-10-27 03:27:32,311][0m Trial 34 finished with value: 0.43339793983560143 and parameters: {'learning_rate': 0.076851126068789, 'num_leaves': 75, 'max_depth': 3, 'colsample_bytree': 0.6348491811887839, 'subsample': 0.7233090814201866}. Best is trial 4 with value: 0.4249307320559762.[0m
[32m[I 2023-10-27 03:27:32,416][0m Trial 35 finished with value: 0.43125846922812106 and parameters: {'learning_rate': 0.07346245869951211, 'num_leaves': 69, 'max_depth': 8, 'colsample_bytree': 0.9166267104733297, 'subsample': 0.6131457843398491}. Best is trial 4 with value: 0.4249307320559762.[0m


Training until validation scores don't improve for 10 rounds
[10]	training's binary_logloss: 0.528859	valid_1's binary_logloss: 0.545064
[20]	training's binary_logloss: 0.457402	valid_1's binary_logloss: 0.483555
[30]	training's binary_logloss: 0.422881	valid_1's binary_logloss: 0.456224
[40]	training's binary_logloss: 0.403932	valid_1's binary_logloss: 0.444354
[50]	training's binary_logloss: 0.387188	valid_1's binary_logloss: 0.440304
[60]	training's binary_logloss: 0.376391	valid_1's binary_logloss: 0.436977
[70]	training's binary_logloss: 0.366359	valid_1's binary_logloss: 0.435024
[80]	training's binary_logloss: 0.360887	valid_1's binary_logloss: 0.434128
Early stopping, best iteration is:
[73]	training's binary_logloss: 0.364475	valid_1's binary_logloss: 0.433398
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
Training until validation scores don't improve for 10 rounds
[10]	training's binary_logloss: 0.476

[32m[I 2023-10-27 03:27:32,529][0m Trial 36 finished with value: 0.43131350339948277 and parameters: {'learning_rate': 0.07541577242894455, 'num_leaves': 121, 'max_depth': 8, 'colsample_bytree': 0.896802880262782, 'subsample': 0.8825616355537538}. Best is trial 4 with value: 0.4249307320559762.[0m
[32m[I 2023-10-27 03:27:32,629][0m Trial 37 finished with value: 0.42043194243489046 and parameters: {'learning_rate': 0.07845503693072967, 'num_leaves': 67, 'max_depth': 4, 'colsample_bytree': 0.9439012359388177, 'subsample': 0.7306517627741391}. Best is trial 37 with value: 0.42043194243489046.[0m


Training until validation scores don't improve for 10 rounds
[10]	training's binary_logloss: 0.473165	valid_1's binary_logloss: 0.501197
[20]	training's binary_logloss: 0.393088	valid_1's binary_logloss: 0.455428
[30]	training's binary_logloss: 0.353973	valid_1's binary_logloss: 0.440175
[40]	training's binary_logloss: 0.325877	valid_1's binary_logloss: 0.434493
[50]	training's binary_logloss: 0.303535	valid_1's binary_logloss: 0.43664
Early stopping, best iteration is:
[43]	training's binary_logloss: 0.319676	valid_1's binary_logloss: 0.431314
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
Training until validation scores don't improve for 10 rounds
[10]	training's binary_logloss: 0.469027	valid_1's binary_logloss: 0.484627
[20]	training's binary_logloss: 0.408849	valid_1's binary_logloss: 0.441896
[30]	training's binary_logloss: 0.380382	valid_1's binary_logloss: 0.433417
[40]	training's binary_logloss: 0.3594

[32m[I 2023-10-27 03:27:32,748][0m Trial 38 finished with value: 0.42903912175490905 and parameters: {'learning_rate': 0.07100376848953054, 'num_leaves': 82, 'max_depth': 7, 'colsample_bytree': 0.7279236371921936, 'subsample': 0.6058197908068536}. Best is trial 37 with value: 0.42043194243489046.[0m


Training until validation scores don't improve for 10 rounds
[10]	training's binary_logloss: 0.499563	valid_1's binary_logloss: 0.526537
[20]	training's binary_logloss: 0.41863	valid_1's binary_logloss: 0.469175
[30]	training's binary_logloss: 0.373023	valid_1's binary_logloss: 0.441968
[40]	training's binary_logloss: 0.344956	valid_1's binary_logloss: 0.432567
[50]	training's binary_logloss: 0.324753	valid_1's binary_logloss: 0.430176
[60]	training's binary_logloss: 0.30884	valid_1's binary_logloss: 0.432987
Early stopping, best iteration is:
[55]	training's binary_logloss: 0.317764	valid_1's binary_logloss: 0.429039
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
Training until validation scores don't improve for 10 rounds
[10]	training's binary_logloss: 0.495889	valid_1's binary_logloss: 0.519005
[20]	training's binary_logloss: 0.413986	valid_1's binary_logloss: 0.468302
[30]	training's binary_logloss: 0.37193

[32m[I 2023-10-27 03:27:32,870][0m Trial 39 finished with value: 0.4323948020876845 and parameters: {'learning_rate': 0.06153711677033979, 'num_leaves': 103, 'max_depth': 8, 'colsample_bytree': 0.8554966968484203, 'subsample': 0.6600703870415041}. Best is trial 37 with value: 0.42043194243489046.[0m


In [12]:
params = {
    'objective': 'binary',
    'metrics': 'binary_logloss',
    'verbose': 0,
    'seed': 71
}

for i, j in study.best_params.items():
    params[i] = j

params

{'objective': 'binary',
 'metrics': 'binary_logloss',
 'verbose': 0,
 'seed': 71,
 'learning_rate': 0.07845503693072967,
 'num_leaves': 67,
 'max_depth': 4,
 'colsample_bytree': 0.9439012359388177,
 'subsample': 0.7306517627741391}

## Model

In [13]:
scores = []
y_preds_test = []

kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=456)

for tr_idx, val_idx in kf.split(X_train, y_train):
    X_tr, X_val = X_train.iloc[tr_idx], X_train.iloc[val_idx]
    y_tr, y_val = y_train.iloc[tr_idx], y_train.iloc[val_idx]
    
    lgb_train = lgb.Dataset(X_tr, y_tr, categorical_feature=categorical_features)
    lgb_eval = lgb.Dataset(X_val, y_val, reference=lgb_train, categorical_feature=categorical_features)
    
    model = lgb.train(
        params,
        lgb_train,
        valid_sets=[lgb_train, lgb_eval],
        verbose_eval=10,
        num_boost_round=1000,
        early_stopping_rounds=10
    )
    
    y_pred_val = model.predict(X_val, num_iteration=model.best_iteration)
    y_pred_val = (y_pred_val > 0.5).astype(int)
    score = f1_score(y_val, y_pred_val, average='macro')
    scores.append(score)
    
    y_pred_test = model.predict(X_test, num_iteration=model.best_iteration)
    y_preds_test.append(y_pred_test)

y_preds_test = np.mean(y_preds_test, axis=0)
y_preds_test = (y_preds_test > 0.5).astype(int)



You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
Training until validation scores don't improve for 10 rounds
[10]	training's binary_logloss: 0.474589	valid_1's binary_logloss: 0.471442
[20]	training's binary_logloss: 0.415452	valid_1's binary_logloss: 0.415623
[30]	training's binary_logloss: 0.387418	valid_1's binary_logloss: 0.3924
[40]	training's binary_logloss: 0.367716	valid_1's binary_logloss: 0.37639
[50]	training's binary_logloss: 0.348471	valid_1's binary_logloss: 0.364311
[60]	training's binary_logloss: 0.33319	valid_1's binary_logloss: 0.359127
[70]	training's binary_logloss: 0.322953	valid_1's binary_logloss: 0.35326
[80]	training's binary_logloss: 0.313104	valid_1's binary_logloss: 0.350679
[90]	training's binary_logloss: 0.304523	valid_1's binary_logloss: 0.34781
[100]	training's binary_logloss: 0.294805	valid_1's binary_logloss: 0.347412
[110]	training's binary_logloss: 0.286279	valid_1's binary_log

In [14]:
print('===CV scores===')
print(scores)
print(np.mean(scores))

===CV scores===
[0.8688033049040511, 0.7682291666666667, 0.830360789652825, 0.7926260964912281, 0.7161084529505584]
0.7952255621330659


## Submit

In [15]:
submit = pd.DataFrame(y_preds_test, index=test['PassengerId'], columns=['Survived'])
submit.to_csv('submit.csv')