## Module

In [1]:
import optuna
import numpy as np
import pandas as pd
import lightgbm as lgb
from sklearn.metrics import f1_score
from sklearn.metrics import log_loss
from sklearn.model_selection import train_test_split

## Dataset

In [2]:
train = pd.read_csv('../input/titanic/train.csv')
print(train.shape)
train.head()

(891, 12)


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [3]:
test = pd.read_csv('../input/titanic/test.csv')
print(test.shape)
test.head()

(418, 11)


Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [4]:
submission = pd.read_csv('../input/titanic/gender_submission.csv')
submission.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0
3,895,0
4,896,1


## Info

In [5]:
train['Survived'].value_counts()

0    549
1    342
Name: Survived, dtype: int64

## Preprocess

In [6]:
features = [
    'Pclass',
    'Sex',
    'Age',
    'SibSp',
    'Parch',
    'Fare',
    'Embarked'
]

target = 'Survived'

In [7]:
categorical_features = ['Sex', 'Embarked']

for col in categorical_features:
    train[col] = train[col].astype('category')

In [8]:
X_train = train[features]
y_train = train[target]
print(X_train.shape)
print(y_train.shape)

(891, 7)
(891,)


In [9]:
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.2, stratify=y_train, random_state=123)

print(X_train.shape)
print(y_valid.shape)
print(X_train.shape)
print(y_valid.shape)

(712, 7)
(179,)
(712, 7)
(179,)


## Params

In [10]:
def objective(trial):
    params = {
        'objective': 'binary',
        'learning_rate': trial.suggest_uniform('learning_rate', 0.05, 0.1),
        'num_leaves': trial.suggest_int('num_leaves', 32, 128),
        'max_depth': trial.suggest_int('max_depth', 3, 9),
        'min_child_weight': trial.suggest_uniform('min_child_weight', 0.1, 10),
        'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.6, 0.95),
        'subsample': trial.suggest_uniform('subsample', 0.6, 0.95)
    }
    
    lgb_train = lgb.Dataset(X_train, y_train, categorical_feature=categorical_features)
    lgb_eval = lgb.Dataset(X_valid, y_valid, reference=lgb_train, categorical_feature=categorical_features)
    
    model = lgb.train(
        params,
        lgb_train,
        valid_sets=[lgb_train, lgb_eval],
        verbose_eval=10,
        num_boost_round=1000,
        early_stopping_rounds=10
    )
    
    y_pred_valid = model.predict(X_valid, num_iteraition=model.best_iteration)
    score = log_loss(y_valid, y_pred_valid)
    return score

In [11]:
study = optuna.create_study(sampler=optuna.samplers.RandomSampler(seed=0))
study.optimize(objective, n_trials=40)

[32m[I 2023-10-24 07:38:05,923][0m A new study created in memory with name: no-name-40dd0592-3378-489c-aa7e-ba053b674919[0m
[32m[I 2023-10-24 07:38:06,123][0m Trial 0 finished with value: 0.41925370688760233 and parameters: {'learning_rate': 0.07744067519636624, 'num_leaves': 96, 'max_depth': 6, 'min_child_weight': 8.593661614465292, 'colsample_bytree': 0.8965381085744438, 'subsample': 0.8182472938750903}. Best is trial 0 with value: 0.41925370688760233.[0m


[LightGBM] [Info] Number of positive: 273, number of negative: 439
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 204
[LightGBM] [Info] Number of data points in the train set: 712, number of used features: 7
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.383427 -> initscore=-0.475028
[LightGBM] [Info] Start training from score -0.475028
Training until validation scores don't improve for 10 rounds
[10]	training's binary_logloss: 0.487561	valid_1's binary_logloss: 0.49683
[20]	training's binary_logloss: 0.423575	valid_1's binary_logloss: 0.445045
[30]	training's binary_logloss: 0.393816	valid_1's binary_logloss: 0.424092
[40]	training's binary_logloss: 0.377427	valid_1's binary_logloss: 0.420263
[50]	training's binary_logloss: 0.366581	valid_1's binary_logloss: 0.420217
Early stopping, best iteration is:
[43]	training's binary_logloss: 0.37385	valid_1's binary_logloss: 0.419254
[Lig

[32m[I 2023-10-24 07:38:06,292][0m Trial 1 finished with value: 0.4165414642345387 and parameters: {'learning_rate': 0.069219085364635, 'num_leaves': 119, 'max_depth': 9, 'min_child_weight': 0.6614584754426874, 'colsample_bytree': 0.6954297031030396, 'subsample': 0.7671827910624724}. Best is trial 1 with value: 0.4165414642345387.[0m
[32m[I 2023-10-24 07:38:06,377][0m Trial 2 finished with value: 0.4071961899570929 and parameters: {'learning_rate': 0.09060843643877467, 'num_leaves': 78, 'max_depth': 3, 'min_child_weight': 3.9885694813982147, 'colsample_bytree': 0.8926275672380821, 'subsample': 0.7180886561460439}. Best is trial 2 with value: 0.4071961899570929.[0m


No further splits with positive gain, best gain: -inf
[30]	training's binary_logloss: 0.35042	valid_1's binary_logloss: 0.428552
[40]	training's binary_logloss: 0.322393	valid_1's binary_logloss: 0.426548
[50]	training's binary_logloss: 0.300367	valid_1's binary_logloss: 0.421309
[60]	training's binary_logloss: 0.285732	valid_1's binary_logloss: 0.421751
[70]	training's binary_logloss: 0.272405	valid_1's binary_logloss: 0.418796
Early stopping, best iteration is:
[65]	training's binary_logloss: 0.278358	valid_1's binary_logloss: 0.416541
[LightGBM] [Info] Number of positive: 273, number of negative: 439
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 204
[LightGBM] [Info] Number of data points in the train set: 712, number of used features: 7
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.383427 -> initscore=-0.475028
[LightGBM] [Info] Start training from score -0.475028
Training u

[32m[I 2023-10-24 07:38:06,487][0m Trial 3 finished with value: 0.41866136386505043 and parameters: {'learning_rate': 0.08240859360255987, 'num_leaves': 112, 'max_depth': 6, 'min_child_weight': 9.575836073635159, 'colsample_bytree': 0.6491227731444258, 'subsample': 0.9045305404254527}. Best is trial 2 with value: 0.4071961899570929.[0m


[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.383427 -> initscore=-0.475028
[LightGBM] [Info] Start training from score -0.475028
Training until validation scores don't improve for 10 rounds
[10]	training's binary_logloss: 0.488254	valid_1's binary_logloss: 0.494348
[20]	training's binary_logloss: 0.428372	valid_1's binary_logloss: 0.444876
[30]	training's binary_logloss: 0.400906	valid_1's binary_logloss: 0.427163
[40]	training's binary_logloss: 0.384984	valid_1's binary_logloss: 0.421057
[50]	training's binary_logloss: 0.373212	valid_1's binary_logloss: 0.420512
Early stopping, best iteration is:
[47]	training's binary_logloss: 0.375986	valid_1's binary_logloss: 0.418661
[LightGBM] [Info] Number of positive: 273, number of negative: 439
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 204
[LightGBM] [Info] Number of data points in the train set: 712, number of used features: 7
[L

[32m[I 2023-10-24 07:38:06,607][0m Trial 4 finished with value: 0.4094404145082083 and parameters: {'learning_rate': 0.07368040226368552, 'num_leaves': 81, 'max_depth': 6, 'min_child_weight': 5.252727047556927, 'colsample_bytree': 0.8376078355416361, 'subsample': 0.8522214291540708}. Best is trial 2 with value: 0.4071961899570929.[0m
[32m[I 2023-10-24 07:38:06,706][0m Trial 5 finished with value: 0.41314852373872335 and parameters: {'learning_rate': 0.07910098960375536, 'num_leaves': 89, 'max_depth': 3, 'min_child_weight': 5.266298385325709, 'colsample_bytree': 0.7451316789966832, 'subsample': 0.6925944642366194}. Best is trial 2 with value: 0.4071961899570929.[0m


[LightGBM] [Info] Number of positive: 273, number of negative: 439
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 204
[LightGBM] [Info] Number of data points in the train set: 712, number of used features: 7
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.383427 -> initscore=-0.475028
[LightGBM] [Info] Start training from score -0.475028
Training until validation scores don't improve for 10 rounds
[10]	training's binary_logloss: 0.49433	valid_1's binary_logloss: 0.496491
[20]	training's binary_logloss: 0.434116	valid_1's binary_logloss: 0.441024
[30]	training's binary_logloss: 0.405079	valid_1's binary_logloss: 0.418526
[40]	training's binary_logloss: 0.38998	valid_1's binary_logloss: 0.414972
[50]	training's binary_logloss: 0.378675	valid_1's binary_logloss: 0.41384
Early stopping, best iteration is:
[45]	training's binary_logloss: 0.384846	valid_1's binary_logloss: 0.413149
[Ligh

[32m[I 2023-10-24 07:38:06,836][0m Trial 6 finished with value: 0.41362007849160165 and parameters: {'learning_rate': 0.08871168447171084, 'num_leaves': 87, 'max_depth': 7, 'min_child_weight': 5.72749609379962, 'colsample_bytree': 0.6065764301527243, 'subsample': 0.8161724239765569}. Best is trial 2 with value: 0.4071961899570929.[0m
[32m[I 2023-10-24 07:38:06,946][0m Trial 7 finished with value: 0.4137253446092469 and parameters: {'learning_rate': 0.08060478613612107, 'num_leaves': 49, 'max_depth': 7, 'min_child_weight': 6.8500209611244856, 'colsample_bytree': 0.725827765200825, 'subsample': 0.7529611838297695}. Best is trial 2 with value: 0.4071961899570929.[0m


[LightGBM] [Info] Number of positive: 273, number of negative: 439
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 204
[LightGBM] [Info] Number of data points in the train set: 712, number of used features: 7
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.383427 -> initscore=-0.475028
[LightGBM] [Info] Start training from score -0.475028
Training until validation scores don't improve for 10 rounds
[10]	training's binary_logloss: 0.477963	valid_1's binary_logloss: 0.493198
[20]	training's binary_logloss: 0.408793	valid_1's binary_logloss: 0.439718
[30]	training's binary_logloss: 0.378097	valid_1's binary_logloss: 0.417119
[40]	training's binary_logloss: 0.359914	valid_1's binary_logloss: 0.414446
[50]	training's binary_logloss: 0.347761	valid_1's binary_logloss: 0.414534
Early stopping, best iteration is:
[41]	training's binary_logloss: 0.358117	valid_1's binary_logloss: 0.413725
[L

[32m[I 2023-10-24 07:38:07,061][0m Trial 8 finished with value: 0.4098327116339549 and parameters: {'learning_rate': 0.08488155979636325, 'num_leaves': 73, 'max_depth': 4, 'min_child_weight': 6.70099048291211, 'colsample_bytree': 0.8347232543663557, 'subsample': 0.6736338963758443}. Best is trial 2 with value: 0.4071961899570929.[0m
[32m[I 2023-10-24 07:38:07,195][0m Trial 9 finished with value: 0.411007102932985 and parameters: {'learning_rate': 0.05644631488274267, 'num_leaves': 46, 'max_depth': 6, 'min_child_weight': 3.700736632331964, 'colsample_bytree': 0.7995688696462578, 'subsample': 0.7535105297118121}. Best is trial 2 with value: 0.4071961899570929.[0m


No further splits with positive gain, best gain: -inf
Early stopping, best iteration is:
[32]	training's binary_logloss: 0.378053	valid_1's binary_logloss: 0.409833
[LightGBM] [Info] Number of positive: 273, number of negative: 439
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 204
[LightGBM] [Info] Number of data points in the train set: 712, number of used features: 7
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.383427 -> initscore=-0.475028
[LightGBM] [Info] Start training from score -0.475028
Training until validation scores don't improve for 10 rounds
[10]	training's binary_logloss: 0.498929	valid_1's binary_logloss: 0.518938
[20]	training's binary_logloss: 0.420973	valid_1's binary_logloss: 0.458474
[30]	training's binary_logloss: 0.374457	valid_1's binary_logloss: 0.425952
[40]	training's binary_logloss: 0.350257	valid_1's binary_logloss: 0.414526
[50]	training's binary_l

[32m[I 2023-10-24 07:38:07,308][0m Trial 10 finished with value: 0.40708326406256384 and parameters: {'learning_rate': 0.09941869190296132, 'num_leaves': 100, 'max_depth': 7, 'min_child_weight': 1.696964227061463, 'colsample_bytree': 0.8285879139128894, 'subsample': 0.6886520608889237}. Best is trial 10 with value: 0.40708326406256384.[0m
[32m[I 2023-10-24 07:38:07,425][0m Trial 11 finished with value: 0.4125458679492675 and parameters: {'learning_rate': 0.07331553864281531, 'num_leaves': 90, 'max_depth': 8, 'min_child_weight': 6.780047990364305, 'colsample_bytree': 0.7110206097242536, 'subsample': 0.8724209187090681}. Best is trial 10 with value: 0.40708326406256384.[0m


[10]	training's binary_logloss: 0.433375	valid_1's binary_logloss: 0.474448
[20]	training's binary_logloss: 0.353309	valid_1's binary_logloss: 0.424536
[30]	training's binary_logloss: 0.32251	valid_1's binary_logloss: 0.408884
[40]	training's binary_logloss: 0.301408	valid_1's binary_logloss: 0.408842
Early stopping, best iteration is:
[34]	training's binary_logloss: 0.312463	valid_1's binary_logloss: 0.407083
[LightGBM] [Info] Number of positive: 273, number of negative: 439
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 204
[LightGBM] [Info] Number of data points in the train set: 712, number of used features: 7
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.383427 -> initscore=-0.475028
[LightGBM] [Info] Start training from score -0.475028
Training until validation scores don't improve for 10 rounds
[10]	training's binary_logloss: 0.487456	valid_1's binary_logloss: 0.501626
[20

[32m[I 2023-10-24 07:38:07,536][0m Trial 12 finished with value: 0.40208711717628304 and parameters: {'learning_rate': 0.0974785526725371, 'num_leaves': 127, 'max_depth': 8, 'min_child_weight': 0.23435919255988735, 'colsample_bytree': 0.8179961334413343, 'subsample': 0.8357808707925263}. Best is trial 12 with value: 0.40208711717628304.[0m
[32m[I 2023-10-24 07:38:07,640][0m Trial 13 finished with value: 0.421208282183147 and parameters: {'learning_rate': 0.0985972501249833, 'num_leaves': 35, 'max_depth': 5, 'min_child_weight': 5.145281329527011, 'colsample_bytree': 0.6195001427955622, 'subsample': 0.7579057250823248}. Best is trial 12 with value: 0.40208711717628304.[0m


[LightGBM] [Info] Number of positive: 273, number of negative: 439
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 204
[LightGBM] [Info] Number of data points in the train set: 712, number of used features: 7
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.383427 -> initscore=-0.475028
[LightGBM] [Info] Start training from score -0.475028
Training until validation scores don't improve for 10 rounds
[10]	training's binary_logloss: 0.434183	valid_1's binary_logloss: 0.475957
[20]	training's binary_logloss: 0.350013	valid_1's binary_logloss: 0.42316
[30]	training's binary_logloss: 0.315023	valid_1's binary_logloss: 0.404772
[40]	training's binary_logloss: 0.293528	valid_1's binary_logloss: 0.407443
Early stopping, best iteration is:
[33]	training's binary_logloss: 0.306808	valid_1's binary_logloss: 0.402087
[LightGBM] [Info] Number of positive: 273, number of negative: 439
You can set 

[32m[I 2023-10-24 07:38:07,785][0m Trial 14 finished with value: 0.42116692874701467 and parameters: {'learning_rate': 0.05099938327043794, 'num_leaves': 42, 'max_depth': 5, 'min_child_weight': 9.797908615246012, 'colsample_bytree': 0.7258055623892625, 'subsample': 0.7683127357926569}. Best is trial 12 with value: 0.40208711717628304.[0m


[LightGBM] [Info] Number of positive: 273, number of negative: 439
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 204
[LightGBM] [Info] Number of data points in the train set: 712, number of used features: 7
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.383427 -> initscore=-0.475028
[LightGBM] [Info] Start training from score -0.475028
Training until validation scores don't improve for 10 rounds
[10]	training's binary_logloss: 0.534235	valid_1's binary_logloss: 0.535641
[20]	training's binary_logloss: 0.472213	valid_1's binary_logloss: 0.47882
[30]	training's binary_logloss: 0.436988	valid_1's binary_logloss: 0.448906
[40]	training's binary_logloss: 0.417048	valid_1's binary_logloss: 0.435031
[50]	training's binary_logloss: 0.402713	valid_1's binary_logloss: 0.429309
[60]	training's binary_logloss: 0.392415	valid_1's binary_logloss: 0.425045
[70]	training's binary_logloss: 0.384466	valid_1's binary_logloss: 0.421517
[80]	training's binary_log

[32m[I 2023-10-24 07:38:07,899][0m Trial 15 finished with value: 0.4205128269940967 and parameters: {'learning_rate': 0.08443305914028852, 'num_leaves': 94, 'max_depth': 6, 'min_child_weight': 9.190531116985232, 'colsample_bytree': 0.6758877481696399, 'subsample': 0.7978161033117064}. Best is trial 12 with value: 0.40208711717628304.[0m
[32m[I 2023-10-24 07:38:07,992][0m Trial 16 finished with value: 0.42021412304392197 and parameters: {'learning_rate': 0.09325512806526925, 'num_leaves': 82, 'max_depth': 6, 'min_child_weight': 9.175557244792662, 'colsample_bytree': 0.9224051635830199, 'subsample': 0.6290893724207108}. Best is trial 12 with value: 0.40208711717628304.[0m


No further splits with positive gain, best gain: -inf
[40]	training's binary_logloss: 0.380091	valid_1's binary_logloss: 0.421179
[50]	training's binary_logloss: 0.368422	valid_1's binary_logloss: 0.421813
Early stopping, best iteration is:
[41]	training's binary_logloss: 0.378553	valid_1's binary_logloss: 0.420513
[LightGBM] [Info] Number of positive: 273, number of negative: 439
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 204
[LightGBM] [Info] Number of data points in the train set: 712, number of used features: 7
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.383427 -> initscore=-0.475028
[LightGBM] [Info] Start training from score -0.475028
Training until validation scores don't improve for 10 rounds
[10]	training's binary_logloss: 0.47085	valid_1's binary_logloss: 0.481697
[20]	training's binary_logloss: 0.411558	valid_1's binary_logloss: 0.435382
[30]	training's binary_lo

[32m[I 2023-10-24 07:38:08,133][0m Trial 17 finished with value: 0.4026775894060062 and parameters: {'learning_rate': 0.06388592806405163, 'num_leaves': 43, 'max_depth': 5, 'min_child_weight': 1.4047988378034826, 'colsample_bytree': 0.850714521441498, 'subsample': 0.7012921325315203}. Best is trial 12 with value: 0.40208711717628304.[0m
[32m[I 2023-10-24 07:38:08,256][0m Trial 18 finished with value: 0.40660486013226504 and parameters: {'learning_rate': 0.059159568100355844, 'num_leaves': 74, 'max_depth': 6, 'min_child_weight': 0.29906470725618617, 'colsample_bytree': 0.890129010226077, 'subsample': 0.6016434166673914}. Best is trial 12 with value: 0.40208711717628304.[0m


No further splits with positive gain, best gain: -inf
[30]	training's binary_logloss: 0.371552	valid_1's binary_logloss: 0.418185
[40]	training's binary_logloss: 0.348937	valid_1's binary_logloss: 0.407291
[50]	training's binary_logloss: 0.333942	valid_1's binary_logloss: 0.403326
[60]	training's binary_logloss: 0.322979	valid_1's binary_logloss: 0.40528
Early stopping, best iteration is:
[51]	training's binary_logloss: 0.33265	valid_1's binary_logloss: 0.402678
[LightGBM] [Info] Number of positive: 273, number of negative: 439
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 204
[LightGBM] [Info] Number of data points in the train set: 712, number of used features: 7
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.383427 -> initscore=-0.475028
[LightGBM] [Info] Start training from score -0.475028
Training until validation scores don't improve for 10 rounds
[10]	training's binary_log

[32m[I 2023-10-24 07:38:08,362][0m Trial 19 finished with value: 0.4055958186516944 and parameters: {'learning_rate': 0.0838908268398115, 'num_leaves': 35, 'max_depth': 9, 'min_child_weight': 7.378420819013689, 'colsample_bytree': 0.9367659907911033, 'subsample': 0.6870636002319853}. Best is trial 12 with value: 0.40208711717628304.[0m


[LightGBM] [Info] Number of positive: 273, number of negative: 439
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 204
[LightGBM] [Info] Number of data points in the train set: 712, number of used features: 7
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.383427 -> initscore=-0.475028
[LightGBM] [Info] Start training from score -0.475028
Training until validation scores don't improve for 10 rounds
[10]	training's binary_logloss: 0.461779	valid_1's binary_logloss: 0.474349
[20]	training's binary_logloss: 0.400039	valid_1's binary_logloss: 0.422872
[30]	training's binary_logloss: 0.370496	valid_1's binary_logloss: 0.408066
[40]	training's binary_logloss: 0.355049	valid_1's binary_logloss: 0.407478
Early stopping, best iteration is:
[37]	training's binary_logloss: 0.359434	valid_1's binary_logloss: 0.405596
[LightGBM] [Info] Number of positive: 273, number of negative: 439
You can set

[32m[I 2023-10-24 07:38:08,482][0m Trial 20 finished with value: 0.413539084275825 and parameters: {'learning_rate': 0.07880786672089185, 'num_leaves': 104, 'max_depth': 5, 'min_child_weight': 5.765293867329646, 'colsample_bytree': 0.6780785714242163, 'subsample': 0.9334621540309447}. Best is trial 12 with value: 0.40208711717628304.[0m
[32m[I 2023-10-24 07:38:08,595][0m Trial 21 finished with value: 0.41294173474337437 and parameters: {'learning_rate': 0.07235626893088137, 'num_leaves': 36, 'max_depth': 6, 'min_child_weight': 7.024844825643292, 'colsample_bytree': 0.7041029327992967, 'subsample': 0.884829236895867}. Best is trial 12 with value: 0.40208711717628304.[0m


[LightGBM] [Info] Number of positive: 273, number of negative: 439
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 204
[LightGBM] [Info] Number of data points in the train set: 712, number of used features: 7
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.383427 -> initscore=-0.475028
[LightGBM] [Info] Start training from score -0.475028
Training until validation scores don't improve for 10 rounds
[10]	training's binary_logloss: 0.490446	valid_1's binary_logloss: 0.502825
[20]	training's binary_logloss: 0.421417	valid_1's binary_logloss: 0.4464
[30]	training's binary_logloss: 0.387426	valid_1's binary_logloss: 0.423057
[40]	training's binary_logloss: 0.37023	valid_1's binary_logloss: 0.415412
[50]	training's binary_logloss: 0.357428	valid_1's binary_logloss: 0.413577
Early stopping, best iteration is:
[49]	training's binary_logloss: 0.358269	valid_1's binary_logloss: 0.412942
[Ligh

[32m[I 2023-10-24 07:38:08,705][0m Trial 22 finished with value: 0.4082288066352989 and parameters: {'learning_rate': 0.06982528704234924, 'num_leaves': 88, 'max_depth': 3, 'min_child_weight': 5.854601439095001, 'colsample_bytree': 0.9086073766491984, 'subsample': 0.842386056527218}. Best is trial 12 with value: 0.40208711717628304.[0m
[32m[I 2023-10-24 07:38:08,803][0m Trial 23 finished with value: 0.4120084550923528 and parameters: {'learning_rate': 0.08626271399098204, 'num_leaves': 93, 'max_depth': 6, 'min_child_weight': 6.47550297237341, 'colsample_bytree': 0.7483492669953629, 'subsample': 0.8122376249447735}. Best is trial 12 with value: 0.40208711717628304.[0m


No further splits with positive gain, best gain: -inf
Early stopping, best iteration is:
[57]	training's binary_logloss: 0.37656	valid_1's binary_logloss: 0.408229
[LightGBM] [Info] Number of positive: 273, number of negative: 439
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 204
[LightGBM] [Info] Number of data points in the train set: 712, number of used features: 7
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.383427 -> initscore=-0.475028
[LightGBM] [Info] Start training from score -0.475028
Training until validation scores don't improve for 10 rounds
[10]	training's binary_logloss: 0.468626	valid_1's binary_logloss: 0.487453
[20]	training's binary_logloss: 0.399971	valid_1's binary_logloss: 0.436068
[30]	training's binary_logloss: 0.370589	valid_1's binary_logloss: 0.417722
[40]	training's binary_logloss: 0.355754	valid_1's binary_logloss: 0.415082
Early stopping, best iter

[32m[I 2023-10-24 07:38:08,986][0m Trial 24 finished with value: 0.4217675987745577 and parameters: {'learning_rate': 0.05095965991546668, 'num_leaves': 117, 'max_depth': 8, 'min_child_weight': 2.9511109431964773, 'colsample_bytree': 0.7342117787622318, 'subsample': 0.8622094430034536}. Best is trial 12 with value: 0.40208711717628304.[0m


No further splits with positive gain, best gain: -inf
[40]	training's binary_logloss: 0.358473	valid_1's binary_logloss: 0.43561
[50]	training's binary_logloss: 0.335501	valid_1's binary_logloss: 0.427288
[60]	training's binary_logloss: 0.319294	valid_1's binary_logloss: 0.425212
[70]	training's binary_logloss: 0.308179	valid_1's binary_logloss: 0.422817
[80]	training's binary_logloss: 0.299464	valid_1's binary_logloss: 0.422209
Early stopping, best iteration is:
[77]	training's binary_logloss: 0.301851	valid_1's binary_logloss: 0.421768
[LightGBM] [Info] Number of positive: 273, number of negative: 439
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 204
[LightGBM] [Info] Number of data points in the train set: 712, number of used features: 7
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.383427 -> initscore=-0.475028
[LightGBM] [Info] Start training from score -0.475028
Training u

[32m[I 2023-10-24 07:38:09,109][0m Trial 25 finished with value: 0.41634603850335267 and parameters: {'learning_rate': 0.0939226095138021, 'num_leaves': 111, 'max_depth': 6, 'min_child_weight': 1.014501505662892, 'colsample_bytree': 0.7239163317293336, 'subsample': 0.7931356906966406}. Best is trial 12 with value: 0.40208711717628304.[0m
[32m[I 2023-10-24 07:38:09,288][0m Trial 26 finished with value: 0.42275027717119545 and parameters: {'learning_rate': 0.051681254674941605, 'num_leaves': 120, 'max_depth': 8, 'min_child_weight': 3.2778726895976256, 'colsample_bytree': 0.677441939814324, 'subsample': 0.6494423667240639}. Best is trial 12 with value: 0.40208711717628304.[0m


Early stopping, best iteration is:
[36]	training's binary_logloss: 0.321833	valid_1's binary_logloss: 0.416346
[LightGBM] [Info] Number of positive: 273, number of negative: 439
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 204
[LightGBM] [Info] Number of data points in the train set: 712, number of used features: 7
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.383427 -> initscore=-0.475028
[LightGBM] [Info] Start training from score -0.475028
Training until validation scores don't improve for 10 rounds
[10]	training's binary_logloss: 0.509717	valid_1's binary_logloss: 0.530251
[20]	training's binary_logloss: 0.431822	valid_1's binary_logloss: 0.475228
[30]	training's binary_logloss: 0.386221	valid_1's binary_logloss: 0.447446
[40]	training's binary_logloss: 0.357171	valid_1's binary_logloss: 0.436817
[50]	training's binary_logloss: 0.336218	valid_1's binary_logloss: 0.430953
[6

[32m[I 2023-10-24 07:38:09,424][0m Trial 27 finished with value: 0.4071930749295739 and parameters: {'learning_rate': 0.054862996353158694, 'num_leaves': 67, 'max_depth': 9, 'min_child_weight': 2.6773751831153065, 'colsample_bytree': 0.7879578824815229, 'subsample': 0.7567741601547554}. Best is trial 12 with value: 0.40208711717628304.[0m


[LightGBM] [Info] Number of positive: 273, number of negative: 439
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 204
[LightGBM] [Info] Number of data points in the train set: 712, number of used features: 7
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.383427 -> initscore=-0.475028
[LightGBM] [Info] Start training from score -0.475028
Training until validation scores don't improve for 10 rounds
[10]	training's binary_logloss: 0.49967	valid_1's binary_logloss: 0.521298
[20]	training's binary_logloss: 0.418231	valid_1's binary_logloss: 0.461324
[30]	training's binary_logloss: 0.367391	valid_1's binary_logloss: 0.426215
[40]	training's binary_logloss: 0.338919	valid_1's binary_logloss: 0.411754
[50]	training's binary_logloss: 0.318941	valid_1's binary_logloss: 0.407602
Early stopping, best iteration is:
[46]	training's binary_logloss: 0.325985	valid_1's binary_logloss: 0.407193
[Li

[32m[I 2023-10-24 07:38:09,552][0m Trial 28 finished with value: 0.40810895453725987 and parameters: {'learning_rate': 0.05497845445554056, 'num_leaves': 116, 'max_depth': 5, 'min_child_weight': 4.745566822503397, 'colsample_bytree': 0.8943990454249597, 'subsample': 0.9166267104733297}. Best is trial 12 with value: 0.40208711717628304.[0m


No further splits with positive gain, best gain: -inf
[50]	training's binary_logloss: 0.355718	valid_1's binary_logloss: 0.410536
[60]	training's binary_logloss: 0.346931	valid_1's binary_logloss: 0.409093
Early stopping, best iteration is:
[55]	training's binary_logloss: 0.350938	valid_1's binary_logloss: 0.408109
[LightGBM] [Info] Number of positive: 273, number of negative: 439
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 204
[LightGBM] [Info] Number of data points in the train set: 712, number of used features: 7
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.383427 -> initscore=-0.475028
[LightGBM] [Info] Start training from score -0.475028
Training until validation scores don't improve for 10 rounds
[10]	training's binary_logloss: 0.507885	valid_1's binary_logloss: 0.527037
[20]	training's binary_logloss: 0.430098	valid_1's binary_logloss: 0.465678
[30]	training's binary_l

[32m[I 2023-10-24 07:38:09,709][0m Trial 29 finished with value: 0.403399507866762 and parameters: {'learning_rate': 0.05187796919140702, 'num_leaves': 59, 'max_depth': 6, 'min_child_weight': 1.7517903791801468, 'colsample_bytree': 0.8726678568631625, 'subsample': 0.9027266534608724}. Best is trial 12 with value: 0.40208711717628304.[0m
[32m[I 2023-10-24 07:38:09,817][0m Trial 30 finished with value: 0.404764509846706 and parameters: {'learning_rate': 0.07056983618777274, 'num_leaves': 99, 'max_depth': 6, 'min_child_weight': 0.42890161413515493, 'colsample_bytree': 0.9439012359388177, 'subsample': 0.7306517627741391}. Best is trial 12 with value: 0.40208711717628304.[0m


[LightGBM] [Info] Number of positive: 273, number of negative: 439
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 204
[LightGBM] [Info] Number of data points in the train set: 712, number of used features: 7
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.383427 -> initscore=-0.475028
[LightGBM] [Info] Start training from score -0.475028
Training until validation scores don't improve for 10 rounds
[10]	training's binary_logloss: 0.465238	valid_1's binary_logloss: 0.487795
[20]	training's binary_logloss: 0.387768	valid_1's binary_logloss: 0.430694
[30]	training's binary_logloss: 0.351659	valid_1's binary_logloss: 0.410166
[40]	training's binary_logloss: 0.330428	valid_1's binary_logloss: 0.406294
Early stopping, best iteration is:
[36]	training's binary_logloss: 0.338522	valid_1's binary_logloss: 0.404765
[LightGBM] [Info] Number of positive: 273, number of negative: 439
You can set

[32m[I 2023-10-24 07:38:09,956][0m Trial 31 finished with value: 0.42790261312936745 and parameters: {'learning_rate': 0.07100376848953054, 'num_leaves': 82, 'max_depth': 7, 'min_child_weight': 3.7184114520077625, 'colsample_bytree': 0.6058197908068536, 'subsample': 0.6807598173923785}. Best is trial 12 with value: 0.40208711717628304.[0m
[32m[I 2023-10-24 07:38:10,062][0m Trial 32 finished with value: 0.4116213748910306 and parameters: {'learning_rate': 0.08824558494984286, 'num_leaves': 93, 'max_depth': 8, 'min_child_weight': 7.524992562823994, 'colsample_bytree': 0.7187913368073314, 'subsample': 0.7713421279118827}. Best is trial 12 with value: 0.40208711717628304.[0m


No further splits with positive gain, best gain: -inf
[60]	training's binary_logloss: 0.313002	valid_1's binary_logloss: 0.428164
Early stopping, best iteration is:
[51]	training's binary_logloss: 0.324521	valid_1's binary_logloss: 0.427903
[LightGBM] [Info] Number of positive: 273, number of negative: 439
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 204
[LightGBM] [Info] Number of data points in the train set: 712, number of used features: 7
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.383427 -> initscore=-0.475028
[LightGBM] [Info] Start training from score -0.475028
Training until validation scores don't improve for 10 rounds
[10]	training's binary_logloss: 0.471612	valid_1's binary_logloss: 0.485654
[20]	training's binary_logloss: 0.405979	valid_1's binary_logloss: 0.433216
[30]	training's binary_logloss: 0.376405	valid_1's binary_logloss: 0.414175
[40]	training's binary_l

[32m[I 2023-10-24 07:38:10,181][0m Trial 33 finished with value: 0.42236291298509254 and parameters: {'learning_rate': 0.06694925583898892, 'num_leaves': 68, 'max_depth': 3, 'min_child_weight': 7.957607263238464, 'colsample_bytree': 0.678373640821133, 'subsample': 0.7208730882439159}. Best is trial 12 with value: 0.40208711717628304.[0m
[32m[I 2023-10-24 07:38:10,293][0m Trial 34 finished with value: 0.41779616264025615 and parameters: {'learning_rate': 0.09640406467327955, 'num_leaves': 40, 'max_depth': 8, 'min_child_weight': 0.41520540235994774, 'colsample_bytree': 0.6576429547742695, 'subsample': 0.8175174405249173}. Best is trial 12 with value: 0.40208711717628304.[0m


No further splits with positive gain, best gain: -inf
[40]	training's binary_logloss: 0.408067	valid_1's binary_logloss: 0.427709
[50]	training's binary_logloss: 0.396622	valid_1's binary_logloss: 0.423798
[60]	training's binary_logloss: 0.388259	valid_1's binary_logloss: 0.422482
Early stopping, best iteration is:
[59]	training's binary_logloss: 0.388981	valid_1's binary_logloss: 0.422363
[LightGBM] [Info] Number of positive: 273, number of negative: 439
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 204
[LightGBM] [Info] Number of data points in the train set: 712, number of used features: 7
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.383427 -> initscore=-0.475028
[LightGBM] [Info] Start training from score -0.475028
Training until validation scores don't improve for 10 rounds
[10]	training's binary_logloss: 0.440382	valid_1's binary_logloss: 0.478052
[20]	training's binary_l

[32m[I 2023-10-24 07:38:10,388][0m Trial 35 finished with value: 0.4176075702304769 and parameters: {'learning_rate': 0.07886142943020838, 'num_leaves': 68, 'max_depth': 3, 'min_child_weight': 9.348718579455458, 'colsample_bytree': 0.8148880845880635, 'subsample': 0.7874714810587353}. Best is trial 12 with value: 0.40208711717628304.[0m
[32m[I 2023-10-24 07:38:10,502][0m Trial 36 finished with value: 0.41429156473717244 and parameters: {'learning_rate': 0.07949549881772855, 'num_leaves': 65, 'max_depth': 5, 'min_child_weight': 5.5870001073359585, 'colsample_bytree': 0.674336674611608, 'subsample': 0.6778551285353865}. Best is trial 12 with value: 0.40208711717628304.[0m


[10]	training's binary_logloss: 0.496908	valid_1's binary_logloss: 0.497884
[20]	training's binary_logloss: 0.439241	valid_1's binary_logloss: 0.445058
[30]	training's binary_logloss: 0.413065	valid_1's binary_logloss: 0.424058
[40]	training's binary_logloss: 0.398776	valid_1's binary_logloss: 0.418117
[50]	training's binary_logloss: 0.389352	valid_1's binary_logloss: 0.419129
Early stopping, best iteration is:
[43]	training's binary_logloss: 0.395273	valid_1's binary_logloss: 0.417608
[LightGBM] [Info] Number of positive: 273, number of negative: 439
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 204
[LightGBM] [Info] Number of data points in the train set: 712, number of used features: 7
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.383427 -> initscore=-0.475028
[LightGBM] [Info] Start training from score -0.475028
Training until validation scores don't improve for 10 rounds
[1

[32m[I 2023-10-24 07:38:10,630][0m Trial 37 finished with value: 0.413154129181515 and parameters: {'learning_rate': 0.060937468686838595, 'num_leaves': 76, 'max_depth': 8, 'min_child_weight': 4.575879446830914, 'colsample_bytree': 0.9395828390836352, 'subsample': 0.8381906419174727}. Best is trial 12 with value: 0.40208711717628304.[0m


[LightGBM] [Info] Start training from score -0.475028
Training until validation scores don't improve for 10 rounds
[10]	training's binary_logloss: 0.47963	valid_1's binary_logloss: 0.501867
[20]	training's binary_logloss: 0.404135	valid_1's binary_logloss: 0.445337
[30]	training's binary_logloss: 0.366216	valid_1's binary_logloss: 0.421906
[40]	training's binary_logloss: 0.341737	valid_1's binary_logloss: 0.413856
[50]	training's binary_logloss: 0.326771	valid_1's binary_logloss: 0.41406
Early stopping, best iteration is:
[44]	training's binary_logloss: 0.334068	valid_1's binary_logloss: 0.413154
[LightGBM] [Info] Number of positive: 273, number of negative: 439
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 204
[LightGBM] [Info] Number of data points in the train set: 712, number of used features: 7
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.383427 -> initscore=-0.475028
[Lig

[32m[I 2023-10-24 07:38:10,792][0m Trial 38 finished with value: 0.41025649734255976 and parameters: {'learning_rate': 0.05426477829293503, 'num_leaves': 54, 'max_depth': 9, 'min_child_weight': 4.9295932731537455, 'colsample_bytree': 0.9083515967705862, 'subsample': 0.9417415353075649}. Best is trial 12 with value: 0.40208711717628304.[0m
[32m[I 2023-10-24 07:38:10,902][0m Trial 39 finished with value: 0.4216785338484761 and parameters: {'learning_rate': 0.08088289580479402, 'num_leaves': 73, 'max_depth': 8, 'min_child_weight': 8.560674445076073, 'colsample_bytree': 0.8603420908584256, 'subsample': 0.7675087139665625}. Best is trial 12 with value: 0.40208711717628304.[0m


No further splits with positive gain, best gain: -inf
[50]	training's binary_logloss: 0.340021	valid_1's binary_logloss: 0.41583
[60]	training's binary_logloss: 0.328519	valid_1's binary_logloss: 0.413997
[70]	training's binary_logloss: 0.319809	valid_1's binary_logloss: 0.411158
Early stopping, best iteration is:
[69]	training's binary_logloss: 0.320914	valid_1's binary_logloss: 0.410256
[LightGBM] [Info] Number of positive: 273, number of negative: 439
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 204
[LightGBM] [Info] Number of data points in the train set: 712, number of used features: 7
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.383427 -> initscore=-0.475028
[LightGBM] [Info] Start training from score -0.475028
Training until validation scores don't improve for 10 rounds
[10]	training's binary_logloss: 0.482888	valid_1's binary_logloss: 0.492987
[20]	training's binary_lo

In [12]:
params = {
    'objective': 'binary'
}

for i, j in study.best_params.items():
    params[i] = j

params

{'objective': 'binary',
 'learning_rate': 0.0974785526725371,
 'num_leaves': 127,
 'max_depth': 8,
 'min_child_weight': 0.23435919255988735,
 'colsample_bytree': 0.8179961334413343,
 'subsample': 0.8357808707925263}

## Model

In [13]:
lgb_train = lgb.Dataset(X_train, y_train, categorical_feature=categorical_features)
lgb_eval = lgb.Dataset(X_valid, y_valid, reference=lgb_train, categorical_feature=categorical_features)
    
model = lgb.train(
    params,
    lgb_train,
    valid_sets=[lgb_train, lgb_eval],
    verbose_eval=10,
    num_boost_round=1000,
    early_stopping_rounds=10
)

[LightGBM] [Info] Number of positive: 273, number of negative: 439
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 204
[LightGBM] [Info] Number of data points in the train set: 712, number of used features: 7
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.383427 -> initscore=-0.475028
[LightGBM] [Info] Start training from score -0.475028
Training until validation scores don't improve for 10 rounds
[10]	training's binary_logloss: 0.434183	valid_1's binary_logloss: 0.475957
[20]	training's binary_logloss: 0.350013	valid_1's binary_logloss: 0.42316
[30]	training's binary_logloss: 0.315023	valid_1's binary_logloss: 0.404772
[40]	training's binary_logloss: 0.293528	valid_1's binary_logloss: 0.407443
Early stopping, best iteration is:
[33]	training's binary_logloss: 0.306808	valid_1's binary_logloss: 0.402087




## Evaluate

In [14]:
y_pred_valid = model.predict(X_valid, num_iteration=model.best_iteration)
y_pred_valid = (y_pred_valid > 0.5).astype(int)

In [15]:
f1_score(y_valid, y_pred_valid, average='macro')

0.7946693657219974

# Submit

In [16]:
for col in categorical_features:
    test[col] = test[col].astype('category')

In [17]:
X_test = test[features]

In [18]:
y_pred_test = model.predict(X_test, num_iteration=model.best_iteration)
y_pred_test = (y_pred_test > 0.5).astype(int)

In [19]:
submit = pd.DataFrame(y_pred_test, index=test['PassengerId'], columns=['Survived'])
submit.to_csv('submit.csv')