## Module

In [1]:
import optuna
import numpy as np
import pandas as pd
import lightgbm as lgb
from sklearn.metrics import log_loss
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split

## Dataset

In [2]:
train = pd.read_csv('../input/titanic/train.csv')
print(train.shape)
train.head()

(891, 12)


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [3]:
test = pd.read_csv('../input/titanic/test.csv')
print(test.shape)
test.head()

(418, 11)


Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [4]:
submission = pd.read_csv('../input/titanic/gender_submission.csv')
submission.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0
3,895,0
4,896,1


## Info

In [5]:
train['Survived'].value_counts()

0    549
1    342
Name: Survived, dtype: int64

## Preprocess

In [6]:
features = [
    'Pclass',
    'Sex',
    'Age',
    'SibSp',
    'Parch',
    'Fare',
    'Embarked'
]

target = 'Survived'

In [7]:
categorical_features = ['Sex', 'Embarked']

for col in categorical_features:
    train[col] = train[col].astype('category')

In [8]:
X_train = train[features]
y_train = train[target]
print(X_train.shape)
print(y_train.shape)

(891, 7)
(891,)


## Params

In [9]:
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.3, stratify=y_train, random_state=123)

print(X_train.shape)
print(y_valid.shape)
print(X_train.shape)
print(y_valid.shape)

(623, 7)
(268,)
(623, 7)
(268,)


In [10]:
def objective(trial):
    params = {
        'objective': 'binary',
        'learning_rate': trial.suggest_uniform('learning_rate', 0.05, 0.1),
        'num_leaves': trial.suggest_int('num_leaves', 32, 128),
        'max_depth': trial.suggest_int('max_depth', 3, 9),
        'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.6, 0.95),
        'subsample': trial.suggest_uniform('subsample', 0.6, 0.95)
    }
    
    lgb_train = lgb.Dataset(X_train, y_train, categorical_feature=categorical_features)
    lgb_eval = lgb.Dataset(X_valid, y_valid, reference=lgb_train, categorical_feature=categorical_features)
    
    model = lgb.train(
        params,
        lgb_train,
        valid_sets=[lgb_train, lgb_eval],
        verbose_eval=10,
        num_boost_round=1000,
        early_stopping_rounds=10
    )
    
    y_pred_valid = model.predict(X_valid, num_iteration=model.best_iteration)
    score = log_loss(y_valid, y_pred_valid)
    return score

In [11]:
study = optuna.create_study(sampler=optuna.samplers.RandomSampler(seed=0))
study.optimize(objective, n_trials=40)

[32m[I 2023-10-26 03:31:48,559][0m A new study created in memory with name: no-name-61bf701b-f210-4e5b-876e-301dbadd2c2a[0m


[LightGBM] [Info] Number of positive: 239, number of negative: 384
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 191
[LightGBM] [Info] Number of data points in the train set: 623, number of used features: 7
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.383628 -> initscore=-0.474179
[LightGBM] [Info] Start training from score -0.474179


[32m[I 2023-10-26 03:31:48,714][0m Trial 0 finished with value: 0.4267343342536637 and parameters: {'learning_rate': 0.07744067519636624, 'num_leaves': 96, 'max_depth': 6, 'colsample_bytree': 0.9002809661679648, 'subsample': 0.8965381085744438}. Best is trial 0 with value: 0.4267343342536637.[0m


Training until validation scores don't improve for 10 rounds
[10]	training's binary_logloss: 0.464283	valid_1's binary_logloss: 0.49758
[20]	training's binary_logloss: 0.389499	valid_1's binary_logloss: 0.449843
[30]	training's binary_logloss: 0.34895	valid_1's binary_logloss: 0.428189
[40]	training's binary_logloss: 0.326736	valid_1's binary_logloss: 0.42684
[50]	training's binary_logloss: 0.310143	valid_1's binary_logloss: 0.428178
Early stopping, best iteration is:
[41]	training's binary_logloss: 0.324613	valid_1's binary_logloss: 0.426734
[LightGBM] [Info] Number of positive: 239, number of negative: 384
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 191
[LightGBM] [Info] Number of data points in the train set: 623, number of used features: 7
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.383628 -> initscore=-0.474179
[LightGBM] [Info] Start training from score -0.474179
Train

[32m[I 2023-10-26 03:31:48,845][0m Trial 1 finished with value: 0.4316685894406363 and parameters: {'learning_rate': 0.08117818483929862, 'num_leaves': 68, 'max_depth': 9, 'colsample_bytree': 0.6198495420611051, 'subsample': 0.6954297031030396}. Best is trial 0 with value: 0.4267343342536637.[0m
[32m[I 2023-10-26 03:31:48,960][0m Trial 2 finished with value: 0.42594674329919446 and parameters: {'learning_rate': 0.0738832558660675, 'num_leaves': 71, 'max_depth': 9, 'colsample_bytree': 0.7988155963828762, 'subsample': 0.9239588234024313}. Best is trial 2 with value: 0.42594674329919446.[0m


No further splits with positive gain, best gain: -inf
[50]	training's binary_logloss: 0.299409	valid_1's binary_logloss: 0.43273
Early stopping, best iteration is:
[41]	training's binary_logloss: 0.320781	valid_1's binary_logloss: 0.431669
[LightGBM] [Info] Number of positive: 239, number of negative: 384
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 191
[LightGBM] [Info] Number of data points in the train set: 623, number of used features: 7
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.383628 -> initscore=-0.474179
[LightGBM] [Info] Start training from score -0.474179
Training until validation scores don't improve for 10 rounds
[10]	training's binary_logloss: 0.466798	valid_1's binary_logloss: 0.503361
[20]	training's binary_logloss: 0.384823	valid_1's binary_logloss: 0.453348
[30]	training's binary_logloss: 0.340995	valid_1's binary_logloss: 0.428553
[40]	training's binary_lo

[32m[I 2023-10-26 03:31:49,168][0m Trial 3 finished with value: 0.4340558562922278 and parameters: {'learning_rate': 0.05355180290989435, 'num_leaves': 41, 'max_depth': 7, 'colsample_bytree': 0.607076439104114, 'subsample': 0.8914169459417782}. Best is trial 2 with value: 0.42594674329919446.[0m


No further splits with positive gain, best gain: -inf
[20]	training's binary_logloss: 0.443934	valid_1's binary_logloss: 0.488417
[30]	training's binary_logloss: 0.396439	valid_1's binary_logloss: 0.459489
[40]	training's binary_logloss: 0.370785	valid_1's binary_logloss: 0.449728
[50]	training's binary_logloss: 0.348318	valid_1's binary_logloss: 0.440265
[60]	training's binary_logloss: 0.332766	valid_1's binary_logloss: 0.437002
[70]	training's binary_logloss: 0.320333	valid_1's binary_logloss: 0.436036
[80]	training's binary_logloss: 0.308318	valid_1's binary_logloss: 0.435062
[90]	training's binary_logloss: 0.29788	valid_1's binary_logloss: 0.434169
[100]	training's binary_logloss: 0.288729	valid_1's binary_logloss: 0.434997
Early stopping, best iteration is:
[91]	training's binary_logloss: 0.297074	valid_1's binary_logloss: 0.434056
[LightGBM] [Info] Number of positive: 239, number of negative: 384
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enoug

[32m[I 2023-10-26 03:31:49,259][0m Trial 4 finished with value: 0.4159477318690093 and parameters: {'learning_rate': 0.08890783754749253, 'num_leaves': 79, 'max_depth': 3, 'colsample_bytree': 0.9425164197814674, 'subsample': 0.8797054974758531}. Best is trial 4 with value: 0.4159477318690093.[0m


No further splits with positive gain, best gain: -inf
[40]	training's binary_logloss: 0.369678	valid_1's binary_logloss: 0.415948
[50]	training's binary_logloss: 0.356138	valid_1's binary_logloss: 0.418927
Early stopping, best iteration is:
[40]	training's binary_logloss: 0.369678	valid_1's binary_logloss: 0.415948
[LightGBM] [Info] Number of positive: 239, number of negative: 384
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 191
[LightGBM] [Info] Number of data points in the train set: 623, number of used features: 7
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.383628 -> initscore=-0.474179
[LightGBM] [Info] Start training from score -0.474179
Training until validation scores don't improve for 10 rounds
[10]	training's binary_logloss: 0.48006	valid_1's binary_logloss: 0.512703
[20]	training's binary_logloss: 0.411796	valid_1's binary_logloss: 0.468505
[30]	training's binary_lo

[32m[I 2023-10-26 03:31:49,428][0m Trial 5 finished with value: 0.4276396882961306 and parameters: {'learning_rate': 0.0730739681126466, 'num_leaves': 51, 'max_depth': 6, 'colsample_bytree': 0.6413960490541266, 'subsample': 0.8239723574646333}. Best is trial 4 with value: 0.4159477318690093.[0m
[32m[I 2023-10-26 03:31:49,582][0m Trial 6 finished with value: 0.4260771634125801 and parameters: {'learning_rate': 0.05716766437045232, 'num_leaves': 64, 'max_depth': 5, 'colsample_bytree': 0.7451316789966832, 'subsample': 0.6925944642366194}. Best is trial 4 with value: 0.4159477318690093.[0m


[LightGBM] [Info] Number of positive: 239, number of negative: 384
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 191
[LightGBM] [Info] Number of data points in the train set: 623, number of used features: 7
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.383628 -> initscore=-0.474179
[LightGBM] [Info] Start training from score -0.474179
Training until validation scores don't improve for 10 rounds
[10]	training's binary_logloss: 0.505257	valid_1's binary_logloss: 0.530118
[20]	training's binary_logloss: 0.432611	valid_1's binary_logloss: 0.476667
[30]	training's binary_logloss: 0.39217	valid_1's binary_logloss: 0.448914
[40]	training's binary_logloss: 0.367622	valid_1's binary_logloss: 0.438541
[50]	training's binary_logloss: 0.348108	valid_1's binary_logloss: 0.43171
[60]	training's binary_logloss: 0.335504	valid_1's binary_logloss: 0.427595
[70]	training's binary_logloss: 0.32517

[32m[I 2023-10-26 03:31:49,699][0m Trial 7 finished with value: 0.4227068952570973 and parameters: {'learning_rate': 0.08871168447171084, 'num_leaves': 87, 'max_depth': 7, 'colsample_bytree': 0.7989518821040269, 'subsample': 0.6065764301527243}. Best is trial 4 with value: 0.4159477318690093.[0m
[32m[I 2023-10-26 03:31:49,792][0m Trial 8 finished with value: 0.41946992658197035 and parameters: {'learning_rate': 0.08088177485379386, 'num_leaves': 70, 'max_depth': 3, 'colsample_bytree': 0.8159268989061649, 'subsample': 0.9303118274801184}. Best is trial 4 with value: 0.4159477318690093.[0m


No further splits with positive gain, best gain: -inf
[30]	training's binary_logloss: 0.331752	valid_1's binary_logloss: 0.423964
[40]	training's binary_logloss: 0.30732	valid_1's binary_logloss: 0.425628
Early stopping, best iteration is:
[37]	training's binary_logloss: 0.313157	valid_1's binary_logloss: 0.422707
[LightGBM] [Info] Number of positive: 239, number of negative: 384
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 191
[LightGBM] [Info] Number of data points in the train set: 623, number of used features: 7
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.383628 -> initscore=-0.474179
[LightGBM] [Info] Start training from score -0.474179
Training until validation scores don't improve for 10 rounds
[10]	training's binary_logloss: 0.487186	valid_1's binary_logloss: 0.502163
[20]	training's binary_logloss: 0.424591	valid_1's binary_logloss: 0.44735
[30]	training's binary_log

[32m[I 2023-10-26 03:31:49,879][0m Trial 9 finished with value: 0.4177087179853438 and parameters: {'learning_rate': 0.08409101495517418, 'num_leaves': 90, 'max_depth': 3, 'colsample_bytree': 0.9158220041108944, 'subsample': 0.6347481226256408}. Best is trial 4 with value: 0.4159477318690093.[0m
[32m[I 2023-10-26 03:31:49,983][0m Trial 10 finished with value: 0.4323477767081574 and parameters: {'learning_rate': 0.09849045338733745, 'num_leaves': 43, 'max_depth': 9, 'colsample_bytree': 0.7253532584393337, 'subsample': 0.8627401494264597}. Best is trial 4 with value: 0.4159477318690093.[0m


No further splits with positive gain, best gain: -inf
[40]	training's binary_logloss: 0.37695	valid_1's binary_logloss: 0.418016
[50]	training's binary_logloss: 0.362142	valid_1's binary_logloss: 0.419469
Early stopping, best iteration is:
[41]	training's binary_logloss: 0.375277	valid_1's binary_logloss: 0.417709
[LightGBM] [Info] Number of positive: 239, number of negative: 384
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 191
[LightGBM] [Info] Number of data points in the train set: 623, number of used features: 7
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.383628 -> initscore=-0.474179
[LightGBM] [Info] Start training from score -0.474179
Training until validation scores don't improve for 10 rounds
[10]	training's binary_logloss: 0.438447	valid_1's binary_logloss: 0.488656
[20]	training's binary_logloss: 0.357984	valid_1's binary_logloss: 0.447699
[30]	training's binary_lo

[32m[I 2023-10-26 03:31:50,112][0m Trial 11 finished with value: 0.4249613053302447 and parameters: {'learning_rate': 0.08039153343577339, 'num_leaves': 44, 'max_depth': 4, 'colsample_bytree': 0.6134488992654571, 'subsample': 0.8219959202850673}. Best is trial 4 with value: 0.4159477318690093.[0m
[32m[I 2023-10-26 03:31:50,222][0m Trial 12 finished with value: 0.4327435631958559 and parameters: {'learning_rate': 0.09794746343122603, 'num_leaves': 79, 'max_depth': 7, 'colsample_bytree': 0.6564583312597486, 'subsample': 0.8285879139128894}. Best is trial 4 with value: 0.4159477318690093.[0m



[30]	training's binary_logloss: 0.385424	valid_1's binary_logloss: 0.437871
[40]	training's binary_logloss: 0.365834	valid_1's binary_logloss: 0.431588
[50]	training's binary_logloss: 0.350727	valid_1's binary_logloss: 0.427628
[60]	training's binary_logloss: 0.340298	valid_1's binary_logloss: 0.425471
Early stopping, best iteration is:
[59]	training's binary_logloss: 0.341364	valid_1's binary_logloss: 0.424961
[LightGBM] [Info] Number of positive: 239, number of negative: 384
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 191
[LightGBM] [Info] Number of data points in the train set: 623, number of used features: 7
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.383628 -> initscore=-0.474179
[LightGBM] [Info] Start training from score -0.474179
Training until validation scores don't improve for 10 rounds
[10]	training's binary_logloss: 0.440394	valid_1's binary_logloss: 0.488739
[

[32m[I 2023-10-26 03:31:50,364][0m Trial 13 finished with value: 0.4304924548808705 and parameters: {'learning_rate': 0.06266458012698911, 'num_leaves': 47, 'max_depth': 7, 'colsample_bytree': 0.685548957200561, 'subsample': 0.6556393542759319}. Best is trial 4 with value: 0.4159477318690093.[0m


No further splits with positive gain, best gain: -inf
Training until validation scores don't improve for 10 rounds
[10]	training's binary_logloss: 0.490448	valid_1's binary_logloss: 0.523134
[20]	training's binary_logloss: 0.411871	valid_1's binary_logloss: 0.469991
[30]	training's binary_logloss: 0.368223	valid_1's binary_logloss: 0.44651
[40]	training's binary_logloss: 0.342322	valid_1's binary_logloss: 0.438465
[50]	training's binary_logloss: 0.320877	valid_1's binary_logloss: 0.43267
[60]	training's binary_logloss: 0.306501	valid_1's binary_logloss: 0.430492
[70]	training's binary_logloss: 0.294605	valid_1's binary_logloss: 0.43265
Early stopping, best iteration is:
[60]	training's binary_logloss: 0.306501	valid_1's binary_logloss: 0.430492
[LightGBM] [Info] Number of positive: 239, number of negative: 384
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 191
[LightGBM] [Info] Number of data points in the train set: 623, number of used features:

[32m[I 2023-10-26 03:31:50,490][0m Trial 14 finished with value: 0.4197625684761274 and parameters: {'learning_rate': 0.05551875705821526, 'num_leaves': 117, 'max_depth': 3, 'colsample_bytree': 0.6483640329720148, 'subsample': 0.6688038265880187}. Best is trial 4 with value: 0.4159477318690093.[0m
[32m[I 2023-10-26 03:31:50,628][0m Trial 15 finished with value: 0.4379375365986779 and parameters: {'learning_rate': 0.06843625853304822, 'num_leaves': 101, 'max_depth': 9, 'colsample_bytree': 0.6339854465275714, 'subsample': 0.8932807176245814}. Best is trial 4 with value: 0.4159477318690093.[0m


No further splits with positive gain, best gain: -inf
[70]	training's binary_logloss: 0.375706	valid_1's binary_logloss: 0.421692
[80]	training's binary_logloss: 0.365721	valid_1's binary_logloss: 0.421072
Early stopping, best iteration is:
[76]	training's binary_logloss: 0.370244	valid_1's binary_logloss: 0.419763
[LightGBM] [Info] Number of positive: 239, number of negative: 384
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 191
[LightGBM] [Info] Number of data points in the train set: 623, number of used features: 7
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.383628 -> initscore=-0.474179
[LightGBM] [Info] Start training from score -0.474179
Training until validation scores don't improve for 10 rounds
[10]	training's binary_logloss: 0.486023	valid_1's binary_logloss: 0.518618
[20]	training's binary_logloss: 0.412757	valid_1's binary_logloss: 0.472787
[30]	training's binary_logloss: 0.365786	valid_1's binary_logloss: 0.447622
[40]	trainin

[32m[I 2023-10-26 03:31:50,779][0m Trial 16 finished with value: 0.43016952366352945 and parameters: {'learning_rate': 0.054804920394698156, 'num_leaves': 125, 'max_depth': 6, 'colsample_bytree': 0.7640279205766956, 'subsample': 0.9418663808666179}. Best is trial 4 with value: 0.4159477318690093.[0m


[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.383628 -> initscore=-0.474179
[LightGBM] [Info] Start training from score -0.474179
Training until validation scores don't improve for 10 rounds
[10]	training's binary_logloss: 0.506254	valid_1's binary_logloss: 0.531845
[20]	training's binary_logloss: 0.430928	valid_1's binary_logloss: 0.478964
[30]	training's binary_logloss: 0.387158	valid_1's binary_logloss: 0.452056
[40]	training's binary_logloss: 0.360423	valid_1's binary_logloss: 0.439528
[50]	training's binary_logloss: 0.34067	valid_1's binary_logloss: 0.433065
[60]	training's binary_logloss: 0.326785	valid_1's binary_logloss: 0.43114
[70]	training's binary_logloss: 0.315051	valid_1's binary_logloss: 0.430763
[80]	training's binary_logloss: 0.305685	valid_1's binary_logloss: 0.430271
Early stopping, best iteration is:
[71]	training's binary_logloss: 0.314364	valid_1's binary_logloss: 0.43017
[LightGBM] [Info] Number of positive: 239, number of negative: 384
You can set `force_row

[32m[I 2023-10-26 03:31:50,896][0m Trial 17 finished with value: 0.42065399007562954 and parameters: {'learning_rate': 0.08024227598725231, 'num_leaves': 105, 'max_depth': 3, 'colsample_bytree': 0.7545988224370959, 'subsample': 0.942855355084455}. Best is trial 4 with value: 0.4159477318690093.[0m
[32m[I 2023-10-26 03:31:51,006][0m Trial 18 finished with value: 0.41973691515276873 and parameters: {'learning_rate': 0.06797222319846608, 'num_leaves': 91, 'max_depth': 5, 'colsample_bytree': 0.8410314139820196, 'subsample': 0.9081665612384084}. Best is trial 4 with value: 0.4159477318690093.[0m



[60]	training's binary_logloss: 0.359896	valid_1's binary_logloss: 0.421906
Early stopping, best iteration is:
[58]	training's binary_logloss: 0.362474	valid_1's binary_logloss: 0.420654
[LightGBM] [Info] Number of positive: 239, number of negative: 384
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 191
[LightGBM] [Info] Number of data points in the train set: 623, number of used features: 7
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.383628 -> initscore=-0.474179
[LightGBM] [Info] Start training from score -0.474179
Training until validation scores don't improve for 10 rounds
[10]	training's binary_logloss: 0.483938	valid_1's binary_logloss: 0.508288
[20]	training's binary_logloss: 0.41017	valid_1's binary_logloss: 0.453729
[30]	training's binary_logloss: 0.370029	valid_1's binary_logloss: 0.426804
[40]	training's binary_logloss: 0.348036	valid_1's binary_logloss: 0.419737
[5

[32m[I 2023-10-26 03:31:51,108][0m Trial 19 finished with value: 0.422789983564475 and parameters: {'learning_rate': 0.09591177331810724, 'num_leaves': 99, 'max_depth': 5, 'colsample_bytree': 0.7978161033117064, 'subsample': 0.9027858964568847}. Best is trial 4 with value: 0.4159477318690093.[0m
[32m[I 2023-10-26 03:31:51,206][0m Trial 20 finished with value: 0.4195414138738696 and parameters: {'learning_rate': 0.07544844803335073, 'num_leaves': 46, 'max_depth': 4, 'colsample_bytree': 0.9224051635830199, 'subsample': 0.6290893724207108}. Best is trial 4 with value: 0.4159477318690093.[0m


No further splits with positive gain, best gain: -inf
[20]	training's binary_logloss: 0.376965	valid_1's binary_logloss: 0.437428
[30]	training's binary_logloss: 0.347221	valid_1's binary_logloss: 0.423655
[40]	training's binary_logloss: 0.328864	valid_1's binary_logloss: 0.425549
Early stopping, best iteration is:
[33]	training's binary_logloss: 0.340906	valid_1's binary_logloss: 0.42279
[LightGBM] [Info] Number of positive: 239, number of negative: 384
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 191
[LightGBM] [Info] Number of data points in the train set: 623, number of used features: 7
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.383628 -> initscore=-0.474179
[LightGBM] [Info] Start training from score -0.474179
Training until validation scores don't improve for 10 rounds
[10]	training's binary_logloss: 0.479707	valid_1's binary_logloss: 0.499397
[20]	training's binary_lo

[32m[I 2023-10-26 03:31:51,343][0m Trial 21 finished with value: 0.42761425428939503 and parameters: {'learning_rate': 0.06388592806405163, 'num_leaves': 43, 'max_depth': 5, 'colsample_bytree': 0.6461292518415372, 'subsample': 0.850714521441498}. Best is trial 4 with value: 0.4159477318690093.[0m


No further splits with positive gain, best gain: -inf
[30]	training's binary_logloss: 0.380903	valid_1's binary_logloss: 0.444215
[40]	training's binary_logloss: 0.357041	valid_1's binary_logloss: 0.436206
[50]	training's binary_logloss: 0.339153	valid_1's binary_logloss: 0.430154
[60]	training's binary_logloss: 0.326548	valid_1's binary_logloss: 0.427621
[70]	training's binary_logloss: 0.31682	valid_1's binary_logloss: 0.428976
Early stopping, best iteration is:
[61]	training's binary_logloss: 0.325623	valid_1's binary_logloss: 0.427614
[LightGBM] [Info] Number of positive: 239, number of negative: 384
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 191
[LightGBM] [Info] Number of data points in the train set: 623, number of used features: 7
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.383628 -> initscore=-0.474179
[LightGBM] [Info] Start training from score -0.474179
Training u

[32m[I 2023-10-26 03:31:51,482][0m Trial 22 finished with value: 0.4253559203561167 and parameters: {'learning_rate': 0.06447030464736006, 'num_leaves': 51, 'max_depth': 9, 'colsample_bytree': 0.8052795271835291, 'subsample': 0.6070376411656228}. Best is trial 4 with value: 0.4159477318690093.[0m
[32m[I 2023-10-26 03:31:51,575][0m Trial 23 finished with value: 0.4175170865354572 and parameters: {'learning_rate': 0.09144700146086816, 'num_leaves': 62, 'max_depth': 3, 'colsample_bytree': 0.8372357878786805, 'subsample': 0.6945027906172576}. Best is trial 4 with value: 0.4159477318690093.[0m
[32m[I 2023-10-26 03:31:51,665][0m Trial 24 finished with value: 0.42464622055375384 and parameters: {'learning_rate': 0.08675970110612974, 'num_leaves': 45, 'max_depth': 3, 'colsample_bytree': 0.6870636002319853, 'subsample': 0.8016550670462429}. Best is trial 4 with value: 0.4159477318690093.[0m


[LightGBM] [Info] Number of positive: 239, number of negative: 384
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 191
[LightGBM] [Info] Number of data points in the train set: 623, number of used features: 7
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.383628 -> initscore=-0.474179
[LightGBM] [Info] Start training from score -0.474179
Training until validation scores don't improve for 10 rounds
[10]	training's binary_logloss: 0.476047	valid_1's binary_logloss: 0.490668
[20]	training's binary_logloss: 0.414337	valid_1's binary_logloss: 0.440642
[30]	training's binary_logloss: 0.389073	valid_1's binary_logloss: 0.421864
[40]	training's binary_logloss: 0.370198	valid_1's binary_logloss: 0.418039
[50]	training's binary_logloss: 0.358102	valid_1's binary_logloss: 0.421238
Early stopping, best iteration is:
[41]	training's binary_logloss: 0.368719	valid_1's binary_logloss: 0.417517
[L

[32m[I 2023-10-26 03:31:51,767][0m Trial 25 finished with value: 0.4212911343489471 and parameters: {'learning_rate': 0.07960209656359196, 'num_leaves': 98, 'max_depth': 3, 'colsample_bytree': 0.6780785714242163, 'subsample': 0.9334621540309447}. Best is trial 4 with value: 0.4159477318690093.[0m
[32m[I 2023-10-26 03:31:51,889][0m Trial 26 finished with value: 0.4246188083288403 and parameters: {'learning_rate': 0.07235626893088137, 'num_leaves': 36, 'max_depth': 6, 'colsample_bytree': 0.8448177463611265, 'subsample': 0.7041029327992967}. Best is trial 4 with value: 0.4159477318690093.[0m


No further splits with positive gain, best gain: -inf
[10]	training's binary_logloss: 0.491332	valid_1's binary_logloss: 0.507808
[20]	training's binary_logloss: 0.43115	valid_1's binary_logloss: 0.455952
[30]	training's binary_logloss: 0.401356	valid_1's binary_logloss: 0.435421
[40]	training's binary_logloss: 0.384599	valid_1's binary_logloss: 0.424759
[50]	training's binary_logloss: 0.372419	valid_1's binary_logloss: 0.421303
Early stopping, best iteration is:
[49]	training's binary_logloss: 0.372961	valid_1's binary_logloss: 0.421291
[LightGBM] [Info] Number of positive: 239, number of negative: 384
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 191
[LightGBM] [Info] Number of data points in the train set: 623, number of used features: 7
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.383628 -> initscore=-0.474179
[LightGBM] [Info] Start training from score -0.474179
Training u

[32m[I 2023-10-26 03:31:51,988][0m Trial 27 finished with value: 0.4234865703453209 and parameters: {'learning_rate': 0.09068989098512387, 'num_leaves': 107, 'max_depth': 3, 'colsample_bytree': 0.6874286964792423, 'subsample': 0.7096264161810828}. Best is trial 4 with value: 0.4159477318690093.[0m
[32m[I 2023-10-26 03:31:52,084][0m Trial 28 finished with value: 0.4269332976485521 and parameters: {'learning_rate': 0.09827081102599636, 'num_leaves': 112, 'max_depth': 7, 'colsample_bytree': 0.8308839442242945, 'subsample': 0.7866221888708269}. Best is trial 4 with value: 0.4159477318690093.[0m


[LightGBM] [Info] Number of positive: 239, number of negative: 384
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 191
[LightGBM] [Info] Number of data points in the train set: 623, number of used features: 7
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.383628 -> initscore=-0.474179
[LightGBM] [Info] Start training from score -0.474179
Training until validation scores don't improve for 10 rounds
[10]	training's binary_logloss: 0.479269	valid_1's binary_logloss: 0.497643
[20]	training's binary_logloss: 0.420639	valid_1's binary_logloss: 0.447936
[30]	training's binary_logloss: 0.393724	valid_1's binary_logloss: 0.431192
[40]	training's binary_logloss: 0.376832	valid_1's binary_logloss: 0.424787
[50]	training's binary_logloss: 0.363382	valid_1's binary_logloss: 0.424513
[60]	training's binary_logloss: 0.353969	valid_1's binary_logloss: 0.424504
Early stopping, best iteration is:
[5

[32m[I 2023-10-26 03:31:52,202][0m Trial 29 finished with value: 0.42294008346331735 and parameters: {'learning_rate': 0.0615266511753165, 'num_leaves': 64, 'max_depth': 7, 'colsample_bytree': 0.8122376249447735, 'subsample': 0.6067176194082667}. Best is trial 4 with value: 0.4159477318690093.[0m
[32m[I 2023-10-26 03:31:52,310][0m Trial 30 finished with value: 0.4210395976647868 and parameters: {'learning_rate': 0.06507874083372747, 'num_leaves': 45, 'max_depth': 3, 'colsample_bytree': 0.7342117787622318, 'subsample': 0.8622094430034536}. Best is trial 4 with value: 0.4159477318690093.[0m


[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.383628 -> initscore=-0.474179
[LightGBM] [Info] Start training from score -0.474179
Training until validation scores don't improve for 10 rounds
[10]	training's binary_logloss: 0.489476	valid_1's binary_logloss: 0.517169
[20]	training's binary_logloss: 0.41067	valid_1's binary_logloss: 0.462174
[30]	training's binary_logloss: 0.36422	valid_1's binary_logloss: 0.433165
[40]	training's binary_logloss: 0.33747	valid_1's binary_logloss: 0.425297
[50]	training's binary_logloss: 0.318841	valid_1's binary_logloss: 0.424052
Early stopping, best iteration is:
[44]	training's binary_logloss: 0.330254	valid_1's binary_logloss: 0.42294
[LightGBM] [Info] Number of positive: 239, number of negative: 384
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 191
[LightGBM] [Info] Number of data points in the train set: 623, number of used features: 7
[Light

[32m[I 2023-10-26 03:31:52,434][0m Trial 31 finished with value: 0.42682713077704365 and parameters: {'learning_rate': 0.0939226095138021, 'num_leaves': 111, 'max_depth': 6, 'colsample_bytree': 0.6323308613113143, 'subsample': 0.7239163317293336}. Best is trial 4 with value: 0.4159477318690093.[0m


[LightGBM] [Info] Number of positive: 239, number of negative: 384
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 191
[LightGBM] [Info] Number of data points in the train set: 623, number of used features: 7
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.383628 -> initscore=-0.474179
[LightGBM] [Info] Start training from score -0.474179
Training until validation scores don't improve for 10 rounds
[10]	training's binary_logloss: 0.452011	valid_1's binary_logloss: 0.492464
[20]	training's binary_logloss: 0.387285	valid_1's binary_logloss: 0.454787
[30]	training's binary_logloss: 0.348678	valid_1's binary_logloss: 0.440572
[40]	training's binary_logloss: 0.329693	valid_1's binary_logloss: 0.430997
[50]	training's binary_logloss: 0.311354	valid_1's binary_logloss: 0.427732
[60]	training's binary_logloss: 0.29841	valid_1's binary_logloss: 0.428482
Early stopping, best iteration is:
[51

[32m[I 2023-10-26 03:31:52,536][0m Trial 32 finished with value: 0.4167695552227848 and parameters: {'learning_rate': 0.07759081295666295, 'num_leaves': 76, 'max_depth': 3, 'colsample_bytree': 0.828236144500591, 'subsample': 0.7509964524018908}. Best is trial 4 with value: 0.4159477318690093.[0m
[32m[I 2023-10-26 03:31:52,628][0m Trial 33 finished with value: 0.4326300768008115 and parameters: {'learning_rate': 0.09482732979255315, 'num_leaves': 108, 'max_depth': 9, 'colsample_bytree': 0.9444147844774512, 'subsample': 0.6911193246555916}. Best is trial 4 with value: 0.4159477318690093.[0m
[32m[I 2023-10-26 03:31:52,726][0m Trial 34 finished with value: 0.42159750601403745 and parameters: {'learning_rate': 0.076851126068789, 'num_leaves': 75, 'max_depth': 3, 'colsample_bytree': 0.6348491811887839, 'subsample': 0.7233090814201866}. Best is trial 4 with value: 0.4159477318690093.[0m


[LightGBM] [Info] Number of positive: 239, number of negative: 384
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 191
[LightGBM] [Info] Number of data points in the train set: 623, number of used features: 7
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.383628 -> initscore=-0.474179
[LightGBM] [Info] Start training from score -0.474179
Training until validation scores don't improve for 10 rounds
[10]	training's binary_logloss: 0.431511	valid_1's binary_logloss: 0.475842
[20]	training's binary_logloss: 0.353411	valid_1's binary_logloss: 0.436099
[30]	training's binary_logloss: 0.311863	valid_1's binary_logloss: 0.437595
Early stopping, best iteration is:
[22]	training's binary_logloss: 0.345275	valid_1's binary_logloss: 0.43263
[LightGBM] [Info] Number of positive: 239, number of negative: 384
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, y

[32m[I 2023-10-26 03:31:52,839][0m Trial 35 finished with value: 0.42629706231486686 and parameters: {'learning_rate': 0.07346245869951211, 'num_leaves': 69, 'max_depth': 8, 'colsample_bytree': 0.9166267104733297, 'subsample': 0.6131457843398491}. Best is trial 4 with value: 0.4159477318690093.[0m


[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.383628 -> initscore=-0.474179
[LightGBM] [Info] Start training from score -0.474179
Training until validation scores don't improve for 10 rounds
[10]	training's binary_logloss: 0.468049	valid_1's binary_logloss: 0.503677
[20]	training's binary_logloss: 0.387636	valid_1's binary_logloss: 0.452655
[30]	training's binary_logloss: 0.343182	valid_1's binary_logloss: 0.428733
[40]	training's binary_logloss: 0.316136	valid_1's binary_logloss: 0.428514
Early stopping, best iteration is:
[33]	training's binary_logloss: 0.333512	valid_1's binary_logloss: 0.426297
[LightGBM] [Info] Number of positive: 239, number of negative: 384
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 191
[LightGBM] [Info] Number of data points in the train set: 623, number of used features: 7
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.383628 -> initscore=-0.47417

[32m[I 2023-10-26 03:31:52,969][0m Trial 36 finished with value: 0.4217535083213516 and parameters: {'learning_rate': 0.07541577242894455, 'num_leaves': 121, 'max_depth': 8, 'colsample_bytree': 0.896802880262782, 'subsample': 0.8825616355537538}. Best is trial 4 with value: 0.4159477318690093.[0m
[32m[I 2023-10-26 03:31:53,071][0m Trial 37 finished with value: 0.41882067988461846 and parameters: {'learning_rate': 0.07845503693072967, 'num_leaves': 67, 'max_depth': 4, 'colsample_bytree': 0.9439012359388177, 'subsample': 0.7306517627741391}. Best is trial 4 with value: 0.4159477318690093.[0m


[LightGBM] [Info] Number of positive: 239, number of negative: 384
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 191
[LightGBM] [Info] Number of data points in the train set: 623, number of used features: 7
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.383628 -> initscore=-0.474179
[LightGBM] [Info] Start training from score -0.474179
Training until validation scores don't improve for 10 rounds
[10]	training's binary_logloss: 0.468108	valid_1's binary_logloss: 0.486422
[20]	training's binary_logloss: 0.401044	valid_1's binary_logloss: 0.434818
[30]	training's binary_logloss: 0.370936	valid_1's binary_logloss: 0.422109
[40]	training's binary_logloss: 0.356412	valid_1's binary_logloss: 0.418826
[50]	training's binary_logloss: 0.34225	valid_1's binary_logloss: 0.420173
Early stopping, best iteration is:
[42]	training's binary_logloss: 0.353517	valid_1's binary_logloss: 0.418821
[Li

[32m[I 2023-10-26 03:31:53,206][0m Trial 38 finished with value: 0.43322525406535745 and parameters: {'learning_rate': 0.07100376848953054, 'num_leaves': 82, 'max_depth': 7, 'colsample_bytree': 0.7279236371921936, 'subsample': 0.6058197908068536}. Best is trial 4 with value: 0.4159477318690093.[0m
[32m[I 2023-10-26 03:31:53,333][0m Trial 39 finished with value: 0.4274479762552598 and parameters: {'learning_rate': 0.06153711677033979, 'num_leaves': 103, 'max_depth': 8, 'colsample_bytree': 0.8554966968484203, 'subsample': 0.6600703870415041}. Best is trial 4 with value: 0.4159477318690093.[0m


No further splits with positive gain, best gain: -inf
Early stopping, best iteration is:
[55]	training's binary_logloss: 0.303651	valid_1's binary_logloss: 0.433225
[LightGBM] [Info] Number of positive: 239, number of negative: 384
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 191
[LightGBM] [Info] Number of data points in the train set: 623, number of used features: 7
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.383628 -> initscore=-0.474179
[LightGBM] [Info] Start training from score -0.474179
Training until validation scores don't improve for 10 rounds
[10]	training's binary_logloss: 0.488331	valid_1's binary_logloss: 0.517902
[20]	training's binary_logloss: 0.407102	valid_1's binary_logloss: 0.46513
[30]	training's binary_logloss: 0.35961	valid_1's binary_logloss: 0.434289
[40]	training's binary_logloss: 0.332209	valid_1's binary_logloss: 0.428763
[50]	training's binary_log

In [12]:
params = {
    'objective': 'binary'
}

for i, j in study.best_params.items():
    params[i] = j

params

{'objective': 'binary',
 'learning_rate': 0.08890783754749253,
 'num_leaves': 79,
 'max_depth': 3,
 'colsample_bytree': 0.9425164197814674,
 'subsample': 0.8797054974758531}

## Model

In [13]:
lgb_train = lgb.Dataset(X_train, y_train, categorical_feature=categorical_features)
lgb_eval = lgb.Dataset(X_valid, y_valid, reference=lgb_train, categorical_feature=categorical_features)

model = lgb.train(
    params,
    lgb_train,
    valid_sets=[lgb_train, lgb_eval],
    verbose_eval=10,
    num_boost_round=1000,
    early_stopping_rounds=10
)

[LightGBM] [Info] Number of positive: 239, number of negative: 384
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 191
[LightGBM] [Info] Number of data points in the train set: 623, number of used features: 7
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.383628 -> initscore=-0.474179
[LightGBM] [Info] Start training from score -0.474179
Training until validation scores don't improve for 10 rounds
[10]	training's binary_logloss: 0.469491	valid_1's binary_logloss: 0.485555
[20]	training's binary_logloss: 0.409649	valid_1's binary_logloss: 0.436881
[30]	training's binary_logloss: 0.384941	valid_1's binary_logloss: 0.420527
[40]	training's binary_logloss: 0.369678	valid_1's binary_logloss: 0.415948
[50]	training's binary_logloss: 0.356138	valid_1's binary_logloss: 0.418927
Early stopping, best iteration is:
[40]	training's binary_logloss: 0.369678	valid_1's binary_logloss: 0.415948




## Evaluate

In [14]:
y_pred_valid = model.predict(X_valid, num_iteration=model.best_iteration)
y_pred_valid = (y_pred_valid > 0.5).astype(int)

In [15]:
f1_score(y_valid, y_pred_valid, average='macro')

0.8091017016224773

## Submit

In [16]:
for col in categorical_features:
    test[col] = test[col].astype('category')

In [17]:
X_test = test[features]

In [18]:
y_pred_test = model.predict(X_test, num_iteration=model.best_iteration)
y_pred_test = (y_pred_test > 0.5).astype(int)

In [19]:
submit = pd.DataFrame(y_pred_test, index=test['PassengerId'], columns=['Survived'])
submit.to_csv('submit.csv')