## Module

In [1]:
import optuna
import numpy as np
import pandas as pd
import lightgbm as lgb
from sklearn.metrics import log_loss
from sklearn.metrics import f1_score
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split

## Dataset

In [2]:
train = pd.read_csv('../input/titanic/train.csv')
print(train.shape)
train.head()

(891, 12)


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [3]:
test = pd.read_csv('../input/titanic/test.csv')
print(test.shape)
test.head()

(418, 11)


Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [4]:
submission = pd.read_csv('../input/titanic/gender_submission.csv')
submission.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0
3,895,0
4,896,1


## Info

In [5]:
train['Survived'].value_counts()

0    549
1    342
Name: Survived, dtype: int64

## Preprocess

In [6]:
features = [
    'Pclass',
    'Sex',
    'Age',
    'SibSp',
    'Parch',
    'Fare',
    'Embarked'
]

target = 'Survived'

In [7]:
categorical_features = ['Sex', 'Embarked']

for col in categorical_features:
    train[col] = train[col].astype('category')

In [8]:
X_train = train[features]
y_train = train[target]
print(X_train.shape)
print(y_train.shape)

(891, 7)
(891,)


## Params

In [9]:
X_tr, X_val, y_tr, y_val = train_test_split(X_train, y_train, test_size=0.3, stratify=y_train, random_state=123)

print(X_tr.shape)
print(y_val.shape)
print(X_tr.shape)
print(y_val.shape)

(623, 7)
(268,)
(623, 7)
(268,)


In [10]:
def objective(trial):
    params = {
        'objective': 'binary',
        'metrics': 'binary_logloss',
        'verbose': 0,
        'seed': 71,
        'learning_rate': trial.suggest_uniform('learning_rate', 0.05, 0.1),
        'num_leaves': trial.suggest_int('num_leaves', 32, 128),
        'max_depth': trial.suggest_int('max_depth', 3, 9),
        'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.6, 0.95),
        'subsample': trial.suggest_uniform('subsample', 0.6, 0.95)
    }
    
    lgb_train = lgb.Dataset(X_tr, y_tr, categorical_feature=categorical_features)
    lgb_eval = lgb.Dataset(X_val, y_val, reference=lgb_train, categorical_feature=categorical_features)
    
    model = lgb.train(
        params,
        lgb_train,
        valid_sets=[lgb_train, lgb_eval],
        verbose_eval=10,
        num_boost_round=1000,
        early_stopping_rounds=10
    )
    
    y_pred_val = model.predict(X_val, num_iteration=model.best_iteration)
    score = log_loss(y_val, y_pred_val)
    return score

In [11]:
study = optuna.create_study(sampler=optuna.samplers.RandomSampler(seed=0))
study.optimize(objective, n_trials=40)

[32m[I 2023-10-27 03:20:51,974][0m A new study created in memory with name: no-name-3321c606-955b-46dc-8e1f-6d1e1e26b40a[0m


You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
Training until validation scores don't improve for 10 rounds
[10]	training's binary_logloss: 0.465868	valid_1's binary_logloss: 0.500733
[20]	training's binary_logloss: 0.387444	valid_1's binary_logloss: 0.451664
[30]	training's binary_logloss: 0.34973	valid_1's binary_logloss: 0.432515

[32m[I 2023-10-27 03:20:52,103][0m Trial 0 finished with value: 0.4271066691504046 and parameters: {'learning_rate': 0.07744067519636624, 'num_leaves': 96, 'max_depth': 6, 'colsample_bytree': 0.9002809661679648, 'subsample': 0.8965381085744438}. Best is trial 0 with value: 0.4271066691504046.[0m
[32m[I 2023-10-27 03:20:52,209][0m Trial 1 finished with value: 0.4351876825336749 and parameters: {'learning_rate': 0.08117818483929862, 'num_leaves': 68, 'max_depth': 9, 'colsample_bytree': 0.6198495420611051, 'subsample': 0.6954297031030396}. Best is trial 0 with value: 0.4271066691504046.[0m


No further splits with positive gain, best gain: -inf
[40]	training's binary_logloss: 0.328638	valid_1's binary_logloss: 0.428269
[50]	training's binary_logloss: 0.312449	valid_1's binary_logloss: 0.431298
Early stopping, best iteration is:
[45]	training's binary_logloss: 0.319944	valid_1's binary_logloss: 0.427107
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
Training until validation scores don't improve for 10 rounds
[10]	training's binary_logloss: 0.495627	valid_1's binary_logloss: 0.52963
[20]	training's binary_logloss: 0.406254	valid_1's binary_logloss: 0.47278
[30]	training's binary_logloss: 0.360116	valid_1's binary_logloss: 0.44909
[40]	training's binary_logloss: 0.329819	valid_1's binary_logloss: 0.441265
[50]	training's binary_logloss: 0.307885	valid_1's binary_logloss: 0.438873
[60]	training's binary_logloss: 0.289891	valid_1's binary_logloss: 0.437418
[70]	training's binary_logloss: 0.275873	valid_

[32m[I 2023-10-27 03:20:52,293][0m Trial 2 finished with value: 0.4279525708540608 and parameters: {'learning_rate': 0.0738832558660675, 'num_leaves': 71, 'max_depth': 9, 'colsample_bytree': 0.7988155963828762, 'subsample': 0.9239588234024313}. Best is trial 0 with value: 0.4271066691504046.[0m
[32m[I 2023-10-27 03:20:52,419][0m Trial 3 finished with value: 0.42774057558486556 and parameters: {'learning_rate': 0.05355180290989435, 'num_leaves': 41, 'max_depth': 7, 'colsample_bytree': 0.607076439104114, 'subsample': 0.8914169459417782}. Best is trial 0 with value: 0.4271066691504046.[0m


[50]	training's binary_logloss: 0.292366	valid_1's binary_logloss: 0.430956
Early stopping, best iteration is:
[43]	training's binary_logloss: 0.307269	valid_1's binary_logloss: 0.427953
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
Training until validation scores don't improve for 10 rounds
[10]	training's binary_logloss: 0.538679	valid_1's binary_logloss: 0.562184
[20]	training's binary_logloss: 0.457839	valid_1's binary_logloss: 0.503352
[30]	training's binary_logloss: 0.411003	valid_1's binary_logloss: 0.471521
[40]	training's binary_logloss: 0.378535	valid_1's binary_logloss: 0.45461
[50]	training's binary_logloss: 0.353675	valid_1's binary_logloss: 0.443308
[60]	training's binary_logloss: 0.337723	valid_1's binary_logloss: 0.436329
[70]	training's binary_logloss: 0.32327	valid_1's binary_logloss: 0.432916
[80]	training's binary_logloss: 0.312074	valid_1's binary_logloss: 0.432495
[90]	training's binary_l

[32m[I 2023-10-27 03:20:52,477][0m Trial 4 finished with value: 0.4159477318690093 and parameters: {'learning_rate': 0.08890783754749253, 'num_leaves': 79, 'max_depth': 3, 'colsample_bytree': 0.9425164197814674, 'subsample': 0.8797054974758531}. Best is trial 4 with value: 0.4159477318690093.[0m
[32m[I 2023-10-27 03:20:52,605][0m Trial 5 finished with value: 0.4271734666260176 and parameters: {'learning_rate': 0.0730739681126466, 'num_leaves': 51, 'max_depth': 6, 'colsample_bytree': 0.6413960490541266, 'subsample': 0.8239723574646333}. Best is trial 4 with value: 0.4159477318690093.[0m


You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
Training until validation scores don't improve for 10 rounds
[10]	training's binary_logloss: 0.509184	valid_1's binary_logloss: 0.537713
[20]	training's binary_logloss: 0.425273	valid_1's binary_logloss: 0.477921
[30]	training's binary_logloss: 0.381282	valid_1's binary_logloss: 0.454296
[40]	training's binary_logloss: 0.352019	valid_1's binary_logloss: 0.44268
[50]	training's binary_logloss: 0.332167	valid_1's binary_logloss: 0.435739
[60]	training's binary_logloss: 0.319244	valid_1's binary_logloss: 0.430977
[70]	training's binary_logloss: 0.306202	valid_1's binary_logloss: 0.428433
[80]	training's binary_logloss: 0.296976	valid_1's binary_logloss: 0.428641
[90]	training's binary_logloss: 0.28749	valid_1's binary_logloss: 0.427715
Early stopping, best iteration is:
[84]	training's binary_logloss: 0.294384	valid_1's binary_logloss: 0.427173
You can set `force_row_w



No further splits with positive gain, best gain: -inf
[10]	training's binary_logloss: 0.523953	valid_1's binary_logloss: 0.542864
[20]	training's binary_logloss: 0.443595	valid_1's binary_logloss: 0.480454
[30]	training's binary_logloss: 0.397266	valid_1's binary_logloss: 0.450191
[40]	training's binary_logloss: 0.370878	valid_1's binary_logloss: 0.436513
[50]	training's binary_logloss: 0.351974	valid_1's binary_logloss: 0.431059
[60]	training's binary_logloss: 0.338187	valid_1's binary_logloss: 0.428204
[70]	training's binary_logloss: 0.328177	valid_1's binary_logloss: 0.426192

[32m[I 2023-10-27 03:20:52,710][0m Trial 6 finished with value: 0.4254828424312325 and parameters: {'learning_rate': 0.05716766437045232, 'num_leaves': 64, 'max_depth': 5, 'colsample_bytree': 0.7451316789966832, 'subsample': 0.6925944642366194}. Best is trial 4 with value: 0.4159477318690093.[0m
[32m[I 2023-10-27 03:20:52,785][0m Trial 7 finished with value: 0.4272552717610297 and parameters: {'learning_rate': 0.08871168447171084, 'num_leaves': 87, 'max_depth': 7, 'colsample_bytree': 0.7989518821040269, 'subsample': 0.6065764301527243}. Best is trial 4 with value: 0.4159477318690093.[0m


No further splits with positive gain, best gain: -inf
[80]	training's binary_logloss: 0.319142	valid_1's binary_logloss: 0.427783
Early stopping, best iteration is:
[72]	training's binary_logloss: 0.326464	valid_1's binary_logloss: 0.425483
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
Training until validation scores don't improve for 10 rounds
[10]	training's binary_logloss: 0.44832	valid_1's binary_logloss: 0.487119
[20]	training's binary_logloss: 0.37084	valid_1's binary_logloss: 0.443606
[30]	training's binary_logloss: 0.332972	valid_1's binary_logloss: 0.429964
[40]	training's binary_logloss: 0.309608	valid_1's binary_logloss: 0.429221
[50]	training's binary_logloss: 0.289958	valid_1's binary_logloss: 0.431876
Early stopping, best iteration is:
[44]	training's binary_logloss: 0.30273	valid_1's binary_logloss: 0.427255
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, y

[32m[I 2023-10-27 03:20:52,851][0m Trial 8 finished with value: 0.4175170868174815 and parameters: {'learning_rate': 0.08088177485379386, 'num_leaves': 70, 'max_depth': 3, 'colsample_bytree': 0.8159268989061649, 'subsample': 0.9303118274801184}. Best is trial 4 with value: 0.4159477318690093.[0m


No further splits with positive gain, best gain: -inf
[50]	training's binary_logloss: 0.364416	valid_1's binary_logloss: 0.420291
Early stopping, best iteration is:
[44]	training's binary_logloss: 0.373333	valid_1's binary_logloss: 0.417517
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
Training until validation scores don't improve for 10 rounds
[10]	training's binary_logloss: 0.485647	valid_1's binary_logloss: 0.498036
[20]	training's binary_logloss: 0.423043	valid_1's binary_logloss: 0.446706
[30]	training's binary_logloss: 0.394397	valid_1's binary_logloss: 0.426157

[32m[I 2023-10-27 03:20:52,915][0m Trial 9 finished with value: 0.4172986534843328 and parameters: {'learning_rate': 0.08409101495517418, 'num_leaves': 90, 'max_depth': 3, 'colsample_bytree': 0.9158220041108944, 'subsample': 0.6347481226256408}. Best is trial 4 with value: 0.4159477318690093.[0m
[32m[I 2023-10-27 03:20:52,997][0m Trial 10 finished with value: 0.4273019848276266 and parameters: {'learning_rate': 0.09849045338733745, 'num_leaves': 43, 'max_depth': 9, 'colsample_bytree': 0.7253532584393337, 'subsample': 0.8627401494264597}. Best is trial 4 with value: 0.4159477318690093.[0m



[40]	training's binary_logloss: 0.37828	valid_1's binary_logloss: 0.417456
[50]	training's binary_logloss: 0.364235	valid_1's binary_logloss: 0.418955
Early stopping, best iteration is:
[41]	training's binary_logloss: 0.377118	valid_1's binary_logloss: 0.417299
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
Training until validation scores don't improve for 10 rounds
[10]	training's binary_logloss: 0.452984	valid_1's binary_logloss: 0.498625
[20]	training's binary_logloss: 0.368999	valid_1's binary_logloss: 0.448001
[30]	training's binary_logloss: 0.325155	valid_1's binary_logloss: 0.433843
[40]	training's binary_logloss: 0.297732	valid_1's binary_logloss: 0.427532
[50]	training's binary_logloss: 0.278465	valid_1's binary_logloss: 0.428024
Early stopping, best iteration is:
[43]	training's binary_logloss: 0.290482	valid_1's binary_logloss: 0.427302
You can set `force_row_wise=true` to remove the overhead.
And i

[32m[I 2023-10-27 03:20:53,081][0m Trial 11 finished with value: 0.42665233197945923 and parameters: {'learning_rate': 0.08039153343577339, 'num_leaves': 44, 'max_depth': 4, 'colsample_bytree': 0.6134488992654571, 'subsample': 0.8219959202850673}. Best is trial 4 with value: 0.4159477318690093.[0m


No further splits with positive gain, best gain: -inf
[30]	training's binary_logloss: 0.394267	valid_1's binary_logloss: 0.446314
[40]	training's binary_logloss: 0.372046	valid_1's binary_logloss: 0.437608
[50]	training's binary_logloss: 0.355756	valid_1's binary_logloss: 0.431619
[60]	training's binary_logloss: 0.345399	valid_1's binary_logloss: 0.42734
Early stopping, best iteration is:
[59]	training's binary_logloss: 0.346744	valid_1's binary_logloss: 0.426652


[32m[I 2023-10-27 03:20:53,164][0m Trial 12 finished with value: 0.42731094147563864 and parameters: {'learning_rate': 0.09794746343122603, 'num_leaves': 79, 'max_depth': 7, 'colsample_bytree': 0.6564583312597486, 'subsample': 0.8285879139128894}. Best is trial 4 with value: 0.4159477318690093.[0m


You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
Training until validation scores don't improve for 10 rounds
[10]	training's binary_logloss: 0.455262	valid_1's binary_logloss: 0.498944
[20]	training's binary_logloss: 0.373813	valid_1's binary_logloss: 0.446336
[30]	training's binary_logloss: 0.333289	valid_1's binary_logloss: 0.431083
[40]	training's binary_logloss: 0.308791	valid_1's binary_logloss: 0.428254
[50]	training's binary_logloss: 0.290886	valid_1's binary_logloss: 0.428181
Early stopping, best iteration is:
[46]	training's binary_logloss: 0.297167	valid_1's binary_logloss: 0.427311
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
Training until validation scores don't improve for 10 rounds
[10]	training's binary_logloss: 0.507477	valid_1's binary_logloss: 0.536761
[20]	training's binary_logloss: 0.423521	valid_1's binary_logloss: 

[32m[I 2023-10-27 03:20:53,269][0m Trial 13 finished with value: 0.4280015169720938 and parameters: {'learning_rate': 0.06266458012698911, 'num_leaves': 47, 'max_depth': 7, 'colsample_bytree': 0.685548957200561, 'subsample': 0.6556393542759319}. Best is trial 4 with value: 0.4159477318690093.[0m


No further splits with positive gain, best gain: -inf
[60]	training's binary_logloss: 0.312577	valid_1's binary_logloss: 0.42929
[70]	training's binary_logloss: 0.300572	valid_1's binary_logloss: 0.429472
Early stopping, best iteration is:
[66]	training's binary_logloss: 0.305295	valid_1's binary_logloss: 0.428002
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
Training until validation scores don't improve for 10 rounds

[32m[I 2023-10-27 03:20:53,354][0m Trial 14 finished with value: 0.420392655001365 and parameters: {'learning_rate': 0.05551875705821526, 'num_leaves': 117, 'max_depth': 3, 'colsample_bytree': 0.6483640329720148, 'subsample': 0.6688038265880187}. Best is trial 4 with value: 0.4159477318690093.[0m



[10]	training's binary_logloss: 0.541809	valid_1's binary_logloss: 0.550332
[20]	training's binary_logloss: 0.471529	valid_1's binary_logloss: 0.49089
[30]	training's binary_logloss: 0.432154	valid_1's binary_logloss: 0.4561
[40]	training's binary_logloss: 0.409689	valid_1's binary_logloss: 0.440153
[50]	training's binary_logloss: 0.394144	valid_1's binary_logloss: 0.431162
[60]	training's binary_logloss: 0.383671	valid_1's binary_logloss: 0.424772
[70]	training's binary_logloss: 0.375671	valid_1's binary_logloss: 0.420745
[80]	training's binary_logloss: 0.36729	valid_1's binary_logloss: 0.421573
Early stopping, best iteration is:
[75]	training's binary_logloss: 0.371619	valid_1's binary_logloss: 0.420393
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
Training until validation scores don't improve for 10 rounds
[10]	training's binary_logloss: 0.514244	valid_1's binary_logloss: 0.542865
[20]	training's binary_lo

[32m[I 2023-10-27 03:20:53,478][0m Trial 15 finished with value: 0.43636401224098365 and parameters: {'learning_rate': 0.06843625853304822, 'num_leaves': 101, 'max_depth': 9, 'colsample_bytree': 0.6339854465275714, 'subsample': 0.8932807176245814}. Best is trial 4 with value: 0.4159477318690093.[0m


No further splits with positive gain, best gain: -inf
[60]	training's binary_logloss: 0.307658	valid_1's binary_logloss: 0.438649
[70]	training's binary_logloss: 0.29344	valid_1's binary_logloss: 0.43693
[80]	training's binary_logloss: 0.278576	valid_1's binary_logloss: 0.438227
Early stopping, best iteration is:
[71]	training's binary_logloss: 0.292211	valid_1's binary_logloss: 0.436364


[32m[I 2023-10-27 03:20:53,595][0m Trial 16 finished with value: 0.4252478788236215 and parameters: {'learning_rate': 0.054804920394698156, 'num_leaves': 125, 'max_depth': 6, 'colsample_bytree': 0.7640279205766956, 'subsample': 0.9418663808666179}. Best is trial 4 with value: 0.4159477318690093.[0m


You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
Training until validation scores don't improve for 10 rounds
[10]	training's binary_logloss: 0.522647	valid_1's binary_logloss: 0.547661
[20]	training's binary_logloss: 0.440894	valid_1's binary_logloss: 0.485822
[30]	training's binary_logloss: 0.392815	valid_1's binary_logloss: 0.45433
[40]	training's binary_logloss: 0.365292	valid_1's binary_logloss: 0.44034
[50]	training's binary_logloss: 0.345811	valid_1's binary_logloss: 0.432765
[60]	training's binary_logloss: 0.33055	valid_1's binary_logloss: 0.427724
[70]	training's binary_logloss: 0.320068	valid_1's binary_logloss: 0.425694
Early stopping, best iteration is:
[68]	training's binary_logloss: 0.322143	valid_1's binary_logloss: 0.425248
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
Training until validation scores don't improve for 10 r

[32m[I 2023-10-27 03:20:53,668][0m Trial 17 finished with value: 0.4222780165247629 and parameters: {'learning_rate': 0.08024227598725231, 'num_leaves': 105, 'max_depth': 3, 'colsample_bytree': 0.7545988224370959, 'subsample': 0.942855355084455}. Best is trial 4 with value: 0.4159477318690093.[0m


No further splits with positive gain, best gain: -inf
[40]	training's binary_logloss: 0.385466	valid_1's binary_logloss: 0.427421
[50]	training's binary_logloss: 0.37281	valid_1's binary_logloss: 0.424457
[60]	training's binary_logloss: 0.363309	valid_1's binary_logloss: 0.423227
Early stopping, best iteration is:
[58]	training's binary_logloss: 0.365522	valid_1's binary_logloss: 0.422278
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
Training until validation scores don't improve for 10 rounds
[10]	training's binary_logloss: 0.485514	valid_1's binary_logloss: 0.512316

[32m[I 2023-10-27 03:20:53,755][0m Trial 18 finished with value: 0.4271085466827357 and parameters: {'learning_rate': 0.06797222319846608, 'num_leaves': 91, 'max_depth': 5, 'colsample_bytree': 0.8410314139820196, 'subsample': 0.9081665612384084}. Best is trial 4 with value: 0.4159477318690093.[0m
[32m[I 2023-10-27 03:20:53,821][0m Trial 19 finished with value: 0.4235854502993674 and parameters: {'learning_rate': 0.09591177331810724, 'num_leaves': 99, 'max_depth': 5, 'colsample_bytree': 0.7978161033117064, 'subsample': 0.9027858964568847}. Best is trial 4 with value: 0.4159477318690093.[0m


No further splits with positive gain, best gain: -inf
[20]	training's binary_logloss: 0.408406	valid_1's binary_logloss: 0.456859
[30]	training's binary_logloss: 0.371037	valid_1's binary_logloss: 0.436705
[40]	training's binary_logloss: 0.34903	valid_1's binary_logloss: 0.428937
[50]	training's binary_logloss: 0.334333	valid_1's binary_logloss: 0.427614
[60]	training's binary_logloss: 0.323083	valid_1's binary_logloss: 0.430775
Early stopping, best iteration is:
[51]	training's binary_logloss: 0.33322	valid_1's binary_logloss: 0.427109
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
Training until validation scores don't improve for 10 rounds
[10]	training's binary_logloss: 0.447927	valid_1's binary_logloss: 0.481884
[20]	training's binary_logloss: 0.37686	valid_1's binary_logloss: 0.435439
[30]	training's binary_logloss: 0.343549	valid_1's binary_logloss: 0.425616
[40]	training's binary_logloss: 0.324903	valid_



Training until validation scores don't improve for 10 rounds
[10]	training's binary_logloss: 0.481813	valid_1's binary_logloss: 0.499588
[20]	training's binary_logloss: 0.409776	valid_1's binary_logloss: 0.448668
[30]	training's binary_logloss: 0.375211	valid_1's binary_logloss: 0.425738
[40]	training's binary_logloss: 0.355912	valid_1's binary_logloss: 0.421077
[50]	training's binary_logloss: 0.342145	valid_1's binary_logloss: 0.422967
Early stopping, best iteration is:
[40]	training's binary_logloss: 0.355912	valid_1's binary_logloss: 0.421077


[32m[I 2023-10-27 03:20:53,909][0m Trial 20 finished with value: 0.4210771395789712 and parameters: {'learning_rate': 0.07544844803335073, 'num_leaves': 46, 'max_depth': 4, 'colsample_bytree': 0.9224051635830199, 'subsample': 0.6290893724207108}. Best is trial 4 with value: 0.4159477318690093.[0m
[32m[I 2023-10-27 03:20:53,994][0m Trial 21 finished with value: 0.42352144171516065 and parameters: {'learning_rate': 0.06388592806405163, 'num_leaves': 43, 'max_depth': 5, 'colsample_bytree': 0.6461292518415372, 'subsample': 0.850714521441498}. Best is trial 4 with value: 0.4159477318690093.[0m


You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
Training until validation scores don't improve for 10 rounds
[10]	training's binary_logloss: 0.511933	valid_1's binary_logloss: 0.534177
[20]	training's binary_logloss: 0.431023	valid_1's binary_logloss: 0.47262
[30]	training's binary_logloss: 0.386733	valid_1's binary_logloss: 0.444639
[40]	training's binary_logloss: 0.361854	valid_1's binary_logloss: 0.43275
[50]	training's binary_logloss: 0.343907	valid_1's binary_logloss: 0.428321
[60]	training's binary_logloss: 0.331344	valid_1's binary_logloss: 0.424641
[70]	training's binary_logloss: 0.321397	valid_1's binary_logloss: 0.424641
Early stopping, best iteration is:
[67]	training's binary_logloss: 0.324737	valid_1's binary_logloss: 0.423521
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
Training until validation scores don't improve for 10 

[32m[I 2023-10-27 03:20:54,090][0m Trial 22 finished with value: 0.4311977265555951 and parameters: {'learning_rate': 0.06447030464736006, 'num_leaves': 51, 'max_depth': 9, 'colsample_bytree': 0.8052795271835291, 'subsample': 0.6070376411656228}. Best is trial 4 with value: 0.4159477318690093.[0m


No further splits with positive gain, best gain: -inf
[40]	training's binary_logloss: 0.325156	valid_1's binary_logloss: 0.434794
[50]	training's binary_logloss: 0.304128	valid_1's binary_logloss: 0.431765
[60]	training's binary_logloss: 0.287294	valid_1's binary_logloss: 0.434699
Early stopping, best iteration is:
[51]	training's binary_logloss: 0.302301	valid_1's binary_logloss: 0.431198
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
Training until validation scores don't improve for 10 rounds
[10]	training's binary_logloss: 0.477933	valid_1's binary_logloss: 0.491064

[32m[I 2023-10-27 03:20:54,155][0m Trial 23 finished with value: 0.41621599869174164 and parameters: {'learning_rate': 0.09144700146086816, 'num_leaves': 62, 'max_depth': 3, 'colsample_bytree': 0.8372357878786805, 'subsample': 0.6945027906172576}. Best is trial 4 with value: 0.4159477318690093.[0m
[32m[I 2023-10-27 03:20:54,217][0m Trial 24 finished with value: 0.4240037146349545 and parameters: {'learning_rate': 0.08675970110612974, 'num_leaves': 45, 'max_depth': 3, 'colsample_bytree': 0.6870636002319853, 'subsample': 0.8016550670462429}. Best is trial 4 with value: 0.4159477318690093.[0m


No further splits with positive gain, best gain: -inf
[20]	training's binary_logloss: 0.416527	valid_1's binary_logloss: 0.442512
[30]	training's binary_logloss: 0.389754	valid_1's binary_logloss: 0.42276
[40]	training's binary_logloss: 0.374487	valid_1's binary_logloss: 0.416291
[50]	training's binary_logloss: 0.359053	valid_1's binary_logloss: 0.41854
Early stopping, best iteration is:
[41]	training's binary_logloss: 0.373668	valid_1's binary_logloss: 0.416216
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
Training until validation scores don't improve for 10 rounds
[10]	training's binary_logloss: 0.499346	valid_1's binary_logloss: 0.513647
[20]	training's binary_logloss: 0.43016	valid_1's binary_logloss: 0.456512
[30]	training's binary_logloss: 0.397538	valid_1's binary_logloss: 0.434192
[40]	training's binary_logloss: 0.38027	valid_1's binary_logloss: 0.425682
[50]	training's binary_logloss: 0.367468	valid_1

[32m[I 2023-10-27 03:20:54,285][0m Trial 25 finished with value: 0.42012451288415553 and parameters: {'learning_rate': 0.07960209656359196, 'num_leaves': 98, 'max_depth': 3, 'colsample_bytree': 0.6780785714242163, 'subsample': 0.9334621540309447}. Best is trial 4 with value: 0.4159477318690093.[0m



[40]	training's binary_logloss: 0.385924	valid_1's binary_logloss: 0.427338
[50]	training's binary_logloss: 0.372926	valid_1's binary_logloss: 0.421675
[60]	training's binary_logloss: 0.362395	valid_1's binary_logloss: 0.423419
Early stopping, best iteration is:
[54]	training's binary_logloss: 0.369532	valid_1's binary_logloss: 0.420125
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
Training until validation scores don't improve for 10 rounds
[10]	training's binary_logloss: 0.472568	valid_1's binary_logloss: 0.507009

[32m[I 2023-10-27 03:20:54,375][0m Trial 26 finished with value: 0.4290719054265302 and parameters: {'learning_rate': 0.07235626893088137, 'num_leaves': 36, 'max_depth': 6, 'colsample_bytree': 0.8448177463611265, 'subsample': 0.7041029327992967}. Best is trial 4 with value: 0.4159477318690093.[0m
[32m[I 2023-10-27 03:20:54,434][0m Trial 27 finished with value: 0.42259986501009805 and parameters: {'learning_rate': 0.09068989098512387, 'num_leaves': 107, 'max_depth': 3, 'colsample_bytree': 0.6874286964792423, 'subsample': 0.7096264161810828}. Best is trial 4 with value: 0.4159477318690093.[0m


No further splits with positive gain, best gain: -inf
[20]	training's binary_logloss: 0.393923	valid_1's binary_logloss: 0.4549
[30]	training's binary_logloss: 0.355307	valid_1's binary_logloss: 0.434789
[40]	training's binary_logloss: 0.332562	valid_1's binary_logloss: 0.431155
[50]	training's binary_logloss: 0.316259	valid_1's binary_logloss: 0.429516
[60]	training's binary_logloss: 0.305154	valid_1's binary_logloss: 0.43037
Early stopping, best iteration is:
[52]	training's binary_logloss: 0.314289	valid_1's binary_logloss: 0.429072
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
Training until validation scores don't improve for 10 rounds
[10]	training's binary_logloss: 0.494609	valid_1's binary_logloss: 0.5097
[20]	training's binary_logloss: 0.42596	valid_1's binary_logloss: 0.45317
[30]	training's binary_logloss: 0.39434	valid_1's binary_logloss: 0.431115
[40]	training's binary_logloss: 0.377216	valid_1's b

[32m[I 2023-10-27 03:20:54,511][0m Trial 28 finished with value: 0.42390363835946177 and parameters: {'learning_rate': 0.09827081102599636, 'num_leaves': 112, 'max_depth': 7, 'colsample_bytree': 0.8308839442242945, 'subsample': 0.7866221888708269}. Best is trial 4 with value: 0.4159477318690093.[0m


Training until validation scores don't improve for 10 rounds
[10]	training's binary_logloss: 0.437488	valid_1's binary_logloss: 0.479674
[20]	training's binary_logloss: 0.360881	valid_1's binary_logloss: 0.436586
[30]	training's binary_logloss: 0.326063	valid_1's binary_logloss: 0.429057
[40]	training's binary_logloss: 0.303522	valid_1's binary_logloss: 0.424334
Early stopping, best iteration is:
[37]	training's binary_logloss: 0.310353	valid_1's binary_logloss: 0.423904

[32m[I 2023-10-27 03:20:54,599][0m Trial 29 finished with value: 0.42801030925119316 and parameters: {'learning_rate': 0.0615266511753165, 'num_leaves': 64, 'max_depth': 7, 'colsample_bytree': 0.8122376249447735, 'subsample': 0.6067176194082667}. Best is trial 4 with value: 0.4159477318690093.[0m


Auto-choosing row-wise multi-threading, the overhead of testing was 0.000108 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
Training until validation scores don't improve for 10 rounds
[10]	training's binary_logloss: 0.492672	valid_1's binary_logloss: 0.519724
[20]	training's binary_logloss: 0.41026	valid_1's binary_logloss: 0.463859
[30]	training's binary_logloss: 0.36666	valid_1's binary_logloss: 0.43961
[40]	training's binary_logloss: 0.33969	valid_1's binary_logloss: 0.431457
[50]	training's binary_logloss: 0.321664	valid_1's binary_logloss: 0.42801
[60]	training's binary_logloss: 0.307065	valid_1's binary_logloss: 0.431289
Early stopping, best iteration is:
[50]	training's binary_logloss: 0.321664	valid_1's binary_logloss: 0.42801
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
Training until validation scores don't improve f

[32m[I 2023-10-27 03:20:54,678][0m Trial 30 finished with value: 0.4213142306893759 and parameters: {'learning_rate': 0.06507874083372747, 'num_leaves': 45, 'max_depth': 3, 'colsample_bytree': 0.7342117787622318, 'subsample': 0.8622094430034536}. Best is trial 4 with value: 0.4159477318690093.[0m


[70]	training's binary_logloss: 0.365353	valid_1's binary_logloss: 0.422187
Early stopping, best iteration is:
[64]	training's binary_logloss: 0.371986	valid_1's binary_logloss: 0.421314
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
Training until validation scores don't improve for 10 rounds
[10]	training's binary_logloss: 0.480916	valid_1's binary_logloss: 0.516923
[20]	training's binary_logloss: 0.396376	valid_1's binary_logloss: 0.462829

[32m[I 2023-10-27 03:20:54,768][0m Trial 31 finished with value: 0.43110815820265713 and parameters: {'learning_rate': 0.0939226095138021, 'num_leaves': 111, 'max_depth': 6, 'colsample_bytree': 0.6323308613113143, 'subsample': 0.7239163317293336}. Best is trial 4 with value: 0.4159477318690093.[0m
[32m[I 2023-10-27 03:20:54,830][0m Trial 32 finished with value: 0.41689135912649944 and parameters: {'learning_rate': 0.07759081295666295, 'num_leaves': 76, 'max_depth': 3, 'colsample_bytree': 0.828236144500591, 'subsample': 0.7509964524018908}. Best is trial 4 with value: 0.4159477318690093.[0m



[30]	training's binary_logloss: 0.357467	valid_1's binary_logloss: 0.443232
[40]	training's binary_logloss: 0.332815	valid_1's binary_logloss: 0.4348
[50]	training's binary_logloss: 0.314784	valid_1's binary_logloss: 0.432813
[60]	training's binary_logloss: 0.300727	valid_1's binary_logloss: 0.431716
Early stopping, best iteration is:
[52]	training's binary_logloss: 0.310932	valid_1's binary_logloss: 0.431108
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
Training until validation scores don't improve for 10 rounds
[10]	training's binary_logloss: 0.493347	valid_1's binary_logloss: 0.504865
[20]	training's binary_logloss: 0.429826	valid_1's binary_logloss: 0.451036
[30]	training's binary_logloss: 0.399558	valid_1's binary_logloss: 0.428985
[40]	training's binary_logloss: 0.383746	valid_1's binary_logloss: 0.420068
[50]	training's binary_logloss: 0.369259	valid_1's binary_logloss: 0.417767
Early stopping, best it

[32m[I 2023-10-27 03:20:54,899][0m Trial 33 finished with value: 0.4326300768008115 and parameters: {'learning_rate': 0.09482732979255315, 'num_leaves': 108, 'max_depth': 9, 'colsample_bytree': 0.9444147844774512, 'subsample': 0.6911193246555916}. Best is trial 4 with value: 0.4159477318690093.[0m



[20]	training's binary_logloss: 0.353411	valid_1's binary_logloss: 0.436099
[30]	training's binary_logloss: 0.311863	valid_1's binary_logloss: 0.437595
Early stopping, best iteration is:
[22]	training's binary_logloss: 0.345275	valid_1's binary_logloss: 0.43263
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
Training until validation scores don't improve for 10 rounds

[32m[I 2023-10-27 03:20:54,979][0m Trial 34 finished with value: 0.4236535923361078 and parameters: {'learning_rate': 0.076851126068789, 'num_leaves': 75, 'max_depth': 3, 'colsample_bytree': 0.6348491811887839, 'subsample': 0.7233090814201866}. Best is trial 4 with value: 0.4159477318690093.[0m


No further splits with positive gain, best gain: -inf
[10]	training's binary_logloss: 0.524391	valid_1's binary_logloss: 0.536448
[20]	training's binary_logloss: 0.451088	valid_1's binary_logloss: 0.477605
[30]	training's binary_logloss: 0.414492	valid_1's binary_logloss: 0.449609
[40]	training's binary_logloss: 0.393365	valid_1's binary_logloss: 0.436621
[50]	training's binary_logloss: 0.377851	valid_1's binary_logloss: 0.429085
[60]	training's binary_logloss: 0.36747	valid_1's binary_logloss: 0.426176
[70]	training's binary_logloss: 0.359446	valid_1's binary_logloss: 0.423935
Early stopping, best iteration is:
[68]	training's binary_logloss: 0.360326	valid_1's binary_logloss: 0.423654
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
Training until validation scores don't improve for 10 rounds
[10]	training's binary_logloss: 0.470783	valid_1's binary_logloss: 0.50463
[20]	training's binary_logloss: 0.387573	valid

[32m[I 2023-10-27 03:20:55,062][0m Trial 35 finished with value: 0.4249250672384724 and parameters: {'learning_rate': 0.07346245869951211, 'num_leaves': 69, 'max_depth': 8, 'colsample_bytree': 0.9166267104733297, 'subsample': 0.6131457843398491}. Best is trial 4 with value: 0.4159477318690093.[0m


You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
Training until validation scores don't improve for 10 rounds
[10]	training's binary_logloss: 0.467743	valid_1's binary_logloss: 0.50239
[20]	training's binary_logloss: 0.385773	valid_1's binary_logloss: 0.450704
[30]	training's binary_logloss: 0.344384	valid_1's binary_logloss: 0.431367
[40]	training's binary_logloss: 0.31885	valid_1's binary_logloss: 0.427054

[32m[I 2023-10-27 03:20:55,140][0m Trial 36 finished with value: 0.4266822186964179 and parameters: {'learning_rate': 0.07541577242894455, 'num_leaves': 121, 'max_depth': 8, 'colsample_bytree': 0.896802880262782, 'subsample': 0.8825616355537538}. Best is trial 4 with value: 0.4159477318690093.[0m
[32m[I 2023-10-27 03:20:55,206][0m Trial 37 finished with value: 0.41882067988461846 and parameters: {'learning_rate': 0.07845503693072967, 'num_leaves': 67, 'max_depth': 4, 'colsample_bytree': 0.9439012359388177, 'subsample': 0.7306517627741391}. Best is trial 4 with value: 0.4159477318690093.[0m


No further splits with positive gain, best gain: -inf
Early stopping, best iteration is:
[36]	training's binary_logloss: 0.329338	valid_1's binary_logloss: 0.426682
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
Training until validation scores don't improve for 10 rounds
[10]	training's binary_logloss: 0.468108	valid_1's binary_logloss: 0.486422
[20]	training's binary_logloss: 0.401044	valid_1's binary_logloss: 0.434818
[30]	training's binary_logloss: 0.370936	valid_1's binary_logloss: 0.422109
[40]	training's binary_logloss: 0.356412	valid_1's binary_logloss: 0.418826
[50]	training's binary_logloss: 0.34225	valid_1's binary_logloss: 0.420173
Early stopping, best iteration is:
[42]	training's binary_logloss: 0.353517	valid_1's binary_logloss: 0.418821
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
Training until validation scores don't i

[32m[I 2023-10-27 03:20:55,298][0m Trial 38 finished with value: 0.4259037846394846 and parameters: {'learning_rate': 0.07100376848953054, 'num_leaves': 82, 'max_depth': 7, 'colsample_bytree': 0.7279236371921936, 'subsample': 0.6058197908068536}. Best is trial 4 with value: 0.4159477318690093.[0m


No further splits with positive gain, best gain: -inf
[60]	training's binary_logloss: 0.302672	valid_1's binary_logloss: 0.426907
Early stopping, best iteration is:
[54]	training's binary_logloss: 0.311131	valid_1's binary_logloss: 0.425904
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
Training until validation scores don't improve for 10 rounds

[32m[I 2023-10-27 03:20:55,377][0m Trial 39 finished with value: 0.4276920169704596 and parameters: {'learning_rate': 0.06153711677033979, 'num_leaves': 103, 'max_depth': 8, 'colsample_bytree': 0.8554966968484203, 'subsample': 0.6600703870415041}. Best is trial 4 with value: 0.4159477318690093.[0m


No further splits with positive gain, best gain: -inf
[10]	training's binary_logloss: 0.491796	valid_1's binary_logloss: 0.520161
[20]	training's binary_logloss: 0.408038	valid_1's binary_logloss: 0.464354
[30]	training's binary_logloss: 0.362905	valid_1's binary_logloss: 0.439769
[40]	training's binary_logloss: 0.333355	valid_1's binary_logloss: 0.429584
[50]	training's binary_logloss: 0.313894	valid_1's binary_logloss: 0.428704
Early stopping, best iteration is:
[44]	training's binary_logloss: 0.325499	valid_1's binary_logloss: 0.427692


In [12]:
params = {
    'objective': 'binary',
    'metrics': 'binary_logloss',
    'verbose': 0,
    'seed': 71
}

for i, j in study.best_params.items():
    params[i] = j

params

{'objective': 'binary',
 'metrics': 'binary_logloss',
 'verbose': 0,
 'seed': 71,
 'learning_rate': 0.08890783754749253,
 'num_leaves': 79,
 'max_depth': 3,
 'colsample_bytree': 0.9425164197814674,
 'subsample': 0.8797054974758531}

## Evaluate

In [13]:
scores = []

kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=456)

for tr_idx, val_idx in kf.split(X_train, y_train):
    X_tr, X_val = X_train.iloc[tr_idx], X_train.iloc[val_idx]
    y_tr, y_val = y_train.iloc[tr_idx], y_train.iloc[val_idx]
    
    lgb_train = lgb.Dataset(X_tr, y_tr, categorical_feature=categorical_features)
    lgb_eval = lgb.Dataset(X_val, y_val, reference=lgb_train, categorical_feature=categorical_features)
    
    model = lgb.train(
        params,
        lgb_train,
        valid_sets=[lgb_train, lgb_eval],
        verbose_eval=10,
        num_boost_round=1000,
        early_stopping_rounds=10
    )
    
    y_pred_val = model.predict(X_val, num_iteration=model.best_iteration)
    y_pred_val = (y_pred_val > 0.5).astype(int)
    score = f1_score(y_val, y_pred_val, average='macro')
    scores.append(score)



You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
Training until validation scores don't improve for 10 rounds
[10]	training's binary_logloss: 0.471307	valid_1's binary_logloss: 0.458247
[20]	training's binary_logloss: 0.418395	valid_1's binary_logloss: 0.400157
[30]	training's binary_logloss: 0.394634	valid_1's binary_logloss: 0.376365
[40]	training's binary_logloss: 0.380933	valid_1's binary_logloss: 0.363845
[50]	training's binary_logloss: 0.371964	valid_1's binary_logloss: 0.358689
[60]	training's binary_logloss: 0.360443	valid_1's binary_logloss: 0.352732
[70]	training's binary_logloss: 0.351467	valid_1's binary_logloss: 0.351082
[80]	training's binary_logloss: 0.341607	valid_1's binary_logloss: 0.347326
[90]	training's binary_logloss: 0.332836	valid_1's binary_logloss: 0.345431
[100]	training's binary_logloss: 0.325733	valid_1's binary_logloss: 0.34236
[110]	training's binary_logloss: 0.319525	valid_1's binar

In [14]:
print('===CV scores===')
print(scores)
print(np.mean(scores))

===CV scores===
[0.8615529475064734, 0.7872485742581019, 0.8034506556245686, 0.8098561458481699, 0.7495219195381563]
0.802326048555094


## Model

In [15]:
X_tr, X_val, y_tr, y_val = train_test_split(X_train, y_train, test_size=0.3, stratify=y_train, random_state=123)

print(X_tr.shape)
print(y_val.shape)
print(X_tr.shape)
print(y_val.shape)

(623, 7)
(268,)
(623, 7)
(268,)


In [16]:
lgb_train = lgb.Dataset(X_tr, y_tr, categorical_feature=categorical_features)
lgb_eval = lgb.Dataset(X_val, y_val, reference=lgb_train, categorical_feature=categorical_features)

model = lgb.train(
    params,
    lgb_train,
    valid_sets=[lgb_train, lgb_eval],
    verbose_eval=10,
    num_boost_round=1000,
    early_stopping_rounds=10
)

You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
Training until validation scores don't improve for 10 rounds
[10]	training's binary_logloss: 0.469491	valid_1's binary_logloss: 0.485555
[20]	training's binary_logloss: 0.409649	valid_1's binary_logloss: 0.436881
[30]	training's binary_logloss: 0.384941	valid_1's binary_logloss: 0.420527
[40]	training's binary_logloss: 0.369678	valid_1's binary_logloss: 0.415948
[50]	training's binary_logloss: 0.356138	valid_1's binary_logloss: 0.418927
Early stopping, best iteration is:
[40]	training's binary_logloss: 0.369678	valid_1's binary_logloss: 0.415948




## Submit

In [17]:
for col in categorical_features:
    test[col] = test[col].astype('category')

In [18]:
X_test = test[features]

In [19]:
y_pred_test = model.predict(X_test, num_iteration=model.best_iteration)
y_pred_test = (y_pred_test > 0.5).astype(int)

In [20]:
submit = pd.DataFrame(y_pred_test, index=test['PassengerId'], columns=['Survived'])
submit.to_csv('submit.csv')