This notebook is a sample code with Japanese comments.

# 2.6 機械学習アルゴリズムのお気持ち？！　ハイパーパラメータを調整してみよう

In [2]:
import numpy as np
import pandas as pd

train = pd.read_csv('../input/titanic/train.csv')
test = pd.read_csv('../input/titanic/test.csv')
gender_submission = pd.read_csv('../input/titanic/gender_submission.csv')

data = pd.concat([train, test], sort=False)

data['Sex'].replace(['male', 'female'], [0, 1], inplace=True)
data['Embarked'].fillna(('S'), inplace=True)
data['Embarked'] = data['Embarked'].map({'S': 0, 'C': 1, 'Q': 2}).astype(int)
data['Fare'].fillna(np.mean(data['Fare']), inplace=True)
data['Age'].fillna(data['Age'].median(), inplace=True)
data['FamilySize'] = data['Parch'] + data['SibSp'] + 1
data['IsAlone'] = 0
data.loc[data['FamilySize'] == 1, 'IsAlone'] = 1

In [3]:
data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,FamilySize,IsAlone
0,1,0.0,3,"Braund, Mr. Owen Harris",0,22.0,1,0,A/5 21171,7.25,,0,2,0
1,2,1.0,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1,38.0,1,0,PC 17599,71.2833,C85,1,2,0
2,3,1.0,3,"Heikkinen, Miss. Laina",1,26.0,0,0,STON/O2. 3101282,7.925,,0,1,1
3,4,1.0,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",1,35.0,1,0,113803,53.1,C123,0,2,0
4,5,0.0,3,"Allen, Mr. William Henry",0,35.0,0,0,373450,8.05,,0,1,1


In [4]:
delete_columns = ['Name', 'PassengerId', 'Ticket', 'Cabin']
data.drop(delete_columns, axis=1, inplace=True)

train = data[:len(train)]
test = data[len(train):]

y_train = train['Survived']
X_train = train.drop('Survived', axis=1)
X_test = test.drop('Survived', axis=1)

In [5]:
X_train.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,FamilySize,IsAlone
0,3,0,22.0,1,0,7.25,0,2,0
1,1,1,38.0,1,0,71.2833,1,2,0
2,3,1,26.0,0,0,7.925,0,1,1
3,1,1,35.0,1,0,53.1,0,2,0
4,3,0,35.0,0,0,8.05,0,1,1


## LightGBM

In [8]:
!pip install lightgbm



In [9]:
from sklearn.model_selection import train_test_split

X_train, X_valid, y_train, y_valid = \
    train_test_split(X_train, y_train, test_size=0.3,
                     random_state=0, stratify=y_train)

In [10]:
categorical_features = ['Embarked', 'Pclass', 'Sex']

# 手動で調整

In [7]:
params = {
    'objective': 'binary'
}

In [11]:
import lightgbm as lgb

lgb_train = lgb.Dataset(X_train, y_train,
                        categorical_feature=categorical_features)
lgb_eval = lgb.Dataset(X_valid, y_valid, reference=lgb_train,
                       categorical_feature=categorical_features)

params = {
    'objective': 'binary'
}

model = lgb.train(params, lgb_train,
                  valid_sets=[lgb_train, lgb_eval],
                  num_boost_round=1000,
                  callbacks=[
                      lgb.early_stopping(
                          stopping_rounds=10,
                          verbose=True
                      ),
                      lgb.log_evaluation(10)
                  ]
                 )

y_pred = model.predict(X_test, num_iteration=model.best_iteration)

[LightGBM] [Info] Number of positive: 167, number of negative: 269
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000113 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 172
[LightGBM] [Info] Number of data points in the train set: 436, number of used features: 9
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.383028 -> initscore=-0.476718
[LightGBM] [Info] Start training from score -0.476718
Training until validation scores don't improve for 10 rounds
[10]	training's binary_logloss: 0.423553	valid_1's binary_logloss: 0.482127
[20]	training's binary_logloss: 0.341829	valid_1's binary_logloss: 0.444758
[30]	training's binary_logloss: 0.291611	valid_1's binary_logloss: 0.443136
Early stopping, best iteration is:
[23]	training's binary_logloss: 0.324238	valid_1's binary_logloss: 0.437735


In [12]:
y_pred[:10]

array([0.14665841, 0.42711079, 0.17556106, 0.05032947, 0.30673957,
       0.35161146, 0.69605657, 0.29078791, 0.69132888, 0.04533161])

In [13]:
# 手動でハイパーパラメータの調整
# https://lightgbm.readthedocs.io/en/latest/Parameters-Tuning.html
params = {
    'objective': 'binary',
    # 各特徴量の最大の分割数
    'max_bin': 300,
    # 学習率
    # 小さめの値を設定することで「丁寧に」対応関係を学習するようになる
    'learning_rate': 0.05,
    # 1つの決定木における分岐の末端の最大数
    'num_leaves': 40
}

In [14]:
lgb_train = lgb.Dataset(X_train, y_train,
                        categorical_feature=categorical_features)
lgb_eval = lgb.Dataset(X_valid, y_valid, reference=lgb_train,
                       categorical_feature=categorical_features)

model = lgb.train(params, lgb_train,
                  valid_sets=[lgb_train, lgb_eval],
                  num_boost_round=1000,
                  callbacks=[
                      lgb.early_stopping(
                          stopping_rounds=10,
                          verbose=True
                      ),
                      lgb.log_evaluation(10)
                  ]
                 )

y_pred = model.predict(X_test, num_iteration=model.best_iteration)

[LightGBM] [Info] Number of positive: 167, number of negative: 269
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000577 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 172
[LightGBM] [Info] Number of data points in the train set: 436, number of used features: 9
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.383028 -> initscore=-0.476718
[LightGBM] [Info] Start training from score -0.476718
Training until validation scores don't improve for 10 rounds
[10]	training's binary_logloss: 0.504016	valid_1's binary_logloss: 0.534687
[20]	training's binary_logloss: 0.425372	valid_1's binary_logloss: 0.483284
[30]	training's binary_logloss: 0.374997	valid_1's binary_logloss: 0.45517
[40]	training's binary_logloss: 0.342624	valid_1's binary_logloss: 0.442745
[50]	training's binary_logloss: 0.312931	valid_1's binary_logloss: 0.437842
Early stop

In [15]:
y_pred[:10]

array([0.15197363, 0.41973104, 0.14480326, 0.05653175, 0.30467589,
       0.34876784, 0.69478187, 0.29543687, 0.6888932 , 0.05074279])

In [16]:
y_pred = (y_pred > 0.5).astype(int)
y_pred[:10]

array([0, 0, 0, 0, 0, 0, 1, 0, 1, 0])

In [17]:
sub = pd.read_csv('../input/titanic/gender_submission.csv')

sub['Survived'] = y_pred
sub.to_csv('submission_lightgbm_handtuning.csv', index=False)

sub.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,0


# Optunaを使う

In [19]:
!pip install optuna

Collecting optuna
  Downloading optuna-4.1.0-py3-none-any.whl.metadata (16 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.9.0-py3-none-any.whl.metadata (10 kB)
Downloading optuna-4.1.0-py3-none-any.whl (364 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m364.4/364.4 kB[0m [31m19.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorlog-6.9.0-py3-none-any.whl (11 kB)
Installing collected packages: colorlog, optuna
Successfully installed colorlog-6.9.0 optuna-4.1.0


In [20]:
import optuna
from sklearn.metrics import log_loss

def objective(trial):
    params = {
        'objective': 'binary',
        # 探索範囲を定義
        'max_bin': trial.suggest_int('max_bin', 255, 500),
        # 一般に低いほど高い性能が得られるため、探索範囲には含めず必要に応じて手動で低い値に変更
        'learning_rate': 0.05,
        # 探索範囲を定義
        'num_leaves': trial.suggest_int('num_leaves', 32, 128),
    }

    lgb_train = lgb.Dataset(X_train, y_train,
                            categorical_feature=categorical_features)
    lgb_eval = lgb.Dataset(X_valid, y_valid, reference=lgb_train,
                           categorical_feature=categorical_features)

    model = lgb.train(params, lgb_train,
                  valid_sets=[lgb_train, lgb_eval],
                  num_boost_round=1000,
                  callbacks=[lgb.early_stopping(stopping_rounds=10,
                                                verbose=True),
                             lgb.log_evaluation(10)])

    y_pred_valid = model.predict(X_valid, num_iteration=model.best_iteration)
    score = log_loss(y_valid, y_pred_valid)
    return score

In [21]:
study = optuna.create_study(sampler=optuna.samplers.RandomSampler(seed=0))
study.optimize(objective, n_trials=40)

[I 2025-01-12 12:57:36,754] A new study created in memory with name: no-name-acc95d47-4b1f-4d31-84a2-3ac5eab83f26
[I 2025-01-12 12:57:36,818] Trial 0 finished with value: 0.43700461618069675 and parameters: {'max_bin': 390, 'num_leaves': 101}. Best is trial 0 with value: 0.43700461618069675.
[I 2025-01-12 12:57:36,861] Trial 1 finished with value: 0.43700461618069675 and parameters: {'max_bin': 403, 'num_leaves': 84}. Best is trial 0 with value: 0.43700461618069675.
[I 2025-01-12 12:57:36,891] Trial 2 finished with value: 0.43700461618069675 and parameters: {'max_bin': 359, 'num_leaves': 94}. Best is trial 0 with value: 0.43700461618069675.
[I 2025-01-12 12:57:36,923] Trial 3 finished with value: 0.43700461618069675 and parameters: {'max_bin': 362, 'num_leaves': 118}. Best is trial 0 with value: 0.43700461618069675.
[I 2025-01-12 12:57:36,948] Trial 4 finished with value: 0.43700461618069675 and parameters: {'max_bin': 492, 'num_leaves': 69}. Best is trial 0 with value: 0.4370046161806

[LightGBM] [Info] Number of positive: 167, number of negative: 269
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000242 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 172
[LightGBM] [Info] Number of data points in the train set: 436, number of used features: 9
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.383028 -> initscore=-0.476718
[LightGBM] [Info] Start training from score -0.476718
Training until validation scores don't improve for 10 rounds
[10]	training's binary_logloss: 0.504016	valid_1's binary_logloss: 0.534687
[20]	training's binary_logloss: 0.425372	valid_1's binary_logloss: 0.483284
[30]	training's binary_logloss: 0.374997	valid_1's binary_logloss: 0.45517
[40]	training's binary_logloss: 0.342624	valid_1's binary_logloss: 0.442745
[50]	training's binary_logloss: 0.312931	valid_1's binary_logloss: 0.437842
Early stop

[I 2025-01-12 12:57:36,989] Trial 5 finished with value: 0.43700461618069675 and parameters: {'max_bin': 449, 'num_leaves': 83}. Best is trial 0 with value: 0.43700461618069675.
[I 2025-01-12 12:57:37,029] Trial 6 finished with value: 0.43700461618069675 and parameters: {'max_bin': 394, 'num_leaves': 121}. Best is trial 0 with value: 0.43700461618069675.
[I 2025-01-12 12:57:37,077] Trial 7 finished with value: 0.43700461618069675 and parameters: {'max_bin': 272, 'num_leaves': 40}. Best is trial 0 with value: 0.43700461618069675.
[I 2025-01-12 12:57:37,121] Trial 8 finished with value: 0.43700461618069675 and parameters: {'max_bin': 259, 'num_leaves': 112}. Best is trial 0 with value: 0.43700461618069675.
[I 2025-01-12 12:57:37,157] Trial 9 finished with value: 0.43700461618069675 and parameters: {'max_bin': 446, 'num_leaves': 116}. Best is trial 0 with value: 0.43700461618069675.


[30]	training's binary_logloss: 0.374997	valid_1's binary_logloss: 0.45517
[40]	training's binary_logloss: 0.342624	valid_1's binary_logloss: 0.442745
[50]	training's binary_logloss: 0.312931	valid_1's binary_logloss: 0.437842
Early stopping, best iteration is:
[47]	training's binary_logloss: 0.320167	valid_1's binary_logloss: 0.437005
[LightGBM] [Info] Number of positive: 167, number of negative: 269
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000080 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 172
[LightGBM] [Info] Number of data points in the train set: 436, number of used features: 9
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.383028 -> initscore=-0.476718
[LightGBM] [Info] Start training from score -0.476718
Training until validation scores don't improve for 10 rounds
[10]	training's binary_logloss: 0.504016	valid_1's 

[I 2025-01-12 12:57:37,185] Trial 10 finished with value: 0.43700461618069675 and parameters: {'max_bin': 495, 'num_leaves': 109}. Best is trial 0 with value: 0.43700461618069675.
[I 2025-01-12 12:57:37,217] Trial 11 finished with value: 0.43700461618069675 and parameters: {'max_bin': 368, 'num_leaves': 107}. Best is trial 0 with value: 0.43700461618069675.
[I 2025-01-12 12:57:37,253] Trial 12 finished with value: 0.43700461618069675 and parameters: {'max_bin': 284, 'num_leaves': 94}. Best is trial 0 with value: 0.43700461618069675.
[I 2025-01-12 12:57:37,275] Trial 13 finished with value: 0.43700461618069675 and parameters: {'max_bin': 290, 'num_leaves': 123}. Best is trial 0 with value: 0.43700461618069675.
[I 2025-01-12 12:57:37,313] Trial 14 finished with value: 0.43700461618069675 and parameters: {'max_bin': 383, 'num_leaves': 72}. Best is trial 0 with value: 0.43700461618069675.
[I 2025-01-12 12:57:37,344] Trial 15 finished with value: 0.43700461618069675 and parameters: {'max_bi

[50]	training's binary_logloss: 0.312931	valid_1's binary_logloss: 0.437842
Early stopping, best iteration is:
[47]	training's binary_logloss: 0.320167	valid_1's binary_logloss: 0.437005
[LightGBM] [Info] Number of positive: 167, number of negative: 269
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000116 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 172
[LightGBM] [Info] Number of data points in the train set: 436, number of used features: 9
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.383028 -> initscore=-0.476718
[LightGBM] [Info] Start training from score -0.476718
Training until validation scores don't improve for 10 rounds
[10]	training's binary_logloss: 0.504016	valid_1's binary_logloss: 0.534687
[20]	training's binary_logloss: 0.425372	valid_1's binary_logloss: 0.483284
[30]	training's binary_logloss: 0.374997	valid_1's

[I 2025-01-12 12:57:37,375] Trial 16 finished with value: 0.43700461618069675 and parameters: {'max_bin': 367, 'num_leaves': 87}. Best is trial 0 with value: 0.43700461618069675.
[I 2025-01-12 12:57:37,410] Trial 17 finished with value: 0.43700461618069675 and parameters: {'max_bin': 259, 'num_leaves': 91}. Best is trial 0 with value: 0.43700461618069675.
[I 2025-01-12 12:57:37,441] Trial 18 finished with value: 0.43700461618069675 and parameters: {'max_bin': 405, 'num_leaves': 91}. Best is trial 0 with value: 0.43700461618069675.
[I 2025-01-12 12:57:37,463] Trial 19 finished with value: 0.43700461618069675 and parameters: {'max_bin': 487, 'num_leaves': 98}. Best is trial 0 with value: 0.43700461618069675.
[I 2025-01-12 12:57:37,495] Trial 20 finished with value: 0.43700461618069675 and parameters: {'max_bin': 343, 'num_leaves': 74}. Best is trial 0 with value: 0.43700461618069675.
[I 2025-01-12 12:57:37,523] Trial 21 finished with value: 0.43700461618069675 and parameters: {'max_bin':

[LightGBM] [Info] Number of positive: 167, number of negative: 269
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000046 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 172
[LightGBM] [Info] Number of data points in the train set: 436, number of used features: 9
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.383028 -> initscore=-0.476718
[LightGBM] [Info] Start training from score -0.476718
Training until validation scores don't improve for 10 rounds
[10]	training's binary_logloss: 0.504016	valid_1's binary_logloss: 0.534687
[20]	training's binary_logloss: 0.425372	valid_1's binary_logloss: 0.483284
[30]	training's binary_logloss: 0.374997	valid_1's binary_logloss: 0.45517
[40]	training's binary_logloss: 0.342624	valid_1's binary_logloss: 0.442745
[50]	training's binary_logloss: 0.312931	valid_1's binary_logloss: 0.437842
Early stop

[I 2025-01-12 12:57:37,595] Trial 23 finished with value: 0.43700461618069675 and parameters: {'max_bin': 306, 'num_leaves': 44}. Best is trial 0 with value: 0.43700461618069675.
[I 2025-01-12 12:57:37,626] Trial 24 finished with value: 0.43700461618069675 and parameters: {'max_bin': 332, 'num_leaves': 67}. Best is trial 0 with value: 0.43700461618069675.
[I 2025-01-12 12:57:37,658] Trial 25 finished with value: 0.43700461618069675 and parameters: {'max_bin': 395, 'num_leaves': 74}. Best is trial 0 with value: 0.43700461618069675.
[I 2025-01-12 12:57:37,685] Trial 26 finished with value: 0.43700461618069675 and parameters: {'max_bin': 498, 'num_leaves': 41}. Best is trial 0 with value: 0.43700461618069675.
[I 2025-01-12 12:57:37,720] Trial 27 finished with value: 0.43700461618069675 and parameters: {'max_bin': 306, 'num_leaves': 47}. Best is trial 0 with value: 0.43700461618069675.
[I 2025-01-12 12:57:37,778] Trial 28 finished with value: 0.43700461618069675 and parameters: {'max_bin':

[50]	training's binary_logloss: 0.312931	valid_1's binary_logloss: 0.437842
Early stopping, best iteration is:
[47]	training's binary_logloss: 0.320167	valid_1's binary_logloss: 0.437005
[LightGBM] [Info] Number of positive: 167, number of negative: 269
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000052 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 172
[LightGBM] [Info] Number of data points in the train set: 436, number of used features: 9
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.383028 -> initscore=-0.476718
[LightGBM] [Info] Start training from score -0.476718
Training until validation scores don't improve for 10 rounds
[10]	training's binary_logloss: 0.504016	valid_1's binary_logloss: 0.534687
[20]	training's binary_logloss: 0.425372	valid_1's binary_logloss: 0.483284
[30]	training's binary_logloss: 0.374997	valid_1's

[I 2025-01-12 12:57:37,824] Trial 29 finished with value: 0.43700461618069675 and parameters: {'max_bin': 369, 'num_leaves': 55}. Best is trial 0 with value: 0.43700461618069675.
[I 2025-01-12 12:57:37,849] Trial 30 finished with value: 0.43700461618069675 and parameters: {'max_bin': 294, 'num_leaves': 42}. Best is trial 0 with value: 0.43700461618069675.
[I 2025-01-12 12:57:37,882] Trial 31 finished with value: 0.43700461618069675 and parameters: {'max_bin': 416, 'num_leaves': 45}. Best is trial 0 with value: 0.43700461618069675.
[I 2025-01-12 12:57:37,923] Trial 32 finished with value: 0.43700461618069675 and parameters: {'max_bin': 303, 'num_leaves': 67}. Best is trial 0 with value: 0.43700461618069675.
[I 2025-01-12 12:57:37,955] Trial 33 finished with value: 0.43700461618069675 and parameters: {'max_bin': 456, 'num_leaves': 41}. Best is trial 0 with value: 0.43700461618069675.
[I 2025-01-12 12:57:37,991] Trial 34 finished with value: 0.43700461618069675 and parameters: {'max_bin':

[20]	training's binary_logloss: 0.425372	valid_1's binary_logloss: 0.483284
[30]	training's binary_logloss: 0.374997	valid_1's binary_logloss: 0.45517
[40]	training's binary_logloss: 0.342624	valid_1's binary_logloss: 0.442745
[50]	training's binary_logloss: 0.312931	valid_1's binary_logloss: 0.437842
Early stopping, best iteration is:
[47]	training's binary_logloss: 0.320167	valid_1's binary_logloss: 0.437005
[LightGBM] [Info] Number of positive: 167, number of negative: 269
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000057 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 172
[LightGBM] [Info] Number of data points in the train set: 436, number of used features: 9
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.383028 -> initscore=-0.476718
[LightGBM] [Info] Start training from score -0.476718
Training until validation scores don

[I 2025-01-12 12:57:38,043] Trial 35 finished with value: 0.43700461618069675 and parameters: {'max_bin': 495, 'num_leaves': 77}. Best is trial 0 with value: 0.43700461618069675.
[I 2025-01-12 12:57:38,077] Trial 36 finished with value: 0.43700461618069675 and parameters: {'max_bin': 495, 'num_leaves': 90}. Best is trial 0 with value: 0.43700461618069675.
[I 2025-01-12 12:57:38,111] Trial 37 finished with value: 0.43700461618069675 and parameters: {'max_bin': 436, 'num_leaves': 35}. Best is trial 0 with value: 0.43700461618069675.
[I 2025-01-12 12:57:38,141] Trial 38 finished with value: 0.43700461618069675 and parameters: {'max_bin': 324, 'num_leaves': 43}. Best is trial 0 with value: 0.43700461618069675.
[I 2025-01-12 12:57:38,178] Trial 39 finished with value: 0.43700461618069675 and parameters: {'max_bin': 327, 'num_leaves': 43}. Best is trial 0 with value: 0.43700461618069675.


[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.383028 -> initscore=-0.476718
[LightGBM] [Info] Start training from score -0.476718
Training until validation scores don't improve for 10 rounds
[10]	training's binary_logloss: 0.504016	valid_1's binary_logloss: 0.534687
[20]	training's binary_logloss: 0.425372	valid_1's binary_logloss: 0.483284
[30]	training's binary_logloss: 0.374997	valid_1's binary_logloss: 0.45517
[40]	training's binary_logloss: 0.342624	valid_1's binary_logloss: 0.442745
[50]	training's binary_logloss: 0.312931	valid_1's binary_logloss: 0.437842
Early stopping, best iteration is:
[47]	training's binary_logloss: 0.320167	valid_1's binary_logloss: 0.437005
[LightGBM] [Info] Number of positive: 167, number of negative: 269
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000046 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Tota

In [22]:
study.best_params

{'max_bin': 390, 'num_leaves': 101}

In [23]:
params = {
    'objective': 'binary',
    'max_bin': study.best_params['max_bin'],
    'learning_rate': 0.05,
    'num_leaves': study.best_params['num_leaves']
}

lgb_train = lgb.Dataset(X_train, y_train,
                        categorical_feature=categorical_features)
lgb_eval = lgb.Dataset(X_valid, y_valid, reference=lgb_train,
                       categorical_feature=categorical_features)

model = lgb.train(params, lgb_train,
                  valid_sets=[lgb_train, lgb_eval],
                  num_boost_round=1000,
                  callbacks=[lgb.early_stopping(stopping_rounds=10,
                                                verbose=True),
                             lgb.log_evaluation(10)])

y_pred = model.predict(X_test, num_iteration=model.best_iteration)

[LightGBM] [Info] Number of positive: 167, number of negative: 269
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001576 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 172
[LightGBM] [Info] Number of data points in the train set: 436, number of used features: 9
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.383028 -> initscore=-0.476718
[LightGBM] [Info] Start training from score -0.476718
Training until validation scores don't improve for 10 rounds
[10]	training's binary_logloss: 0.504016	valid_1's binary_logloss: 0.534687
[20]	training's binary_logloss: 0.425372	valid_1's binary_logloss: 0.483284
[30]	training's binary_logloss: 0.374997	valid_1's binary_logloss: 0.45517
[40]	training's binary_logloss: 0.342624	valid_1's binary_logloss: 0.442745
[50]	training's binary_logloss: 0.312931	valid_1's binary_logloss: 0.437842
Early stop

In [24]:
y_pred = (y_pred > 0.5).astype(int)

sub['Survived'] = y_pred
sub.to_csv('submission_lightgbm_optuna.csv', index=False)

sub.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,0
