In [1]:
import optuna
import numpy as np
import pandas as pd
import lightgbm as lgb
import matplotlib.pyplot as plt
from sklearn.metrics import log_loss
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split

In [2]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
data = pd.concat([train, test], sort=False)

data['FamilySize'] = data['Parch'] + data['SibSp'] + 1
data['IsAlone'] = 0
data.loc[data['FamilySize']==1, 'IsAlone'] = 1
# data['Sex'].replace(['male', 'female'], [0, 1], inplace=True)
data['Embarked'].fillna('S', inplace=True)
# data['Embarked'] = data['Embarked'].map({'S': 0, 'C': 1, 'Q': 2}).astype(int)
data['Fare'].fillna(np.mean(data['Fare']), inplace=True)
data['Age'].fillna(data['Age'].median(), inplace=True)

delete_columns = ['Name', 'PassengerId', 'SibSp', 'Parch', 'Ticket', 'Cabin']
data.drop(delete_columns, axis=1, inplace=True)

train = data[:len(train)]
test = data[len(train):]

y_train = train['Survived']
X_train = train.drop('Survived', axis=1)
X_test = test.drop('Survived', axis=1)

X_train.dtypes

Pclass          int64
Sex            object
Age           float64
Fare          float64
Embarked       object
FamilySize      int64
IsAlone         int64
dtype: object

In [3]:
categorical_features = ['Embarked', 'Sex']

for i in categorical_features:
    X_train[i] = X_train[i].astype('category')
    X_test[i] = X_test[i].astype('category')

### Base

In [4]:
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.3, random_state=0, stratify=y_train)

In [5]:
lgb_train = lgb.Dataset(X_train, y_train, categorical_feature=categorical_features)
lgb_eval = lgb.Dataset(X_valid, y_valid, reference=lgb_train, categorical_feature=categorical_features)

params = {
    'objective': 'binary',
    'max_bin': 300,
    'learning_rate': 0.05,
    'num_leaves': 40
}

model = lgb.train(params, lgb_train,
                  valid_sets=[lgb_train, lgb_eval],
                  verbose_eval=10, num_boost_round=1000,
                  early_stopping_rounds=10)

y_pred = model.predict(X_test, num_iteration=model.best_iteration)

[LightGBM] [Info] Number of positive: 239, number of negative: 384
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 193
[LightGBM] [Info] Number of data points in the train set: 623, number of used features: 7
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.383628 -> initscore=-0.474179
[LightGBM] [Info] Start training from score -0.474179
Training until validation scores don't improve for 10 rounds
[10]	training's binary_logloss: 0.504312	valid_1's binary_logloss: 0.530865
[20]	training's binary_logloss: 0.420987	valid_1's binary_logloss: 0.477988
[30]	training's binary_logloss: 0.37117	valid_1's binary_logloss: 0.455629
[40]	training's binary_logloss: 0.342521	valid_1's binary_logloss: 0.44594
[50]	training's binary_logloss: 0.320631	valid_1's binary_logloss: 0.445363
[60]	training's binary_logloss: 0.301265	valid_1's binary_logloss: 0.440947
[70]	training's binary_logloss: 0.283113	valid_1's binary_logloss: 0.441209
Early stopping, best iterat



In [6]:
y_pred[:10]

array([0.04198491, 0.55452289, 0.11008306, 0.06332245, 0.53269321,
       0.4972651 , 0.71883733, 0.14084128, 0.74880735, 0.03988284])

In [7]:
y_pred = (y_pred > 0.5).astype(int)
y_pred[:10]

array([0, 1, 0, 0, 1, 0, 1, 0, 1, 0])

### Optuna

In [8]:
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.3, random_state=0, stratify=y_train)

In [9]:
def objective(trial):
    params = {
        'objective': 'binary',
        'max_bin': trial.suggest_int('max_bin', 255, 500),
        'learning_rate': 0.05,
        'num_leaves': trial.suggest_int('num_leaves', 32, 128)
    }
    
    lgb_train = lgb.Dataset(X_train, y_train, categorical_feature=categorical_features)
    lgb_eval = lgb.Dataset(X_valid, y_valid, reference=lgb_train, categorical_feature=categorical_features)
    
    model = lgb.train(params, lgb_train,
                      valid_sets=[lgb_train, lgb_eval],
                      verbose_eval=10,
                      num_boost_round=1000,
                      early_stopping_rounds=10)
    
    y_pred_valid = model.predict(X_valid, num_iteraition=model.best_iteration)
    score = log_loss(y_valid, y_pred_valid)
    return score

In [10]:
study = optuna.create_study(sampler=optuna.samplers.RandomSampler(seed=0))
study.optimize(objective, n_trials=40)

[32m[I 2022-07-19 16:55:10,101][0m A new study created in memory with name: no-name-dfcf46af-bd54-4974-a2d0-e0f9f119826b[0m


[LightGBM] [Info] Number of positive: 167, number of negative: 269
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 160
[LightGBM] [Info] Number of data points in the train set: 436, number of used features: 7
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.383028 -> initscore=-0.476718
[LightGBM] [Info] Start training from score -0.476718
Training until validation scores don't improve for 10 rounds
[10]	training's binary_logloss: 0.504508	valid_1's binary_logloss: 0.535535
[20]	training's binary_logloss: 0.426798	valid_1's binary_logloss: 0.481159


[32m[I 2022-07-19 16:55:10,209][0m Trial 0 finished with value: 0.44029878145831425 and parameters: {'max_bin': 390, 'num_leaves': 101}. Best is trial 0 with value: 0.44029878145831425.[0m
[32m[I 2022-07-19 16:55:10,305][0m Trial 1 finished with value: 0.44029878145831425 and parameters: {'max_bin': 403, 'num_leaves': 84}. Best is trial 0 with value: 0.44029878145831425.[0m


[30]	training's binary_logloss: 0.379595	valid_1's binary_logloss: 0.459938
[40]	training's binary_logloss: 0.344263	valid_1's binary_logloss: 0.448049
[50]	training's binary_logloss: 0.315298	valid_1's binary_logloss: 0.440744
[60]	training's binary_logloss: 0.293677	valid_1's binary_logloss: 0.445986
Early stopping, best iteration is:
[51]	training's binary_logloss: 0.312794	valid_1's binary_logloss: 0.440299
[LightGBM] [Info] Number of positive: 167, number of negative: 269
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 160
[LightGBM] [Info] Number of data points in the train set: 436, number of used features: 7
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.383028 -> initscore=-0.476718
[LightGBM] [Info] Start training from score -0.476718
Training until validation scores don't improve for 10 rounds
[10]	training's binary_logloss: 0.504508	valid_1's binary_logloss: 0.535535
[20]	training's binary_logloss: 0.426798	valid_1's binary_logloss:

[LightGBM] [Info] Number of positive: 167, number of negative: 269
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 160
[LightGBM] [Info] Number of data points in the train set: 436, number of used features: 7
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.383028 -> initscore=-0.476718
[LightGBM] [Info] Start training from score -0.476718
Training until validation scores don't improve for 10 rounds
[10]	training's binary_logloss: 0.504508	valid_1's binary_logloss: 0.535535
[20]	training's binary_logloss: 0.426798	valid_1's binary_logloss: 0.481159


[32m[I 2022-07-19 16:55:10,399][0m Trial 2 finished with value: 0.44029878145831425 and parameters: {'max_bin': 359, 'num_leaves': 94}. Best is trial 0 with value: 0.44029878145831425.[0m
[32m[I 2022-07-19 16:55:10,485][0m Trial 3 finished with value: 0.44029878145831425 and parameters: {'max_bin': 362, 'num_leaves': 118}. Best is trial 0 with value: 0.44029878145831425.[0m


[30]	training's binary_logloss: 0.379595	valid_1's binary_logloss: 0.459938
[40]	training's binary_logloss: 0.344263	valid_1's binary_logloss: 0.448049
[50]	training's binary_logloss: 0.315298	valid_1's binary_logloss: 0.440744
[60]	training's binary_logloss: 0.293677	valid_1's binary_logloss: 0.445986
Early stopping, best iteration is:
[51]	training's binary_logloss: 0.312794	valid_1's binary_logloss: 0.440299
[LightGBM] [Info] Number of positive: 167, number of negative: 269
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 160
[LightGBM] [Info] Number of data points in the train set: 436, number of used features: 7
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.383028 -> initscore=-0.476718
[LightGBM] [Info] Start training from score -0.476718
Training until validation scores don't improve for 10 rounds
[10]	training's binary_logloss: 0.504508	valid_1's binary_logloss: 0.535535
[20]	training's binary_logloss: 0.426798	valid_1's binary_logloss:

[10]	training's binary_logloss: 0.504508	valid_1's binary_logloss: 0.535535
[20]	training's binary_logloss: 0.426798	valid_1's binary_logloss: 0.481159
[30]	training's binary_logloss: 0.379595	valid_1's binary_logloss: 0.459938
[40]	training's binary_logloss: 0.344263	valid_1's binary_logloss: 0.448049


[32m[I 2022-07-19 16:55:10,584][0m Trial 4 finished with value: 0.44029878145831425 and parameters: {'max_bin': 492, 'num_leaves': 69}. Best is trial 0 with value: 0.44029878145831425.[0m
[32m[I 2022-07-19 16:55:10,678][0m Trial 5 finished with value: 0.44029878145831425 and parameters: {'max_bin': 449, 'num_leaves': 83}. Best is trial 0 with value: 0.44029878145831425.[0m


[50]	training's binary_logloss: 0.315298	valid_1's binary_logloss: 0.440744
[60]	training's binary_logloss: 0.293677	valid_1's binary_logloss: 0.445986
Early stopping, best iteration is:
[51]	training's binary_logloss: 0.312794	valid_1's binary_logloss: 0.440299
[LightGBM] [Info] Number of positive: 167, number of negative: 269
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 160
[LightGBM] [Info] Number of data points in the train set: 436, number of used features: 7
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.383028 -> initscore=-0.476718
[LightGBM] [Info] Start training from score -0.476718
Training until validation scores don't improve for 10 rounds
[10]	training's binary_logloss: 0.504508	valid_1's binary_logloss: 0.535535
[20]	training's binary_logloss: 0.426798	valid_1's binary_logloss: 0.481159
[30]	training's binary_logloss: 0.379595	valid_1's binary_logloss: 0.459938
[40]	training's binary_logloss: 0.344263	valid_1's binary_logloss:

[20]	training's binary_logloss: 0.426798	valid_1's binary_logloss: 0.481159
[30]	training's binary_logloss: 0.379595	valid_1's binary_logloss: 0.459938
[40]	training's binary_logloss: 0.344263	valid_1's binary_logloss: 0.448049
[50]	training's binary_logloss: 0.315298	valid_1's binary_logloss: 0.440744


[32m[I 2022-07-19 16:55:10,768][0m Trial 6 finished with value: 0.44029878145831425 and parameters: {'max_bin': 394, 'num_leaves': 121}. Best is trial 0 with value: 0.44029878145831425.[0m
[32m[I 2022-07-19 16:55:10,855][0m Trial 7 finished with value: 0.44029878145831425 and parameters: {'max_bin': 272, 'num_leaves': 40}. Best is trial 0 with value: 0.44029878145831425.[0m


[60]	training's binary_logloss: 0.293677	valid_1's binary_logloss: 0.445986
Early stopping, best iteration is:
[51]	training's binary_logloss: 0.312794	valid_1's binary_logloss: 0.440299
[LightGBM] [Info] Number of positive: 167, number of negative: 269
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 160
[LightGBM] [Info] Number of data points in the train set: 436, number of used features: 7
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.383028 -> initscore=-0.476718
[LightGBM] [Info] Start training from score -0.476718
Training until validation scores don't improve for 10 rounds
[10]	training's binary_logloss: 0.504508	valid_1's binary_logloss: 0.535535
[20]	training's binary_logloss: 0.426798	valid_1's binary_logloss: 0.481159
[30]	training's binary_logloss: 0.379595	valid_1's binary_logloss: 0.459938
[40]	training's binary_logloss: 0.344263	valid_1's binary_logloss: 0.448049
[50]	training's binary_logloss: 0.315298	valid_1's binary_logloss:

[32m[I 2022-07-19 16:55:10,946][0m Trial 8 finished with value: 0.44029878145831425 and parameters: {'max_bin': 259, 'num_leaves': 112}. Best is trial 0 with value: 0.44029878145831425.[0m


[50]	training's binary_logloss: 0.315298	valid_1's binary_logloss: 0.440744
[60]	training's binary_logloss: 0.293677	valid_1's binary_logloss: 0.445986
Early stopping, best iteration is:
[51]	training's binary_logloss: 0.312794	valid_1's binary_logloss: 0.440299
[LightGBM] [Info] Number of positive: 167, number of negative: 269
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 160
[LightGBM] [Info] Number of data points in the train set: 436, number of used features: 7


[32m[I 2022-07-19 16:55:11,034][0m Trial 9 finished with value: 0.44029878145831425 and parameters: {'max_bin': 446, 'num_leaves': 116}. Best is trial 0 with value: 0.44029878145831425.[0m


[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.383028 -> initscore=-0.476718
[LightGBM] [Info] Start training from score -0.476718
Training until validation scores don't improve for 10 rounds
[10]	training's binary_logloss: 0.504508	valid_1's binary_logloss: 0.535535
[20]	training's binary_logloss: 0.426798	valid_1's binary_logloss: 0.481159
[30]	training's binary_logloss: 0.379595	valid_1's binary_logloss: 0.459938
[40]	training's binary_logloss: 0.344263	valid_1's binary_logloss: 0.448049
[50]	training's binary_logloss: 0.315298	valid_1's binary_logloss: 0.440744
[60]	training's binary_logloss: 0.293677	valid_1's binary_logloss: 0.445986
Early stopping, best iteration is:
[51]	training's binary_logloss: 0.312794	valid_1's binary_logloss: 0.440299
[LightGBM] [Info] Number of positive: 167, number of negative: 269
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 160
[LightGBM] [Info] Number of data points in the train set: 436, number of used fe

[32m[I 2022-07-19 16:55:11,116][0m Trial 10 finished with value: 0.44029878145831425 and parameters: {'max_bin': 495, 'num_leaves': 109}. Best is trial 0 with value: 0.44029878145831425.[0m


[LightGBM] [Info] Number of positive: 167, number of negative: 269
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 160
[LightGBM] [Info] Number of data points in the train set: 436, number of used features: 7
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.383028 -> initscore=-0.476718
[LightGBM] [Info] Start training from score -0.476718
Training until validation scores don't improve for 10 rounds
[10]	training's binary_logloss: 0.504508	valid_1's binary_logloss: 0.535535
[20]	training's binary_logloss: 0.426798	valid_1's binary_logloss: 0.481159


[32m[I 2022-07-19 16:55:11,207][0m Trial 11 finished with value: 0.44029878145831425 and parameters: {'max_bin': 368, 'num_leaves': 107}. Best is trial 0 with value: 0.44029878145831425.[0m
[32m[I 2022-07-19 16:55:11,297][0m Trial 12 finished with value: 0.44029878145831425 and parameters: {'max_bin': 284, 'num_leaves': 94}. Best is trial 0 with value: 0.44029878145831425.[0m


[30]	training's binary_logloss: 0.379595	valid_1's binary_logloss: 0.459938
[40]	training's binary_logloss: 0.344263	valid_1's binary_logloss: 0.448049
[50]	training's binary_logloss: 0.315298	valid_1's binary_logloss: 0.440744
[60]	training's binary_logloss: 0.293677	valid_1's binary_logloss: 0.445986
Early stopping, best iteration is:
[51]	training's binary_logloss: 0.312794	valid_1's binary_logloss: 0.440299
[LightGBM] [Info] Number of positive: 167, number of negative: 269
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 160
[LightGBM] [Info] Number of data points in the train set: 436, number of used features: 7
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.383028 -> initscore=-0.476718
[LightGBM] [Info] Start training from score -0.476718
Training until validation scores don't improve for 10 rounds
[10]	training's binary_logloss: 0.504508	valid_1's binary_logloss: 0.535535
[20]	training's binary_logloss: 0.426798	valid_1's binary_logloss:



[10]	training's binary_logloss: 0.504508	valid_1's binary_logloss: 0.535535
[20]	training's binary_logloss: 0.426798	valid_1's binary_logloss: 0.481159
[30]	training's binary_logloss: 0.379595	valid_1's binary_logloss: 0.459938
[40]	training's binary_logloss: 0.344263	valid_1's binary_logloss: 0.448049


[32m[I 2022-07-19 16:55:11,389][0m Trial 13 finished with value: 0.44029878145831425 and parameters: {'max_bin': 290, 'num_leaves': 123}. Best is trial 0 with value: 0.44029878145831425.[0m
[32m[I 2022-07-19 16:55:11,477][0m Trial 14 finished with value: 0.44029878145831425 and parameters: {'max_bin': 383, 'num_leaves': 72}. Best is trial 0 with value: 0.44029878145831425.[0m


[50]	training's binary_logloss: 0.315298	valid_1's binary_logloss: 0.440744
[60]	training's binary_logloss: 0.293677	valid_1's binary_logloss: 0.445986
Early stopping, best iteration is:
[51]	training's binary_logloss: 0.312794	valid_1's binary_logloss: 0.440299
[LightGBM] [Info] Number of positive: 167, number of negative: 269
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 160
[LightGBM] [Info] Number of data points in the train set: 436, number of used features: 7
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.383028 -> initscore=-0.476718
[LightGBM] [Info] Start training from score -0.476718
Training until validation scores don't improve for 10 rounds
[10]	training's binary_logloss: 0.504508	valid_1's binary_logloss: 0.535535
[20]	training's binary_logloss: 0.426798	valid_1's binary_logloss: 0.481159
[30]	training's binary_logloss: 0.379595	valid_1's binary_logloss: 0.459938
[40]	training's binary_logloss: 0.344263	valid_1's binary_logloss:

[30]	training's binary_logloss: 0.379595	valid_1's binary_logloss: 0.459938
[40]	training's binary_logloss: 0.344263	valid_1's binary_logloss: 0.448049
[50]	training's binary_logloss: 0.315298	valid_1's binary_logloss: 0.440744


[32m[I 2022-07-19 16:55:11,573][0m Trial 15 finished with value: 0.44029878145831425 and parameters: {'max_bin': 320, 'num_leaves': 107}. Best is trial 0 with value: 0.44029878145831425.[0m
[32m[I 2022-07-19 16:55:11,658][0m Trial 16 finished with value: 0.44029878145831425 and parameters: {'max_bin': 367, 'num_leaves': 87}. Best is trial 0 with value: 0.44029878145831425.[0m


[60]	training's binary_logloss: 0.293677	valid_1's binary_logloss: 0.445986
Early stopping, best iteration is:
[51]	training's binary_logloss: 0.312794	valid_1's binary_logloss: 0.440299
[LightGBM] [Info] Number of positive: 167, number of negative: 269
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 160
[LightGBM] [Info] Number of data points in the train set: 436, number of used features: 7
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.383028 -> initscore=-0.476718
[LightGBM] [Info] Start training from score -0.476718
Training until validation scores don't improve for 10 rounds
[10]	training's binary_logloss: 0.504508	valid_1's binary_logloss: 0.535535
[20]	training's binary_logloss: 0.426798	valid_1's binary_logloss: 0.481159
[30]	training's binary_logloss: 0.379595	valid_1's binary_logloss: 0.459938
[40]	training's binary_logloss: 0.344263	valid_1's binary_logloss: 0.448049
[50]	training's binary_logloss: 0.315298	valid_1's binary_logloss:

[32m[I 2022-07-19 16:55:11,752][0m Trial 17 finished with value: 0.44029878145831425 and parameters: {'max_bin': 259, 'num_leaves': 91}. Best is trial 0 with value: 0.44029878145831425.[0m


[50]	training's binary_logloss: 0.315298	valid_1's binary_logloss: 0.440744
[60]	training's binary_logloss: 0.293677	valid_1's binary_logloss: 0.445986
Early stopping, best iteration is:
[51]	training's binary_logloss: 0.312794	valid_1's binary_logloss: 0.440299
[LightGBM] [Info] Number of positive: 167, number of negative: 269
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 160
[LightGBM] [Info] Number of data points in the train set: 436, number of used features: 7
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.383028 -> initscore=-0.476718


[32m[I 2022-07-19 16:55:11,839][0m Trial 18 finished with value: 0.44029878145831425 and parameters: {'max_bin': 405, 'num_leaves': 91}. Best is trial 0 with value: 0.44029878145831425.[0m


[LightGBM] [Info] Start training from score -0.476718
Training until validation scores don't improve for 10 rounds
[10]	training's binary_logloss: 0.504508	valid_1's binary_logloss: 0.535535
[20]	training's binary_logloss: 0.426798	valid_1's binary_logloss: 0.481159
[30]	training's binary_logloss: 0.379595	valid_1's binary_logloss: 0.459938
[40]	training's binary_logloss: 0.344263	valid_1's binary_logloss: 0.448049
[50]	training's binary_logloss: 0.315298	valid_1's binary_logloss: 0.440744
[60]	training's binary_logloss: 0.293677	valid_1's binary_logloss: 0.445986
Early stopping, best iteration is:
[51]	training's binary_logloss: 0.312794	valid_1's binary_logloss: 0.440299
[LightGBM] [Info] Number of positive: 167, number of negative: 269
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 160
[LightGBM] [Info] Number of data points in the train set: 436, number of used features: 7
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.383028 -> initscore=

[32m[I 2022-07-19 16:55:11,923][0m Trial 19 finished with value: 0.44029878145831425 and parameters: {'max_bin': 487, 'num_leaves': 98}. Best is trial 0 with value: 0.44029878145831425.[0m


[LightGBM] [Info] Number of positive: 167, number of negative: 269
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 160
[LightGBM] [Info] Number of data points in the train set: 436, number of used features: 7
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.383028 -> initscore=-0.476718
[LightGBM] [Info] Start training from score -0.476718
Training until validation scores don't improve for 10 rounds
[10]	training's binary_logloss: 0.504508	valid_1's binary_logloss: 0.535535
[20]	training's binary_logloss: 0.426798	valid_1's binary_logloss: 0.481159


[32m[I 2022-07-19 16:55:12,017][0m Trial 20 finished with value: 0.44029878145831425 and parameters: {'max_bin': 343, 'num_leaves': 74}. Best is trial 0 with value: 0.44029878145831425.[0m
[32m[I 2022-07-19 16:55:12,099][0m Trial 21 finished with value: 0.44029878145831425 and parameters: {'max_bin': 426, 'num_leaves': 37}. Best is trial 0 with value: 0.44029878145831425.[0m


[30]	training's binary_logloss: 0.379595	valid_1's binary_logloss: 0.459938
[40]	training's binary_logloss: 0.344263	valid_1's binary_logloss: 0.448049
[50]	training's binary_logloss: 0.315298	valid_1's binary_logloss: 0.440744
[60]	training's binary_logloss: 0.293677	valid_1's binary_logloss: 0.445986
Early stopping, best iteration is:
[51]	training's binary_logloss: 0.312794	valid_1's binary_logloss: 0.440299
[LightGBM] [Info] Number of positive: 167, number of negative: 269
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 160
[LightGBM] [Info] Number of data points in the train set: 436, number of used features: 7
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.383028 -> initscore=-0.476718
[LightGBM] [Info] Start training from score -0.476718
Training until validation scores don't improve for 10 rounds
[10]	training's binary_logloss: 0.504508	valid_1's binary_logloss: 0.535535
[20]	training's binary_logloss: 0.426798	valid_1's binary_logloss:



[20]	training's binary_logloss: 0.426798	valid_1's binary_logloss: 0.481159
[30]	training's binary_logloss: 0.379595	valid_1's binary_logloss: 0.459938
[40]	training's binary_logloss: 0.344263	valid_1's binary_logloss: 0.448049


[32m[I 2022-07-19 16:55:12,192][0m Trial 22 finished with value: 0.44029878145831425 and parameters: {'max_bin': 419, 'num_leaves': 97}. Best is trial 0 with value: 0.44029878145831425.[0m
[32m[I 2022-07-19 16:55:12,283][0m Trial 23 finished with value: 0.44029878145831425 and parameters: {'max_bin': 306, 'num_leaves': 44}. Best is trial 0 with value: 0.44029878145831425.[0m


[50]	training's binary_logloss: 0.315298	valid_1's binary_logloss: 0.440744
[60]	training's binary_logloss: 0.293677	valid_1's binary_logloss: 0.445986
Early stopping, best iteration is:
[51]	training's binary_logloss: 0.312794	valid_1's binary_logloss: 0.440299
[LightGBM] [Info] Number of positive: 167, number of negative: 269
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 160
[LightGBM] [Info] Number of data points in the train set: 436, number of used features: 7
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.383028 -> initscore=-0.476718
[LightGBM] [Info] Start training from score -0.476718
Training until validation scores don't improve for 10 rounds
[10]	training's binary_logloss: 0.504508	valid_1's binary_logloss: 0.535535
[20]	training's binary_logloss: 0.426798	valid_1's binary_logloss: 0.481159
[30]	training's binary_logloss: 0.379595	valid_1's binary_logloss: 0.459938
[4

[30]	training's binary_logloss: 0.379595	valid_1's binary_logloss: 0.459938
[40]	training's binary_logloss: 0.344263	valid_1's binary_logloss: 0.448049
[50]	training's binary_logloss: 0.315298	valid_1's binary_logloss: 0.440744


[32m[I 2022-07-19 16:55:12,389][0m Trial 24 finished with value: 0.44029878145831425 and parameters: {'max_bin': 332, 'num_leaves': 67}. Best is trial 0 with value: 0.44029878145831425.[0m
[32m[I 2022-07-19 16:55:12,469][0m Trial 25 finished with value: 0.44029878145831425 and parameters: {'max_bin': 395, 'num_leaves': 74}. Best is trial 0 with value: 0.44029878145831425.[0m


[60]	training's binary_logloss: 0.293677	valid_1's binary_logloss: 0.445986
Early stopping, best iteration is:
[51]	training's binary_logloss: 0.312794	valid_1's binary_logloss: 0.440299
[LightGBM] [Info] Number of positive: 167, number of negative: 269
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 160
[LightGBM] [Info] Number of data points in the train set: 436, number of used features: 7
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.383028 -> initscore=-0.476718
[LightGBM] [Info] Start training from score -0.476718
Training until validation scores don't improve for 10 rounds
[10]	training's binary_logloss: 0.504508	valid_1's binary_logloss: 0.535535
[20]	training's binary_logloss: 0.426798	valid_1's binary_logloss: 0.481159
[30]	training's binary_logloss: 0.379595	valid_1's binary_logloss: 0.459938
[40]	training's binary_logloss: 0.344263	valid_1's binary_logloss: 0.448049
[50]	training's binary_logloss: 0.315298	valid_1's binary_logloss:

[40]	training's binary_logloss: 0.344263	valid_1's binary_logloss: 0.448049
[50]	training's binary_logloss: 0.315298	valid_1's binary_logloss: 0.440744


[32m[I 2022-07-19 16:55:12,583][0m Trial 26 finished with value: 0.44029878145831425 and parameters: {'max_bin': 498, 'num_leaves': 41}. Best is trial 0 with value: 0.44029878145831425.[0m
[32m[I 2022-07-19 16:55:12,665][0m Trial 27 finished with value: 0.44029878145831425 and parameters: {'max_bin': 306, 'num_leaves': 47}. Best is trial 0 with value: 0.44029878145831425.[0m


[60]	training's binary_logloss: 0.293677	valid_1's binary_logloss: 0.445986
Early stopping, best iteration is:
[51]	training's binary_logloss: 0.312794	valid_1's binary_logloss: 0.440299
[LightGBM] [Info] Number of positive: 167, number of negative: 269
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 160
[LightGBM] [Info] Number of data points in the train set: 436, number of used features: 7
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.383028 -> initscore=-0.476718
[LightGBM] [Info] Start training from score -0.476718
Training until validation scores don't improve for 10 rounds
[10]	training's binary_logloss: 0.504508	valid_1's binary_logloss: 0.535535
[20]	training's binary_logloss: 0.426798	valid_1's binary_logloss: 0.481159
[30]	training's binary_logloss: 0.379595	valid_1's binary_logloss: 0.459938
[40]	training's binary_logloss: 0.344263	valid_1's binary_logloss: 0.448049
[50]	training's binary_logloss: 0.315298	valid_1's binary_logloss:

[32m[I 2022-07-19 16:55:12,754][0m Trial 28 finished with value: 0.44029878145831425 and parameters: {'max_bin': 415, 'num_leaves': 56}. Best is trial 0 with value: 0.44029878145831425.[0m


No further splits with positive gain, best gain: -inf
[50]	training's binary_logloss: 0.315298	valid_1's binary_logloss: 0.440744
[60]	training's binary_logloss: 0.293677	valid_1's binary_logloss: 0.445986
Early stopping, best iteration is:
[51]	training's binary_logloss: 0.312794	valid_1's binary_logloss: 0.440299
[LightGBM] [Info] Number of positive: 167, number of negative: 269
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 160
[LightGBM] [Info] Number of data points in the train set: 436, number of used features: 7
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.383028 -> initscore=-0.476718
[LightGBM] [Info] Start training from score -0.476718
Training until validation scores don't improve for 10 rounds


[32m[I 2022-07-19 16:55:12,844][0m Trial 29 finished with value: 0.44029878145831425 and parameters: {'max_bin': 369, 'num_leaves': 55}. Best is trial 0 with value: 0.44029878145831425.[0m


[10]	training's binary_logloss: 0.504508	valid_1's binary_logloss: 0.535535
[20]	training's binary_logloss: 0.426798	valid_1's binary_logloss: 0.481159
[30]	training's binary_logloss: 0.379595	valid_1's binary_logloss: 0.459938
[40]	training's binary_logloss: 0.344263	valid_1's binary_logloss: 0.448049
[50]	training's binary_logloss: 0.315298	valid_1's binary_logloss: 0.440744
[60]	training's binary_logloss: 0.293677	valid_1's binary_logloss: 0.445986
Early stopping, best iteration is:
[51]	training's binary_logloss: 0.312794	valid_1's binary_logloss: 0.440299
[LightGBM] [Info] Number of positive: 167, number of negative: 269
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 160
[LightGBM] [Info] Number of data points in the train set: 436, number of used features: 7
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.383028 -> initscore=-0.476718
[LightGBM] [Info] Start training from score -0.476718
Training until validation scores don't improve for 

[32m[I 2022-07-19 16:55:12,930][0m Trial 30 finished with value: 0.44029878145831425 and parameters: {'max_bin': 294, 'num_leaves': 42}. Best is trial 0 with value: 0.44029878145831425.[0m


[LightGBM] [Info] Number of positive: 167, number of negative: 269
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 160
[LightGBM] [Info] Number of data points in the train set: 436, number of used features: 7
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.383028 -> initscore=-0.476718
[LightGBM] [Info] Start training from score -0.476718
Training until validation scores don't improve for 10 rounds
[10]	training's binary_logloss: 0.504508	valid_1's binary_logloss: 0.535535


[32m[I 2022-07-19 16:55:13,029][0m Trial 31 finished with value: 0.44029878145831425 and parameters: {'max_bin': 416, 'num_leaves': 45}. Best is trial 0 with value: 0.44029878145831425.[0m
[32m[I 2022-07-19 16:55:13,110][0m Trial 32 finished with value: 0.44029878145831425 and parameters: {'max_bin': 303, 'num_leaves': 67}. Best is trial 0 with value: 0.44029878145831425.[0m


[20]	training's binary_logloss: 0.426798	valid_1's binary_logloss: 0.481159
[30]	training's binary_logloss: 0.379595	valid_1's binary_logloss: 0.459938
[40]	training's binary_logloss: 0.344263	valid_1's binary_logloss: 0.448049
[50]	training's binary_logloss: 0.315298	valid_1's binary_logloss: 0.440744
[60]	training's binary_logloss: 0.293677	valid_1's binary_logloss: 0.445986
Early stopping, best iteration is:
[51]	training's binary_logloss: 0.312794	valid_1's binary_logloss: 0.440299
[LightGBM] [Info] Number of positive: 167, number of negative: 269
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 160
[LightGBM] [Info] Number of data points in the train set: 436, number of used features: 7
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.383028 -> initscore=-0.476718
[LightGBM] [Info] Start training from score -0.476718
Training until validation scores don't improve for 10 rounds
[10]	training's binary_logloss: 0.504508	valid_1's binary_logloss:

[20]	training's binary_logloss: 0.426798	valid_1's binary_logloss: 0.481159
[30]	training's binary_logloss: 0.379595	valid_1's binary_logloss: 0.459938


[32m[I 2022-07-19 16:55:13,211][0m Trial 33 finished with value: 0.44029878145831425 and parameters: {'max_bin': 456, 'num_leaves': 41}. Best is trial 0 with value: 0.44029878145831425.[0m
[32m[I 2022-07-19 16:55:13,295][0m Trial 34 finished with value: 0.44029878145831425 and parameters: {'max_bin': 461, 'num_leaves': 41}. Best is trial 0 with value: 0.44029878145831425.[0m


[40]	training's binary_logloss: 0.344263	valid_1's binary_logloss: 0.448049
[50]	training's binary_logloss: 0.315298	valid_1's binary_logloss: 0.440744
[60]	training's binary_logloss: 0.293677	valid_1's binary_logloss: 0.445986
Early stopping, best iteration is:
[51]	training's binary_logloss: 0.312794	valid_1's binary_logloss: 0.440299
[LightGBM] [Info] Number of positive: 167, number of negative: 269
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 160
[LightGBM] [Info] Number of data points in the train set: 436, number of used features: 7
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.383028 -> initscore=-0.476718
[LightGBM] [Info] Start training from score -0.476718
Training until validation scores don't improve for 10 rounds
[10]	training's binary_logloss: 0.504508	valid_1's binary_logloss: 0.535535
[20]	training's binary_logloss: 0.426798	valid_1's binary_logloss: 0.481159
[30]	training's binary_logloss: 0.379595	valid_1's binary_logloss:

[30]	training's binary_logloss: 0.379595	valid_1's binary_logloss: 0.459938
[40]	training's binary_logloss: 0.344263	valid_1's binary_logloss: 0.448049
[50]	training's binary_logloss: 0.315298	valid_1's binary_logloss: 0.440744


[32m[I 2022-07-19 16:55:13,388][0m Trial 35 finished with value: 0.44029878145831425 and parameters: {'max_bin': 495, 'num_leaves': 77}. Best is trial 0 with value: 0.44029878145831425.[0m
[32m[I 2022-07-19 16:55:13,483][0m Trial 36 finished with value: 0.44029878145831425 and parameters: {'max_bin': 495, 'num_leaves': 90}. Best is trial 0 with value: 0.44029878145831425.[0m


[60]	training's binary_logloss: 0.293677	valid_1's binary_logloss: 0.445986
Early stopping, best iteration is:
[51]	training's binary_logloss: 0.312794	valid_1's binary_logloss: 0.440299
[LightGBM] [Info] Number of positive: 167, number of negative: 269
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 160
[LightGBM] [Info] Number of data points in the train set: 436, number of used features: 7
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.383028 -> initscore=-0.476718
[LightGBM] [Info] Start training from score -0.476718
Training until validation scores don't improve for 10 rounds
[10]	training's binary_logloss: 0.504508	valid_1's binary_logloss: 0.535535
[20]	training's binary_logloss: 0.426798	valid_1's binary_logloss: 0.481159
[30]	training's binary_logloss: 0.379595	valid_1's binary_logloss: 0.459938
[40]	training's binary_logloss: 0.344263	valid_1's binary_logloss: 0.448049
[50]	training's binary_logloss: 0.315298	valid_1's binary_logloss:

[40]	training's binary_logloss: 0.344263	valid_1's binary_logloss: 0.448049
[50]	training's binary_logloss: 0.315298	valid_1's binary_logloss: 0.440744
[60]	training's binary_logloss: 0.293677	valid_1's binary_logloss: 0.445986


[32m[I 2022-07-19 16:55:13,582][0m Trial 37 finished with value: 0.44029878145831425 and parameters: {'max_bin': 436, 'num_leaves': 35}. Best is trial 0 with value: 0.44029878145831425.[0m
[32m[I 2022-07-19 16:55:13,671][0m Trial 38 finished with value: 0.44029878145831425 and parameters: {'max_bin': 324, 'num_leaves': 43}. Best is trial 0 with value: 0.44029878145831425.[0m


Early stopping, best iteration is:
[51]	training's binary_logloss: 0.312794	valid_1's binary_logloss: 0.440299
[LightGBM] [Info] Number of positive: 167, number of negative: 269
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 160
[LightGBM] [Info] Number of data points in the train set: 436, number of used features: 7
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.383028 -> initscore=-0.476718
[LightGBM] [Info] Start training from score -0.476718
Training until validation scores don't improve for 10 rounds
[10]	training's binary_logloss: 0.504508	valid_1's binary_logloss: 0.535535
[20]	training's binary_logloss: 0.426798	valid_1's binary_logloss: 0.481159
[30]	training's binary_logloss: 0.379595	valid_1's binary_logloss: 0.459938
[40]	training's binary_logloss: 0.344263	valid_1's binary_logloss: 0.448049
[50]	training's binary_logloss: 0.315298	valid_1's binary_logloss: 0.440744
[60]	training's binary_logloss: 0.293677	valid_1's binary_logloss:

[32m[I 2022-07-19 16:55:13,776][0m Trial 39 finished with value: 0.44029878145831425 and parameters: {'max_bin': 327, 'num_leaves': 43}. Best is trial 0 with value: 0.44029878145831425.[0m


[50]	training's binary_logloss: 0.315298	valid_1's binary_logloss: 0.440744
[60]	training's binary_logloss: 0.293677	valid_1's binary_logloss: 0.445986
Early stopping, best iteration is:
[51]	training's binary_logloss: 0.312794	valid_1's binary_logloss: 0.440299


In [11]:
study.best_params

{'max_bin': 390, 'num_leaves': 101}

### Cross Validation

In [12]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
data = pd.concat([train, test], sort=False)

data['FamilySize'] = data['Parch'] + data['SibSp'] + 1
data['IsAlone'] = 0
data.loc[data['FamilySize']==1, 'IsAlone'] = 1
# data['Sex'].replace(['male', 'female'], [0, 1], inplace=True)
data['Embarked'].fillna('S', inplace=True)
# data['Embarked'] = data['Embarked'].map({'S': 0, 'C': 1, 'Q': 2}).astype(int)
data['Fare'].fillna(np.mean(data['Fare']), inplace=True)
data['Age'].fillna(data['Age'].median(), inplace=True)

delete_columns = ['Name', 'PassengerId', 'SibSp', 'Parch', 'Ticket', 'Cabin']
data.drop(delete_columns, axis=1, inplace=True)

train = data[:len(train)]
test = data[len(train):]

y_train = train['Survived']
X_train = train.drop('Survived', axis=1)
X_test = test.drop('Survived', axis=1)

X_train.dtypes

Pclass          int64
Sex            object
Age           float64
Fare          float64
Embarked       object
FamilySize      int64
IsAlone         int64
dtype: object

In [13]:
categorical_features = ['Embarked', 'Sex']

for i in categorical_features:
    X_train[i] = X_train[i].astype('category')
    X_test[i] = X_test[i].astype('category')

In [14]:
y_preds = []
models = []
oof_train = np.zeros((len(X_train)))
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)

params = {
    'objective': 'binary',
    'max_bin': 300,
    'learning_rate': 0.05,
    'num_leaves': 40
}

for fold_id, (train_index, valid_index) in enumerate(cv.split(X_train, y_train)):
    print(f'===Fold {fold_id}===')
    X_tr = X_train.loc[train_index, :]
    X_val = X_train.loc[valid_index, :]
    y_tr = y_train.loc[train_index]
    y_val = y_train.loc[valid_index]
    
    lgb_train = lgb.Dataset(X_tr, y_tr, categorical_feature=categorical_features)
    lgb_eval = lgb.Dataset(X_val, y_val, reference=lgb_train, categorical_feature=categorical_features)
    
    model = lgb.train(params, lgb_train,
                      valid_sets=[lgb_train, lgb_eval],
                      verbose_eval=10,
                      num_boost_round=1000,
                      early_stopping_rounds=10)
    
    oof_train[valid_index] = model.predict(X_val, num_iteration=model.best_iteration)
    y_pred = model.predict(X_test, num_iteration=model.best_iteration)
    
    y_preds.append(y_pred)
    models.append(model)



===Fold 0===
[LightGBM] [Info] Number of positive: 273, number of negative: 439
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 199
[LightGBM] [Info] Number of data points in the train set: 712, number of used features: 7
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.383427 -> initscore=-0.475028
[LightGBM] [Info] Start training from score -0.475028
Training until validation scores don't improve for 10 rounds
[10]	training's binary_logloss: 0.503827	valid_1's binary_logloss: 0.506999
[20]	training's binary_logloss: 0.424757	valid_1's binary_logloss: 0.43869
[30]	training's binary_logloss: 0.377808	valid_1's binary_logloss: 0.400629
[40]	training's binary_logloss: 0.347689	valid_1's binary_logloss: 0.387221
[50]	training's binary_logloss: 0.324947	valid_1's binary_logloss: 0.384495
Early stopping, best iteration is:
[49]	training's binary_logloss: 0.327075	valid_1's binary_logloss: 0.383785
===Fold 1===
[LightGBM] [Info] Number of positive: 274

[50]	training's binary_logloss: 0.323009	valid_1's binary_logloss: 0.383873
[60]	training's binary_logloss: 0.302526	valid_1's binary_logloss: 0.3856
Early stopping, best iteration is:
[53]	training's binary_logloss: 0.315729	valid_1's binary_logloss: 0.383328
===Fold 2===
[LightGBM] [Info] Number of positive: 274, number of negative: 439
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 198
[LightGBM] [Info] Number of data points in the train set: 713, number of used features: 7
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.384292 -> initscore=-0.471371
[LightGBM] [Info] Start training from score -0.471371
Training until validation scores don't improve for 10 rounds
[10]	training's binary_logloss: 0.505381	valid_1's binary_logloss: 0.534263
[20]	training's binary_logloss: 0.428702	valid_1's binary_logloss: 0.473779
[30]	training's binary_logloss: 0.377932	valid_1's binary_logloss: 0.434827
[40]	training's binary_logloss: 0.345395	valid_1's bina

[50]	training's binary_logloss: 0.319057	valid_1's binary_logloss: 0.427899
[60]	training's binary_logloss: 0.299992	valid_1's binary_logloss: 0.427857
Early stopping, best iteration is:
[55]	training's binary_logloss: 0.309446	valid_1's binary_logloss: 0.425886
===Fold 4===
[LightGBM] [Info] Number of positive: 273, number of negative: 440
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 204
[LightGBM] [Info] Number of data points in the train set: 713, number of used features: 7
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.382889 -> initscore=-0.477303
[LightGBM] [Info] Start training from score -0.477303
Training until validation scores don't improve for 10 rounds
[10]	training's binary_logloss: 0.490492	valid_1's binary_logloss: 0.554932
[20]	training's binary_logloss: 0.411194	valid_1's binary_logloss: 0.512784
[30]	training's binary_logloss: 0.361044	valid_1's binary_logloss: 0.490877
[40]	training's binary_logloss: 0.327953	valid_1's bi

In [15]:
scores = [m.best_score['valid_1']['binary_logloss'] for m in models]
score = sum(scores) / len(scores)
print('===CV scores===')
print(scores)
print(score)

===CV scores===
[0.3837846953952286, 0.3833277157878286, 0.4032715933710547, 0.42588582547674164, 0.48146659093606975]
0.41554728419338466
