## 2.7　Submitのその前に！「Cross Validation」の大切さを知ろう

In [1]:
import optuna
import numpy as np
import pandas as pd
import lightgbm as lgb
import matplotlib.pyplot as plt
from sklearn.metrics import log_loss
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split

In [2]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
data = pd.concat([train, test], sort=False)

data['FamilySize'] = data['Parch'] + data['SibSp'] + 1
data['IsAlone'] = 0
data.loc[data['FamilySize']==1, 'IsAlone'] = 1
# data['Sex'].replace(['male', 'female'], [0, 1], inplace=True)
data['Embarked'].fillna('S', inplace=True)
# data['Embarked'] = data['Embarked'].map({'S': 0, 'C': 1, 'Q': 2}).astype(int)
data['Fare'].fillna(np.mean(data['Fare']), inplace=True)
data['Age'].fillna(data['Age'].median(), inplace=True)

delete_columns = ['Name', 'PassengerId', 'SibSp', 'Parch', 'Ticket', 'Cabin']
data.drop(delete_columns, axis=1, inplace=True)

train = data[:len(train)]
test = data[len(train):]

y_train = train['Survived']
X_train = train.drop('Survived', axis=1)
X_test = test.drop('Survived', axis=1)

categorical_features = ['Embarked', 'Sex']
for i in categorical_features:
    X_train[i] = X_train[i].astype('category')
    X_test[i] = X_test[i].astype('category')

X_train.dtypes

Pclass           int64
Sex           category
Age            float64
Fare           float64
Embarked      category
FamilySize       int64
IsAlone          int64
dtype: object

### 2.7.3 Cross Validation

In [3]:
y_preds = []
models = []
oof_train = np.zeros((len(X_train)))
cv = KFold(n_splits=5, shuffle=True, random_state=0)

params = {
    'objective': 'binary',
    'max_bin': 300,
    'learning_rate': 0.05,
    'num_leaves': 40
}

for fold_id, (train_index, valid_index) in enumerate(cv.split(X_train)):
    print('===Fold {}==='.format(fold_id))
    X_tr = X_train.loc[train_index, :]
    X_val = X_train.loc[valid_index, :]
    y_tr = y_train.loc[train_index]
    y_val = y_train.loc[valid_index]
    
    lgb_train = lgb.Dataset(X_tr, y_tr, categorical_feature=categorical_features)
    lgb_eval = lgb.Dataset(X_val, y_val, reference=lgb_train, categorical_feature=categorical_features)
    
    model = lgb.train(params, lgb_train,
                      valid_sets=[lgb_train, lgb_eval],
                      verbose_eval=10,
                      num_boost_round=1000,
                      early_stopping_rounds=10)
    
    oof_train[valid_index] = model.predict(X_val, num_iteration=model.best_iteration)
    y_pred = model.predict(X_test, num_iteration=model.best_iteration)
    
    y_preds.append(y_pred)
    models.append(model)

===Fold 0===
[LightGBM] [Info] Number of positive: 273, number of negative: 439
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 200
[LightGBM] [Info] Number of data points in the train set: 712, number of used features: 7
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.383427 -> initscore=-0.475028
[LightGBM] [Info] Start training from score -0.475028
Training until validation scores don't improve for 10 rounds
[10]	training's binary_logloss: 0.507967	valid_1's binary_logloss: 0.516065
[20]	training's binary_logloss: 0.42728	valid_1's binary_logloss: 0.44405
[30]	training's binary_logloss: 0.377783	valid_1's binary_logloss: 0.40641
[40]	training's binary_logloss: 0.347257	valid_1's binary_logloss: 0.38835
[50]	training's binary_logloss: 0.323818	valid_1's binary_logloss: 0.381184
[60]	training's binary_logloss: 0.305707	valid_1's binary_logloss: 0.378986
[70]	training's binary_logloss: 0.287565	valid_1's binary_logloss: 0.374994
[80]	training's 



[20]	training's binary_logloss: 0.420604	valid_1's binary_logloss: 0.48032
[30]	training's binary_logloss: 0.368313	valid_1's binary_logloss: 0.451685
[40]	training's binary_logloss: 0.336337	valid_1's binary_logloss: 0.443882
[50]	training's binary_logloss: 0.315167	valid_1's binary_logloss: 0.443434
[60]	training's binary_logloss: 0.295864	valid_1's binary_logloss: 0.441838
Early stopping, best iteration is:
[55]	training's binary_logloss: 0.305388	valid_1's binary_logloss: 0.441191
===Fold 2===
[LightGBM] [Info] Number of positive: 284, number of negative: 429
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 203
[LightGBM] [Info] Number of data points in the train set: 713, number of used features: 7
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.398317 -> initscore=-0.412483
[LightGBM] [Info] Start training from score -0.412483
Training until validation scores don't improve for 

===Fold 3===
[LightGBM] [Info] Number of positive: 275, number of negative: 438
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 198
[LightGBM] [Info] Number of data points in the train set: 713, number of used features: 7
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.385694 -> initscore=-0.465448
[LightGBM] [Info] Start training from score -0.465448
Training until validation scores don't improve for 10 rounds
[10]	training's binary_logloss: 0.497945	valid_1's binary_logloss: 0.524956
[20]	training's binary_logloss: 0.418251	valid_1's binary_logloss: 0.469826
[30]	training's binary_logloss: 0.373537	valid_1's binary_logloss: 0.446586
[40]	training's binary_logloss: 0.341566	valid_1's binary_logloss: 0.43267
[50]	training's binary_logloss: 0.316843	valid_1's binary_logloss: 0.427101
[60]	training's binary_logloss: 0.297129	valid_1's binary_logloss: 0.431308
Early stopping, best iteration is:
[51]	training's binary_logloss: 0.314296	valid_1's bin

In [4]:
scores = [m.best_score['valid_1']['binary_logloss'] for m in models]
score = sum(scores) / len(scores)
print('===CV scores===')
print(scores)
print(score)

===CV scores===
[0.3623438225615162, 0.4411906327808854, 0.37069461044413654, 0.4269596067698506, 0.4324191310118583]
0.4067215607136494


### 2.7.4　データセットの分割方法

In [5]:
cv = KFold(n_splits=5, shuffle=True, random_state=0)
for fold_id, (train_index, valid_index) in enumerate(cv.split(X_train)):
    X_tr = X_train.loc[train_index, :]
    X_val = X_train.loc[valid_index, :]
    y_tr = y_train[train_index]
    y_val = y_train[valid_index]
    
    print(f'fold_id: {fold_id}')
    print(f'y_tr y==1 rate: {sum(y_tr)/len(y_tr)}')
    print(f'y_val y==1rate: {sum(y_val)/len(y_val)}')

fold_id: 0
y_tr y==1 rate: 0.38342696629213485
y_val y==1rate: 0.3854748603351955
fold_id: 1
y_tr y==1 rate: 0.3856942496493689
y_val y==1rate: 0.37640449438202245
fold_id: 2
y_tr y==1 rate: 0.39831697054698456
y_val y==1rate: 0.3258426966292135
fold_id: 3
y_tr y==1 rate: 0.3856942496493689
y_val y==1rate: 0.37640449438202245
fold_id: 4
y_tr y==1 rate: 0.36605890603085556
y_val y==1rate: 0.4550561797752809


In [6]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
for fold_id, (train_index, valid_index) in enumerate(cv.split(X_train, y_train)):
    X_tr = X_train.loc[train_index, :]
    X_val = X_train.loc[valid_index, :]
    y_tr = y_train[train_index]
    y_val = y_train[valid_index]
    
    print(f'fold_id: {fold_id}')
    print(f'y_tr y==1 rate: {sum(y_tr)/len(y_tr)}')
    print(f'y_val y==1rate: {sum(y_val)/len(y_val)}')

fold_id: 0
y_tr y==1 rate: 0.38342696629213485
y_val y==1rate: 0.3854748603351955
fold_id: 1
y_tr y==1 rate: 0.38429172510518933
y_val y==1rate: 0.38202247191011235
fold_id: 2
y_tr y==1 rate: 0.38429172510518933
y_val y==1rate: 0.38202247191011235
fold_id: 3
y_tr y==1 rate: 0.38429172510518933
y_val y==1rate: 0.38202247191011235
fold_id: 4
y_tr y==1 rate: 0.38288920056100983
y_val y==1rate: 0.38764044943820225


In [7]:
y_preds = []
models = []
oof_train = np.zeros((len(X_train)))
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)

params = {
    'objective': 'binary',
    'max_bin': 300,
    'learning_rate': 0.05,
    'num_leaves': 40
}

for fold_id, (train_index, valid_index) in enumerate(cv.split(X_train, y_train)):
    print('===Fold {}==='.format(fold_id))
    X_tr = X_train.loc[train_index, :]
    X_val = X_train.loc[valid_index, :]
    y_tr = y_train.loc[train_index]
    y_val = y_train.loc[valid_index]
    
    lgb_train = lgb.Dataset(X_tr, y_tr, categorical_feature=categorical_features)
    lgb_eval = lgb.Dataset(X_val, y_val, reference=lgb_train, categorical_feature=categorical_features)
    
    model = lgb.train(params, lgb_train,
                      valid_sets=[lgb_train, lgb_eval],
                      verbose_eval=10,
                      num_boost_round=1000,
                      early_stopping_rounds=10)
    
    oof_train[valid_index] = model.predict(X_val, num_iteration=model.best_iteration)
    y_pred = model.predict(X_test, num_iteration=model.best_iteration)
    
    y_preds.append(y_pred)
    models.append(model)

===Fold 0===
[LightGBM] [Info] Number of positive: 273, number of negative: 439
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 199
[LightGBM] [Info] Number of data points in the train set: 712, number of used features: 7
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.383427 -> initscore=-0.475028
[LightGBM] [Info] Start training from score -0.475028
Training until validation scores don't improve for 10 rounds
[10]	training's binary_logloss: 0.503827	valid_1's binary_logloss: 0.506999
[20]	training's binary_logloss: 0.424757	valid_1's binary_logloss: 0.43869
[30]	training's binary_logloss: 0.377808	valid_1's binary_logloss: 0.400629
[40]	training's binary_logloss: 0.347689	valid_1's binary_logloss: 0.387221
[50]	training's binary_logloss: 0.324947	valid_1's binary_logloss: 0.384495
Early stopping, best iteration is:
[49]	training's binary_logloss: 0.327075	valid_1's binary_logloss: 0.383785
===Fold 1===
[LightGBM] [Info] Number of positive: 274

[50]	training's binary_logloss: 0.323009	valid_1's binary_logloss: 0.383873




[60]	training's binary_logloss: 0.302526	valid_1's binary_logloss: 0.3856
Early stopping, best iteration is:
[53]	training's binary_logloss: 0.315729	valid_1's binary_logloss: 0.383328
===Fold 2===
[LightGBM] [Info] Number of positive: 274, number of negative: 439
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 198
[LightGBM] [Info] Number of data points in the train set: 713, number of used features: 7
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.384292 -> initscore=-0.471371
[LightGBM] [Info] Start training from score -0.471371
Training until validation scores don't improve for 10 rounds
[10]	training's binary_logloss: 0.505381	valid_1's binary_logloss: 0.534263
[20]	training's binary_logloss: 0.428702	valid_1's binary_logloss: 0.473779
[30]	training's binary_logloss: 0.377932	valid_1's binary_logloss: 0.434827
[40]	training's binary_logloss: 0.345395	valid_1's binary_logloss: 0.416499
[50]	training's binary_logloss: 0.320578	valid_1's bina

[30]	training's binary_logloss: 0.370269	valid_1's binary_logloss: 0.443773
[40]	training's binary_logloss: 0.341403	valid_1's binary_logloss: 0.433661
[50]	training's binary_logloss: 0.319057	valid_1's binary_logloss: 0.427899
[60]	training's binary_logloss: 0.299992	valid_1's binary_logloss: 0.427857
Early stopping, best iteration is:
[55]	training's binary_logloss: 0.309446	valid_1's binary_logloss: 0.425886
===Fold 4===
[LightGBM] [Info] Number of positive: 273, number of negative: 440
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 204
[LightGBM] [Info] Number of data points in the train set: 713, number of used features: 7
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.382889 -> initscore=-0.477303
[LightGBM] [Info] Start training from score -0.477303
Training until validation scores don't improve for 10 rounds
[10]	training's binary_logloss: 0.490492	valid_1's binary_logloss: 0.554932
[20]	training's binary_logloss: 0.411194	valid_1's bi

In [8]:
scores = [m.best_score['valid_1']['binary_logloss'] for m in models]
score = sum(scores) / len(scores)
print('===CV scores===')
print(scores)
print(score)

===CV scores===
[0.3837846953952286, 0.3833277157878286, 0.4032715933710547, 0.42588582547674164, 0.48146659093606975]
0.41554728419338466
