This notebook is a sample code with Japanese comments.

# 2.7 submitのその前に！　「Cross Validation」の大切さを知ろう

In [1]:
!pip install lightgbm



In [2]:
import numpy as np
import pandas as pd

train = pd.read_csv('../input/titanic/train.csv')
test = pd.read_csv('../input/titanic/test.csv')
gender_submission = pd.read_csv('../input/titanic/gender_submission.csv')

data = pd.concat([train, test], sort=False)

data['Sex'].replace(['male', 'female'], [0, 1], inplace=True)
data['Embarked'].fillna(('S'), inplace=True)
data['Embarked'] = data['Embarked'].map({'S': 0, 'C': 1, 'Q': 2}).astype(int)
data['Fare'].fillna(np.mean(data['Fare']), inplace=True)
data['Age'].fillna(data['Age'].median(), inplace=True)
data['FamilySize'] = data['Parch'] + data['SibSp'] + 1
data['IsAlone'] = 0
data.loc[data['FamilySize'] == 1, 'IsAlone'] = 1

In [3]:
data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,FamilySize,IsAlone
0,1,0.0,3,"Braund, Mr. Owen Harris",0,22.0,1,0,A/5 21171,7.25,,0,2,0
1,2,1.0,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1,38.0,1,0,PC 17599,71.2833,C85,1,2,0
2,3,1.0,3,"Heikkinen, Miss. Laina",1,26.0,0,0,STON/O2. 3101282,7.925,,0,1,1
3,4,1.0,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",1,35.0,1,0,113803,53.1,C123,0,2,0
4,5,0.0,3,"Allen, Mr. William Henry",0,35.0,0,0,373450,8.05,,0,1,1


In [4]:
delete_columns = ['Name', 'PassengerId', 'Ticket', 'Cabin']
data.drop(delete_columns, axis=1, inplace=True)

train = data[:len(train)]
test = data[len(train):]

y_train = train['Survived']
X_train = train.drop('Survived', axis=1)
X_test = test.drop('Survived', axis=1)

In [5]:
X_train.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,FamilySize,IsAlone
0,3,0,22.0,1,0,7.25,0,2,0
1,1,1,38.0,1,0,71.2833,1,2,0
2,3,1,26.0,0,0,7.925,0,1,1
3,1,1,35.0,1,0,53.1,0,2,0
4,3,0,35.0,0,0,8.05,0,1,1


# ホールドアウト検証

In [6]:
from sklearn.model_selection import train_test_split

# 学習用データセットを分割した上でLightGBMを学習
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.3, random_state=0, stratify=y_train)

In [7]:
categorical_features = ['Embarked', 'Pclass', 'Sex']

In [8]:
params = {
    'objective': 'binary',
    'max_bin': 300,
    'learning_rate': 0.05,
    'num_leaves': 40
}

In [9]:
import lightgbm as lgb

lgb_train = lgb.Dataset(X_train, y_train,
                        categorical_feature=categorical_features)
lgb_eval = lgb.Dataset(X_valid, y_valid, reference=lgb_train,
                       categorical_feature=categorical_features)

model = lgb.train(params, lgb_train,
                  valid_sets=[lgb_train, lgb_eval],
                  num_boost_round=1000,
                  callbacks=[lgb.early_stopping(stopping_rounds=10,
                                                verbose=True),
                             lgb.log_evaluation(10)])

y_pred = model.predict(X_test, num_iteration=model.best_iteration)

[LightGBM] [Info] Number of positive: 239, number of negative: 384
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000338 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 205
[LightGBM] [Info] Number of data points in the train set: 623, number of used features: 9
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.383628 -> initscore=-0.474179
[LightGBM] [Info] Start training from score -0.474179
Training until validation scores don't improve for 10 rounds
[10]	training's binary_logloss: 0.505696	valid_1's binary_logloss: 0.532008
[20]	training's binary_logloss: 0.422967	valid_1's binary_logloss: 0.478904
[30]	training's binary_logloss: 0.373496	valid_1's binary_logloss: 0.448045
[40]	training's binary_logloss: 0.342723	valid_1's binary_logloss: 0.439465
[50]	training's binary_logloss: 0.318708	valid_1's binary_logloss: 0.438633
[60]	trai

In [10]:
y_pred[:10]

array([0.03935242, 0.52380086, 0.11487096, 0.07538967, 0.39761111,
       0.41886672, 0.7023978 , 0.13190461, 0.74563405, 0.0485806 ])

In [11]:
y_pred = (y_pred > 0.5).astype(int)
y_pred[:10]

array([0, 1, 0, 0, 0, 0, 1, 0, 1, 0])

In [12]:
sub = pd.read_csv('../input/titanic/gender_submission.csv')

sub['Survived'] = y_pred
sub.to_csv('submission_lightgbm_holdout.csv', index=False)

sub.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0
3,895,0
4,896,0


# 交差検証（Cross Validation）
複数回にわたって異なる方法でデータセットを分割し、それぞれでホールドアウト検証を実行する方法

In [13]:
train = pd.read_csv('../input/titanic/train.csv')
test = pd.read_csv('../input/titanic/test.csv')
gender_submission = pd.read_csv('../input/titanic/gender_submission.csv')

data = pd.concat([train, test], sort=False)

data['Sex'].replace(['male', 'female'], [0, 1], inplace=True)
data['Embarked'].fillna(('S'), inplace=True)
data['Embarked'] = data['Embarked'].map({'S': 0, 'C': 1, 'Q': 2}).astype(int)
data['Fare'].fillna(np.mean(data['Fare']), inplace=True)
data['Age'].fillna(data['Age'].median(), inplace=True)
data['FamilySize'] = data['Parch'] + data['SibSp'] + 1
data['IsAlone'] = 0
data.loc[data['FamilySize'] == 1, 'IsAlone'] = 1

delete_columns = ['Name', 'PassengerId', 'Ticket', 'Cabin']
data.drop(delete_columns, axis=1, inplace=True)

train = data[:len(train)]
test = data[len(train):]

y_train = train['Survived']
X_train = train.drop('Survived', axis=1)
X_test = test.drop('Survived', axis=1)

X_train.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,FamilySize,IsAlone
0,3,0,22.0,1,0,7.25,0,2,0
1,1,1,38.0,1,0,71.2833,1,2,0
2,3,1,26.0,0,0,7.925,0,1,1
3,1,1,35.0,1,0,53.1,0,2,0
4,3,0,35.0,0,0,8.05,0,1,1


In [14]:
from sklearn.model_selection import KFold

y_preds = []
models = []
oof_train = np.zeros((len(X_train),))
# CV
cv = KFold(
    # 分割数
    n_splits=5,
    shuffle=True,
    random_state=0
)

categorical_features = ['Embarked', 'Pclass', 'Sex']

params = {
    'objective': 'binary',
    'max_bin': 300,
    'learning_rate': 0.05,
    'num_leaves': 40
}

for fold_id, (train_index, valid_index) in enumerate(cv.split(X_train)):
    X_tr = X_train.loc[train_index, :]
    X_val = X_train.loc[valid_index, :]
    y_tr = y_train[train_index]
    y_val = y_train[valid_index]

    lgb_train = lgb.Dataset(X_tr, y_tr,
                            categorical_feature=categorical_features)
    lgb_eval = lgb.Dataset(X_val, y_val, reference=lgb_train,
                           categorical_feature=categorical_features)

    model = lgb.train(params, lgb_train,
                      valid_sets=[lgb_train, lgb_eval],
                      num_boost_round=1000,
                      callbacks=[lgb.early_stopping(stopping_rounds=10,
                                                    verbose=True),
                                 lgb.log_evaluation(10)])

    oof_train[valid_index] = model.predict(X_val, num_iteration=model.best_iteration)
    y_pred = model.predict(X_test, num_iteration=model.best_iteration)

    y_preds.append(y_pred)
    models.append(model)

[LightGBM] [Info] Number of positive: 273, number of negative: 439
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001831 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 214
[LightGBM] [Info] Number of data points in the train set: 712, number of used features: 9
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.383427 -> initscore=-0.475028
[LightGBM] [Info] Start training from score -0.475028
Training until validation scores don't improve for 10 rounds
[10]	training's binary_logloss: 0.506339	valid_1's binary_logloss: 0.516151
[20]	training's binary_logloss: 0.426275	valid_1's binary_logloss: 0.446237
[30]	training's binary_logloss: 0.377059	valid_1's binary_logloss: 0.404444
[40]	training's binary_logloss: 0.346209	valid_1's binary_logloss: 0.386344
[50]	training's binary_logloss: 0.322399	valid_1's binary_logloss: 0.380195
[60]	trai

In [15]:
pd.DataFrame(oof_train).to_csv('oof_train_kfold.csv', index=False)

scores = [
    m.best_score['valid_1']['binary_logloss'] for m in models
]
score = sum(scores) / len(scores)
print('===CV scores===')
print(scores)
print(score)

===CV scores===
[0.36879329957340645, 0.43982467718089535, 0.37133848188218943, 0.4304457066410963, 0.4321380114935291]
0.4085080353542233


In [16]:
from sklearn.metrics import accuracy_score

y_pred_oof = (oof_train > 0.5).astype(int)
accuracy_score(y_train, y_pred_oof)

0.8294051627384961

In [17]:
len(y_preds)

5

In [18]:
y_preds[0][:10]

array([0.07903405, 0.3569554 , 0.04071679, 0.36745113, 0.41165963,
       0.62777401, 0.6920704 , 0.16400441, 0.82285313, 0.02543809])

In [19]:
y_sub = sum(y_preds) / len(y_preds)
y_sub = (y_sub > 0.5).astype(int)
y_sub[:10]

array([0, 0, 0, 0, 0, 0, 1, 0, 1, 0])

In [20]:
sub['Survived'] = y_sub
sub.to_csv('submission_lightgbm_kfold.csv', index=False)

sub.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,0


# データセットの分割方法

In [21]:
from sklearn.model_selection import KFold

cv = KFold(n_splits=5, shuffle=True, random_state=0)
for fold_id, (train_index, valid_index) in enumerate(cv.split(X_train)):
    X_tr = X_train.loc[train_index, :]
    X_val = X_train.loc[valid_index, :]
    y_tr = y_train[train_index]
    y_val = y_train[valid_index]

    print(f'fold_id: {fold_id}')
    print(f'y_tr y==1 rate: {sum(y_tr)/len(y_tr)}')
    print(f'y_val y==1 rate: {sum(y_val)/len(y_val)}')

fold_id: 0
y_tr y==1 rate: 0.38342696629213485
y_val y==1 rate: 0.3854748603351955
fold_id: 1
y_tr y==1 rate: 0.3856942496493689
y_val y==1 rate: 0.37640449438202245
fold_id: 2
y_tr y==1 rate: 0.39831697054698456
y_val y==1 rate: 0.3258426966292135
fold_id: 3
y_tr y==1 rate: 0.3856942496493689
y_val y==1 rate: 0.37640449438202245
fold_id: 4
y_tr y==1 rate: 0.36605890603085556
y_val y==1 rate: 0.4550561797752809


In [22]:
from sklearn.model_selection import StratifiedKFold

# 目的変数の割合を保ったままCVを実施
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
for fold_id, (train_index, valid_index) in enumerate(cv.split(X_train, y_train)):
    X_tr = X_train.loc[train_index, :]
    X_val = X_train.loc[valid_index, :]
    y_tr = y_train[train_index]
    y_val = y_train[valid_index]

    print(f'fold_id: {fold_id}')
    print(f'y_tr y==1 rate: {sum(y_tr)/len(y_tr)}')
    print(f'y_val y==1 rate: {sum(y_val)/len(y_val)}')

fold_id: 0
y_tr y==1 rate: 0.38342696629213485
y_val y==1 rate: 0.3854748603351955
fold_id: 1
y_tr y==1 rate: 0.38429172510518933
y_val y==1 rate: 0.38202247191011235
fold_id: 2
y_tr y==1 rate: 0.38429172510518933
y_val y==1 rate: 0.38202247191011235
fold_id: 3
y_tr y==1 rate: 0.38429172510518933
y_val y==1 rate: 0.38202247191011235
fold_id: 4
y_tr y==1 rate: 0.38288920056100983
y_val y==1 rate: 0.38764044943820225


In [23]:
from sklearn.model_selection import StratifiedKFold

y_preds = []
models = []
oof_train = np.zeros((len(X_train),))
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)

categorical_features = ['Embarked', 'Pclass', 'Sex']

params = {
    'objective': 'binary',
    'max_bin': 300,
    'learning_rate': 0.05,
    'num_leaves': 40
}

for fold_id, (train_index, valid_index) in enumerate(cv.split(X_train, y_train)):
    X_tr = X_train.loc[train_index, :]
    X_val = X_train.loc[valid_index, :]
    y_tr = y_train[train_index]
    y_val = y_train[valid_index]

    lgb_train = lgb.Dataset(X_tr, y_tr,
                            categorical_feature=categorical_features)
    lgb_eval = lgb.Dataset(X_val, y_val, reference=lgb_train,
                           categorical_feature=categorical_features)

    model = lgb.train(params, lgb_train,
                      valid_sets=[lgb_train, lgb_eval],
                      num_boost_round=1000,
                      callbacks=[lgb.early_stopping(stopping_rounds=10,
                                                    verbose=True),
                                 lgb.log_evaluation(10)])

    oof_train[valid_index] = model.predict(X_val, num_iteration=model.best_iteration)
    y_pred = model.predict(X_test, num_iteration=model.best_iteration)

    y_preds.append(y_pred)
    models.append(model)

[LightGBM] [Info] Number of positive: 273, number of negative: 439
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000057 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 213
[LightGBM] [Info] Number of data points in the train set: 712, number of used features: 9
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.383427 -> initscore=-0.475028
[LightGBM] [Info] Start training from score -0.475028
Training until validation scores don't improve for 10 rounds
[10]	training's binary_logloss: 0.503721	valid_1's binary_logloss: 0.507892
[20]	training's binary_logloss: 0.426394	valid_1's binary_logloss: 0.439962
[30]	training's binary_logloss: 0.37914	valid_1's binary_logloss: 0.401837
[40]	training's binary_logloss: 0.34819	valid_1's binary_logloss: 0.389454
[50]	training's binary_logloss: 0.325355	valid_1's binary_logloss: 0.384696
[60]	traini

In [24]:
pd.DataFrame(oof_train).to_csv('oof_train_skfold.csv', index=False)
print(oof_train[:10])

scores = [
    m.best_score['valid_1']['binary_logloss'] for m in models
]
score = sum(scores) / len(scores)
print('===CV scores===')
print(scores)
print(score)

[0.10743915 0.9336148  0.24343427 0.97174844 0.23722008 0.05967737
 0.10523416 0.27028808 0.37330822 0.9413475 ]
===CV scores===
[0.38009408187107163, 0.37924619589793596, 0.40508420679400237, 0.44506993291806346, 0.48255560255294944]
0.41841000400680456


In [25]:
from sklearn.metrics import accuracy_score

y_pred_oof = (oof_train > 0.5).astype(int)
accuracy_score(y_train, y_pred_oof)

0.8294051627384961

In [26]:
y_sub = sum(y_preds) / len(y_preds)
y_sub = (y_sub > 0.5).astype(int)
y_sub[:10]

array([0, 0, 0, 0, 0, 0, 1, 0, 1, 0])

In [27]:
sub['Survived'] = y_sub
sub.to_csv('submission_lightgbm_skfold.csv', index=False)

sub.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,0
