## 7.3　スタッキング

In [1]:
import numpy as np
import pandas as pd
import lightgbm as lgb
import xgboost as xgb
from sklearn.metrics import log_loss
from sklearn.model_selection import KFold
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

In [2]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

train_x =train.drop(['Survived'], axis=1)
train_y = train['Survived']
test_x = test.copy()

train_x = train_x.drop(['PassengerId'], axis=1)
test_x = test_x.drop(['PassengerId'], axis=1)

train_x = train_x.drop(['Name', 'Ticket', 'Cabin'], axis=1)
test_x = test_x.drop(['Name', 'Ticket', 'Cabin'], axis=1)

num_cols = [col for col in train_x.columns if train_x[col].dtype != 'object']
cat_cols = [col for col in train_x.columns if train_x[col].dtype == 'object']

train_x = train_x[num_cols]
test_x = test_x[num_cols]

train_x = train_x.fillna(train_x.mean())
test_x = test_x.fillna(test_x.mean())

print(train_x.columns)
train_x.head()

Index(['Pclass', 'Age', 'SibSp', 'Parch', 'Fare'], dtype='object')


Unnamed: 0,Pclass,Age,SibSp,Parch,Fare
0,3,22.0,1,0,7.25
1,1,38.0,1,0,71.2833
2,3,26.0,0,0,7.925
3,1,35.0,1,0,53.1
4,3,35.0,0,0,8.05


### 7.3.3　スタッキングの実装

In [3]:
def predict_cv(model, train_x, train_y, test_x):
    preds = []
    preds_test = []
    va_idxes = []
    
    kf = KFold(n_splits=4, shuffle=True, random_state=71)
    
    for i, (tr_idx, va_idx) in enumerate(kf.split(train_x)):
        tr_x, va_x = train_x.iloc[tr_idx], train_x.iloc[va_idx]
        tr_y, va_y = train_y.iloc[tr_idx], train_y.iloc[va_idx]
        model.fit(tr_x, tr_y)
        pred = model.predict(va_x)
        preds.append(pred)
        pred_test = model.predict(test_x)
        preds_test.append(pred_test)
        va_idxes.append(va_idx)
        
    va_idxes = np.concatenate(va_idxes)
    preds = np.concatenate(preds, axis=0)
    order = np.argsort(va_idxes)
    pred_train = preds[order]
    
    preds_test = np.mean(preds_test, axis=0)
    return pred_train, preds_test

`preds[order]` で、バリデーションの予測値を元のレコード順に並べ直している。

In [4]:
model_1a = RandomForestClassifier()
pred_train_1a, pred_test_1a = predict_cv(model_1a, train_x, train_y, test_x)

model_1b = lgb.LGBMClassifier()
pred_train_1b, pred_test_1b = predict_cv(model_1b, train_x, train_y, test_x)

print(f'logloss: {log_loss(train_y, pred_train_1a, eps=1e-7):.4f}')
print(f'logloss: {log_loss(train_y, pred_train_1b, eps=1e-7):.4f}')

logloss: 5.0471
logloss: 4.7938


In [5]:
train_x_2 = pd.DataFrame({'pred_1a': pred_train_1a, 'pred_1b': pred_train_1b})
test_x_2 = pd.DataFrame({'pred_1a': pred_test_1a, 'pred_1b': pred_test_1b})

model_2 = LogisticRegression()
pred_train_2, pred_test_2 = predict_cv(model_2, train_x_2, train_y, test_x_2)
print(f'logloss: {log_loss(train_y, pred_train_2, eps=1e-7):.4f}')

logloss: 4.6491


### Supplement

In [6]:
x = np.array([[1, 2], [3, 4], [1, 2], [3, 4], [3, 4], [3, 4], [3, 4], [3, 4]])

kf = KFold(n_splits=4, shuffle=True, random_state=71)
for i, (tr_idx, va_idx) in enumerate(kf.split(x)):
    print('i:', i, 'tr_idx:', tr_idx, 'va_idx:', va_idx)

i: 0 tr_idx: [0 2 3 4 5 7] va_idx: [1 6]
i: 1 tr_idx: [0 1 3 5 6 7] va_idx: [2 4]
i: 2 tr_idx: [1 2 3 4 5 6] va_idx: [0 7]
i: 3 tr_idx: [0 1 2 4 6 7] va_idx: [3 5]


In [7]:
a = np.array([2, 3, 1])
b = np.argsort(a)
print(b)

[2 0 1]


値を昇順に並べ替えて、元のインデックスを返す。