## 3.5.5　target encoding

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold

In [2]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

train_x =train.drop(['Survived'], axis=1)
train_y = train['Survived']
test_x = test.copy()

train_x = train_x.drop(['PassengerId'], axis=1)
test_x = test_x.drop(['PassengerId'], axis=1)

train_x = train_x.drop(['Name', 'Ticket', 'Cabin'], axis=1)
test_x = test_x.drop(['Name', 'Ticket', 'Cabin'], axis=1)

num_cols = [col for col in train_x.columns if train_x[col].dtype != 'object']
cat_cols = [col for col in train_x.columns if train_x[col].dtype == 'object']

In [3]:
train_x.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,3,male,22.0,1,0,7.25,S
1,1,female,38.0,1,0,71.2833,C
2,3,female,26.0,0,0,7.925,S
3,1,female,35.0,1,0,53.1,S
4,3,male,35.0,0,0,8.05,S


### target encoding の手法・実装

In [4]:
for c in cat_cols:
    data_tmp = pd.DataFrame({c: train_x[c], 'target': train_y})
    target_mean = data_tmp.groupby(c)['target'].mean()
    test_x[c] = test_x[c].map(target_mean)
    
    tmp = np.repeat(np.nan, train_x.shape[0])
    
    kf = KFold(n_splits=4, shuffle=True, random_state=72)
    
    for idx_1, idx_2 in kf.split(train_x):
        target_mean = data_tmp.iloc[idx_1].groupby(c)['target'].mean()
        tmp[idx_2] = train_x[c].iloc[idx_2].map(target_mean)
    
    train_x[c] = tmp

テストデータ<br>
・`cat_col`　を 1 つ取り出し、学習データ全体の目的変数の平均値を計算し、テストデータに代入する。<br>

学習データ<br>
・`KFold` でバリデーションを行い、`idx_1` で求めた目的変数の平均値を `idx_2` に代入する。<br>
・その後、元データ `train_x` の `cat_col` の値と置き換える。

In [5]:
train_x.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,3,0.185882,22.0,1,0,7.25,0.35
1,1,0.742222,38.0,1,0,71.2833,0.567164
2,3,0.718367,26.0,0,0,7.925,0.334016
3,1,0.742222,35.0,1,0,53.1,0.322105
4,3,0.191344,35.0,0,0,8.05,0.341513


In [6]:
x = np.array([[1, 2], [3, 4], [1, 2], [3, 4], [3, 4], [3, 4], [3, 4], [3, 4]])

kf = KFold(n_splits=4, shuffle=True, random_state=72)
for train_idx, test_idx in kf.split(x):
    print('Fold 2, 3, 4:', train_idx, 'Fold 1:', test_idx)

Fold 2, 3, 4: [0 1 2 3 5 6] Fold 1: [4 7]
Fold 2, 3, 4: [0 2 3 4 5 7] Fold 1: [1 6]
Fold 2, 3, 4: [0 1 3 4 6 7] Fold 1: [2 5]
Fold 2, 3, 4: [1 2 4 5 6 7] Fold 1: [0 3]


### target encoding の手法・実装ークロスバリデーションを行う場合

上の方法に、`tr_idx` と `va_idx` の分割（クロスバリデーション）を追加する。<br>
書籍における「クロスバリデーションの fold」は、最初の `KFold` で分割した fold を指している。

In [7]:
kf = KFold(n_splits=4, shuffle=True, random_state=71)
for i, (tr_idx, va_idx) in enumerate(kf.split(train_x)):
    
    tr_x, va_x = train_x.iloc[tr_idx].copy(), train_x.iloc[va_idx].copy()
    tr_y, va_y = train_y.iloc[tr_idx].copy(), train_y.iloc[va_idx]
    
    for c in cat_cols:
        data_tmp = pd.DataFrame({c: tr_x[c], 'target': tr_y})
        target_mean = data_tmp.groupby(c)['target'].mean()
        va_x.loc[:, c] = va_x[c].map(target_mean)
        
        tmp = np.repeat(np.nan, tr_x.shape[0])
        kf_encoding = KFold(n_splits=4, shuffle=True, random_state=72)
        for idx_1, idx_2 in kf_encoding.split(tr_x):
            target_mean = data_tmp.iloc[idx_1].groupby(c)['target'].mean()
            tmp[idx_2] = tr_x[c].iloc[idx_2].map(target_mean)
            
        tr_x.loc[:, c] = tmp

この処理は、パラメータチューニングやモデル評価のためのクロスバリデーションにおいて<br>
fold ごとに target encoding で特徴量を変換する場合を想定している。<br>
テストデータに関しては、通常の target encoding で対応すれば良い。

In [8]:
tr_x.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,3,0.195402,22.0,1,0,7.25,0.371681
2,3,0.804878,26.0,0,0,7.925,0.368421
3,1,0.765957,35.0,1,0,53.1,0.359551
4,3,0.147059,35.0,0,0,8.05,0.293333
5,3,0.139241,,0,0,8.4583,0.555556


In [9]:
x = np.array([[1, 2], [3, 4], [1, 2], [3, 4], [3, 4], [3, 4], [3, 4], [3, 4]])

kf = KFold(n_splits=4, shuffle=True, random_state=71)
for i, (tr_idx, va_idx) in enumerate(kf.split(x)):
    print('i:', i, 'tr_idx:', tr_idx, 'va_idx:', va_idx)

i: 0 tr_idx: [0 2 3 4 5 7] va_idx: [1 6]
i: 1 tr_idx: [0 1 3 5 6 7] va_idx: [2 4]
i: 2 tr_idx: [1 2 3 4 5 6] va_idx: [0 7]
i: 3 tr_idx: [0 1 2 4 6 7] va_idx: [3 5]
