## 3.5.5　target encoding

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold

In [2]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

train_x =train.drop(['Survived'], axis=1)
train_y = train['Survived']
test_x = test.copy()

train_x = train_x.drop(['PassengerId'], axis=1)
test_x = test_x.drop(['PassengerId'], axis=1)

train_x = train_x.drop(['Name', 'Ticket', 'Cabin'], axis=1)
test_x = test_x.drop(['Name', 'Ticket', 'Cabin'], axis=1)

num_cols = [col for col in train_x.columns if train_x[col].dtype != 'object']
cat_cols = [col for col in train_x.columns if train_x[col].dtype == 'object']

In [3]:
train_x.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,3,male,22.0,1,0,7.25,S
1,1,female,38.0,1,0,71.2833,C
2,3,female,26.0,0,0,7.925,S
3,1,female,35.0,1,0,53.1,S
4,3,male,35.0,0,0,8.05,S


In [4]:
for c in cat_cols:
    data_tmp = pd.DataFrame({c: train_x[c], 'target': train_y})
    target_mean = data_tmp.groupby(c)['target'].mean()
    test_x[c] = test_x[c].map(target_mean)
    
    tmp = np.repeat(np.nan, train_x.shape[0])
    
    kf = KFold(n_splits=4, shuffle=True, random_state=72)
    
    for idx_1, idx_2 in kf.split(train_x):
        target_mean = data_tmp.iloc[idx_1].groupby(c)['target'].mean()
        tmp[idx_2] = train_x[c].iloc[idx_2].map(target_mean)
    
    train_x[c] = tmp

In [5]:
train_x.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,3,0.185882,22.0,1,0,7.25,0.35
1,1,0.742222,38.0,1,0,71.2833,0.567164
2,3,0.718367,26.0,0,0,7.925,0.334016
3,1,0.742222,35.0,1,0,53.1,0.322105
4,3,0.191344,35.0,0,0,8.05,0.341513


KFold の `idx_1` は `train_idx`、`idx_2` は `test_idx` に対応している。

In [6]:
x = np.array([[1, 2], [3, 4], [1, 2], [3, 4], [3, 4], [3, 4], [3, 4], [3, 4]])

kf = KFold(n_splits=4, shuffle=True, random_state=72)
for train_index, test_index in kf.split(x):
    print('train_index:', train_index, 'test_index:', test_index)

train_index: [0 1 2 3 5 6] test_index: [4 7]
train_index: [0 2 3 4 5 7] test_index: [1 6]
train_index: [0 1 3 4 6 7] test_index: [2 5]
train_index: [1 2 4 5 6 7] test_index: [0 3]
