<a href="https://colab.research.google.com/github/tsato-code/colab_notebooks/blob/main/target_encoding.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

元ネタはこちら。  
[Python: Target Encoding のやり方について | CUBE SUGAR CONTAINER 技術系のこと書きます。](https://blog.amedama.jp/entry/target-mean-encoding-types)

In [1]:
import pandas as pd

In [2]:
data = {
    'category': ['apple', 'apple',
        'banana', 'banana', 'banana',
        'cherry', 'cherry', 'cherry', 'cherry',
        'durian'],
    'label': [0, 1,
        0, 0, 1,
        0, 1, 1, 1,
        1],
 }

df = pd.DataFrame(data=data)
df

Unnamed: 0,category,label
0,apple,0
1,apple,1
2,banana,0
3,banana,0
4,banana,1
5,cherry,0
6,cherry,1
7,cherry,1
8,cherry,1
9,durian,1


In [3]:
# Greedy TS
ts = df.groupby('category', as_index=False).agg({'label': 'mean'})
ts

Unnamed: 0,category,label
0,apple,0.5
1,banana,0.333333
2,cherry,0.75
3,durian,1.0


In [4]:
pd.merge(df, ts, on='category', right_index=True)

Unnamed: 0,category,label_x,label_y
0,apple,0,0.5
1,apple,1,0.5
2,banana,0,0.333333
3,banana,0,0.333333
4,banana,1,0.333333
5,cherry,0,0.75
6,cherry,1,0.75
7,cherry,1,0.75
8,cherry,1,0.75
9,durian,1,1.0


In [5]:
# Leave-one_out TS
agg_df = df.groupby('category').agg({'label': ['sum', 'count']})
agg_df

Unnamed: 0_level_0,label,label
Unnamed: 0_level_1,sum,count
category,Unnamed: 1_level_2,Unnamed: 2_level_2
apple,1,2
banana,1,3
cherry,3,4
durian,1,1


In [6]:
def loo_ts(row):
    group_ts = agg_df.loc[row.category]
    loo_sum = group_ts.loc[('label', 'sum')] - row.label
    loo_count = group_ts.loc[('label', 'count')] - 1
    return loo_sum / (loo_count + 1)

In [7]:
ts = df.apply(loo_ts, axis=1)
ts.name = 'loo_ts'
ts

0    0.500000
1    0.000000
2    0.333333
3    0.333333
4    0.000000
5    0.750000
6    0.500000
7    0.500000
8    0.500000
9    0.000000
Name: loo_ts, dtype: float64

In [8]:
df.join(ts)

Unnamed: 0,category,label,loo_ts
0,apple,0,0.5
1,apple,1,0.0
2,banana,0,0.333333
3,banana,0,0.333333
4,banana,1,0.0
5,cherry,0,0.75
6,cherry,1,0.5
7,cherry,1,0.5
8,cherry,1,0.5
9,durian,1,0.0


In [9]:
# Holdout TS
agg_df = df.groupby('category').agg({'label': ['sum', 'count']})
agg_df

Unnamed: 0_level_0,label,label
Unnamed: 0_level_1,sum,count
category,Unnamed: 1_level_2,Unnamed: 2_level_2
apple,1,2
banana,1,3
cherry,3,4
durian,1,1


In [10]:
from sklearn.model_selection import StratifiedKFold

folds = StratifiedKFold(n_splits=3,
                        shuffle=True,
                        random_state=42)

In [11]:
import numpy as np

ts = pd.Series(np.empty(df.shape[0]), index=df.index)
ts

0     0.000000e+00
1     0.000000e+00
2    4.940656e-324
3    4.940656e-324
4    4.940656e-324
5    9.881313e-324
6    9.881313e-324
7    9.881313e-324
8    9.881313e-324
9    1.482197e-323
dtype: float64

In [12]:
for i, (_, holdout_idx) in enumerate(folds.split(df, df.label)):
    print(f'===== iteration {i} =====')
    print(holdout_idx)
    holdout_df = df.iloc[holdout_idx]
    holdout_agg_df = holdout_df.groupby('category').agg({'sum', 'count'})
    print(holdout_agg_df)
    train_agg_df = agg_df - holdout_agg_df
    oof_ts = holdout_df.apply(lambda row: train_agg_df.loc[row.category][('label', 'sum')] \
                              / (train_agg_df.loc[row.category][('label', 'count')] + 1), axis=1)
    ts[oof_ts.index] = oof_ts

ts.name = 'holdout_ts'
df.join(ts)

===== iteration 0 =====
[0 3 4 7]
         label    
         count sum
category          
apple        1   0
banana       2   1
cherry       1   1
===== iteration 1 =====
[1 5 9]
         label    
         count sum
category          
apple        1   1
cherry       1   0
durian       1   1
===== iteration 2 =====
[2 6 8]
         label    
         count sum
category          
banana       1   0
cherry       2   2


Unnamed: 0,category,label,holdout_ts
0,apple,0,0.5
1,apple,1,0.0
2,banana,0,0.333333
3,banana,0,0.0
4,banana,1,0.0
5,cherry,0,0.75
6,cherry,1,0.333333
7,cherry,1,0.5
8,cherry,1,0.333333
9,durian,1,0.0


In [13]:
# Ordered TS
np.random.seed(42)
artificial_time = np.random.permutation(df.index)
artificial_time

array([8, 1, 5, 0, 7, 2, 9, 4, 3, 6])

In [14]:
group_col = 'category'
target_col = 'label'
smooth = False

counter_name = 'Train'
assert counter_name not in df.columns, f'Oops! need to rename {counter_name} column'
df[counter_name] = ~df[target_col].isnull()

In [15]:
sorted_indices =np.argsort(artificial_time)
df_shifted = df.iloc[sorted_indices].groupby(group_col).shift(1)
df_shifted[group_col] = df.iloc[sorted_indices][group_col]
df_shifted

Unnamed: 0,label,Train,category
3,,,banana
1,,,apple
5,,,cherry
8,0.0,True,cherry
7,1.0,True,cherry
2,0.0,True,banana
9,,,durian
4,0.0,True,banana
0,1.0,True,apple
6,1.0,True,cherry


In [16]:
df_shifted[target_col].fillna(value=0, inplace=True)
df_shifted[counter_name].fillna(value=False, inplace=True)
df_shifted

Unnamed: 0,label,Train,category
3,0.0,False,banana
1,0.0,False,apple
5,0.0,False,cherry
8,0.0,True,cherry
7,1.0,True,cherry
2,0.0,True,banana
9,0.0,False,durian
4,0.0,True,banana
0,1.0,True,apple
6,1.0,True,cherry


In [17]:
gdf = df_shifted.groupby(group_col)
agg_df = gdf.agg({target_col: 'cumsum', counter_name: 'cumsum'})
ordered_ts = agg_df[target_col] / (agg_df[counter_name] + int(smooth))
ordered_ts[ts.index]

0    1.000000
1         NaN
2    0.000000
3         NaN
4    0.000000
5         NaN
6    0.666667
7    0.500000
8    0.000000
9         NaN
dtype: float64

In [18]:
df.join(ordered_ts[df.index].rename('ordered_ts'))

Unnamed: 0,category,label,Train,ordered_ts
0,apple,0,True,1.0
1,apple,1,True,
2,banana,0,True,0.0
3,banana,0,True,
4,banana,1,True,0.0
5,cherry,0,True,
6,cherry,1,True,0.666667
7,cherry,1,True,0.5
8,cherry,1,True,0.0
9,durian,1,True,
