In [1]:
base_path = "C:\\Users\\hasee\\workspace\\Kaggle\\safe_driver\\" # your folder

In [2]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
def add_noise(series, noise_level):
    return series * (1 + noise_level * np.random.randn(len(series)))

def target_encode(trn_series=None, 
                  tst_series=None, 
                  target=None, 
                  min_samples_leaf=1, 
                  smoothing=1,
                  noise_level=0):
    """
    Smoothing is computed like in the following paper by Daniele Micci-Barreca
    https://kaggle2.blob.core.windows.net/forum-message-attachments/225952/7441/high%20cardinality%20categoricals.pdf
    trn_series : training categorical feature as a pd.Series
    tst_series : test categorical feature as a pd.Series
    target : target data as a pd.Series
    min_samples_leaf (int) : minimum samples to take category average into account
    smoothing (int) : smoothing effect to balance categorical average vs prior  
    """ 
    assert len(trn_series) == len(target)
    assert trn_series.name == tst_series.name
    temp = pd.concat([trn_series, target], axis=1)
    # Compute target mean 
    averages = temp.groupby(by=trn_series.name)[target.name].agg(["mean", "count"])
    # Compute smoothing
    smoothing = 1 / (1 + np.exp(-(averages["count"] - min_samples_leaf) / smoothing))
    # Apply average function to all target data
    prior = target.mean()
    # The bigger the count the less full_avg is taken into account
    averages[target.name] = prior * (1 - smoothing) + averages["mean"] * smoothing
    averages.drop(["mean", "count"], axis=1, inplace=True)
    # Apply averages to trn and tst series
    ft_trn_series = pd.merge(
        trn_series.to_frame(trn_series.name),
        averages.reset_index().rename(columns={'index': target.name, target.name: 'average'}),
        on=trn_series.name,
        how='left')['average'].rename(trn_series.name + '_mean').fillna(prior)
    # pd.merge does not keep the index so restore it
    ft_trn_series.index = trn_series.index 
    ft_tst_series = pd.merge(
        tst_series.to_frame(tst_series.name),
        averages.reset_index().rename(columns={'index': target.name, target.name: 'average'}),
        on=tst_series.name,
        how='left')['average'].rename(trn_series.name + '_mean').fillna(prior)
    # pd.merge does not keep the index so restore it
    ft_tst_series.index = tst_series.index
    return add_noise(ft_trn_series, noise_level), add_noise(ft_tst_series, noise_level)

In [31]:
trn_df = pd.read_csv(base_path+"train.csv", index_col=0)
sub_df = pd.read_csv(base_path+"test.csv", index_col=0)

In [48]:
temp=pd.concat([trn_df["ps_car_11_cat"],trn_df.target],axis=1)
temp

Unnamed: 0_level_0,ps_car_11_cat,target
id,Unnamed: 1_level_1,Unnamed: 2_level_1
7,12,0
9,19,0
13,60,0
16,104,0
17,82,0
19,104,0
20,99,0
22,30,0
26,68,0
28,104,1


In [33]:
trn_df.target.name

'target'

In [51]:
averages = temp.groupby(by=trn_df["ps_car_11_cat"].name)[trn_df.target.name].agg(["mean", "count"])#"mean"compute mean of group
averages

Unnamed: 0_level_0,mean,count
ps_car_11_cat,Unnamed: 1_level_1,Unnamed: 2_level_1
1,0.040528,3331
2,0.037995,2553
3,0.051177,3185
4,0.060777,1596
5,0.041956,12513
6,0.037097,2480
7,0.025473,5653
8,0.037485,2481
9,0.028042,2211
10,0.025653,8732


In [35]:
trn_df["ps_car_11_cat"].name

'ps_car_11_cat'

In [36]:
smoothing = 1 / (1 + np.exp(-(averages["count"] - 100) / 10))
prior=trn_df.target.mean()
prior

0.036447517859182946

In [37]:
averages[trn_df.target.name] = prior * (1 - smoothing) + averages["mean"] * smoothing

In [38]:
averages.drop(["mean", "count"], axis=1, inplace=True)

In [23]:
averages

Unnamed: 0_level_0,target
ps_car_11_cat,Unnamed: 1_level_1
1,0.040528
2,0.037995
3,0.051177
4,0.060777
5,0.041956
6,0.037097
7,0.025473
8,0.037485
9,0.028042
10,0.025653


In [42]:
ft_trn_series = pd.merge(
        trn_df["ps_car_11_cat"].to_frame(trn_df["ps_car_11_cat"].name),
        averages.reset_index().rename(columns={'index': trn_df.target.name, trn_df.target.name: 'average'}),
        on=trn_df["ps_car_11_cat"].name,
        how='left')['average'].rename(trn_df["ps_car_11_cat"].name + '_mean').fillna(prior)

In [43]:
ft_trn_series

0         0.038642
1         0.023936
2         0.031281
3         0.045015
4         0.026266
5         0.045015
6         0.023003
7         0.030577
8         0.034335
9         0.045015
10        0.044913
11        0.038891
12        0.038289
13        0.045015
14        0.026667
15        0.075110
16        0.036158
17        0.045015
18        0.018423
19        0.028728
20        0.026266
21        0.026667
22        0.036266
23        0.027544
24        0.045015
25        0.039099
26        0.041956
27        0.023003
28        0.045015
29        0.042431
            ...   
595182    0.025653
595183    0.028728
595184    0.040389
595185    0.030932
595186    0.031626
595187    0.045015
595188    0.029752
595189    0.025974
595190    0.028728
595191    0.027544
595192    0.026266
595193    0.045015
595194    0.023715
595195    0.040786
595196    0.040389
595197    0.033761
595198    0.038289
595199    0.045593
595200    0.032176
595201    0.023936
595202    0.029633
595203    0.

In [40]:
averages.reset_index().rename(columns={'index': trn_df.target.name,trn_df.target.name: 'average'})

Unnamed: 0,ps_car_11_cat,average
0,1,0.040528
1,2,0.037995
2,3,0.051177
3,4,0.060777
4,5,0.041956
5,6,0.037097
6,7,0.025473
7,8,0.037485
8,9,0.028042
9,10,0.025653


In [44]:
trn_df["ps_car_11_cat"].to_frame(trn_df["ps_car_11_cat"].name)

Unnamed: 0_level_0,ps_car_11_cat
id,Unnamed: 1_level_1
7,12
9,19
13,60
16,104
17,82
19,104
20,99
22,30
26,68
28,104
