In [1]:
import pandas as pd
import numpy as np

In [2]:
train = pd.read_csv('train.csv', parse_dates=['activation_date']).sort_values('activation_date').reset_index(drop=True)
test = pd.read_csv('test.csv', parse_dates=['activation_date'])

In [3]:
train['price'] = train.price.fillna(-1)
test['price'] = test.price.fillna(-1)

In [4]:
sorted_prices = sorted(train.price.values)

In [5]:
best_bin = None
best_std = None

for bin_size in range(10, 310, 10):
    feat_name = 'price_binned_{}'.format(bin_size)
    
    _, bins = pd.qcut(sorted_prices, bin_size, labels=None, retbins=True, precision=3, duplicates='drop')
    train.loc[:, 'price_binned_{}'.format(bin_size)] = \
        pd.cut(train.price, bins=bins, labels=np.arange(len(bins)-1), duplicates='drop', retbins=False)
    
    avg_std = train.groupby(feat_name)['deal_probability'].std().mean()
    print(feat_name, ' dp std =',avg_std)
    print('real categories # =', train[feat_name].nunique())
    
    if best_std is None or avg_std < best_std:
        best_std = avg_std
        best_bin = bin_size

print(best_std, best_bin)

price_binned_10  dp std = 0.25536325051866854
real categories # = 10
price_binned_20  dp std = 0.2538109216102096
real categories # = 17
price_binned_30  dp std = 0.2534462040140496
real categories # = 27
price_binned_40  dp std = 0.24722297462536957
real categories # = 34
price_binned_50  dp std = 0.2487651950565567
real categories # = 41
price_binned_60  dp std = 0.2522740323114417
real categories # = 42
price_binned_70  dp std = 0.2492686325771821
real categories # = 50
price_binned_80  dp std = 0.24613363742369912
real categories # = 55
price_binned_90  dp std = 0.2484157060273815
real categories # = 55
price_binned_100  dp std = 0.2467635687313918
real categories # = 66
price_binned_110  dp std = 0.24534813158000984
real categories # = 65
price_binned_120  dp std = 0.24600023755026326
real categories # = 67
price_binned_130  dp std = 0.24606164148227266
real categories # = 68
price_binned_140  dp std = 0.2439071454638305
real categories # = 76
price_binned_150  dp std = 0.24144138

> the best bin size param is 819200, which minimize internal group std

In [6]:
all_df = pd.concat([train,test]).reset_index(drop=True)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=True'.


  """Entry point for launching an IPython kernel.


In [7]:
sorted_prices = sorted(all_df.price.values)

In [8]:
_, bins = pd.qcut(sorted_prices, 250, labels=None, retbins=True, precision=3, duplicates='drop')
all_df.loc[:, 'price_binned'] = \
    pd.cut(all_df.price, bins=bins, labels=np.arange(len(bins)-1), duplicates='drop', retbins=False)

In [9]:
all_df.loc[:train.shape[0]-1, 'price_binned'].value_counts()

17     117997
27      92121
12      68110
32      57018
9       49179
37      48719
14      44816
43      36756
6       36608
21      32724
23      32126
50      31712
19      31326
7       29005
40      28688
10      28340
13      25904
47      23856
61      20120
29      18434
45      16698
53      14450
55      13687
57      13179
25      12968
15      12853
49      12464
4       12018
68      12000
30      11026
        ...  
56       5334
22       5312
103      5268
95       5214
3        5180
91       5054
77       4992
79       4940
39       4912
1        4819
97       4721
0        4646
100      4618
106      4519
80       4156
24       3815
36       3622
26       2621
60       2512
65       2487
62       2053
41       1713
11       1011
46       1011
52        973
16        717
58        551
72        521
8         119
48         78
Name: price_binned, Length: 112, dtype: int64

In [10]:
all_df.loc[train.shape[0]:, 'price_binned'].value_counts()

17     36796
27     29031
12     21345
32     17775
9      16301
37     15573
14     14570
43     12276
6      12147
50     11136
21     10409
19     10017
23      9846
7       9768
40      9518
10      9422
13      8302
47      8198
61      6924
29      6060
45      5698
53      4975
55      4831
57      4649
49      4400
15      4236
25      4183
68      4140
4       4061
59      3974
       ...  
110     1920
56      1916
44      1913
1       1912
95      1888
76      1871
79      1820
22      1795
33      1766
97      1764
100     1712
77      1711
39      1686
106     1639
80      1591
36      1279
24      1275
65       965
60       932
26       891
62       813
41       647
11       357
52       339
46       332
16       265
58       209
72       183
8         46
48        25
Name: price_binned, Length: 112, dtype: int64

In [11]:
from sklearn.model_selection import KFold
fold_num = 10
kf = KFold(fold_num, shuffle=True, random_state=61458379)

mean_enc_cols = ['price_binned', 'item_seq_number']
train.loc[:, 'price_binned'] = all_df.loc[:train.shape[0]-1, 'price_binned']

for col in mean_enc_cols:    
    
    all_df.loc[:, col+'_dp_mean_enc'] = np.zeros((all_df.shape[0],))
    all_df.loc[:, col+'_dp_std_enc'] = np.zeros((all_df.shape[0],))
    
    for train_ix, val_ix in kf.split(train):
        tr_X = train.loc[train_ix, :]
    
        gp = tr_X.groupby(col)['deal_probability']
        mapping, mapping_std = gp.mean(), gp.std()
        
        all_df.loc[val_ix, col+'_dp_mean_enc'] = all_df.loc[val_ix, col].map(mapping)
        all_df.loc[val_ix, col+'_dp_std_enc'] = all_df.loc[val_ix, col].map(mapping_std)
        
        all_df.loc[train.shape[0]:, col+'_dp_mean_enc'] += all_df.loc[train.shape[0]:, col].map(mapping)
        all_df.loc[train.shape[0]:, col+'_dp_std_enc'] += all_df.loc[train.shape[0]:, col].map(mapping_std)
     
    all_df.loc[train.shape[0]:, col+'_dp_mean_enc'] /= fold_num
    all_df.loc[train.shape[0]:, col+'_dp_std_enc'] /= fold_num
    print(col + ' processed.')
    
encoded_cols = ['price_binned_dp_mean_enc', 'item_seq_number_dp_mean_enc', 
                'price_binned_dp_std_enc', 'item_seq_number_dp_std_enc']

all_df.loc[:,encoded_cols] = all_df[encoded_cols].fillna(0.)

price_binned processed.
item_seq_number processed.


In [12]:
all_df.loc[:train.shape[0]-1, encoded_cols+['price_binned']].to_csv('train_price_binned_item_seq_features.csv', index=False)
all_df.loc[train.shape[0]:, encoded_cols+['price_binned']].to_csv('test_price_binned_item_seq_features.csv', index=False)