In [1]:
import pandas as pd
import numpy as np
import gc; gc.enable()
import pickle
from tqdm import tqdm

In [2]:
train_X = pd.read_csv('data/train.csv', usecols = ['price', 'item_seq_number', 'deal_probability', 'activation_date'],
                    parse_dates=['activation_date']) \
            .sort_values('activation_date') \
            .drop('activation_date', axis=1) \
            .reset_index(drop=True)

test_X = pd.read_csv('data/test.csv', usecols = ['price', 'item_seq_number'])

train_X.head(1), test_X.head(1)

(   price  item_seq_number  deal_probability
 0  500.0               12               0.0,    price  item_seq_number
 0    NaN               66)

In [3]:
train_X.loc[:, 'price'] = train_X['price'].fillna(-1)
test_X.loc[:, 'price'] = test_X['price'].fillna(-1)

In [4]:
prefix = 'nmf' # lda or nmf
with open(prefix+'_features.pickle', 'rb') as handle: # nmf_features or lda_features
    features = pickle.load(handle)

In [5]:
features[:3,:]

array([[  0.00000000e+00,   3.05850260e-04,   2.36557387e-04,
          0.00000000e+00,   0.00000000e+00,   1.50118352e-14,
          2.26818574e-04,   0.00000000e+00,   0.00000000e+00,
          4.61121312e-04],
       [  0.00000000e+00,   7.43872466e-55,   2.22401801e-05,
          0.00000000e+00,   3.99799082e-09,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   2.56303215e-09,
          9.67677640e-04],
       [  5.41536183e-02,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00]])

In [6]:
features.shape

(2011862, 10)

In [7]:
features = 1./features.sum(axis=1).reshape((features.shape[0],1)) * features
features[:3,:]

  """Entry point for launching an IPython kernel.
  """Entry point for launching an IPython kernel.


array([[  0.00000000e+00,   2.48588510e-01,   1.92268754e-01,
          0.00000000e+00,   0.00000000e+00,   1.22012966e-11,
          1.84353256e-01,   0.00000000e+00,   0.00000000e+00,
          3.74789480e-01],
       [  0.00000000e+00,   7.51443726e-52,   2.24665445e-02,
          0.00000000e+00,   4.03868305e-06,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   2.58911913e-06,
          9.77526828e-01],
       [  1.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00]])

In [8]:
features = np.where(np.isnan(features), 0., features)

In [9]:
train_X.loc[:, prefix+'_topic'] = np.argmax(features[:train_X.shape[0],:], axis=1)
test_X.loc[:, prefix+'_topic'] = np.argmax(features[train_X.shape[0]:,:], axis=1)

In [10]:
feature = prefix+'_topic'

cnt_mapping = train_X[feature].value_counts()
train_X.loc[:, feature+'_cnt'] = train_X[feature].map(cnt_mapping)
test_X.loc[:, feature+'_cnt'] = test_X[feature].map(cnt_mapping)

In [11]:
gp = train_X.groupby(feature)

In [12]:
train_features = features[:train_X.shape[0], :]
test_features = features[train_X.shape[0]:, :]

for feature in ['price', 'item_seq_number']:
    mapping = gp[feature].apply(lambda x: x[x>=0].mean())
    train_X.loc[:, prefix+'_'+feature+'_avg'] = train_X[prefix+'_topic'].map(mapping)
    test_X.loc[:, prefix+'_'+feature+'_avg'] = test_X[prefix+'_topic'].map(mapping)
    train_X.loc[:, prefix+'_'+feature+'_weighted_avg'] = [(vf*mapping.values).sum() for vf in tqdm(train_features)]
    test_X.loc[:, prefix+'_'+feature+'_weighted_avg'] = [(vf*mapping.values).sum() for vf in tqdm(test_features)]
    
    mapping = gp[feature].apply(lambda x: x[x>=0].median())
    train_X.loc[:, prefix+'_'+feature+'_med'] = train_X[prefix+'_topic'].map(mapping)
    test_X.loc[:, prefix+'_'+feature+'_med'] = test_X[prefix+'_topic'].map(mapping)
    train_X.loc[:, prefix+'_'+feature+'_weighted_med'] = [(vf*mapping.values).sum() for vf in tqdm(train_features)]
    test_X.loc[:, prefix+'_'+feature+'_weighted_med'] = [(vf*mapping.values).sum() for vf in tqdm(test_features)]

100%|████████████████████████████████████████████████████████████████| 1503424/1503424 [00:06<00:00, 248828.86it/s]
100%|██████████████████████████████████████████████████████████████████| 508438/508438 [00:02<00:00, 227999.10it/s]
100%|████████████████████████████████████████████████████████████████| 1503424/1503424 [00:06<00:00, 246220.78it/s]
100%|██████████████████████████████████████████████████████████████████| 508438/508438 [00:02<00:00, 253585.02it/s]
100%|████████████████████████████████████████████████████████████████| 1503424/1503424 [00:06<00:00, 249738.21it/s]
100%|██████████████████████████████████████████████████████████████████| 508438/508438 [00:02<00:00, 252954.20it/s]
100%|████████████████████████████████████████████████████████████████| 1503424/1503424 [00:05<00:00, 254990.51it/s]
100%|██████████████████████████████████████████████████████████████████| 508438/508438 [00:02<00:00, 252702.80it/s]


In [13]:
from sklearn.model_selection import KFold
fold_num = 10
kf = KFold(fold_num, shuffle=True, random_state=610412898)

for col in [prefix+'_topic']:    
    
    train_X.loc[:, col+'_dp_mean_enc'] = np.zeros((train_X.shape[0],))
    train_X.loc[:, col+'_dp_std_enc'] = np.zeros((train_X.shape[0],))
    test_X.loc[:, col+'_dp_mean_enc'] = np.zeros((test_X.shape[0],))
    test_X.loc[:, col+'_dp_std_enc'] = np.zeros((test_X.shape[0],))
    
    for train_ix, val_ix in kf.split(train_X):
        tr_X = train_X.loc[train_ix, :]
        
        gp = tr_X.groupby(col)['deal_probability']
        mapping, mapping_std = gp.mean(), gp.std()
        
        train_X.loc[val_ix, col+'_dp_mean_enc'] = train_X.loc[val_ix, col].map(mapping).fillna(0.)
        train_X.loc[val_ix, col+'_dp_std_enc'] = train_X.loc[val_ix, col].map(mapping_std).fillna(0.)
        
        test_X.loc[:, col+'_dp_mean_enc'] += test_X.loc[:, col].map(mapping).fillna(0.)
        test_X.loc[:, col+'_dp_std_enc'] += test_X.loc[:, col].map(mapping_std).fillna(0.)
     
    
        # weighted scores:
        val_features = features[val_ix, :]
        train_X.loc[val_ix, col+'_weighted_dp_mean_enc'] = [(vf*mapping.values).sum() for vf in tqdm(val_features)]
        train_X.loc[val_ix, col+'_weighted_dp_std_enc'] = [(vf*mapping_std.values).sum() for vf in tqdm(val_features)]
        
        test_features = features[train_X.shape[0]:, :]
        test_X.loc[:, col+'_weighted_dp_mean_enc'] = [(vf*mapping.values).sum() for vf in tqdm(test_features)]
        test_X.loc[:, col+'_weighted_dp_std_enc'] = [(vf*mapping_std.values).sum() for vf in tqdm(test_features)]
        
        del gp
        
    test_X.loc[:, col+'_dp_mean_enc'] /= fold_num
    test_X.loc[:, col+'_dp_std_enc'] /= fold_num
    test_X.loc[:, col+'_weighted_dp_mean_enc'] /= fold_num
    test_X.loc[:, col+'_weighted_dp_std_enc'] /= fold_num
    print(col + ' processed.')

100%|██████████████████████████████████████████████████████████████████| 150343/150343 [00:00<00:00, 253957.77it/s]
100%|██████████████████████████████████████████████████████████████████| 150343/150343 [00:00<00:00, 253102.65it/s]
100%|██████████████████████████████████████████████████████████████████| 508438/508438 [00:01<00:00, 254856.12it/s]
100%|██████████████████████████████████████████████████████████████████| 508438/508438 [00:02<00:00, 249478.92it/s]
100%|██████████████████████████████████████████████████████████████████| 150343/150343 [00:00<00:00, 252677.30it/s]
100%|██████████████████████████████████████████████████████████████████| 150343/150343 [00:00<00:00, 249325.17it/s]
100%|██████████████████████████████████████████████████████████████████| 508438/508438 [00:02<00:00, 253332.34it/s]
100%|██████████████████████████████████████████████████████████████████| 508438/508438 [00:02<00:00, 247656.12it/s]
100%|██████████████████████████████████████████████████████████████████|

nmf_topic processed.


In [14]:
train_X.head(10)

Unnamed: 0,price,item_seq_number,deal_probability,nmf_topic,nmf_topic_cnt,nmf_price_avg,nmf_price_weighted_avg,nmf_price_med,nmf_price_weighted_med,nmf_item_seq_number_avg,nmf_item_seq_number_weighted_avg,nmf_item_seq_number_med,nmf_item_seq_number_weighted_med,nmf_topic_dp_mean_enc,nmf_topic_dp_std_enc,nmf_topic_weighted_dp_mean_enc,nmf_topic_weighted_dp_std_enc
0,500.0,12,0.0,9,634988,677368.253095,311690.084882,2500.0,1516.095906,1468.678421,698.547583,32.0,31.325511,0.157913,0.265723,0.138797,0.257145
1,100.0,5,0.0,9,634988,677368.253095,663505.527408,2500.0,2466.320741,1468.678421,1445.816785,32.0,32.067316,0.158004,0.265728,0.157665,0.265958
2,1500.0,32,0.0,0,125302,11946.124992,11946.124992,500.0,500.0,179.104691,179.104691,22.0,22.0,0.079857,0.198563,0.079857,0.198563
3,350.0,17,0.0,2,115796,60488.374426,249281.342483,1000.0,1351.659524,451.463401,556.419636,35.0,30.165295,0.143247,0.276358,0.153313,0.274284
4,1500.0,7,0.27307,8,85720,256313.503356,137574.502698,12000.0,6482.853389,145.192545,174.06925,12.0,23.635717,0.197375,0.284781,0.134587,0.241257
5,800.0,4,0.0,0,125302,11946.124992,11946.124992,500.0,500.0,179.104691,179.104691,22.0,22.0,0.079227,0.197704,0.079227,0.197704
6,399.0,71051,0.0,9,634988,677368.253095,634388.815016,2500.0,2792.946437,1468.678421,1356.423719,32.0,31.033366,0.157913,0.265723,0.15929,0.267029
7,950000.0,7,0.16934,9,634988,677368.253095,481503.725996,2500.0,5553.961891,1468.678421,874.053713,32.0,24.377251,0.157979,0.265693,0.171656,0.273393
8,-1.0,19,0.78503,9,634988,677368.253095,532868.418081,2500.0,2668.008209,1468.678421,1109.460155,32.0,29.864958,0.157776,0.265549,0.159463,0.269255
9,1.0,5,0.11508,9,634988,677368.253095,677224.953494,2500.0,2499.624011,1468.678421,1468.406513,32.0,32.000859,0.157928,0.265647,0.157908,0.265632


In [15]:
test_X.head(10)

Unnamed: 0,price,item_seq_number,nmf_topic,nmf_topic_cnt,nmf_price_avg,nmf_price_weighted_avg,nmf_price_med,nmf_price_weighted_med,nmf_item_seq_number_avg,nmf_item_seq_number_weighted_avg,nmf_item_seq_number_med,nmf_item_seq_number_weighted_med,nmf_topic_dp_mean_enc,nmf_topic_dp_std_enc,nmf_topic_weighted_dp_mean_enc,nmf_topic_weighted_dp_std_enc
0,-1.0,66,0,125302,11946.124992,0.0,500.0,0.0,179.104691,0.0,22.0,0.0,0.079629,0.198157,0.0,0.0
1,3000.0,4,8,85720,256313.503356,230801.157816,12000.0,8415.922271,145.192545,129.189484,12.0,15.910057,0.197872,0.28491,0.01862,0.028285
2,15000.0,15,6,215616,10395.670451,249911.808801,750.0,1721.152636,203.115242,505.813855,36.0,30.300881,0.06551,0.192153,0.012499,0.024194
3,4500.0,70,1,98963,178097.8917,177978.607977,1000.0,1000.0,95.962198,96.32276,24.0,24.011157,0.161182,0.277684,0.01616,0.027811
4,4900.0,15,1,98963,178097.8917,283933.648766,1000.0,3322.556198,95.962198,356.500905,24.0,23.236222,0.161182,0.277684,0.016773,0.027715
5,500.0,39,9,634988,677368.253095,418819.537499,2500.0,2404.11267,1468.678421,949.277036,32.0,32.313332,0.157909,0.265651,0.012751,0.024077
6,20990.0,57316,9,634988,677368.253095,435941.144776,2500.0,1896.757839,1468.678421,1049.539822,32.0,33.257134,0.157909,0.265651,0.014258,0.025918
7,990.0,851,9,634988,677368.253095,635236.653809,2500.0,3360.432652,1468.678421,1341.567065,32.0,30.186262,0.157909,0.265651,0.016158,0.026756
8,1200.0,10020,6,215616,10395.670451,271008.869687,750.0,1631.796956,203.115242,687.829914,36.0,34.016467,0.06551,0.192153,0.01034,0.022205
9,400.0,16,6,215616,10395.670451,48158.261113,750.0,927.560078,203.115242,240.977685,36.0,33.522723,0.06551,0.192153,0.010468,0.023189


In [16]:
train_X.isnull().sum()

price                               0
item_seq_number                     0
deal_probability                    0
nmf_topic                           0
nmf_topic_cnt                       0
nmf_price_avg                       0
nmf_price_weighted_avg              0
nmf_price_med                       0
nmf_price_weighted_med              0
nmf_item_seq_number_avg             0
nmf_item_seq_number_weighted_avg    0
nmf_item_seq_number_med             0
nmf_item_seq_number_weighted_med    0
nmf_topic_dp_mean_enc               0
nmf_topic_dp_std_enc                0
nmf_topic_weighted_dp_mean_enc      0
nmf_topic_weighted_dp_std_enc       0
dtype: int64

In [17]:
test_X.isnull().sum()

price                               0
item_seq_number                     0
nmf_topic                           0
nmf_topic_cnt                       0
nmf_price_avg                       0
nmf_price_weighted_avg              0
nmf_price_med                       0
nmf_price_weighted_med              0
nmf_item_seq_number_avg             0
nmf_item_seq_number_weighted_avg    0
nmf_item_seq_number_med             0
nmf_item_seq_number_weighted_med    0
nmf_topic_dp_mean_enc               0
nmf_topic_dp_std_enc                0
nmf_topic_weighted_dp_mean_enc      0
nmf_topic_weighted_dp_std_enc       0
dtype: int64

In [18]:
res_cols = test_X.columns.tolist()[2:]
res_cols

['nmf_topic',
 'nmf_topic_cnt',
 'nmf_price_avg',
 'nmf_price_weighted_avg',
 'nmf_price_med',
 'nmf_price_weighted_med',
 'nmf_item_seq_number_avg',
 'nmf_item_seq_number_weighted_avg',
 'nmf_item_seq_number_med',
 'nmf_item_seq_number_weighted_med',
 'nmf_topic_dp_mean_enc',
 'nmf_topic_dp_std_enc',
 'nmf_topic_weighted_dp_mean_enc',
 'nmf_topic_weighted_dp_std_enc']

In [19]:
train_X[res_cols].to_csv('train_'+prefix+'_features.csv', index=False)
test_X[res_cols].to_csv('test_'+prefix+'_features.csv', index=False)