In [1]:
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
from tqdm import tqdm; tqdm.pandas()
pd.options.display.max_columns = 202
pd.options.display.max_rows = 300

In [2]:
train = pd.read_pickle('data/train_original.pkl')
test = pd.read_pickle('data/test_original.pkl')
special_cols = [col for col in train.columns if train[col].dtype != np.float64]
feature_cols = [col for col in train.columns if col not in special_cols]

In [3]:
def add_noise(series, noise_level): 
    return series * (1 + noise_level * np.random.randn(len(series)))
                     
def target_encode(trn_series=None, 
                  tst_series=None, 
                  target=None, 
                  min_samples_leaf=1, 
                  smoothing=1,
                  noise_level=0):
                      
    assert len(trn_series) == len(target)
    assert trn_series.name == tst_series.name
    temp = pd.concat([trn_series, target], axis=1)
    # Compute target mean 
    averages = temp.groupby(by=trn_series.name)[target.name].agg(["mean", "count"])
    # Compute smoothing
    smoothing = 1 / (1 + np.exp(-(averages["count"] - min_samples_leaf) / smoothing))
    # Apply average function to all target data
    prior = target.mean()
    # The bigger the count the less full_avg is taken into account
    averages[target.name] = prior * (1 - smoothing) + averages["mean"] * smoothing
    averages.drop(["mean", "count"], axis=1, inplace=True)
    # Apply averages to trn and tst series
    ft_trn_series = pd.merge(
        trn_series.to_frame(trn_series.name),
        averages.reset_index().rename(columns={'index': target.name, target.name: 'average'}),
        on=trn_series.name,
        how='left')['average'].rename(trn_series.name + '_mean').fillna(prior)
    # pd.merge does not keep the index so restore it
    ft_trn_series.index = trn_series.index 
    ft_tst_series = pd.merge(
        tst_series.to_frame(tst_series.name),
        averages.reset_index().rename(columns={'index': target.name, target.name: 'average'}),
        on=tst_series.name,
        how='left')['average'].rename(trn_series.name + '_mean').fillna(prior)
    # pd.merge does not keep the index so restore it
    ft_tst_series.index = tst_series.index
    
    return add_noise(ft_trn_series, noise_level), add_noise(ft_tst_series, noise_level)

In [5]:
'''
th = 100
trn_res_df = pd.DataFrame(index=train.index)
tst_res_df = pd.DataFrame(index=test.index)

target_cols = ['var_68', 'var_91', 'var_108', 'var_103','var_12',
 'var_161','var_148', 'var_71', 'var_43', 'var_25',
 'var_125', 'var_166', 'var_169', 'var_15', 'var_133',
 'var_131', 'var_34', 'var_23', 'var_93', 'var_95',
 'var_42', 'var_126', 'var_50', 'var_53', 'var_98',
 'var_57', 'var_28', 'var_130', 'var_59', 'var_6',
 'var_156', 'var_105', 'var_144', 'var_197', 'var_189',
 'var_114', 'var_111', 'var_66', 'var_9', 'var_79',
 'var_181', 'var_162', 'var_195', 'var_132', 'var_64',
 'var_192', 'var_27', 'var_112', 'var_4', 'var_116',
 'var_121']
for col in target_cols:
    
    print('Processing {}'.format(col))
    trn_me, tst_me = target_encode(train[col], 
                                    test[col], 
                                    target=train.target,  
                                    min_samples_leaf=th,
                                    smoothing=10,
                                    noise_level=0.01)
    
    trn_res_df[col+'_me'] = trn_me
    tst_res_df[col+'_me'] = tst_me

feature_cols = trn_res_df.columns.tolist()
for df in [trn_res_df, tst_res_df]:
    df['target_encode_max'] = df[feature_cols].max(axis=1)
    df['target_encode_min'] = df[feature_cols].min(axis=1)
    df['target_encode_sum'] = df[feature_cols].sum(axis=1)
    df['target_encode_std'] = df[feature_cols].std(axis=1)
    
trn_res_df.to_pickle('features/top_{}_common_target_encode_train'.format(len(target_cols)))
tst_res_df.to_pickle('features/top_{}_common_target_encode_test'.format(len(target_cols)))
trn_res_df.head(10)
'''

Processing var_68
Processing var_91
Processing var_108
Processing var_103
Processing var_12
Processing var_161
Processing var_148
Processing var_71
Processing var_43
Processing var_25
Processing var_125
Processing var_166
Processing var_169
Processing var_15
Processing var_133
Processing var_131
Processing var_34
Processing var_23
Processing var_93
Processing var_95
Processing var_42
Processing var_126
Processing var_50
Processing var_53
Processing var_98
Processing var_57
Processing var_28
Processing var_130
Processing var_59
Processing var_6
Processing var_156
Processing var_105
Processing var_144
Processing var_197
Processing var_189
Processing var_114
Processing var_111
Processing var_66
Processing var_9
Processing var_79
Processing var_181
Processing var_162
Processing var_195
Processing var_132
Processing var_64
Processing var_192
Processing var_27
Processing var_112
Processing var_4
Processing var_116
Processing var_121


Unnamed: 0,var_68_me,var_91_me,var_108_me,var_103_me,var_12_me,var_161_me,var_148_me,var_71_me,var_43_me,var_25_me,var_125_me,var_166_me,var_169_me,var_15_me,var_133_me,var_131_me,var_34_me,var_23_me,var_93_me,var_95_me,var_42_me,var_126_me,var_50_me,var_53_me,var_98_me,var_57_me,var_28_me,var_130_me,var_59_me,var_6_me,var_156_me,var_105_me,var_144_me,var_197_me,var_189_me,var_114_me,var_111_me,var_66_me,var_9_me,var_79_me,var_181_me,var_162_me,var_195_me,var_132_me,var_64_me,var_192_me,var_27_me,var_112_me,var_4_me,var_116_me,var_121_me,target_encode_max,target_encode_min,target_encode_sum,target_encode_std
0,0.086476,0.099314,0.101245,0.100912,0.099752,0.100054,0.100586,0.101377,0.099165,0.100512,0.100509,0.101263,0.10067,0.100775,0.09988,0.099535,0.099288,0.10123,0.102293,0.10061,0.098875,0.10027,0.101069,0.097741,0.099678,0.101364,0.10092,0.100237,0.099616,0.101553,0.101275,0.102064,0.100187,0.100845,0.100892,0.101739,0.102751,0.100869,0.098749,0.099637,0.100405,0.099675,0.100877,0.101472,0.098083,0.100703,0.099339,0.099954,0.100433,0.10109,0.100801,0.102751,0.086476,5.108607,0.002198
1,0.091794,0.100437,0.103139,0.100567,0.099708,0.099371,0.100746,0.100808,0.099647,0.100058,0.102566,0.098296,0.097892,0.099549,0.101017,0.100898,0.098612,0.100228,0.099651,0.09827,0.099674,0.100044,0.10046,0.100409,0.100249,0.099029,0.099644,0.100011,0.100206,0.100874,0.099201,0.099666,0.098426,0.100575,0.100285,0.099815,0.099498,0.098234,0.100163,0.101436,0.102298,0.098986,0.102303,0.100237,0.100558,0.101443,0.099749,0.100096,0.099601,0.101248,0.102043,0.103139,0.091794,5.099713,0.001626
2,0.075303,0.102238,0.100432,0.100187,0.097975,0.101399,0.100325,0.099397,0.100502,0.10088,0.100134,0.101131,0.098995,0.098955,0.099722,0.100054,0.101658,0.100231,0.100224,0.10081,0.099217,0.102365,0.100216,0.101171,0.101109,0.101108,0.100027,0.099936,0.100362,0.098909,0.100405,0.099996,0.099953,0.100414,0.099474,0.099462,0.098792,0.10084,0.099766,0.097858,0.099151,0.099778,0.100668,0.100627,0.099999,0.101223,0.100517,0.100882,0.100883,0.100217,0.100323,0.102365,0.075303,5.086201,0.003606
3,0.091867,0.101359,0.099624,0.101679,0.100256,0.100617,0.099642,0.100781,0.100387,0.102331,0.098818,0.101642,0.101018,0.09879,0.099986,0.101227,0.102611,0.099268,0.10069,0.099756,0.099884,0.100424,0.099338,0.09975,0.100805,0.102001,0.101408,0.100148,0.100244,0.1006,0.100838,0.099959,0.100413,0.101211,0.100121,0.101637,0.10087,0.101697,0.098188,0.100987,0.098522,0.100577,0.101402,0.101008,0.100968,0.101015,0.099535,0.102358,0.099678,0.101051,0.098916,0.102611,0.091867,5.1179,0.001573
4,0.089689,0.102239,0.09938,0.099332,0.099329,0.101423,0.100819,0.099332,0.098679,0.101002,0.09977,0.098812,0.102181,0.101691,0.102449,0.100234,0.101865,0.099579,0.099775,0.100554,0.100105,0.100566,0.100799,0.10012,0.099301,0.100865,0.098843,0.100848,0.100769,0.100517,0.099422,0.100867,0.100758,0.100994,0.098144,0.099779,0.100778,0.099966,0.100977,0.100972,0.101182,0.10176,0.098111,0.101763,0.10015,0.100962,0.103216,0.099525,0.100276,0.098929,0.100549,0.103216,0.089689,5.109947,0.001864
5,0.120055,0.100664,0.100977,0.10022,0.101698,0.098017,0.098092,0.09989,0.099075,0.099537,0.100122,0.100466,0.10015,0.099722,0.099666,0.10046,0.102061,0.100184,0.101553,0.100519,0.101192,0.100436,0.099478,0.100609,0.100968,0.100009,0.102206,0.100571,0.101151,0.101248,0.099533,0.100806,0.099544,0.100407,0.100131,0.102147,0.100956,0.100126,0.098989,0.100527,0.099014,0.10011,0.100573,0.099104,0.100676,0.100894,0.100192,0.101152,0.097673,0.100489,0.101179,0.120055,0.097673,5.135218,0.002931
6,0.109932,0.100532,0.09864,0.10141,0.100226,0.102977,0.097968,0.099621,0.100486,0.100309,0.100105,0.101565,0.100507,0.100304,0.099319,0.100284,0.099934,0.100665,0.100672,0.098194,0.100731,0.101124,0.099211,0.1002,0.100573,0.100325,0.101013,0.100744,0.100481,0.099971,0.101552,0.100026,0.099699,0.099973,0.101617,0.100197,0.101139,0.102384,0.100887,0.100624,0.09988,0.100861,0.100584,0.098877,0.100132,0.100862,0.10088,0.099003,0.099029,0.100266,0.099135,0.109932,0.097968,5.125627,0.001647
7,0.087062,0.10074,0.09875,0.101207,0.101351,0.098985,0.099203,0.101418,0.098368,0.099418,0.10106,0.099597,0.100005,0.100382,0.101949,0.100969,0.099901,0.101643,0.0984,0.100523,0.101054,0.100804,0.099399,0.100217,0.102472,0.100325,0.100078,0.098995,0.099678,0.100401,0.099753,0.101355,0.098749,0.101086,0.100981,0.099318,0.102321,0.099654,0.102296,0.098292,0.099632,0.09942,0.101258,0.100018,0.101656,0.100126,0.100278,0.099723,0.098982,0.099863,0.101257,0.102472,0.087062,5.100371,0.002136
8,0.108113,0.101182,0.100407,0.100817,0.099692,0.101123,0.099428,0.101833,0.100316,0.102799,0.099917,0.101289,0.101255,0.10206,0.100695,0.098992,0.102275,0.100327,0.10177,0.099947,0.099728,0.101904,0.09939,0.100294,0.101172,0.099769,0.099175,0.100797,0.100415,0.101198,0.100716,0.099446,0.100356,0.101053,0.099555,0.098223,0.099923,0.100698,0.101493,0.099225,0.10237,0.100755,0.1019,0.099364,0.100089,0.099528,0.100047,0.099273,0.100392,0.102231,0.09963,0.108113,0.098223,5.134349,0.001475
9,0.099275,0.0993,0.100321,0.100356,0.09952,0.101438,0.101244,0.101188,0.099966,0.100082,0.101135,0.101159,0.100211,0.098207,0.101728,0.100372,0.099896,0.100896,0.102218,0.099493,0.098796,0.100239,0.099687,0.101101,0.101127,0.100542,0.10064,0.101512,0.101338,0.100962,0.098649,0.099437,0.098747,0.098999,0.101316,0.101776,0.100632,0.099622,0.100946,0.100372,0.100284,0.10009,0.101796,0.101543,0.101098,0.100722,0.099821,0.101095,0.102372,0.100508,0.098819,0.102372,0.098207,5.122592,0.000984


In [5]:
flatten_train = pd.DataFrame()
flatten_train['original_values'] = np.concatenate(train[feature_cols].values)
flatten_train['target'] = np.concatenate([[t]*len(feature_cols) for t in train.target])
print(flatten_train.shape)
flatten_train.head(3)

(40000000, 2)


Unnamed: 0,original_values,target
0,8.9255,0
1,-6.7863,0
2,11.9081,0


In [6]:
train.head(1)

Unnamed: 0,ID_code,target,var_0,var_1,var_2,var_3,var_4,var_5,var_6,var_7,var_8,var_9,var_10,var_11,var_12,var_13,var_14,var_15,var_16,var_17,var_18,var_19,var_20,var_21,var_22,var_23,var_24,var_25,var_26,var_27,var_28,var_29,var_30,var_31,var_32,var_33,var_34,var_35,var_36,var_37,var_38,var_39,var_40,var_41,var_42,var_43,var_44,var_45,var_46,var_47,var_48,var_49,var_50,var_51,var_52,var_53,var_54,var_55,var_56,var_57,var_58,var_59,var_60,var_61,var_62,var_63,var_64,var_65,var_66,var_67,var_68,var_69,var_70,var_71,var_72,var_73,var_74,var_75,var_76,var_77,var_78,var_79,var_80,var_81,var_82,var_83,var_84,var_85,var_86,var_87,var_88,var_89,var_90,var_91,var_92,var_93,var_94,var_95,var_96,var_97,var_98,var_99,var_100,var_101,var_102,var_103,var_104,var_105,var_106,var_107,var_108,var_109,var_110,var_111,var_112,var_113,var_114,var_115,var_116,var_117,var_118,var_119,var_120,var_121,var_122,var_123,var_124,var_125,var_126,var_127,var_128,var_129,var_130,var_131,var_132,var_133,var_134,var_135,var_136,var_137,var_138,var_139,var_140,var_141,var_142,var_143,var_144,var_145,var_146,var_147,var_148,var_149,var_150,var_151,var_152,var_153,var_154,var_155,var_156,var_157,var_158,var_159,var_160,var_161,var_162,var_163,var_164,var_165,var_166,var_167,var_168,var_169,var_170,var_171,var_172,var_173,var_174,var_175,var_176,var_177,var_178,var_179,var_180,var_181,var_182,var_183,var_184,var_185,var_186,var_187,var_188,var_189,var_190,var_191,var_192,var_193,var_194,var_195,var_196,var_197,var_198,var_199
0,train_0,0,8.9255,-6.7863,11.9081,5.093,11.4607,-9.2834,5.1187,18.6266,-4.92,5.747,2.9252,3.1821,14.0137,0.5745,8.7989,14.5691,5.7487,-7.2393,4.284,30.7133,10.535,16.2191,2.5791,2.4716,14.3831,13.4325,-5.1488,-0.4073,4.9306,5.9965,-0.3085,12.9041,-3.8766,16.8911,11.192,10.5785,0.6764,7.8871,4.6667,3.8743,-5.2387,7.3746,11.5767,12.0446,11.6418,-7.017,5.9226,-14.2136,16.0283,5.3253,12.9194,29.046,-0.694,5.1736,-0.7474,14.8322,11.2668,5.3822,2.0183,10.1166,16.1828,4.959,2.0771,-0.2154,8.6748,9.5319,5.8056,22.4321,5.0109,-4.701,21.6374,0.5663,5.1999,8.86,43.1127,18.3816,-2.344,23.4104,6.5199,12.1983,13.6468,13.8372,1.3675,2.9423,-4.5213,21.4669,9.3225,16.4597,7.9984,-1.7069,-21.4494,6.7806,11.0924,9.9913,14.8421,0.1812,8.9642,16.2572,2.1743,-3.4132,9.4763,13.3102,26.5376,1.4403,14.71,6.0454,9.5426,17.1554,14.1104,24.3627,2.0323,6.7602,3.9141,-0.4851,2.524,1.5093,2.5516,15.5752,-13.4221,7.2739,16.0094,9.7268,0.8897,0.7754,4.2218,12.0039,13.8571,-0.7338,-1.9245,15.4462,12.8287,0.3587,9.6508,6.5674,5.1726,3.1345,29.4547,31.4045,2.8279,15.6599,8.3307,-5.6011,19.0614,11.2663,8.6989,8.3694,11.5659,-16.4727,4.0288,17.9244,18.5177,10.78,9.0056,16.6964,10.4838,1.6573,12.1749,-13.1324,17.6054,11.5423,15.4576,5.3133,3.6159,5.0384,6.676,12.6644,2.7004,-0.6975,9.5981,5.4879,-4.7645,-8.4254,20.8773,3.1531,18.5618,7.7423,-10.1245,13.7241,-3.5189,1.7202,-8.4051,9.0164,3.0657,14.3691,25.8398,5.8764,11.8411,-19.7159,17.5743,0.5857,4.4354,3.9642,3.1364,1.691,18.5227,-2.3978,7.8784,8.5635,12.7803,-1.0914


In [7]:
flatten_test = pd.DataFrame()
flatten_test['original_values'] = np.concatenate(test[feature_cols].values)

In [8]:
flatten_train.original_values.isin(flatten_test.original_values).sum()/len(flatten_train)

0.99359275

In [9]:
flatten_test.original_values.isin(flatten_train.original_values).sum()/len(flatten_test)

0.9970897

In [10]:
trn_me, tst_me = target_encode(flatten_train['original_values'], 
                                flatten_test['original_values'], 
                                target=flatten_train.target,  
                                min_samples_leaf=500,
                                smoothing=10,
                                noise_level=0.01)

In [11]:
trn_me.sort_values(ascending=False).iloc[:30]

25363668    0.139258
22747868    0.138860
21988468    0.138798
23594468    0.138401
27779068    0.138266
10093468    0.138256
34205468    0.138222
17296468    0.138150
29091579    0.138148
8501268     0.138018
34435868    0.137926
25313268    0.137901
27953068    0.137898
2195668     0.137774
26693668    0.137730
27057468    0.137693
6910468     0.137673
38521268    0.137667
31688268    0.137584
15934414    0.137541
12564712    0.137487
27764068    0.137361
15873868    0.137332
17186868    0.137247
22288068    0.137242
31699006    0.137193
9929468     0.137106
11011068    0.137102
39852268    0.137095
5839868     0.137072
Name: original_values_mean, dtype: float64

In [12]:
flatten_train['target_enc'] = trn_me.values
flatten_test['target_enc'] = tst_me.values

In [13]:
trn_me_mapping = flatten_train.groupby('original_values')['target_enc'].mean()
tst_me_mapping = flatten_test.groupby('original_values')['target_enc'].mean()

In [14]:
for col in tqdm(feature_cols):
    train[col] = train[col].map(trn_me_mapping)
    test[col] = test[col].map(tst_me_mapping)

100%|████████████████████████████████████████████████████████████████████████████| 200/200 [00:09<00:00, 22.01it/s]


In [15]:
train.head(3)

Unnamed: 0,ID_code,target,var_0,var_1,var_2,var_3,var_4,var_5,var_6,var_7,var_8,var_9,var_10,var_11,var_12,var_13,var_14,var_15,var_16,var_17,var_18,var_19,var_20,var_21,var_22,var_23,var_24,var_25,var_26,var_27,var_28,var_29,var_30,var_31,var_32,var_33,var_34,var_35,var_36,var_37,var_38,var_39,var_40,var_41,var_42,var_43,var_44,var_45,var_46,var_47,var_48,var_49,var_50,var_51,var_52,var_53,var_54,var_55,var_56,var_57,var_58,var_59,var_60,var_61,var_62,var_63,var_64,var_65,var_66,var_67,var_68,var_69,var_70,var_71,var_72,var_73,var_74,var_75,var_76,var_77,var_78,var_79,var_80,var_81,var_82,var_83,var_84,var_85,var_86,var_87,var_88,var_89,var_90,var_91,var_92,var_93,var_94,var_95,var_96,var_97,var_98,var_99,var_100,var_101,var_102,var_103,var_104,var_105,var_106,var_107,var_108,var_109,var_110,var_111,var_112,var_113,var_114,var_115,var_116,var_117,var_118,var_119,var_120,var_121,var_122,var_123,var_124,var_125,var_126,var_127,var_128,var_129,var_130,var_131,var_132,var_133,var_134,var_135,var_136,var_137,var_138,var_139,var_140,var_141,var_142,var_143,var_144,var_145,var_146,var_147,var_148,var_149,var_150,var_151,var_152,var_153,var_154,var_155,var_156,var_157,var_158,var_159,var_160,var_161,var_162,var_163,var_164,var_165,var_166,var_167,var_168,var_169,var_170,var_171,var_172,var_173,var_174,var_175,var_176,var_177,var_178,var_179,var_180,var_181,var_182,var_183,var_184,var_185,var_186,var_187,var_188,var_189,var_190,var_191,var_192,var_193,var_194,var_195,var_196,var_197,var_198,var_199
0,train_0,0,0.100587,0.100537,0.100501,0.100336,0.100516,0.100489,0.100426,0.100376,0.100503,0.100414,0.10047,0.100469,0.100522,0.10055,0.100468,0.100582,0.100467,0.100213,0.100446,0.099931,0.100385,0.100383,0.100465,0.100567,0.100482,0.100439,0.10023,0.100529,0.10057,0.100464,0.10057,0.100509,0.100331,0.100451,0.100465,0.100617,0.100506,0.100385,0.100472,0.100414,0.100368,0.100491,0.100353,0.100471,0.100327,0.100354,0.100535,0.100236,0.100323,0.100537,0.100459,0.100704,0.100483,0.100427,0.100567,0.100483,0.100452,0.100386,0.100623,0.100579,0.10051,0.100394,0.100306,0.100408,0.100483,0.10048,0.100366,0.10073,0.091127,0.100412,0.100603,0.100452,0.100571,0.100618,0.102697,0.10047,0.100292,0.100185,0.100523,0.100495,0.100417,0.100489,0.100453,0.100636,0.100555,0.100578,0.100335,0.100512,0.100542,0.100446,0.101316,0.100508,0.100535,0.100439,0.100416,0.100501,0.100513,0.100542,0.100402,0.100663,0.100428,0.100513,0.100565,0.100542,0.100433,0.100504,0.100418,0.10038,0.100593,0.100249,0.100632,0.100462,0.100538,0.100472,0.100627,0.100428,0.100306,0.10071,0.100279,0.1004,0.100549,0.10041,0.100468,0.100481,0.100504,0.100507,0.100395,0.100537,0.100422,0.100475,0.100374,0.100532,0.10052,0.100533,0.100516,0.100474,0.100954,0.100267,0.100514,0.100553,0.100509,0.100367,0.100219,0.10047,0.100398,0.100399,0.100535,0.100605,0.100481,0.100392,0.100467,0.100394,0.100521,0.100432,0.100473,0.100533,0.100591,0.100427,0.100647,0.100538,0.100599,0.100389,0.100486,0.100441,0.100406,0.100398,0.100371,0.10049,0.100429,0.100547,0.100696,0.100374,0.100477,0.100576,0.100485,0.100315,0.100559,0.100534,0.100577,0.100351,0.100767,0.100671,0.100411,0.100525,0.100615,0.100558,0.100462,0.100413,0.100578,0.100416,0.100529,0.100474,0.100599,0.100511,0.100297,0.100654,0.10045,0.100515,0.100454,0.100348
1,train_1,0,0.100425,0.100389,0.100605,0.100402,0.100485,0.100502,0.100562,0.100328,0.1006,0.100432,0.100606,0.100545,0.100486,0.100445,0.100519,0.10052,0.100343,0.100723,0.100467,0.100383,0.10048,0.100674,0.100449,0.100512,0.100482,0.100509,0.100912,0.100602,0.100456,0.100504,0.100479,0.100485,0.100502,0.100527,0.100526,0.100588,0.100428,0.100432,0.100564,0.100495,0.100569,0.100446,0.100421,0.100497,0.100462,0.100044,0.100539,0.100526,0.100889,0.100392,0.10048,0.100429,0.100434,0.100444,0.10027,0.100594,0.10048,0.100467,0.100448,0.100576,0.100505,0.100614,0.100465,0.100495,0.100384,0.10038,0.100509,0.100508,0.094772,0.10049,0.100757,0.100402,0.100514,0.100385,0.10062,0.100522,0.100435,0.100781,0.100542,0.100456,0.100395,0.100562,0.100625,0.10067,0.10039,0.100447,0.100649,0.100495,0.100519,0.10055,0.100646,0.100499,0.100393,0.100382,0.100467,0.100442,0.100596,0.100636,0.100448,0.100549,0.100434,0.10038,0.100264,0.100558,0.100254,0.100269,0.100608,0.100399,0.100491,0.10044,0.100631,0.100545,0.100539,0.100502,0.100486,0.100387,0.100548,0.100721,0.10048,0.100552,0.100517,0.100464,0.100444,0.100409,0.100568,0.100482,0.100237,0.100424,0.100552,0.100455,0.100509,0.100372,0.100556,0.100444,0.100508,0.100353,0.100434,0.100714,0.100407,0.100389,0.10059,0.100535,0.100503,0.100511,0.100544,0.100481,0.100615,0.10052,0.100597,0.100551,0.100512,0.100585,0.100483,0.100532,0.10051,0.100606,0.100455,0.100701,0.100689,0.100447,0.100366,0.100435,0.100442,0.100272,0.100516,0.100432,0.100511,0.100333,0.100391,0.100411,0.100449,0.100507,0.100622,0.100423,0.100322,0.100472,0.100796,0.100394,0.100643,0.100341,0.100379,0.100497,0.100989,0.100351,0.100587,0.100408,0.100574,0.100401,0.10041,0.100439,0.100465,0.100492,0.100521,0.1006,0.100327,0.100592,0.100486,0.100456,0.100443,0.100432
2,train_2,0,0.100559,0.100564,0.100586,0.100514,0.100544,0.100696,0.10053,0.100473,0.100095,0.100561,0.100639,0.100165,0.100517,0.100463,0.100619,0.100541,0.10056,0.100464,0.100422,0.100739,0.100488,0.100368,0.100529,0.100532,0.10036,0.100568,0.100453,0.10064,0.100532,0.100383,0.100336,0.10044,0.100552,0.100368,0.100421,0.100568,0.100508,0.100429,0.10045,0.100373,0.100662,0.100487,0.100445,0.100527,0.100493,0.100201,0.100559,0.100409,0.100305,0.100503,0.100429,0.100782,0.100484,0.100658,0.10062,0.100509,0.100481,0.100478,0.100429,0.100591,0.100605,0.100567,0.100424,0.100609,0.10042,0.10049,0.100391,0.100573,0.072664,0.100487,0.100454,0.100408,0.100516,0.100483,0.100676,0.100351,0.100515,0.100701,0.10055,0.100506,0.100517,0.100443,0.100448,0.100556,0.100378,0.100506,0.100583,0.100364,0.100438,0.100549,0.10015,0.100371,0.100473,0.100537,0.100508,0.100443,0.100571,0.100596,0.100493,0.100375,0.100354,0.100556,0.100511,0.100394,0.100628,0.100454,0.100564,0.100455,0.100496,0.100432,0.100509,0.100467,0.100347,0.100429,0.100423,0.100338,0.100403,0.100077,0.100241,0.100377,0.100614,0.100414,0.100423,0.100315,0.100433,0.100618,0.100527,0.100428,0.100458,0.100221,0.100489,0.100482,0.100445,0.100486,0.100449,0.100571,0.100305,0.100442,0.100493,0.100486,0.100488,0.100483,0.100504,0.100375,0.100452,0.100538,0.100424,0.100475,0.10054,0.100413,0.10041,0.100446,0.100453,0.100655,0.100217,0.10056,0.100595,0.100425,0.099961,0.10045,0.100517,0.100549,0.100527,0.100355,0.100605,0.100339,0.100643,0.100608,0.100462,0.100526,0.10069,0.100272,0.100469,0.10046,0.100484,0.100477,0.100536,0.100558,0.100534,0.100475,0.100549,0.100602,0.100274,0.100391,0.100337,0.100533,0.100436,0.100488,0.100276,0.100496,0.100498,0.100489,0.100366,0.100563,0.100593,0.100536,0.100329,0.100493,0.100456,0.100462


In [16]:
for df in [train, test]:
    df['target_encode_max'] = df[feature_cols].max(axis=1)
    df['target_encode_min'] = df[feature_cols].min(axis=1)
    df['target_encode_sum'] = df[feature_cols].sum(axis=1)
    df['target_encode_std'] = df[feature_cols].std(axis=1)

In [17]:
feature_cols += ['target_encode_max', 'target_encode_min', 'target_encode_sum', 'target_encode_std']

In [18]:
train[['target', 'target_encode_sum', 'target_encode_max', 'target_encode_min', 'target_encode_std']].corr('spearman')

Unnamed: 0,target,target_encode_sum,target_encode_max,target_encode_min,target_encode_std
target,1.0,0.031171,0.021625,0.025723,-0.001871
target_encode_sum,0.031171,1.0,0.742015,0.796106,-0.072921
target_encode_max,0.021625,0.742015,1.0,0.396267,0.386386
target_encode_min,0.025723,0.796106,0.396267,1.0,-0.475371
target_encode_std,-0.001871,-0.072921,0.386386,-0.475371,1.0


In [19]:
train[['target']+feature_cols].iloc[:10000,].corr('spearman')[['target']]

Unnamed: 0,target
target,1.0
var_0,0.007825
var_1,-0.018439
var_2,0.007362
var_3,0.002949
var_4,-0.007872
var_5,0.017825
var_6,0.012781
var_7,0.006978
var_8,0.000224


In [20]:
train[feature_cols].to_pickle('features/all_items_target_encode_train_v2.pkl')
test[feature_cols].to_pickle('features/all_items_target_encode_test_v2.pkl')