## Feature selection


In [1]:
import pandas as pd
import numpy as np
import os 
from pathlib import Path

from datetime import datetime, timedelta
import time 
from dateutil.relativedelta import relativedelta

from sklearn.preprocessing import StandardScaler, MinMaxScaler

In [2]:
import pyarrow.parquet as pq
import pyarrow as pa

In [3]:
import warnings
warnings.filterwarnings("ignore")

pd.options.display.max_rows=999
pd.options.display.max_columns=999

In [4]:
train_file = r"amex\agg_train_all_rev.parquet"

In [5]:
%%time
df=pd.read_parquet(train_file, engine='pyarrow')


Wall time: 4.63 s


In [6]:
to_log_feats = ['B_11', 'B_12', 'B_13', 'B_21', 'B_22','B_23', 'B_24', 'B_26', 'B_27', 'B_28', 'B_29', 'B_3', 'B_32', 'B_36', 
             'B_4', 'B_40', 'B_41', 'B_42', 'B_5', 'B_9', 
             'D_106', 'D_107', 'D_108', 'D_109', 'D_113', 'D_115', 'D_118', 'D_119', 'D_123', 'D_125', 'D_131', 'D_133', 
             'D_135', 'D_136', 'D_137', 'D_138', 'D_140', 
             'D_39', 'D_41', 'D_43', 'D_44',  'D_45', 'D_49','D_51']

In [7]:
eps =  1e-8

log_feats = []
for c in to_log_feats:

    log_feats.append(f'log_{c}')

### feature selection

In [8]:
all_cols = ['cnt', 'B_1', 'B_10', 'B_11', 'B_12', 'B_13', 'B_14', 'B_15', 'B_16', 'B_17', 'B_18', 'B_19', 'B_2', 'B_20', 'B_21', 'B_22', 'B_23', 'B_24', 'B_25', 'B_26', 'B_27', 'B_28', 'B_29', 'B_3', 'B_30=0.0', 'B_30=1.0', 'B_30=2.0', 'B_31=0', 'B_31=1', 'B_32', 'B_33', 'B_36', 'B_37', 'B_38=1.0', 'B_38=2.0', 'B_38=3.0', 'B_38=4.0', 'B_38=5.0', 'B_38=6.0', 'B_38=7.0', 'B_39', 'B_4', 'B_40', 'B_41', 'B_42', 'B_5', 'B_6', 'B_7', 'B_8', 'B_9', 'D_102', 'D_103', 'D_104', 'D_105', 'D_106', 'D_107', 'D_108', 'D_109', 'D_110', 'D_111', 'D_112', 'D_113', 'D_114=0.0', 'D_114=1.0', 'D_115', 'D_116=0.0', 'D_116=1.0', 'D_117=-1.0', 'D_117=1.0', 'D_117=2.0', 'D_117=3.0', 'D_117=4.0', 'D_117=5.0', 'D_117=6.0', 'D_118', 'D_119', 'D_120=0.0', 'D_120=1.0', 'D_121', 'D_122', 'D_123', 'D_124', 'D_125', 'D_126=-1.0', 'D_126=0.0', 'D_126=1.0', 'D_127', 'D_128', 'D_129', 'D_130', 'D_131', 'D_132', 'D_133', 'D_134', 'D_135', 'D_136', 'D_137', 'D_138', 'D_139', 'D_140', 'D_141', 'D_142', 'D_143', 'D_144', 'D_145', 'D_39', 'D_41', 'D_42', 'D_43', 'D_44', 'D_45', 'D_46', 'D_47', 'D_48', 'D_49', 'D_50', 'D_51', 'D_52', 'D_53', 'D_54', 'D_55', 'D_56', 'D_58', 'D_59', 'D_60', 'D_61', 'D_62', 'D_63=CL', 'D_63=CO', 'D_63=CR', 'D_63=XL', 'D_63=XM', 'D_63=XZ', 'D_64=-1', 'D_64=O', 'D_64=R', 'D_64=U', 'D_65', 'D_66=0.0', 'D_66=1.0', 'D_68=0.0', 'D_68=1.0', 'D_68=2.0', 'D_68=3.0', 'D_68=4.0', 'D_68=5.0', 'D_68=6.0', 'D_69', 'D_70', 'D_71', 'D_72', 'D_73', 'D_74', 'D_75', 'D_76', 'D_77', 'D_78', 'D_79', 'D_80', 'D_81', 'D_82', 'D_83', 'D_84', 'D_86', 'D_87', 'D_88', 'D_89', 'D_91', 'D_92', 'D_93', 'D_94', 'D_96', 'P_2', 'P_3', 'P_4', 'R_1', 'R_10', 'R_11', 'R_12', 'R_13', 'R_14', 'R_15', 'R_16', 'R_17', 'R_18', 'R_19', 'R_2', 'R_20', 'R_21', 'R_22', 'R_23', 'R_24', 'R_25', 'R_26', 'R_27', 'R_28', 'R_3', 'R_4', 'R_5', 'R_6', 'R_7', 'R_8', 'R_9', 'S_11', 'S_12', 'S_13', 'S_15', 'S_16', 'S_17', 'S_18', 'S_19', 'S_20', 'S_22', 'S_23', 'S_24', 'S_25', 'S_26', 'S_27', 'S_2=max', 'S_2=min', 'S_3', 'S_5', 'S_6', 'S_7', 'S_8', 'S_9', 'customer_ID', 'days']

cat_feats = ['B_30=0.0', 'B_30=1.0', 'B_30=2.0', 'B_31=0', 'B_31=1', 'B_38=1.0', 'B_38=2.0', 'B_38=3.0', 'B_38=4.0', 'B_38=5.0', 'B_38=6.0', 'B_38=7.0', 'D_114=0.0', 'D_114=1.0', 'D_116=0.0', 'D_116=1.0', 'D_117=-1.0', 'D_117=1.0', 'D_117=2.0', 'D_117=3.0', 'D_117=4.0', 'D_117=5.0', 'D_117=6.0', 'D_120=0.0', 'D_120=1.0', 'D_126=-1.0', 'D_126=0.0', 'D_126=1.0', 'D_63=CL', 'D_63=CO', 'D_63=CR', 'D_63=XL', 'D_63=XM', 'D_63=XZ', 'D_64=-1', 'D_64=O', 'D_64=R', 'D_64=U', 'D_66=0.0', 'D_66=1.0', 'D_68=0.0', 'D_68=1.0', 'D_68=2.0', 'D_68=3.0', 'D_68=4.0', 'D_68=5.0', 'D_68=6.0']
s2_feats = ['S_2=max', 'S_2=min']
float_feats = ['cnt', 'B_1', 'B_10', 'B_11', 'B_12', 'B_13', 'B_14', 'B_15', 'B_16', 'B_17', 'B_18', 'B_19', 'B_2', 'B_20', 'B_21', 'B_22', 'B_23', 'B_24', 'B_25', 'B_26', 'B_27', 'B_28', 'B_29', 'B_3', 'B_32', 'B_33', 'B_36', 'B_37', 'B_39', 'B_4', 'B_40', 'B_41', 'B_42', 'B_5', 'B_6', 'B_7', 'B_8', 'B_9', 'D_102', 'D_103', 'D_104', 'D_105', 'D_106', 'D_107', 'D_108', 'D_109', 'D_110', 'D_111', 'D_112', 'D_113', 'D_115', 'D_118', 'D_119', 'D_121', 'D_122', 'D_123', 'D_124', 'D_125', 'D_127', 'D_128', 'D_129', 'D_130', 'D_131', 'D_132', 'D_133', 'D_134', 'D_135', 'D_136', 'D_137', 'D_138', 'D_139', 'D_140', 'D_141', 'D_142', 'D_143', 'D_144', 'D_145', 'D_39', 'D_41', 'D_42', 'D_43', 'D_44', 'D_45', 'D_46', 'D_47', 'D_48', 'D_49', 'D_50', 'D_51', 'D_52', 'D_53', 'D_54', 'D_55', 'D_56', 'D_58', 'D_59', 'D_60', 'D_61', 'D_62', 'D_65', 'D_69', 'D_70', 'D_71', 'D_72', 'D_73', 'D_74', 'D_75', 'D_76', 'D_77', 'D_78', 'D_79', 'D_80', 'D_81', 'D_82', 'D_83', 'D_84', 'D_86', 'D_87', 'D_88', 'D_89', 'D_91', 'D_92', 'D_93', 'D_94', 'D_96', 'P_2', 'P_3', 'P_4', 'R_1', 'R_10', 'R_11', 'R_12', 'R_13', 'R_14', 'R_15', 'R_16', 'R_17', 'R_18', 'R_19', 'R_2', 'R_20', 'R_21', 'R_22', 'R_23', 'R_24', 'R_25', 'R_26', 'R_27', 'R_28', 'R_3', 'R_4', 'R_5', 'R_6', 'R_7', 'R_8', 'R_9', 'S_11', 'S_12', 'S_13', 'S_15', 'S_16', 'S_17', 'S_18', 'S_19', 'S_20', 'S_22', 'S_23', 'S_24', 'S_25', 'S_26', 'S_27', 'S_3', 'S_5', 'S_6', 'S_7', 'S_8', 'S_9', 'days']

print(float_feats)
print(log_feats)

['cnt', 'B_1', 'B_10', 'B_11', 'B_12', 'B_13', 'B_14', 'B_15', 'B_16', 'B_17', 'B_18', 'B_19', 'B_2', 'B_20', 'B_21', 'B_22', 'B_23', 'B_24', 'B_25', 'B_26', 'B_27', 'B_28', 'B_29', 'B_3', 'B_32', 'B_33', 'B_36', 'B_37', 'B_39', 'B_4', 'B_40', 'B_41', 'B_42', 'B_5', 'B_6', 'B_7', 'B_8', 'B_9', 'D_102', 'D_103', 'D_104', 'D_105', 'D_106', 'D_107', 'D_108', 'D_109', 'D_110', 'D_111', 'D_112', 'D_113', 'D_115', 'D_118', 'D_119', 'D_121', 'D_122', 'D_123', 'D_124', 'D_125', 'D_127', 'D_128', 'D_129', 'D_130', 'D_131', 'D_132', 'D_133', 'D_134', 'D_135', 'D_136', 'D_137', 'D_138', 'D_139', 'D_140', 'D_141', 'D_142', 'D_143', 'D_144', 'D_145', 'D_39', 'D_41', 'D_42', 'D_43', 'D_44', 'D_45', 'D_46', 'D_47', 'D_48', 'D_49', 'D_50', 'D_51', 'D_52', 'D_53', 'D_54', 'D_55', 'D_56', 'D_58', 'D_59', 'D_60', 'D_61', 'D_62', 'D_65', 'D_69', 'D_70', 'D_71', 'D_72', 'D_73', 'D_74', 'D_75', 'D_76', 'D_77', 'D_78', 'D_79', 'D_80', 'D_81', 'D_82', 'D_83', 'D_84', 'D_86', 'D_87', 'D_88', 'D_89', 'D_91', 'D

In [9]:
df.shape

(458913, 273)

In [10]:
df.head(2)

Unnamed: 0,customer_ID,D_104,D_73,D_77,R_24,D_145,D_123,B_31=0,B_31=1,D_91,B_2,D_135,R_10,S_20,R_21,D_112,D_124,D_63=CL,D_63=CO,D_63=CR,D_63=XL,D_63=XM,D_63=XZ,S_11,D_125,D_46,R_1,R_6,B_3,R_12,D_106,B_18,S_15,S_12,B_6,B_22,D_41,D_103,D_108,B_39,S_16,D_120=0.0,D_120=1.0,S_23,S_2=min,S_2=max,days,S_18,R_2,D_131,R_25,B_5,B_25,D_52,D_53,D_89,D_144,D_78,P_2,D_56,D_119,P_3,B_16,S_19,B_15,D_137,D_122,D_141,D_142,D_43,B_9,D_45,S_24,D_58,S_7,R_22,D_143,B_30=0.0,B_30=1.0,B_30=2.0,B_27,D_66=0.0,D_66=1.0,B_36,D_139,D_118,D_114=0.0,D_114=1.0,B_4,B_24,D_102,S_6,D_136,B_33,D_138,D_105,R_3,B_11,D_47,D_59,B_1,D_60,S_5,B_41,B_14,D_50,R_7,B_8,D_134,D_132,D_127,R_4,D_65,B_23,B_42,D_74,D_79,D_130,B_19,D_68=0.0,D_68=1.0,D_68=2.0,D_68=3.0,D_68=4.0,D_68=5.0,D_68=6.0,S_8,R_17,P_4,D_81,D_128,B_37,R_15,D_70,D_71,B_32,S_9,D_93,R_8,S_22,D_61,B_7,D_86,B_26,B_20,S_13,D_126=-1.0,D_126=0.0,D_126=1.0,D_69,R_16,cnt,D_109,D_64=-1,D_64=O,D_64=R,D_64=U,D_113,R_23,R_13,B_29,D_133,D_49,B_17,D_39,D_87,D_117=-1.0,D_117=1.0,D_117=2.0,D_117=3.0,D_117=4.0,D_117=5.0,D_117=6.0,D_140,S_3,D_62,D_72,R_19,D_80,D_54,B_38=1.0,B_38=2.0,B_38=3.0,B_38=4.0,B_38=5.0,B_38=6.0,B_38=7.0,D_84,D_116=0.0,D_116=1.0,D_75,D_83,D_111,D_121,D_110,R_26,R_20,D_82,B_13,R_18,D_88,S_26,B_40,R_14,B_21,D_48,R_11,S_17,R_28,R_9,D_92,S_27,D_76,D_96,D_115,D_42,R_5,B_10,D_94,R_27,B_12,D_55,D_107,D_44,S_25,D_51,B_28,D_129,log_B_11,log_B_12,log_B_13,log_B_21,log_B_22,log_B_23,log_B_24,log_B_26,log_B_27,log_B_28,log_B_29,log_B_3,log_B_32,log_B_36,log_B_4,log_B_40,log_B_41,log_B_42,log_B_5,log_B_9,log_D_106,log_D_107,log_D_108,log_D_109,log_D_113,log_D_115,log_D_118,log_D_119,log_D_123,log_D_125,log_D_131,log_D_133,log_D_135,log_D_136,log_D_137,log_D_138,log_D_140,log_D_39,log_D_41,log_D_43,log_D_44,log_D_45,log_D_49,log_D_51,target
0,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,1.009606,0.163683,0.419295,0.005469,0.005814,0.003103,0.0,13.0,1.197627,1.005086,0.030999,0.004742,0.006292,0.004931,1.004348,0.686641,0.0,0.0,13.0,0.0,0.0,0.0,0.469887,0.006253,0.378074,0.004509,0.006067,0.006456,1.004889,0.19425,0.842565,0.173335,0.247988,0.11351,0.0057,0.005021,1.005188,0.056857,0.213355,0.004974,13.0,0.0,0.135036,2017-03-09,2018-03-13,369,0.005075,0.005754,0.005446,0.005799,0.14665,0.005915,0.204972,0.078497,0.005235,0.005283,0.005218,0.933824,0.158571,0.244733,0.680138,0.004769,0.005955,0.026247,0.017057,0.433732,0.005178,0.382473,0.153943,0.00622,0.725369,0.931956,0.064803,0.098374,0.005002,0.005066,13.0,0.0,0.0,0.004098,0.0,0.0,0.005292,0.003664,0.245514,0.0,13.0,0.040309,0.005135,0.856909,0.004647,0.212771,1.004033,0.16938,0.984966,0.004752,0.00723,0.532874,0.061147,0.012007,0.534817,0.029112,0.006753,0.023142,0.150326,0.005851,0.003585,0.334751,0.185971,1.006123,0.006656,0.004701,0.026177,0.104542,0.039496,0.00417,0.005195,0.005231,0.0,0.0,0.0,0.0,0.0,0.0,13.0,0.797145,0.004164,0.00567,0.005834,1.004154,0.012015,0.005171,0.005221,0.214785,0.005084,0.039818,0.004415,0.005595,0.916144,0.225847,0.036624,0.004874,0.004408,0.005731,0.6547,0.0,0.0,13.0,0.005275,0.00474,13,0.006114,0.0,13.0,0.0,0.0,0.005476,0.004593,0.004522,0.029967,0.00476,0.181014,0.680789,0.010704,1.0,0.0,0.0,0.0,0.0,13.0,0.0,0.0,0.005343,0.113215,0.181286,0.004495,0.005231,0.081974,1.006225,0.0,13.0,0.0,0.0,0.0,0.0,0.0,0.004945,13.0,0.0,0.036125,0.004892,0.868921,0.711829,0.740351,0.085475,0.005231,0.505637,0.100432,0.006589,0.180269,0.01256,0.0873,0.005879,0.0055,0.240978,0.004784,0.005079,0.004089,0.211189,0.54389,0.850951,0.138271,0.003619,0.247095,0.180571,0.004684,0.27028,0.005909,1.005594,0.125683,0.224432,0.672166,0.004673,0.974669,0.978914,0.050968,1.005537,-4.929449,-2.073995,-2.298275,-5.203047,-5.167329,-3.642872,-5.271716,-5.424415,-5.497164,-2.97656,-5.223177,-5.042712,-5.281577,-5.241639,-3.211186,-2.438409,-4.997762,-3.590491,-1.919705,-5.079943,-2.28975,-0.39725,-5.240566,-5.097225,-5.207382,-1.397983,-1.404402,-1.407586,-5.775453,-5.07472,-5.212901,-5.347446,-4.912965,-2.413825,-5.17097,-3.796226,-5.232025,-4.537166,-5.294216,-2.376637,-5.365879,-0.321075,-2.241847,-0.021312,0
1,00000fd6641609c6ece5454664794f0340ad84dddce9a2...,0.004932,0.163683,0.223911,0.004027,0.004902,0.006022,0.0,13.0,0.082667,0.991083,0.030999,0.005269,0.005367,0.004984,1.006037,0.140331,0.0,13.0,0.0,0.0,0.0,0.0,0.375552,0.004236,0.452041,0.006246,0.005476,0.005663,1.005214,0.19425,1.004884,0.265484,0.181548,0.20227,0.005374,0.004993,0.005606,0.056857,0.213355,0.004212,8.0,5.0,0.135614,2017-03-01,2018-03-25,389,0.006049,0.006251,0.004214,0.004724,0.035462,0.02409,0.158313,0.078497,0.003807,0.004218,0.003896,0.89982,0.705671,0.430961,0.566665,0.076456,0.005197,0.00556,0.017057,0.290804,0.006007,0.382473,0.144571,0.010298,0.256461,0.910228,0.005146,0.103002,0.003541,0.004824,13.0,0.0,0.0,0.004588,0.0,0.0,0.006088,0.004906,0.433039,0.0,13.0,0.016746,0.004736,0.004578,0.003821,0.212771,1.005387,0.16938,0.358575,0.057673,0.013792,0.392433,0.232975,0.025654,0.32653,0.016785,0.005432,0.014848,0.174954,0.005768,0.004956,0.334751,0.185971,0.080648,0.003229,0.00413,0.013286,0.104542,0.005337,0.004831,0.003449,0.004915,0.0,0.0,0.0,0.0,0.0,0.0,13.0,0.411079,0.004766,0.005322,0.004415,1.002631,0.025244,0.006144,0.004143,0.011508,0.00596,0.033809,0.004098,0.006046,0.908578,0.053319,0.028049,0.004769,0.005267,0.004935,0.145258,0.0,0.0,13.0,0.005872,0.042936,13,0.00687,0.0,13.0,0.0,0.0,0.006391,0.004819,0.004322,0.029967,0.004519,0.181014,0.680789,0.215205,1.0,13.0,0.0,0.0,0.0,0.0,0.0,0.0,0.006271,0.120578,0.23852,0.00469,0.004496,0.005388,1.006141,0.0,13.0,0.0,0.0,0.0,0.0,0.0,0.004301,13.0,0.0,0.006471,0.003974,0.868921,0.535892,0.740351,0.085475,0.003404,0.443825,0.046753,0.004902,0.180269,0.004071,0.022125,0.005675,0.005497,0.048203,0.044768,0.004356,0.005405,0.211189,0.004729,0.133975,0.138271,0.004662,0.439431,0.180571,0.005794,0.298815,0.005067,1.005984,0.025823,0.048069,0.005859,0.004311,0.975606,0.390638,0.017052,0.004173,-4.283689,-3.656474,-3.062866,-5.20347,-5.226271,-4.321062,-5.352521,-5.246314,-5.384415,-4.071506,-5.223177,-5.173726,-5.122607,-5.101498,-4.089598,-3.811057,-5.215529,-3.590491,-3.33928,-4.575761,-2.28975,-5.139822,-5.240566,-4.980628,-5.052804,-0.822275,-0.836928,-0.841738,-5.112281,-5.464061,-5.469399,-5.399383,-4.912965,-2.413825,-5.17097,-3.796226,-5.071773,-1.536165,-5.299725,-1.933982,-5.446628,-1.360777,-2.241847,-0.939973,0


In [11]:
x_cols = float_feats + cat_feats + log_feats
len(x_cols)

269

In [12]:
df.isna().sum()[df.isna().sum()>0]

Series([], dtype: int64)

In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 458913 entries, 0 to 458912
Columns: 273 entries, customer_ID to target
dtypes: float32(267), int32(2), int64(1), object(3)
memory usage: 488.4+ MB


In [14]:

from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from spFSR import SpFSR


In [15]:
x=df[x_cols]
y=df['target']
x.shape, y.shape

((458913, 269), (458913,))

In [16]:
x.head(2)

Unnamed: 0,cnt,B_1,B_10,B_11,B_12,B_13,B_14,B_15,B_16,B_17,B_18,B_19,B_2,B_20,B_21,B_22,B_23,B_24,B_25,B_26,B_27,B_28,B_29,B_3,B_32,B_33,B_36,B_37,B_39,B_4,B_40,B_41,B_42,B_5,B_6,B_7,B_8,B_9,D_102,D_103,D_104,D_105,D_106,D_107,D_108,D_109,D_110,D_111,D_112,D_113,D_115,D_118,D_119,D_121,D_122,D_123,D_124,D_125,D_127,D_128,D_129,D_130,D_131,D_132,D_133,D_134,D_135,D_136,D_137,D_138,D_139,D_140,D_141,D_142,D_143,D_144,D_145,D_39,D_41,D_42,D_43,D_44,D_45,D_46,D_47,D_48,D_49,D_50,D_51,D_52,D_53,D_54,D_55,D_56,D_58,D_59,D_60,D_61,D_62,D_65,D_69,D_70,D_71,D_72,D_73,D_74,D_75,D_76,D_77,D_78,D_79,D_80,D_81,D_82,D_83,D_84,D_86,D_87,D_88,D_89,D_91,D_92,D_93,D_94,D_96,P_2,P_3,P_4,R_1,R_10,R_11,R_12,R_13,R_14,R_15,R_16,R_17,R_18,R_19,R_2,R_20,R_21,R_22,R_23,R_24,R_25,R_26,R_27,R_28,R_3,R_4,R_5,R_6,R_7,R_8,R_9,S_11,S_12,S_13,S_15,S_16,S_17,S_18,S_19,S_20,S_22,S_23,S_24,S_25,S_26,S_27,S_3,S_5,S_6,S_7,S_8,S_9,days,B_30=0.0,B_30=1.0,B_30=2.0,B_31=0,B_31=1,B_38=1.0,B_38=2.0,B_38=3.0,B_38=4.0,B_38=5.0,B_38=6.0,B_38=7.0,D_114=0.0,D_114=1.0,D_116=0.0,D_116=1.0,D_117=-1.0,D_117=1.0,D_117=2.0,D_117=3.0,D_117=4.0,D_117=5.0,D_117=6.0,D_120=0.0,D_120=1.0,D_126=-1.0,D_126=0.0,D_126=1.0,D_63=CL,D_63=CO,D_63=CR,D_63=XL,D_63=XM,D_63=XZ,D_64=-1,D_64=O,D_64=R,D_64=U,D_66=0.0,D_66=1.0,D_68=0.0,D_68=1.0,D_68=2.0,D_68=3.0,D_68=4.0,D_68=5.0,D_68=6.0,log_B_11,log_B_12,log_B_13,log_B_21,log_B_22,log_B_23,log_B_24,log_B_26,log_B_27,log_B_28,log_B_29,log_B_3,log_B_32,log_B_36,log_B_4,log_B_40,log_B_41,log_B_42,log_B_5,log_B_9,log_D_106,log_D_107,log_D_108,log_D_109,log_D_113,log_D_115,log_D_118,log_D_119,log_D_123,log_D_125,log_D_131,log_D_133,log_D_135,log_D_136,log_D_137,log_D_138,log_D_140,log_D_39,log_D_41,log_D_43,log_D_44,log_D_45,log_D_49,log_D_51
0,13,0.012007,0.27028,0.00723,0.125683,0.100432,0.023142,0.026247,0.004769,0.680789,0.842565,0.005231,1.005086,0.005731,0.0055,0.0057,0.026177,0.005135,0.005915,0.004408,0.004098,0.050968,0.029967,0.006456,0.005084,1.004033,0.005292,0.012015,0.213355,0.040309,0.0873,0.006753,0.104542,0.14665,0.11351,0.036624,0.003585,0.00622,0.856909,1.005188,1.009606,0.984966,0.19425,0.672166,0.056857,0.006114,0.740351,0.868921,1.004348,0.005476,0.247095,0.245514,0.244733,0.711829,0.433732,0.003103,0.686641,0.006253,1.006123,1.004154,1.005537,0.005195,0.005446,0.185971,0.00476,0.334751,0.030999,0.212771,0.017057,0.16938,0.003664,0.005343,0.005178,0.382473,0.005066,0.005283,0.005814,0.010704,0.005021,0.180571,0.153943,0.004673,0.725369,0.378074,0.532874,0.240978,0.181014,0.150326,0.978914,0.204972,0.078497,1.006225,0.224432,0.158571,0.064803,0.061147,0.534817,0.225847,0.181286,0.004701,0.005275,0.005221,0.214785,0.004495,0.163683,0.039496,0.036125,0.138271,0.419295,0.005218,0.00417,0.081974,0.005834,0.505637,0.004892,0.004945,0.004874,1.0,0.180269,0.005235,1.197627,0.54389,0.004415,0.005909,0.003619,0.933824,0.680138,0.00567,0.004509,0.004742,0.004784,1.004889,0.004522,0.005879,0.005171,0.00474,0.004164,0.006589,0.005231,0.005754,0.005231,0.004931,0.005002,0.004593,0.005469,0.005799,0.085475,1.005594,0.004089,0.004752,0.006656,0.004684,0.006067,0.005851,0.005595,0.211189,0.469887,0.247988,0.6547,0.173335,0.004974,0.005079,0.005075,0.005955,0.006292,0.916144,0.135036,0.931956,0.974669,0.01256,0.850951,0.113215,0.029112,0.004647,0.098374,0.797145,0.039818,369,13.0,0.0,0.0,0.0,13.0,0.0,13.0,0.0,0.0,0.0,0.0,0.0,0.0,13.0,13.0,0.0,0.0,0.0,0.0,0.0,13.0,0.0,0.0,13.0,0.0,0.0,0.0,13.0,0.0,0.0,13.0,0.0,0.0,0.0,0.0,13.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,13.0,-4.929449,-2.073995,-2.298275,-5.203047,-5.167329,-3.642872,-5.271716,-5.424415,-5.497164,-2.97656,-5.223177,-5.042712,-5.281577,-5.241639,-3.211186,-2.438409,-4.997762,-3.590491,-1.919705,-5.079943,-2.28975,-0.39725,-5.240566,-5.097225,-5.207382,-1.397983,-1.404402,-1.407586,-5.775453,-5.07472,-5.212901,-5.347446,-4.912965,-2.413825,-5.17097,-3.796226,-5.232025,-4.537166,-5.294216,-2.376637,-5.365879,-0.321075,-2.241847,-0.021312
1,13,0.025654,0.298815,0.013792,0.025823,0.046753,0.014848,0.00556,0.076456,0.680789,1.004884,0.004915,0.991083,0.004935,0.005497,0.005374,0.013286,0.004736,0.02409,0.005267,0.004588,0.017052,0.029967,0.005663,0.00596,1.005387,0.006088,0.025244,0.213355,0.016746,0.022125,0.005432,0.104542,0.035462,0.20227,0.028049,0.004956,0.010298,0.004578,0.005606,0.004932,0.358575,0.19425,0.005859,0.056857,0.00687,0.740351,0.868921,1.006037,0.006391,0.439431,0.433039,0.430961,0.535892,0.290804,0.006022,0.140331,0.004236,0.080648,1.002631,0.004173,0.003449,0.004214,0.185971,0.004519,0.334751,0.030999,0.212771,0.017057,0.16938,0.004906,0.006271,0.006007,0.382473,0.004824,0.004218,0.004902,0.215205,0.004993,0.180571,0.144571,0.004311,0.256461,0.452041,0.392433,0.048203,0.181014,0.174954,0.390638,0.158313,0.078497,1.006141,0.048069,0.705671,0.005146,0.232975,0.32653,0.053319,0.23852,0.00413,0.005872,0.004143,0.011508,0.00469,0.163683,0.005337,0.006471,0.138271,0.223911,0.003896,0.004831,0.005388,0.004415,0.443825,0.003974,0.004301,0.004769,1.0,0.180269,0.003807,0.082667,0.004729,0.004098,0.005067,0.004662,0.89982,0.566665,0.005322,0.006246,0.005269,0.044768,1.005214,0.004322,0.005675,0.006144,0.042936,0.004766,0.004902,0.004496,0.006251,0.003404,0.004984,0.003541,0.004819,0.004027,0.004724,0.085475,1.005984,0.005405,0.057673,0.003229,0.005794,0.005476,0.005768,0.006046,0.211189,0.375552,0.181548,0.145258,0.265484,0.004212,0.004356,0.006049,0.005197,0.005367,0.908578,0.135614,0.910228,0.975606,0.004071,0.133975,0.120578,0.016785,0.003821,0.103002,0.411079,0.033809,389,13.0,0.0,0.0,0.0,13.0,0.0,13.0,0.0,0.0,0.0,0.0,0.0,0.0,13.0,13.0,0.0,13.0,0.0,0.0,0.0,0.0,0.0,0.0,8.0,5.0,0.0,0.0,13.0,0.0,13.0,0.0,0.0,0.0,0.0,0.0,13.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,13.0,-4.283689,-3.656474,-3.062866,-5.20347,-5.226271,-4.321062,-5.352521,-5.246314,-5.384415,-4.071506,-5.223177,-5.173726,-5.122607,-5.101498,-4.089598,-3.811057,-5.215529,-3.590491,-3.33928,-4.575761,-2.28975,-5.139822,-5.240566,-4.980628,-5.052804,-0.822275,-0.836928,-0.841738,-5.112281,-5.464061,-5.469399,-5.399383,-4.912965,-2.413825,-5.17097,-3.796226,-5.071773,-1.536165,-5.299725,-1.933982,-5.446628,-1.360777,-2.241847,-0.939973


In [17]:
y.head(2)

0    0
1    0
Name: target, dtype: int64

In [None]:
del df

import gc
gc.collect()

In [21]:
for pct in [0.35, 0.5, 0.75, 0.95]:
    print('{:,.0f}'.format(pct*x.shape[0])) # print("%.2f" % z)  print("{:.2f}".format(z))

160,620
229,456
344,185
435,967


In [22]:
# set the engine parameters
#sp_engine = SpFSR(x.values, y.values, pred_type='r', scoring='r2', wrapper=DecisionTreeRegressor(random_state=1))
sp_engine = SpFSR(x.values, y.values, pred_type='c', scoring='roc_auc', wrapper=DecisionTreeClassifier(random_state=1))

# for n_samples in [150000, 250000, 350000, 400000]:
for n_samples in [200000, 32000, 450000]:    
    print(n_samples, '='*100)

    sp_run = sp_engine.run(num_features=0, n_samples_max=n_samples, print_freq=5, iter_max=25)  

    sp_results = sp_run.results

    print('Best value:', sp_results.get('selected_ft_score_mean'))
    print('Indices of selected features: ', sp_results.get('selected_features'))
    print('Importance of selected features: ', sp_results.get('selected_ft_importance').round(3))
    print('Total iterations for the optimal feature set:', sp_results.get('total_iter_for_opt'))

    sel_feats=list(np.array(x_cols)[sp_results.get('selected_features')])
    print(sel_feats)



SpFSR-INFO: Wrapper: DecisionTreeClassifier(random_state=1)
SpFSR-INFO: Hot start: True
SpFSR-INFO: Hot start range: 0.2
SpFSR-INFO: Feature weighting: False
SpFSR-INFO: Scoring metric: roc_auc
SpFSR-INFO: Number of jobs: 1
SpFSR-INFO: Number of observations in the dataset: 458913
SpFSR-INFO: Number of observations used: 200000
SpFSR-INFO: Number of features available: 269
SpFSR-INFO: Number of features to select: 0
SpFSR-INFO: iter_no: 0, num_ft: 6, value: 0.746, st_dev: 0.002, best: 0.746 @ iter_no 0
SpFSR-INFO: iter_no: 5, num_ft: 25, value: 0.773, st_dev: 0.002, best: 0.773 @ iter_no 4
SpFSR-INFO: iter_no: 10, num_ft: 28, value: 0.775, st_dev: 0.002, best: 0.775 @ iter_no 7
SpFSR-INFO: iter_no: 15, num_ft: 26, value: 0.775, st_dev: 0.002, best: 0.776 @ iter_no 12
SpFSR-INFO: iter_no: 20, num_ft: 29, value: 0.777, st_dev: 0.002, best: 0.777 @ iter_no 19
SpFSR-INFO: ===> iter_no: 24, same feature stall limit reached, initializing search...
SpFSR-INFO: iter_no: 25, num_ft: 6, value: 0

Best value: 0.777
Indices of selected features:  [244, 9, 98, 12, 80, 125, 37, 126, 150, 128, 175, 84, 99, 35, 85, 33, 59, 88, 1, 159, 25, 106, 3, 174, 265, 36, 16, 77, 171, 178]
Importance of selected features:  [0.341 0.289 0.26  0.256 0.207 0.186 0.153 0.145 0.144 0.139 0.136 0.128
 0.125 0.109 0.099 0.097 0.094 0.072 0.053 0.051 0.045 0.045 0.044 0.039
 0.037 0.028 0.023 0.01  0.003 0.   ]
Total iterations for the optimal feature set: 19
['log_B_9', 'B_17', 'D_62', 'B_2', 'D_43', 'P_2', 'B_9', 'P_3', 'R_4', 'R_1', 'S_8', 'D_47', 'D_65', 'B_7', 'D_48', 'B_5', 'D_128', 'D_51', 'B_1', 'S_15', 'B_33', 'D_75', 'B_11', 'S_7', 'log_D_44', 'B_8', 'B_23', 'D_39', 'S_3', 'B_30=0.0']


SpFSR-INFO: Wrapper: DecisionTreeClassifier(random_state=1)
SpFSR-INFO: Hot start: True
SpFSR-INFO: Hot start range: 0.2
SpFSR-INFO: Feature weighting: False
SpFSR-INFO: Scoring metric: roc_auc
SpFSR-INFO: Number of jobs: 1
SpFSR-INFO: Number of observations in the dataset: 458913
SpFSR-INFO: Number of observations used: 32000
SpFSR-INFO: Number of features available: 269
SpFSR-INFO: Number of features to select: 0
SpFSR-INFO: iter_no: 0, num_ft: 4, value: 0.737, st_dev: 0.005, best: 0.737 @ iter_no 0
SpFSR-INFO: iter_no: 5, num_ft: 7, value: 0.756, st_dev: 0.005, best: 0.756 @ iter_no 5
SpFSR-INFO: iter_no: 10, num_ft: 19, value: 0.769, st_dev: 0.006, best: 0.769 @ iter_no 9
SpFSR-INFO: iter_no: 15, num_ft: 22, value: 0.772, st_dev: 0.006, best: 0.772 @ iter_no 15
SpFSR-INFO: iter_no: 20, num_ft: 19, value: 0.771, st_dev: 0.005, best: 0.775 @ iter_no 16
SpFSR-INFO: iter_no: 25, num_ft: 27, value: 0.773, st_dev: 0.005, best: 0.775 @ iter_no 16
SpFSR-INFO: SpFSR completed in 12.77 minut

Best value: 0.775
Indices of selected features:  [12, 175, 264, 128, 126, 229, 129, 125, 153, 149, 263, 171, 79, 19, 3, 13, 29, 265, 266, 77]
Importance of selected features:  [0.357 0.301 0.243 0.201 0.197 0.148 0.144 0.126 0.122 0.121 0.104 0.089
 0.089 0.086 0.075 0.069 0.06  0.029 0.012 0.008]
Total iterations for the optimal feature set: 16
['B_2', 'S_8', 'log_D_43', 'R_1', 'P_3', 'log_B_22', 'R_10', 'P_2', 'R_7', 'R_3', 'log_D_41', 'S_3', 'D_42', 'B_26', 'B_11', 'B_20', 'B_4', 'log_D_44', 'log_D_45', 'D_39']


SpFSR-INFO: Wrapper: DecisionTreeClassifier(random_state=1)
SpFSR-INFO: Hot start: True
SpFSR-INFO: Hot start range: 0.2
SpFSR-INFO: Feature weighting: False
SpFSR-INFO: Scoring metric: roc_auc
SpFSR-INFO: Number of jobs: 1
SpFSR-INFO: Number of observations in the dataset: 458913
SpFSR-INFO: Number of observations used: 450000
SpFSR-INFO: Number of features available: 269
SpFSR-INFO: Number of features to select: 0
SpFSR-INFO: iter_no: 0, num_ft: 4, value: 0.738, st_dev: 0.001, best: 0.738 @ iter_no 0
SpFSR-INFO: iter_no: 5, num_ft: 6, value: 0.751, st_dev: 0.001, best: 0.751 @ iter_no 4
SpFSR-INFO: iter_no: 10, num_ft: 14, value: 0.766, st_dev: 0.001, best: 0.766 @ iter_no 10
SpFSR-INFO: iter_no: 15, num_ft: 23, value: 0.777, st_dev: 0.002, best: 0.777 @ iter_no 15
SpFSR-INFO: iter_no: 20, num_ft: 24, value: 0.78, st_dev: 0.001, best: 0.78 @ iter_no 16
SpFSR-INFO: iter_no: 25, num_ft: 26, value: 0.78, st_dev: 0.002, best: 0.781 @ iter_no 22
SpFSR-INFO: SpFSR completed in 284.93 minut

Best value: 0.781
Indices of selected features:  [37, 125, 15, 128, 149, 171, 35, 85, 152, 266, 77, 79, 239, 99, 126, 130, 12, 27, 80, 1, 10, 268, 84, 115, 81]
Importance of selected features:  [0.219 0.155 0.12  0.119 0.113 0.112 0.106 0.097 0.094 0.091 0.082 0.067
 0.063 0.059 0.049 0.046 0.043 0.037 0.032 0.031 0.026 0.021 0.02  0.014
 0.01 ]
Total iterations for the optimal feature set: 22
['B_9', 'P_2', 'B_22', 'R_1', 'R_3', 'S_3', 'B_7', 'D_48', 'R_6', 'log_D_45', 'D_39', 'D_42', 'log_B_4', 'D_65', 'P_3', 'R_11', 'B_2', 'B_37', 'D_43', 'B_1', 'B_18', 'log_D_51', 'D_47', 'D_84', 'D_44']
