In [1]:
%load_ext autoreload
%autoreload 2
from glob import glob
import gc
import os
import sys
import datetime
import numpy as np
import pandas as pd
from tqdm import tqdm
from func.utils import get_numeric_features, get_categorical_features, read_pkl_gzip, to_pkl_gzip, parallel_load_data, get_filename
from func.time_utils import date_add_days
from func.ml_utils import save_feature, get_cnt_feature, get_dummie_feature, get_label_feature
from func.parallel_utils import get_parallel_arg_list
from joblib import delayed, Parallel

In [2]:
COLUMN_ID = 'TransactionID'
COLUMN_DT = 'TransactionDT'
COLUMN_TARGET = 'isFraud'
COLUMNS_IGNORE = [COLUMN_ID, COLUMN_DT, COLUMN_TARGET, 'ProductCD']

train_paths = glob('../feature/eda_base/*_train.gz')
test_paths  = glob('../feature/eda_base/*_test.gz')

train_paths = [path for path in train_paths 
               if (path.count(COLUMN_DT) 
               or path.count(COLUMN_ID)
               or path.count(COLUMN_TARGET)
#                or path.count('_DT-')
#                or path.count('ProductCD_t')
#                or path.count('D')
               or path.count('C')
#                or path.count('card')
#                or path.count('addr')
#                or path.count('domain')
#                or path.count('uid')
               )
               and not path.count('fill')
               and not path.count('bin')
               and not path.count('uid')
               and not path.count('129')
              ]
test_paths = [path for path in test_paths 
               if (path.count(COLUMN_DT) 
               or path.count(COLUMN_ID)
               or path.count(COLUMN_TARGET)
#                or path.count('_DT-')
#                or path.count('ProductCD_t')
#                or path.count('D')
               or path.count('C')
#                or path.count('card')
#                or path.count('addr')
#                or path.count('domain')
#                or path.count('uid')
               )
               and not path.count('fill')
               and not path.count('bin')
               and not path.count('uid')
               and not path.count('129')
              ]

df_train = parallel_load_data(train_paths)
df_test = parallel_load_data(test_paths)
data = pd.concat([df_train, df_test], axis=0, ignore_index=True)
train_length = df_train.shape[0]
del df_train, df_test
gc.collect()

21

In [6]:
#========================================================================
# C同士のdiff, ratio
# 類似してるCでまとめる
#========================================================================
from itertools import combinations

cols_C = [col for col in data.columns if col.startswith('C')]

# df_corr = data[cols_C].corr()
# c1_group = df_corr[df_corr['C1']>0.9].index
# list_remain_group = list(set(df_corr.columns) - set(c1_group))
# df_corr[list_remain_group].loc[list_remain_group]
# c13_group = ['C13', 'C5', 'C9']
# C3はグループなし

# combi_c1_group = combinations(c1_group, 2)
# for (f1, f2) in tqdm(list(combi_c1_group)):
#     data[f'{f1}-{f2}__diff'] = data[f1] - data[f2]
#     data[f'{f1}-{f2}__ratio'] = data[f1] / (data[f2]+1)

combi_C = combinations(cols_C, 2)
for (f1, f2) in tqdm(list(combi_C)):
#     data[f'{f1}-{f2}__diff'] = data[f1] - data[f2]
    data[f'{f1}-{f2}__ratio'] = data[f1] / (data[f2]+1)

100%|██████████| 91/91 [00:01<00:00, 90.54it/s]


In [11]:
#========================================================================
# ProductCDあたりのC
#========================================================================
cols_feature = [col for col in data.columns if col.count('__diff') or col.count('__ratio')]
cols_pcd = data['ProductCD'].unique()

for col in tqdm(cols_feature):
    for pcd in cols_pcd:
        feature_name = f'{col}__ProductCD-{pcd}'
        data[feature_name] = np.nan
        data.loc[data['ProductCD'].isin([pcd]), feature_name] = data.loc[data['ProductCD'].isin([pcd]), col]

100%|██████████| 91/91 [06:59<00:00,  6.10s/it]


In [12]:
#========================================================================
# FE Aggregation User ID & TimeSeries Date
#========================================================================
prefix = '507'
dir_save = 'check_trush'
cols_save = [col for col in data.columns if col.count('__diff') or col.count('__ratio')]

train = data.iloc[:train_length][cols_save]
test  = data.iloc[train_length:][cols_save]

save_feature(train[cols_save], prefix, dir_save, is_train=True, auto_type=True, list_ignore=COLUMNS_IGNORE)
save_feature(test[cols_save],  prefix, dir_save, is_train=False, auto_type=True, list_ignore=COLUMNS_IGNORE)

(590540,) | C1-C10__ratio
(590540,) | C1-C11__ratio
(590540,) | C1-C12__ratio
(590540,) | C1-C13__ratio
(590540,) | C1-C14__ratio
(590540,) | C1-C2__ratio
(590540,) | C1-C3__ratio
(590540,) | C1-C4__ratio
(590540,) | C1-C5__ratio
(590540,) | C1-C6__ratio
(590540,) | C1-C7__ratio
(590540,) | C1-C8__ratio
(590540,) | C1-C9__ratio
(590540,) | C10-C11__ratio
(590540,) | C10-C12__ratio
(590540,) | C10-C13__ratio
(590540,) | C10-C14__ratio
(590540,) | C10-C2__ratio
(590540,) | C10-C3__ratio
(590540,) | C10-C4__ratio
(590540,) | C10-C5__ratio
(590540,) | C10-C6__ratio
(590540,) | C10-C7__ratio
(590540,) | C10-C8__ratio
(590540,) | C10-C9__ratio
(590540,) | C11-C12__ratio
(590540,) | C11-C13__ratio
(590540,) | C11-C14__ratio
(590540,) | C11-C2__ratio
(590540,) | C11-C3__ratio
(590540,) | C11-C4__ratio
(590540,) | C11-C5__ratio
(590540,) | C11-C6__ratio
(590540,) | C11-C7__ratio
(590540,) | C11-C8__ratio
(590540,) | C11-C9__ratio
(590540,) | C12-C13__ratio
(590540,) | C12-C14__ratio
(590540,) |

In [23]:
train.shape

(590540, 600)