In [1]:
%load_ext autoreload
%autoreload 2
from glob import glob
import gc
import os
import sys
import datetime
import numpy as np
import pandas as pd
from tqdm import tqdm
from func.utils import get_numeric_features, get_categorical_features, read_pkl_gzip, to_pkl_gzip, parallel_load_data, get_filename
from func.time_utils import date_add_days
from func.ml_utils import save_feature, get_cnt_feature, get_dummie_feature, get_label_feature
from func.parallel_utils import get_parallel_arg_list
from joblib import delayed, Parallel

In [2]:
COLUMN_ID = 'TransactionID'
COLUMN_DT = 'TransactionDT'
COLUMN_TARGET = 'isFraud'
COLUMNS_IGNORE = [COLUMN_ID, COLUMN_DT, COLUMN_TARGET, 'ProductCD']

train_paths = glob('../feature/eda_base/*_train.gz')
test_paths  = glob('../feature/eda_base/*_test.gz')

train_paths = [path for path in train_paths 
               if (path.count(COLUMN_DT) 
               or path.count(COLUMN_ID)
               or path.count(COLUMN_TARGET)
#                or path.count('_DT-')
#                or path.count('ProductCD_t')
               or path.count('D')
               or path.count('C')
#                or path.count('card')
#                or path.count('addr')
#                or path.count('domain')
#                or path.count('uid')
               )
               and not path.count('fill')
               and not path.count('bin')
               and not path.count('129')
              ]
test_paths = [path for path in test_paths 
               if (path.count(COLUMN_DT) 
               or path.count(COLUMN_ID)
               or path.count(COLUMN_TARGET)
#                or path.count('_DT-')
#                or path.count('ProductCD_t')
               or path.count('D')
               or path.count('C')
#                or path.count('card')
#                or path.count('addr')
#                or path.count('domain')
#                or path.count('uid')
               )
               and not path.count('fill')
               and not path.count('bin')
               and not path.count('129')
              ]

df_train = parallel_load_data(train_paths)
df_test = parallel_load_data(test_paths)
data = pd.concat([df_train, df_test], axis=0, ignore_index=True)
train_length = df_train.shape[0]
del df_train, df_test
gc.collect()

21

In [5]:
#========================================================================
# 
#========================================================================

cols_C = sorted([col for col in data.columns if col.startswith('C')])
cols_D = sorted([col for col in data.columns if col.startswith('D')])

for C in tqdm(cols_C):
    for D in cols_D:
        data[f'{C}-{D}__ratio'] = data[C] / (data[D]+10)

100%|██████████| 14/14 [00:11<00:00,  1.17it/s]


In [10]:
#========================================================================
# ProductCDあたりのC
#========================================================================
cols_feature = [col for col in data.columns if col.count('__ratio') and not col.count('ProductCD-')]
cols_pcd = data['ProductCD'].unique()

# for col in tqdm(cols_feature):
for col in tqdm(cols_feature[159:]):
    for pcd in cols_pcd:
        feature_name = f'{col}__ProductCD-{pcd}'
        data[feature_name] = np.nan
        data.loc[data['ProductCD'].isin([pcd]), feature_name] = data.loc[data['ProductCD'].isin([pcd]), col]


  0%|          | 0/51 [00:00<?, ?it/s][A
  2%|▏         | 1/51 [00:02<01:42,  2.06s/it][A
  4%|▍         | 2/51 [00:11<03:22,  4.14s/it][A
  6%|▌         | 3/51 [00:22<05:02,  6.29s/it][A
  8%|▊         | 4/51 [00:34<06:13,  7.95s/it][A
 10%|▉         | 5/51 [00:45<06:54,  9.01s/it][A
 12%|█▏        | 6/51 [00:57<07:23,  9.87s/it][A
 14%|█▎        | 7/51 [01:09<07:43, 10.54s/it][A
 16%|█▌        | 8/51 [01:22<08:01, 11.20s/it][A
 18%|█▊        | 9/51 [01:34<08:07, 11.60s/it][A
 20%|█▉        | 10/51 [01:47<08:05, 11.85s/it][A
 22%|██▏       | 11/51 [02:01<08:25, 12.63s/it][A
 24%|██▎       | 12/51 [02:16<08:31, 13.11s/it][A
 25%|██▌       | 13/51 [02:30<08:30, 13.45s/it][A
 27%|██▋       | 14/51 [02:45<08:39, 14.03s/it][A
 29%|██▉       | 15/51 [03:01<08:39, 14.44s/it][A
 31%|███▏      | 16/51 [03:15<08:27, 14.51s/it][A
 33%|███▎      | 17/51 [03:29<08:10, 14.44s/it][A
 35%|███▌      | 18/51 [03:44<07:57, 14.48s/it][A
 37%|███▋      | 19/51 [03:58<07:40, 14.40s/it]

In [18]:
#========================================================================
# FE Aggregation User ID & TimeSeries Date
#========================================================================
prefix = '508'
dir_save = 'check_trush'
cols_save = [col for col in data.columns if col.count('__ratio')]

train = data.iloc[:train_length]
test  = data.iloc[train_length:]
cols = [
# 'C1-D15__ratio__ProductCD-W',
# 'C11-D15__ratio__ProductCD-W',
# 'C13-D15__ratio',
# 'C14-D15__ratio__ProductCD-W',
# 'C2-D1__ratio',
'C6-D10__ratio',
'C6-D1__ratio',
'C6-D1__ratio__ProductCD-W',
]

# for col in cols_save:
for col in cols:
    save_feature(train[[col]], prefix, dir_save, is_train=True, auto_type=True, list_ignore=COLUMNS_IGNORE)
    save_feature(test[[col]],  prefix, dir_save, is_train=False, auto_type=True, list_ignore=COLUMNS_IGNORE)

(590540,) | C6-D10__ratio
(506691,) | C6-D10__ratio
(590540,) | C6-D1__ratio
(506691,) | C6-D1__ratio
(590540,) | C6-D1__ratio__ProductCD-W
(506691,) | C6-D1__ratio__ProductCD-W


(590540, 1348)