In [2]:
%load_ext autoreload
%autoreload 2
from glob import glob
import gc
import os
import sys
import datetime
import numpy as np
import pandas as pd
from tqdm import tqdm
from func.utils import get_numeric_features, get_categorical_features, read_pkl_gzip, to_pkl_gzip, parallel_load_data, get_filename
from func.time_utils import date_add_days
from func.ml_utils import save_feature, get_cnt_feature, get_dummie_feature, get_label_feature
from func.parallel_utils import get_parallel_arg_list
from joblib import delayed, Parallel

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [None]:
COLUMN_ID = 'TransactionID'
COLUMN_DT = 'TransactionDT'
COLUMN_TARGET = 'isFraud'
COLUMNS_IGNORE = [COLUMN_ID, COLUMN_DT, COLUMN_TARGET, 'ProductCD']

train_paths = glob('../feature/eda_base/*_train.gz')
test_paths = glob('../feature/eda_base/*_test.gz')

train_paths = [path for path in train_paths 
               if path.count(COLUMN_DT) 
               or path.count(COLUMN_ID)
               or path.count(COLUMN_TARGET)
               or path.count('M')
               or path.count('Product')
              ]
test_paths = [path for path in test_paths 
               if path.count(COLUMN_DT) 
               or path.count(COLUMN_ID)
               or path.count(COLUMN_TARGET)
               or path.count('M')
               or path.count('Product')
              ]

df_train = parallel_load_data(train_paths)
df_test = parallel_load_data(test_paths)
data = pd.concat([df_train, df_test], axis=0, ignore_index=True)
if COLUMN_ID in data.columns:
    data.set_index(COLUMN_ID, inplace=True)

base_train = read_pkl_gzip('../input/base_train.gz').set_index(COLUMN_ID)
base_test = read_pkl_gzip('../input/base_test.gz').set_index(COLUMN_ID)
base = pd.concat([base_train, base_test], axis=0)

cols_num = get_numeric_features(data, COLUMNS_IGNORE)
cols_num = [col for col in cols_num if col.count('V') or col.count('D')]
data[cols_num] = data[cols_num].astype('float32')

  File "/home/yryrgogo/anaconda3/lib/python3.7/multiprocessing/process.py", line 297, in _bootstrap
    self.run()
  File "/home/yryrgogo/anaconda3/lib/python3.7/multiprocessing/process.py", line 297, in _bootstrap
    self.run()
  File "/home/yryrgogo/anaconda3/lib/python3.7/multiprocessing/process.py", line 297, in _bootstrap
    self.run()
  File "/home/yryrgogo/anaconda3/lib/python3.7/multiprocessing/process.py", line 99, in run
    self._target(*self._args, **self._kwargs)
  File "/home/yryrgogo/anaconda3/lib/python3.7/multiprocessing/process.py", line 297, in _bootstrap
    self.run()
  File "/home/yryrgogo/anaconda3/lib/python3.7/multiprocessing/process.py", line 297, in _bootstrap
    self.run()
  File "/home/yryrgogo/anaconda3/lib/python3.7/multiprocessing/process.py", line 297, in _bootstrap
    self.run()
  File "/home/yryrgogo/anaconda3/lib/python3.7/multiprocessing/process.py", line 297, in _bootstrap
    self.run()
  File "/home/yryrgogo/anaconda3/lib/python3.7/multiproce

In [30]:
cols_M = [col for col in data.columns if col.startswith('M')]
vals_P = data['ProductCD'].value_counts().index
#========================================================================
# ProductCDあたりのM
#========================================================================
cols_M_P = []
for col in tqdm(cols_M):
    for pcd in vals_P:
        feature_name = f'{col}__ProductCD-{pcd}'
        data[feature_name] = -1
        data.loc[data['ProductCD'].isin([pcd]), feature_name] = data.loc[data['ProductCD'].isin([pcd]), col]


  0%|          | 0/54 [00:00<?, ?it/s][A
100%|██████████| 54/54 [00:00<00:00, 34494.73it/s][A

In [None]:
for col in cols_M:
    for val in vals_P:
        tmp = data[data['ProductCD']==val]
        print(val)
        display(tmp[col].value_counts())
        display(tmp.groupby(col)[COLUMN_TARGET].mean())

In [27]:
#========================================================================
# FE Categorical Encoding 
#========================================================================

cols_categorical = get_categorical_features(data, ignore_list=COLUMNS_IGNORE)
# df_cat = data[cols_categorical].copy()
for col in tqdm(cols_categorical):
    num = df_cat[col].value_counts().shape[0]
    if not num>1:
        continue
    df_cat[col].fillna('#', inplace=True)
    df_cat[col] = df_cat[col].astype('str')
    cols_cat = [col]
    if num>15:
        cnt_feature = get_cnt_feature(df_cat[col].to_frame(), cols_cat)
        label_feature = get_label_feature(df_cat[col].to_frame(), cols_cat)
        df_cat = df_cat.join(cnt_feature).join(label_feature)
    elif num>2:
        cnt_feature = get_cnt_feature(df_cat[col].to_frame(), cols_cat)
        label_feature = get_label_feature(df_cat[col].to_frame(), cols_cat)
        dummie_feature = get_dummie_feature(df_cat[col].to_frame(), cols_cat)
        df_cat = df_cat.join(cnt_feature).join(label_feature).join(dummie_feature)
    elif num<=2:
        label_feature = get_label_feature(df_cat[col].to_frame(), cols_cat)
        df_cat = df_cat.join(label_feature)
    else:
        print(col)
    df_cat.drop(col, axis=1, inplace=True)


  0%|          | 0/45 [00:00<?, ?it/s][A
  2%|▏         | 1/45 [00:07<05:42,  7.77s/it][A
  4%|▍         | 2/45 [00:07<03:55,  5.48s/it][A
  7%|▋         | 3/45 [00:08<02:42,  3.87s/it][A
  9%|▉         | 4/45 [00:08<01:52,  2.74s/it][A
 11%|█         | 5/45 [00:08<01:18,  1.95s/it][A
 13%|█▎        | 6/45 [00:16<02:35,  4.00s/it][A
 16%|█▌        | 7/45 [00:17<01:47,  2.83s/it][A
 18%|█▊        | 8/45 [00:17<01:14,  2.02s/it][A
 20%|██        | 9/45 [00:17<00:52,  1.45s/it][A
 22%|██▏       | 10/45 [00:17<00:36,  1.05s/it][A
 24%|██▍       | 11/45 [00:26<01:59,  3.50s/it][A
 27%|██▋       | 12/45 [00:26<01:22,  2.49s/it][A
 29%|██▉       | 13/45 [00:26<00:56,  1.78s/it][A
 31%|███       | 14/45 [00:27<00:39,  1.28s/it][A
 33%|███▎      | 15/45 [00:27<00:27,  1.08it/s][A
 36%|███▌      | 16/45 [00:35<01:35,  3.30s/it][A
 38%|███▊      | 17/45 [00:44<02:19,  4.99s/it][A
 40%|████      | 18/45 [00:45<01:35,  3.53s/it][A
 42%|████▏     | 19/45 [00:45<01:05,  2.50s/it]

In [34]:
data_cat = data.join(df_cat, how='left')
cols_feature = [col for col in data_cat.columns if col.count('M') and (
    col.count('cnt') or 
    col.count('dummie') or 
    col.count('label')
)]

In [35]:
#========================================================================
# Save
#========================================================================
prefix = '518'
dir_save = 'valid'

train = data_cat.iloc[:len(base_train)]
test  = data_cat.iloc[len(base_train):]
save_feature(train[cols_feature], prefix, dir_save, is_train=True, auto_type=True, list_ignore=COLUMNS_IGNORE)
save_feature(test[ cols_feature], prefix, dir_save, is_train=False, auto_type=True, list_ignore=COLUMNS_IGNORE)

(590540,) | label__M1
(590540,) | label__M2
(590540,) | label__M3
(590540,) | cnt__M4
(590540,) | label__M4
(590540,) | M4_#_dummie
(590540,) | M4_M0_dummie
(590540,) | M4_M1_dummie
(590540,) | M4_M2_dummie
(590540,) | label__M5
(590540,) | label__M6
(590540,) | label__M7
(590540,) | label__M8
(590540,) | label__M9
(590540,) | cnt__M1__ProductCD-W
(590540,) | label__M1__ProductCD-W
(590540,) | M1__ProductCD-W_#_dummie
(590540,) | M1__ProductCD-W_-1_dummie
(590540,) | M1__ProductCD-W_F_dummie
(590540,) | M1__ProductCD-W_T_dummie
(590540,) | cnt__M2__ProductCD-W
(590540,) | label__M2__ProductCD-W
(590540,) | M2__ProductCD-W_#_dummie
(590540,) | M2__ProductCD-W_-1_dummie
(590540,) | M2__ProductCD-W_F_dummie
(590540,) | M2__ProductCD-W_T_dummie
(590540,) | cnt__M3__ProductCD-W
(590540,) | label__M3__ProductCD-W
(590540,) | M3__ProductCD-W_#_dummie
(590540,) | M3__ProductCD-W_-1_dummie
(590540,) | M3__ProductCD-W_F_dummie
(590540,) | M3__ProductCD-W_T_dummie
(590540,) | cnt__M4__ProductCD-W