In [2]:
%load_ext autoreload
%autoreload 2
from glob import glob
import gc
import os
import sys
import datetime
import numpy as np
import pandas as pd
from tqdm import tqdm
from func.utils import get_numeric_features, get_categorical_features, read_pkl_gzip, to_pkl_gzip, parallel_load_data, get_filename
from func.time_utils import date_add_days
from func.ml_utils import save_feature, get_cnt_feature, get_dummie_feature, get_label_feature
from func.parallel_utils import get_parallel_arg_list
from joblib import delayed, Parallel

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [6]:
COLUMN_ID = 'TransactionID'
COLUMN_DT = 'TransactionDT'
COLUMN_TARGET = 'isFraud'
COLUMNS_IGNORE = [COLUMN_ID, COLUMN_DT, COLUMN_TARGET, 'ProductCD']

train_paths = glob('../feature/eda_base/*_train.gz')
test_paths  = glob('../feature/eda_base/*_test.gz')

train_paths = [path for path in train_paths 
               if (path.count(COLUMN_DT) 
               or path.count(COLUMN_ID)
               or path.count(COLUMN_TARGET)
#                or path.count('_DT-')
               or path.count('ProductCD_t')
               or path.count('M')
               )
               and not path.count('fill')
               and not path.count('bin')
               and not path.count('uid')
               and not path.count('129')
              ]
test_paths = [path for path in test_paths 
               if (path.count(COLUMN_DT) 
               or path.count(COLUMN_ID)
               or path.count(COLUMN_TARGET)
#                or path.count('_DT-')
               or path.count('ProductCD_t')
               or path.count('M')
               )
               and not path.count('fill')
               and not path.count('bin')
               and not path.count('uid')
               and not path.count('129')
              ]

df_train = parallel_load_data(train_paths)
df_test = parallel_load_data(test_paths)
data = pd.concat([df_train, df_test], axis=0, ignore_index=True)
train_length = df_train.shape[0]
del df_train, df_test
gc.collect()


base_train = read_pkl_gzip('../input/base_train.gz').set_index(COLUMN_ID)
base_test = read_pkl_gzip('../input/base_test.gz').set_index(COLUMN_ID)
base = pd.concat([base_train, base_test], axis=0)

cols_num = get_numeric_features(data, COLUMNS_IGNORE)
cols_num = [col for col in cols_num if col.count('V') or col.count('D')]
data[cols_num] = data[cols_num].astype('float32')

In [7]:
cols_M = [col for col in data.columns if col.startswith('M')]
vals_P = data['ProductCD'].value_counts().index
#========================================================================
# ProductCDあたりのM
#========================================================================
cols_M_P = []
for col in tqdm(cols_M):
    for pcd in vals_P:
        feature_name = f'{col}__ProductCD-{pcd}'
        data[feature_name] = -1
        data.loc[data['ProductCD'].isin([pcd]), feature_name] = data.loc[data['ProductCD'].isin([pcd]), col]

100%|██████████| 9/9 [01:38<00:00, 13.60s/it]


In [9]:
#========================================================================
# FE Categorical Encoding 
#========================================================================

cols_categorical = get_categorical_features(data, ignore_list=COLUMNS_IGNORE)
df_cat = data[cols_categorical].copy()
for col in tqdm(cols_categorical):
    num = df_cat[col].value_counts().shape[0]
    if not num>1:
        continue
    df_cat[col].fillna('#', inplace=True)
    df_cat[col] = df_cat[col].astype('str')
    cols_cat = [col]
    if num>15:
        cnt_feature = get_cnt_feature(df_cat[col].to_frame(), cols_cat)
        label_feature = get_label_feature(df_cat[col].to_frame(), cols_cat)
        df_cat = df_cat.join(cnt_feature).join(label_feature)
    elif num>2:
        cnt_feature = get_cnt_feature(df_cat[col].to_frame(), cols_cat)
        label_feature = get_label_feature(df_cat[col].to_frame(), cols_cat)
        dummie_feature = get_dummie_feature(df_cat[col].to_frame(), cols_cat)
        df_cat = df_cat.join(cnt_feature).join(label_feature).join(dummie_feature)
    elif num<=2:
        label_feature = get_label_feature(df_cat[col].to_frame(), cols_cat)
        df_cat = df_cat.join(label_feature)
    else:
        print(col)
    df_cat.drop(col, axis=1, inplace=True)

100%|██████████| 55/55 [01:46<00:00,  1.94s/it]


In [18]:
cols_cat = [col for col in df_cat.columns if col.count('label') or col.count('dummie') or col.count('cnt_')]

['M9__ProductCD-W']

In [19]:
data_cat = pd.concat([data, df_cat[cols_cat]], axis=1)
cols_feature = [col for col in data_cat.columns if col.count('M') and (
    col.count('cnt') or 
    col.count('dummie') or 
    col.count('label')
)]

In [20]:
data_cat[cols_cats].head()

Unnamed: 0,label__M1,label__M2,label__M3,cnt__M4,label__M4,M4_#_dummie,M4_M0_dummie,M4_M1_dummie,M4_M2_dummie,label__M5,...,M8__ProductCD-W_#_dummie,M8__ProductCD-W_-1_dummie,M8__ProductCD-W_F_dummie,M8__ProductCD-W_T_dummie,cnt__M9__ProductCD-W,label__M9__ProductCD-W,M9__ProductCD-W_#_dummie,M9__ProductCD-W_-1_dummie,M9__ProductCD-W_F_dummie,M9__ProductCD-W_T_dummie
0,2,2,2,122947,3,0,0,0,1,1,...,1,0,0,0,284682,0,1,0,0,0
1,0,0,0,357789,1,0,1,0,0,2,...,1,0,0,0,284682,0,1,0,0,0
2,2,2,2,357789,1,0,1,0,0,1,...,0,0,1,0,74040,2,0,0,1,0
3,0,0,0,357789,1,0,1,0,0,2,...,1,0,0,0,284682,0,1,0,0,0
4,0,0,0,519189,0,1,0,0,0,0,...,0,1,0,0,296574,1,0,1,0,0


In [21]:
#========================================================================
# Save
#========================================================================
prefix = '518'
dir_save = 'check_trush'

train = data_cat.iloc[:len(base_train)]
test  = data_cat.iloc[len(base_train):]
save_feature(train[cols_feature], prefix, dir_save, is_train=True, auto_type=True, list_ignore=COLUMNS_IGNORE)
save_feature(test[ cols_feature], prefix, dir_save, is_train=False, auto_type=True, list_ignore=COLUMNS_IGNORE)

(590540,) | label__M1
(590540,) | label__M2
(590540,) | label__M3
(590540,) | cnt__M4
(590540,) | label__M4
(590540,) | M4_#_dummie
(590540,) | M4_M0_dummie
(590540,) | M4_M1_dummie
(590540,) | M4_M2_dummie
(590540,) | label__M5
(590540,) | label__M6
(590540,) | label__M7
(590540,) | label__M8
(590540,) | label__M9
(590540,) | cnt____DT-M
(590540,) | label____DT-M
(590540,) | __DT-M_2017-12_dummie
(590540,) | __DT-M_2018-1_dummie
(590540,) | __DT-M_2018-10_dummie
(590540,) | __DT-M_2018-11_dummie
(590540,) | __DT-M_2018-12_dummie
(590540,) | __DT-M_2018-2_dummie
(590540,) | __DT-M_2018-3_dummie
(590540,) | __DT-M_2018-4_dummie
(590540,) | __DT-M_2018-5_dummie
(590540,) | __DT-M_2018-7_dummie
(590540,) | __DT-M_2018-8_dummie
(590540,) | __DT-M_2018-9_dummie
(590540,) | cnt__M1__ProductCD-W
(590540,) | label__M1__ProductCD-W
(590540,) | M1__ProductCD-W_#_dummie
(590540,) | M1__ProductCD-W_-1_dummie
(590540,) | M1__ProductCD-W_F_dummie
(590540,) | M1__ProductCD-W_T_dummie
(590540,) | cnt_