In [26]:
%load_ext autoreload
%autoreload 2
from glob import glob
import os
import sys
import datetime
import numpy as np
import pandas as pd
from tqdm import tqdm_notebook as tqdm
from func.utils import get_numeric_features, get_categorical_features, read_pkl_gzip, to_pkl_gzip, parallel_load_data, get_filename
from func.ml_utils import save_feature, get_cnt_feature, get_dummie_feature, get_label_feature

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [21]:
COLUMN_ID = 'TransactionID'
COLUMN_DT = 'TransactionDT'
COLUMN_TARGET = 'isFraud'
COLUMNS_IGNORE = [COLUMN_ID, COLUMN_DT, COLUMN_TARGET]

train_paths = glob('../feature/eda_base/*_train.gz')
test_paths = glob('../feature/eda_base/*_test.gz')

train_paths = [path for path in train_paths 
               if path.count(COLUMN_DT) 
               or path.count(COLUMN_ID)
               or path.count('time_zone')
               or path.count('hour')
               or path.count('C')
               or path.count('D')
               or path.count('Product')
              ]
test_paths = [path for path in test_paths 
               if path.count(COLUMN_DT) 
               or path.count(COLUMN_ID)
               or path.count('time_zone')
               or path.count('hour')
               or path.count('C')
               or path.count('D')
               or path.count('Product')
              ]

df_train = parallel_load_data(train_paths)
df_test = parallel_load_data(test_paths)
data = pd.concat([df_train, df_test], axis=0)
cols_num = get_numeric_features(data, COLUMNS_IGNORE)
cols_num = [col for col in cols_num if col.count('C') or col.count('D')]
data[cols_num] = data[cols_num].astype('float32')

df_same = pd.read_csv('../output/same_user_pattern/0902__same_user_id__card_addr_pemail_M.csv')
df_same.set_index(COLUMN_ID, inplace=True)
data['user_id_card_addr_pemail_M'] = df_same['predicted_user_id']
data['user_id_card_addr_pemail_M'].isnull().sum()

1097231

In [22]:
START_DATE = '2017-12-01'
startdate = datetime.datetime.strptime(START_DATE, '%Y-%m-%d')
df_train['datetime'] = df_train['TransactionDT'].apply(lambda x: (startdate + datetime.timedelta(seconds = x) - datetime.timedelta(seconds = 14400) ))
df_test['datetime'] = df_test['TransactionDT'].apply(lambda x: (startdate + datetime.timedelta(seconds = x) - datetime.timedelta(seconds = 14400) ))
df_train['datetime'].fillna(datetime.date(2020, 1, 1), inplace=True)
df_test['datetime'].fillna(datetime.date(2020, 1, 1), inplace=True)
df_train['date'] = df_train['datetime'].map(lambda x: x.date())
df_test['date']  =  df_test['datetime'].map(lambda x: x.date())

In [23]:
#========================================================================
# ProductCDあたりのC
#========================================================================
cols_C = [col for col in df_train.columns if col.startswith('C')]
cols_pcd = df_train['ProductCD'].unique()

for col in tqdm(cols_C):
    for pcd in cols_pcd:
        feature_name = f'{col}__ProductCD-{pcd}'
        df_train[feature_name] = -1
        df_test[feature_name] = -1
        df_train.loc[df_train['ProductCD'].isin([pcd]), feature_name] = df_train.loc[df_train['ProductCD'].isin([pcd]), col]
        df_test.loc[df_test['ProductCD'].isin([pcd]), feature_name] = df_test.loc[df_test['ProductCD'].isin([pcd]), col]


  0%|          | 0/14 [00:00<?, ?it/s][A
  7%|▋         | 1/14 [00:02<00:35,  2.73s/it][A
 14%|█▍        | 2/14 [00:04<00:30,  2.54s/it][A
 21%|██▏       | 3/14 [00:07<00:26,  2.45s/it][A
 29%|██▊       | 4/14 [00:08<00:22,  2.22s/it][A
 36%|███▌      | 5/14 [00:10<00:19,  2.12s/it][A
 43%|████▎     | 6/14 [00:12<00:16,  2.09s/it][A
 50%|█████     | 7/14 [00:14<00:14,  2.08s/it][A
 57%|█████▋    | 8/14 [00:17<00:13,  2.17s/it][A
 64%|██████▍   | 9/14 [00:19<00:11,  2.38s/it][A
 71%|███████▏  | 10/14 [00:22<00:09,  2.45s/it][A
 79%|███████▊  | 11/14 [00:25<00:07,  2.56s/it][A
 86%|████████▌ | 12/14 [00:28<00:05,  2.69s/it][A
 93%|█████████▎| 13/14 [00:31<00:02,  2.85s/it][A
100%|██████████| 14/14 [00:34<00:00,  2.90s/it][A
[A

In [28]:
#========================================================================
# Save Feature
#========================================================================
dir_save = 'org_use'
cols_feature = [col for col in df_train.columns if col.count('__Pro')]
save_feature(df_train[cols_feature], '502', dir_save, is_train=True, auto_type=True, list_ignore=COLUMNS_IGNORE)
save_feature(df_test[cols_feature], '502', dir_save, is_train=False, auto_type=True, list_ignore=COLUMNS_IGNORE)

(590540,) | C12__ProductCD-W
(590540,) | C12__ProductCD-H
(590540,) | C12__ProductCD-C
(590540,) | C12__ProductCD-S
(590540,) | C12__ProductCD-R
(590540,) | C1__ProductCD-W
(590540,) | C1__ProductCD-H
(590540,) | C1__ProductCD-C
(590540,) | C1__ProductCD-S
(590540,) | C1__ProductCD-R
(590540,) | C6__ProductCD-W
(590540,) | C6__ProductCD-H
(590540,) | C6__ProductCD-C
(590540,) | C6__ProductCD-S
(590540,) | C6__ProductCD-R
(590540,) | C14__ProductCD-W
(590540,) | C14__ProductCD-H
(590540,) | C14__ProductCD-C
(590540,) | C14__ProductCD-S
(590540,) | C14__ProductCD-R
(590540,) | C13__ProductCD-W
(590540,) | C13__ProductCD-H
(590540,) | C13__ProductCD-C
(590540,) | C13__ProductCD-S
(590540,) | C13__ProductCD-R
(590540,) | C3__ProductCD-W
(590540,) | C3__ProductCD-H
(590540,) | C3__ProductCD-C
(590540,) | C3__ProductCD-S
(590540,) | C3__ProductCD-R
(590540,) | C9__ProductCD-W
(590540,) | C9__ProductCD-H
(590540,) | C9__ProductCD-C
(590540,) | C9__ProductCD-S
(590540,) | C9__ProductCD-R
(5905

In [30]:
df_train.drop(cols_feature, axis=1, inplace=True)
df_test.drop(cols_feature, axis=1, inplace=True)

In [36]:
#========================================================================
# FE Categorical Encoding 
#========================================================================

cols_C = [col for col in df_train.columns if col.startswith('C')]
df_cat = data[cols_C].copy()

for col in tqdm(cols_C):
    num = df_cat[col].value_counts().shape[0]
    df_cat[col].fillna(-1, inplace=True)
    tmp_cols = [col]
    if num>15:
        cnt_feature = get_cnt_feature(df_cat[col].to_frame(), tmp_cols)
        label_feature = get_label_feature(df_cat[col].to_frame(), tmp_cols)
        df_cat = df_cat.join(cnt_feature).join(label_feature)
    elif num>2:
        cnt_feature = get_cnt_feature(df_cat[col].to_frame(), tmp_cols)
        label_feature = get_label_feature(df_cat[col].to_frame(), tmp_cols)
        dummie_feature = get_dummie_feature(df_cat[col].to_frame(), tmp_cols)
        df_cat = df_cat.join(cnt_feature).join(label_feature).join(dummie_feature)
    elif num<=2:
        label_feature = get_label_feature(df_cat[col].to_frame(), tmp_cols)
        df_cat = df_cat.join(label_feature)
    else:
        print(col)
    df_cat.drop(col, axis=1, inplace=True)

HBox(children=(IntProgress(value=0, max=14), HTML(value='')))

Exception ignored in: <function tqdm.__del__ at 0x7fdac6cd7840>
Traceback (most recent call last):
  File "/home/yryrgogo/anaconda3/lib/python3.7/site-packages/tqdm/_tqdm.py", line 889, in __del__
    self.close()
  File "/home/yryrgogo/anaconda3/lib/python3.7/site-packages/tqdm/_tqdm.py", line 1095, in close
    self._decr_instances(self)
  File "/home/yryrgogo/anaconda3/lib/python3.7/site-packages/tqdm/_tqdm.py", line 454, in _decr_instances
    cls.monitor.exit()
  File "/home/yryrgogo/anaconda3/lib/python3.7/site-packages/tqdm/_monitor.py", line 52, in exit
    self.join()
  File "/home/yryrgogo/anaconda3/lib/python3.7/threading.py", line 1029, in join
    raise RuntimeError("cannot join current thread")
RuntimeError: cannot join current thread





MemoryError: 

In [35]:
sorted([col for col in data.columns if not col.count('C')])

['D1',
 'D10',
 'D11',
 'D12',
 'D13',
 'D14',
 'D15',
 'D2',
 'D3',
 'D4',
 'D5',
 'D6',
 'D7',
 'D8',
 'D9',
 'TransactionDT',
 'TransactionID',
 'hour',
 'time_zone',
 'user_id_card_addr_pemail_M']

In [None]:
#========================================================================
# Userあたりの集計, 差分, 変化
#========================================================================
cols_C = [col for col in df_train.columns if col.startswith('C')]
cols_pcd = df_train['ProductCD'].unique()

for col in tqdm(cols_C):
    for pcd in cols_pcd:
        feature_name = f'{col}__ProductCD-{pcd}'
        df_train[feature_name] = -1
        df_test[feature_name] = -1
        df_train.loc[df_train['ProductCD'].isin([pcd]), feature_name] = df_train.loc[df_train['ProductCD'].isin([pcd]), col]
        df_test.loc[df_test['ProductCD'].isin([pcd]), feature_name] = df_test.loc[df_test['ProductCD'].isin([pcd]), col]