In [1]:
%load_ext autoreload
%autoreload 2
from glob import glob
import gc
import os
import sys
import datetime
import numpy as np
import pandas as pd
from tqdm import tqdm
from func.utils import get_numeric_features, get_categorical_features, read_pkl_gzip, to_pkl_gzip, parallel_load_data, get_filename
from func.time_utils import date_add_days
from func.ml_utils import save_feature, get_cnt_feature, get_dummie_feature, get_label_feature
from func.parallel_utils import get_parallel_arg_list
from joblib import delayed, Parallel

In [2]:
COLUMN_ID = 'TransactionID'
COLUMN_DT = 'TransactionDT'
COLUMN_TARGET = 'isFraud'
COLUMNS_IGNORE = [COLUMN_ID, COLUMN_DT, COLUMN_TARGET, 'ProductCD']

train_paths = glob('../feature/eda_base/*_train.gz')
test_paths = glob('../feature/eda_base/*_test.gz')
train_paths += glob('../feature/org_use/526*_train.gz')
test_paths  += glob('../feature/org_use/526*_test.gz')
train_paths += glob('../feature/raw_use/ker__uid*_train.gz')
test_paths  += glob('../feature/raw_use/ker__uid*_test.gz')

train_paths = [path for path in train_paths 
               if (path.count(COLUMN_DT) 
               or path.count(COLUMN_ID)
               or path.count(COLUMN_TARGET)
#                or path.count('V')
#                or path.count('C')
               or path.count('D')
#                or (path.count('526') and path.count('mean'))
               or path.count('uid_')
               or path.count('uid2_t')
               or path.count('uid3_t')
#                or path.count('uid4_t')
#                or path.count('uid5_t')
               or path.count('card')
               or path.count('addr')
               or path.count('domain')
               or path.count('TransactionAmt')
               or path.count('Product')
               )
               and not path.count('fill')
               and not path.count('bin')
               and not path.count('129')
               and not path.count('130')
              ]
test_paths = [path for path in test_paths 
               if (path.count(COLUMN_DT) 
               or path.count(COLUMN_ID)
               or path.count(COLUMN_TARGET)
#                or path.count('V')
#                or path.count('C')
               or path.count('D')
#                or (path.count('526') and path.count('mean'))
               or path.count('uid_')
               or path.count('uid2_t')
               or path.count('uid3_t')
#                or path.count('uid4_t')
#                or path.count('uid5_t')
               or path.count('card')
               or path.count('addr')
               or path.count('domain')
               or path.count('TransactionAmt')
               or path.count('Product')
               )
               and not path.count('fill')
               and not path.count('bin')
               and not path.count('129')
               and not path.count('130')
              ]

df_train = parallel_load_data(train_paths)
df_test = parallel_load_data(test_paths)
data = pd.concat([df_train, df_test], axis=0, ignore_index=True)
train_length = df_train.shape[0]

In [3]:
START_DATE = '2017-12-01'
# START_DATE = '2017-12-01'
startdate = datetime.datetime.strptime(START_DATE, '%Y-%m-%d')

data['datetime'] = data['TransactionDT'].apply(lambda x: (startdate + datetime.timedelta(seconds = x) - datetime.timedelta(seconds = 14400) ))
data['datetime'].fillna(datetime.date(2020, 1, 1), inplace=True)
data['date'] = data['datetime'].map(lambda x: x.date())
data['hour'] = data['datetime'].map(lambda x: x.hour)

list_regist = []
for d, diff in tqdm(data[['date', 'D1']].values):
    if diff < 999999:
        regist = date_add_days(d, -1*diff)
    else:
        regist = date_add_days(d, 0)
    list_regist.append(str(regist))

data['Regist_date'] = list_regist

100%|██████████| 1097231/1097231 [00:04<00:00, 236339.14it/s]


In [4]:
data.columns = [col.replace('ker__', '') for col in data.columns]

# cols_V = [col for col in data.columns if col.startswith('V') and col.count('mean')] + ['V258']
# cols_C = [col for col in data.columns if col.startswith('C')]
cols_D = [col for col in data.columns if col.startswith('D')]
# cols_D = ['D1', 'D3', 'D6', 'D14', 'D15']
col_pd = 'ProductCD'

cols_card = [col for col in data.columns if col.startswith('card')]
cols_addr = [col for col in data.columns if col.startswith('addr')]
cols_uid = ['uid', 'uid2', 'uid3']

In [5]:
prefix = '531'
dir_save = 'valid_use'
col_amt = 'TransactionAmt'
feature = 'datetime'

cols_ugr = cols_card + cols_addr + cols_uid

def parallel_agg(df, base_key):
    
    if str(type(base_key)).count('list'):
        fname = '-'.join(base_key)
    else:
        fname = base_key
        
    base_train = data[base_key].iloc[:len(df_train)]
    base_test  = data[base_key].iloc[len(df_train):]
        
    df_agg = df.groupby(base_key)[feature].agg({
        f'{fname}_Transaction_count': 'count',
    })
    
    base_train_agg = base_train.merge(df_agg, how='left', on=base_key)
    base_test_agg = base_test.merge(df_agg, how='left', on=base_key)

    cols_save = [col for col in base_train_agg.columns if col.count('tion_count')]

    save_feature(base_train_agg[cols_save], prefix, dir_save, is_train=True, auto_type=True, list_ignore=COLUMNS_IGNORE)
    save_feature(base_test_agg[cols_save],  prefix, dir_save, is_train=False, auto_type=True, list_ignore=COLUMNS_IGNORE)
    

for i, base_key in tqdm(enumerate(cols_ugr)):
    
    list_base_key = [
    [base_key] + [col_pd, 'Regist_date', 'hour']
    ,[base_key] + [col_pd, 'Regist_date', 'D6']
    ,[base_key] + [col_pd, 'Regist_date', 'D8']
    ,[base_key] + [col_pd, 'Regist_date', 'D13']
    ,[base_key] + [col_pd, 'Regist_date', 'D14']
    ,[base_key] + [col_pd, 'Regist_date', 'D15']
    ,[base_key] + [col_pd, 'Regist_date', 'hour', col_amt]
    ,[base_key] + [col_pd, 'Regist_date', 'D6', col_amt]
    ,[base_key] + [col_pd, 'Regist_date', 'D8', col_amt]
    ,[base_key] + [col_pd, 'Regist_date', 'D13', col_amt]
    ,[base_key] + [col_pd, 'Regist_date', 'D14', col_amt]
    ,[base_key] + [col_pd, 'Regist_date', 'D15', col_amt]
        
    ,[base_key]  + [col_pd, 'date', 'hour']
    ,[base_key] + [col_pd, 'date', 'D1']
    ,[base_key] + [col_pd, 'date', 'D6']
    ,[base_key] + [col_pd, 'date', 'D8']
    ,[base_key] + [col_pd, 'date', 'D13']
    ,[base_key] + [col_pd, 'date', 'D14']
    ,[base_key] + [col_pd, 'date', 'D15']
    ,[base_key] + [col_pd, 'date', 'hour', col_amt]
    ,[base_key] + [col_pd, 'date', 'D6', col_amt]
    ,[base_key] + [col_pd, 'date', 'D8', col_amt]
    ,[base_key] + [col_pd, 'date', 'D13', col_amt]
    ,[base_key] + [col_pd, 'date', 'D14', col_amt]
    ,[base_key] + [col_pd, 'date', 'D15', col_amt]
    ,[base_key] + [col_pd, 'Regist_date', 'date', 'hour']
    ,[base_key] + [col_pd, 'Regist_date', 'date', 'D6']
    ,[base_key] + [col_pd, 'Regist_date', 'date', 'D8']
    ,[base_key] + [col_pd, 'Regist_date', 'date', 'D13']
    ,[base_key] + [col_pd, 'Regist_date', 'date', 'D14']
    ,[base_key] + [col_pd, 'Regist_date', 'date', 'D15']
    ,[base_key] + [col_pd, 'Regist_date', 'date', 'hour', col_amt]
    ,[base_key] + [col_pd, 'Regist_date', 'date', 'D6', col_amt]
    ,[base_key] + [col_pd, 'Regist_date', 'date', 'D8', col_amt]
    ,[base_key] + [col_pd, 'Regist_date', 'date', 'D13', col_amt]
    ,[base_key] + [col_pd, 'Regist_date', 'date', 'D14', col_amt]
    ,[base_key] + [col_pd, 'Regist_date', 'date', 'D15', col_amt]
    ]
    
    list_p = Parallel(60)([delayed(parallel_agg)(data[list_key + [feature]], list_key) for list_key in list_base_key])

11it [57:56, 323.05s/it]


['c', 'a', 'r', 'd', '1', 'ProductCD', 'date', 'hour']