In [30]:
%load_ext autoreload
%autoreload 2
from glob import glob
import gc
import os
import sys
import datetime
import numpy as np
import pandas as pd
from tqdm import tqdm
from func.utils import get_numeric_features, get_categorical_features, read_pkl_gzip, to_pkl_gzip, parallel_load_data, get_filename
from func.time_utils import date_add_days
from func.ml_utils import save_feature, get_cnt_feature, get_dummie_feature, get_label_feature
from func.parallel_utils import get_parallel_arg_list
from joblib import delayed, Parallel

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [31]:
COLUMN_ID = 'TransactionID'
COLUMN_DT = 'TransactionDT'
COLUMN_TARGET = 'isFraud'
COLUMNS_IGNORE = [COLUMN_ID, COLUMN_DT, COLUMN_TARGET, 'ProductCD']

train_paths = glob('../feature/eda_base/*_train.gz')
test_paths = glob('../feature/eda_base/*_test.gz')
train_paths += glob('../feature/org_use/526*_train.gz')
test_paths  += glob('../feature/org_use/526*_test.gz')
train_paths += glob('../feature/raw_use/ker__uid*_train.gz')
test_paths  += glob('../feature/raw_use/ker__uid*_test.gz')

train_paths = [path for path in train_paths 
               if (path.count(COLUMN_DT) 
               or path.count(COLUMN_ID)
               or path.count(COLUMN_TARGET)
               or path.count('V')
               or path.count('C')
               or path.count('D')
               or (path.count('526') and path.count('mean'))
               or path.count('uid_')
               or path.count('uid2_t')
               or path.count('uid3_t')
#                or path.count('uid4_t')
#                or path.count('uid5_t')
               or path.count('card')
               or path.count('addr')
               or path.count('domain')
               or path.count('TransactionAmt')
               or path.count('Product')
               )
               and not path.count('fill')
               and not path.count('bin')
               and not path.count('129')
               and not path.count('130')
              ]
test_paths = [path for path in test_paths 
               if (path.count(COLUMN_DT) 
               or path.count(COLUMN_ID)
               or path.count(COLUMN_TARGET)
               or path.count('V')
               or path.count('C')
               or path.count('D')
               or (path.count('526') and path.count('mean'))
               or path.count('uid_')
               or path.count('uid2_t')
               or path.count('uid3_t')
#                or path.count('uid4_t')
#                or path.count('uid5_t')
               or path.count('card')
               or path.count('addr')
               or path.count('domain')
               or path.count('TransactionAmt')
               or path.count('Product')
               )
               and not path.count('fill')
               and not path.count('bin')
               and not path.count('129')
               and not path.count('130')
              ]

df_train = parallel_load_data(train_paths)
df_test = parallel_load_data(test_paths)
data = pd.concat([df_train, df_test], axis=0, ignore_index=True)
train_length = df_train.shape[0]

In [32]:
START_DATE = '2017-12-01'
# START_DATE = '2017-12-01'
startdate = datetime.datetime.strptime(START_DATE, '%Y-%m-%d')

data['datetime'] = data['TransactionDT'].apply(lambda x: (startdate + datetime.timedelta(seconds = x) - datetime.timedelta(seconds = 14400) ))
data['datetime'].fillna(datetime.date(2020, 1, 1), inplace=True)
data['date'] = data['datetime'].map(lambda x: x.date())

list_regist = []
for d, diff in tqdm(data[['date', 'D1']].values):
    if diff < 999999:
        regist = date_add_days(d, -1*diff)
    else:
        regist = date_add_days(d, 0)
    list_regist.append(str(regist))

data['Regist_date'] = list_regist

100%|██████████| 1097231/1097231 [00:04<00:00, 239846.95it/s]


In [46]:
prog = read_pkl_gzip('../output/0920_ieee__d3_d8_progress_ProductCD.gz')
data.set_index(COLUMN_ID, inplace=True)
prog.set_index(COLUMN_ID, inplace=True)
data['d3_progress'] = prog['d3_progress']
data['d8_progress'] = prog['d8_progress']
data['d3_progress'] = data['d3_progress'].fillna('0').astype('str')
data['d8_progress'] = data['d8_progress'].fillna('0').astype('str')

In [64]:
cols_V = [col for col in data.columns if col.startswith('V') and col.count('mean')] + ['V258']
cols_C = [col for col in data.columns if col.startswith('C')]
# cols_D = [col for col in data.columns if col.startswith('D')]
cols_D = ['D1', 'D6', 'D14', 'D15']

In [None]:
#========================================================================
# FE Aggregation User ID & TimeSeries Date
#========================================================================

# User別に期間を切って集計できる様にする
dir_save = 'valid_use'
prefix = '529'


def parallel_agg(df, base_key, feature):
    result = df.groupby(base_key)[feature].agg({
        f'{base_key}_agg_{feature}_mean': 'mean',
        f'{base_key}_agg_{feature}_std': 'std',
    })
    return result
    
cols_ugr = ['d3_progress', 'd8_progress']    

cols_feature = cols_C
cols_feature = cols_V
# cols_feature = cols_D
cols_feature = ['TransactionAmt'] + cols_V + cols_C + cols_D

for col in cols_feature:
    data[col] = data[col].astype('float')

# for base_key in tqdm(cols_C + cols_V + cols_D):
for i, base_key in tqdm(enumerate(cols_ugr)):
    
#     if base_key.count('D'):
#         continue
    
    base_train = data[[base_key]].iloc[:len(df_train)]
    base_test  = data[[base_key]].iloc[len(df_train):]

    list_p = Parallel(60)([delayed(parallel_agg)(data[[base_key, feature]], base_key, feature) for feature in cols_feature])

    df_agg = pd.concat(list_p, axis=1)
    df_agg.reset_index(inplace=True)

    base_train_agg = base_train.merge(df_agg, how='left', on=base_key)
    base_test_agg = base_test.merge(df_agg, how='left', on=base_key)

    cols_save = [col for col in base_train_agg.columns if col.count('agg')]

    save_feature(base_train_agg[cols_save], prefix, dir_save, is_train=True, auto_type=True, list_ignore=COLUMNS_IGNORE)
    save_feature(base_test_agg[cols_save],  prefix, dir_save, is_train=False, auto_type=True, list_ignore=COLUMNS_IGNORE)

In [67]:
df_agg

Unnamed: 0,d8_progress,d8_progress_agg_TransactionAmt_mean,d8_progress_agg_TransactionAmt_std
0,0,136.146623,245.863088
1,1,500.000000,
2,2,125.000000,35.355339
3,3,125.000000,35.355339
4,4,200.000000,
5,5,225.000000,176.776695
6,6,166.666667,57.735027
7,7,90.000000,22.360680
8,8,183.333333,28.867513
9,9,50.000000,
