In [2]:
%load_ext autoreload
%autoreload 2
from glob import glob
import gc
import os
import sys
import datetime
import numpy as np
import pandas as pd
from tqdm import tqdm
from func.utils import get_numeric_features, get_categorical_features, read_pkl_gzip, to_pkl_gzip, parallel_load_data, get_filename
from func.time_utils import date_add_days
from func.ml_utils import save_feature, get_cnt_feature, get_dummie_feature, get_label_feature
from func.parallel_utils import get_parallel_arg_list
from joblib import delayed, Parallel

In [10]:
COLUMN_ID = 'TransactionID'
COLUMN_DT = 'TransactionDT'
COLUMN_TARGET = 'isFraud'
COLUMNS_IGNORE = [COLUMN_ID, COLUMN_DT, COLUMN_TARGET, 'ProductCD']

train_paths = glob('../feature/eda_base/*_train.gz')
test_paths = glob('../feature/eda_base/*_test.gz')

train_paths = [path for path in train_paths 
               if path.count(COLUMN_DT) 
               or path.count(COLUMN_ID)
               or path.count('C')
               or path.count('D')
               or path.count('V')
               or path.count('card')
               or path.count('addr')
               or path.count('domain')
               or path.count('Product')
              ]
test_paths = [path for path in test_paths 
               if path.count(COLUMN_DT) 
               or path.count(COLUMN_ID)
               or path.count('C')
               or path.count('D')
               or path.count('V')
               or path.count('card')
               or path.count('addr')
               or path.count('domain')
               or path.count('Product')
              ]

df_train = parallel_load_data(train_paths)
df_test = parallel_load_data(test_paths)
data = pd.concat([df_train, df_test], axis=0, ignore_index=True)
if COLUMN_ID in data.columns:
    data.set_index(COLUMN_ID, inplace=True)

base_train = read_pkl_gzip('../input/base_train.gz').set_index(COLUMN_ID)
base_test = read_pkl_gzip('../input/base_test.gz').set_index(COLUMN_ID)
base = pd.concat([base_train, base_test], axis=0)

In [11]:
START_DATE = '2017-12-01'
startdate = datetime.datetime.strptime(START_DATE, '%Y-%m-%d')

data['datetime'] = data['TransactionDT'].apply(lambda x: (startdate + datetime.timedelta(seconds = x) - datetime.timedelta(seconds = 14400) ))
data['datetime'].fillna(datetime.date(2020, 1, 1), inplace=True)

In [12]:
#========================================================================
# Feature 
#========================================================================
cols_C = [col for col in data.columns if col.startswith('C')]
cols_D = [col for col in data.columns if col.startswith('D')]
cols_V = ['V317', 'V45', 'V87', 'V314', 'V258', 'V282', 'V243', 'V201']
cols_F = [col for col in data.columns if col.count('diff') or col.count('ratio')]

list_domain = [col for col in data.columns if col.count('domain')]
data[list_domain[0]].fillna('#', inplace=True)
data[list_domain[0] +'_prefix'] = data[list_domain[0]].apply(lambda x: x.split('.')[0])
data[list_domain[1]].fillna('#', inplace=True)
data[list_domain[1] +'_prefix'] = data[list_domain[0]].apply(lambda x: x.split('.')[0])

list_domain = [col for col in data.columns if col.count('prefix')]
list_card = [col for col in data.columns if col.count('card')]
list_addr = [col for col in data.columns if col.count('addr')]

In [13]:
#========================================================================
# ProductCDあたりのC
#========================================================================
cols_feature = cols_C + cols_D + cols_V + cols_F
cols_pcd = data['ProductCD'].unique()

for col in tqdm(cols_feature):
    for pcd in cols_pcd:
        feature_name = f'{col}__ProductCD-{pcd}'
        data[feature_name] = np.nan
        data.loc[data['ProductCD'].isin([pcd]), feature_name] = data.loc[data['ProductCD'].isin([pcd]), col]


  0%|          | 0/39 [00:00<?, ?it/s][A
  3%|▎         | 1/39 [00:02<01:15,  2.00s/it][A
  5%|▌         | 2/39 [00:03<01:08,  1.85s/it][A
  8%|▊         | 3/39 [00:05<01:04,  1.78s/it][A
 10%|█         | 4/39 [00:06<01:00,  1.74s/it][A
 13%|█▎        | 5/39 [00:08<00:59,  1.74s/it][A
 15%|█▌        | 6/39 [00:10<00:58,  1.76s/it][A
 18%|█▊        | 7/39 [00:12<00:56,  1.78s/it][A
 21%|██        | 8/39 [00:14<00:56,  1.84s/it][A
 23%|██▎       | 9/39 [00:16<00:56,  1.87s/it][A
 26%|██▌       | 10/39 [00:18<00:55,  1.91s/it][A
 28%|██▊       | 11/39 [00:20<00:55,  1.98s/it][A
 31%|███       | 12/39 [00:22<00:54,  2.02s/it][A
 33%|███▎      | 13/39 [00:24<00:54,  2.09s/it][A
 36%|███▌      | 14/39 [00:26<00:53,  2.14s/it][A
 38%|███▊      | 15/39 [00:29<00:52,  2.20s/it][A
 41%|████      | 16/39 [00:31<00:52,  2.27s/it][A
 44%|████▎     | 17/39 [00:34<00:50,  2.31s/it][A
 46%|████▌     | 18/39 [00:36<00:49,  2.38s/it][A
 49%|████▊     | 19/39 [00:39<00:48,  2.44s/it]

In [46]:
#========================================================================
# 複数カテゴリ組み合わせの各feature deltaを計算
#========================================================================
from itertools import combinations
prefix = 523
length = len(df_train)
dir_save = 'valid_use'
feature = 'datetime'

# debug
# base_key = ['card1', 'card2']
# df = data[list(base_key) + [feature]]
# for i in range(1):

def parallel_agg(df, base_key):
    fname = '-'.join(base_key)
    base = df[base_key + [feature]].copy()
    
    tmp = df[~df[feature].isnull()]
    tmp.sort_values(by=feature, inplace=True)
    
    df_shift_p5 = tmp.groupby(base_key)[feature].shift(5)
    df_shift_p4 = tmp.groupby(base_key)[feature].shift(4)
    df_shift_p3 = tmp.groupby(base_key)[feature].shift(3)
    df_shift_p2 = tmp.groupby(base_key)[feature].shift(2)
    df_shift_p1 = tmp.groupby(base_key)[feature].shift(1)
    df_shift_m1 = tmp.groupby(base_key)[feature].shift(-1)
    df_shift_m2 = tmp.groupby(base_key)[feature].shift(-2)
    df_shift_m3 = tmp.groupby(base_key)[feature].shift(-3)
    df_shift_m4 = tmp.groupby(base_key)[feature].shift(-4)
    df_shift_m5 = tmp.groupby(base_key)[feature].shift(-5)
    
    p5 = 'shift_p5'
    p4 = 'shift_p4'
    p3 = 'shift_p3'
    p2 = 'shift_p2'
    p1 = 'shift_p1'
    m1 = 'shift_m1'
    m2 = 'shift_m2'
    m3 = 'shift_m3'
    m4 = 'shift_m4'
    m5 = 'shift_m5'
    
    df_shift_p5.name = p5
    df_shift_p4.name = p4
    df_shift_p3.name = p3
    df_shift_p2.name = p2
    df_shift_p1.name = p1
    df_shift_m1.name = m1
    df_shift_m2.name = m2
    df_shift_m3.name = m3
    df_shift_m4.name = m4
    df_shift_m5.name = m5
    
    df_shift = pd.concat([
        df_shift_p5,
        df_shift_p4,
        df_shift_p3,
        df_shift_p2,
        df_shift_p1,
        df_shift_m1,
        df_shift_m2,
        df_shift_m3,
        df_shift_m4,
        df_shift_m5,
    ], axis=1, ignore_index=False)
    
    cols_shift = [col for col in df_shift.columns if col.count('shift_')]
    base = base.join(df_shift[cols_shift])
    b0 = feature
    
    # 過去デルタ
    base[f'{fname}_past_{b0}_{p1}_diff'] = base[b0] - base[p1]
    base[f'{fname}_past_{b0}_{p2}_diff'] = base[b0] - base[p2]
    base[f'{fname}_past_{b0}_{p3}_diff'] = base[b0] - base[p3]
    base[f'{fname}_past_{b0}_{p4}_diff'] = base[b0] - base[p4]
    base[f'{fname}_past_{b0}_{p5}_diff'] = base[b0] - base[p5]
    
    base[f'{fname}_past_{p1}_{p2}_diff'] = base[p1] - base[p2]
    base[f'{fname}_past_{p1}_{p3}_diff'] = base[p1] - base[p3]
    base[f'{fname}_past_{p1}_{p4}_diff'] = base[p1] - base[p4]
    base[f'{fname}_past_{p1}_{p5}_diff'] = base[p1] - base[p5]
    
    # 未来デルタ
    base[f'{fname}_future_{b0}_{m1}_diff'] = base[b0] - base[m1]
    base[f'{fname}_future_{b0}_{m2}_diff'] = base[b0] - base[m2]
    base[f'{fname}_future_{b0}_{m3}_diff'] = base[b0] - base[m3]
    base[f'{fname}_future_{b0}_{m4}_diff'] = base[b0] - base[m4]
    base[f'{fname}_future_{b0}_{m5}_diff'] = base[b0] - base[m5]
    
    base[f'{fname}_future_{m1}_{m2}_diff'] = base[m1] - base[m2]
    base[f'{fname}_future_{m1}_{m3}_diff'] = base[m1] - base[m3]
    base[f'{fname}_future_{m1}_{m4}_diff'] = base[m1] - base[m4]
    base[f'{fname}_future_{m1}_{m5}_diff'] = base[m1] - base[m5]
    
    train = base.iloc[:length]
    test = base.iloc[length:]
    
    cols_save = [col for col in train.columns if col.count('future') or col.count('past')]
    
    for col in tqdm(cols_save):
        train[col] = train[col].map(lambda x: x.days)
        test[col] = test[col].map(lambda x: x.days)
    
#     print(train[cols_save].head())
    save_feature(train[cols_save], prefix, dir_save, is_train=True, auto_type=True, list_ignore=COLUMNS_IGNORE)
    save_feature(test[cols_save],  prefix, dir_save, is_train=False, auto_type=True, list_ignore=COLUMNS_IGNORE)
    

combi_card = list(combinations(list_card, 2))
list_base_key = combi_card
Parallel(60)([delayed(parallel_agg)(data[list(base_key) + [feature]], list(base_key)) for base_key in list_base_key])

combi_card = list(combinations(list_card, 3))
list_base_key = combi_card
Parallel(60)([delayed(parallel_agg)(data[list(base_key) + [feature]], list(base_key)) for base_key in list_base_key])

combi_card = list(combinations(list_card, 4))
list_base_key = combi_card
Parallel(60)([delayed(parallel_agg)(data[list(base_key) + [feature]], list(base_key)) for base_key in list_base_key])

combi_card = list(combinations(list_card, 5))
list_base_key = combi_card
Parallel(60)([delayed(parallel_agg)(data[list(base_key) + [feature]], list(base_key)) for base_key in list_base_key])

list_base_key = []
combi_card = list(combinations(list_card, 2))
for domain in list_domain:
    for card in combi_card:
        list_base_key.append([domain] + list(card))
Parallel(60)([delayed(parallel_agg)(data[list(base_key) + [feature]], list(base_key)) for base_key in list_base_key])

list_base_key = []
combi_card = list(combinations(list_card, 3))
for domain in list_domain:
    for card in combi_card:
        list_base_key.append([domain] + list(card))
Parallel(60)([delayed(parallel_agg)(data[list(base_key) + [feature]], list(base_key)) for base_key in list_base_key])

list_base_key = []
combi_card = list(combinations(list_card, 4))
for domain in list_domain:
    for card in combi_card:
        list_base_key.append([domain] + list(card))
Parallel(60)([delayed(parallel_agg)(data[list(base_key) + [feature]], list(base_key)) for base_key in list_base_key])

list_base_key = []
combi_card = list(combinations(list_card, 5))
for domain in list_domain:
    for card in combi_card:
        list_base_key.append([domain] + list(card))
Parallel(60)([delayed(parallel_agg)(data[list(base_key) + [feature]], list(base_key)) for base_key in list_base_key])

[None, None, None, None, None, None, None, None, None, None, None, None]

In [47]:
1

1