In [1]:
%load_ext autoreload
%autoreload 2
from glob import glob
import gc
import os
import sys
import datetime
import numpy as np
import pandas as pd
from tqdm import tqdm
from func.utils import get_numeric_features, get_categorical_features, read_pkl_gzip, to_pkl_gzip, parallel_load_data, get_filename
from func.time_utils import date_add_days, date_add_times
from func.ml_utils import save_feature, get_cnt_feature, get_dummie_feature, get_label_feature
from func.parallel_utils import get_parallel_arg_list
from joblib import delayed, Parallel

In [2]:
COLUMN_ID = 'TransactionID'
COLUMN_DT = 'TransactionDT'
COLUMN_TARGET = 'isFraud'
COLUMNS_IGNORE = [COLUMN_ID, COLUMN_DT, COLUMN_TARGET, 'ProductCD']

train_paths = glob('../feature/eda_base/*_train.gz')
test_paths = glob('../feature/eda_base/*_test.gz')

train_paths = [path for path in train_paths 
               if path.count(COLUMN_DT) 
               or path.count(COLUMN_ID)
               or path.count('time_zone')
               or path.count('hour')
               or path.count('Product')
               or path.count('card')
               or path.count('addr')
               or path.count('domain')
               or path.count('C')
               or path.count('V')
              ]
test_paths = [path for path in test_paths 
               if path.count(COLUMN_DT) 
               or path.count(COLUMN_ID)
               or path.count('time_zone')
               or path.count('hour')
               or path.count('Product')
               or path.count('card')
               or path.count('addr')
               or path.count('domain')
               or path.count('C')
               or path.count('V')
              ]

df_train = parallel_load_data(train_paths)
df_test = parallel_load_data(test_paths)
data = pd.concat([df_train, df_test], axis=0, ignore_index=True)
if COLUMN_ID in data.columns:
    data.set_index(COLUMN_ID, inplace=True)
    
del df_train, df_test
gc.collect()

base_train = read_pkl_gzip('../input/base_train.gz').set_index(COLUMN_ID)
base_test = read_pkl_gzip('../input/base_test.gz').set_index(COLUMN_ID)
length_train = base_train.shape[0]
base = pd.concat([base_train, base_test], axis=0)

In [4]:
#========================================================================
# date
#========================================================================
START_DATE = '2017-12-01'
startdate = datetime.datetime.strptime(START_DATE, '%Y-%m-%d')

data['datetime'] = data['TransactionDT'].apply(lambda x: (startdate + datetime.timedelta(seconds = x) - datetime.timedelta(seconds = 14400) ))
data['datetime'].fillna(datetime.date(2020, 1, 1), inplace=True)
data['date'] = data['datetime'].map(lambda x: x.date())
data['date-zone'] = data[['date', 'time_zone']].apply(lambda x: str(x[0]) + '-' + str(x[1]), axis=1)
data['date-hour'] = data[['date', 'hour']].apply(lambda x: str(x[0]) + '-' + str(x[1]), axis=1)

In [7]:
#========================================================================
# ProductCDあたりのC, V
#========================================================================
cols_C = [col for col in data.columns if col.startswith('C')]
cols_V = ['V317', 'V45', 'V87', 'V314', 'V258', 'V282', 'V243']
cols_507 = [col for col in data.columns if col.startswith('507')] 
cols_feature = cols_C + cols_V + cols_507
cols_pcd = data['ProductCD'].unique()

for col in tqdm(cols_feature):
    for pcd in cols_pcd:
        feature_name = f'{col}__ProductCD-{pcd}'
        data[feature_name] = -1
        data.loc[data['ProductCD'].isin([pcd]), feature_name] = data.loc[data['ProductCD'].isin([pcd]), col]

100%|██████████| 23/23 [00:47<00:00,  2.40s/it]


In [10]:
list_domain = [col for col in data.columns if col.count('domain')]
data[list_domain[0]].fillna('#', inplace=True)
data[list_domain[0] +'_prefix'] = data[list_domain[0]].apply(lambda x: x.split('.')[0])
data[list_domain[1]].fillna('#', inplace=True)
data[list_domain[1] +'_prefix'] = data[list_domain[0]].apply(lambda x: x.split('.')[0])

list_domain = [col for col in data.columns if col.count('prefix')]
list_card = [col for col in data.columns if col.count('card')]
list_addr = [col for col in data.columns if col.count('addr')]
list_date = [col for col in data.columns if col.count('date') and not col.count('datetime')]

In [11]:
cols_feature_P = [col for col in data.columns if col.count('ProductCD-') ]
# cols_feature_noP = [col for col in data.columns if not col.count('ProductCD') and not col.count('DT') and (col.count('C') or col.count('V')) ]

In [None]:
from itertools import combinations 
# User別に期間を切って集計できる様にする
prefix = '606'
dir_save = 'valid'
cols_feature = cols_feature_P

        
def parallel_agg(df, base_key, feature):
    
    tmp_train = df.iloc[:length_train].set_index(base_key)
    tmp_test  = df.iloc[length_train:].set_index(base_key)
    
    if str(type(base_key)).count('list'):
        fname = '-'.join(base_key)
    else:
        fname = base_key
    
    result = df.groupby(base_key, as_index=False)[feature].agg({
        f'{fname}_{feature}_sum': 'sum',
        f'{fname}_{feature}_sum': 'mean',
        f'{fname}_{feature}_std': 'std',
    })
    
    agg_train = tmp_train.merge(result, on=base_key, how="left")
    agg_test  = tmp_test.merge(result, on=base_key, how="left")
    
    cols_save = [col for col in agg_train.columns if col.count(f'{fname}_{feature}')]
        
    if agg_train[cols_save[0]].value_counts().shape[0]>1 and agg_test[cols_save[0]].value_counts().shape[0]>1:
        save_feature(agg_train[cols_save], prefix, dir_save, is_train=True, auto_type=True, list_ignore=COLUMNS_IGNORE)
        save_feature(agg_test[cols_save],  prefix, dir_save, is_train=False, auto_type=True, list_ignore=COLUMNS_IGNORE)
    
    
#========================================================================
# Only Card
#========================================================================

combi_card = list(combinations(list_card, 3))
list_base_key = combi_card
    
for combi in tqdm(list_base_key):
    base_key = list(combi)

    list_p = Parallel(60)(
        [delayed(parallel_agg)(
            data[base_key + [feature]], base_key, feature
        ) for feature in cols_feature])
        
combi_card = list(combinations(list_card, 4))
list_base_key = combi_card
    
for combi in tqdm(list_base_key):
    base_key = list(combi)

    list_p = Parallel(60)(
        [delayed(parallel_agg)(
            data[base_key + [feature]], base_key, feature
        ) for feature in cols_feature])
                  
                  
combi_card = list(combinations(list_card, 5))
list_base_key = combi_card
    
for combi in tqdm(list_base_key):
    base_key = list(combi)

    list_p = Parallel(60)(
        [delayed(parallel_agg)(
            data[base_key + [feature]], base_key, feature
        ) for feature in cols_feature])

                  
#========================================================================
# Card and Domain
#========================================================================

combi_card = list(combinations(list_card, 3))
list_base_key = combi_card
    
for combi in tqdm(list_base_key):
    for domain in list_domain:
        base_key = list(combi) + [domain]
        
        list_p = Parallel(60)(
            [delayed(parallel_agg)(
                data[base_key + [feature]], base_key, feature
            ) for feature in cols_feature])
    
    
combi_card = list(combinations(list_card, 4))
list_base_key = combi_card
    
for combi in tqdm(list_base_key):
    for domain in list_domain:
        base_key = list(combi) + [domain]
        
        list_p = Parallel(60)(
            [delayed(parallel_agg)(
                data[base_key + [feature]], base_key, feature
            ) for feature in cols_feature])
        
        
combi_card = list(combinations(list_card, 5))
list_base_key = combi_card
    
for combi in tqdm(list_base_key):
    for domain in list_domain:
        base_key = list(combi) + [domain]
        
        list_p = Parallel(60)(
            [delayed(parallel_agg)(
                data[base_key + [feature]], base_key, feature
            ) for feature in cols_feature])
        

#========================================================================
# Card and addr
#========================================================================

combi_card = list(combinations(list_card, 3))
list_base_key = combi_card
    
for combi in tqdm(list_base_key):
        base_key = list(combi) + ['addr1']
        
        list_p = Parallel(60)(
            [delayed(parallel_agg)(
                data[base_key + [feature]], base_key, feature
            ) for feature in cols_feature])
    
    
combi_card = list(combinations(list_card, 4))
list_base_key = combi_card
    
for combi in tqdm(list_base_key):
    base_key = list(combi) + ['addr1']

    list_p = Parallel(60)(
        [delayed(parallel_agg)(
            data[base_key + [feature]], base_key, feature
        ) for feature in cols_feature])
        
        
combi_card = list(combinations(list_card, 5))
list_base_key = combi_card
    
for combi in tqdm(list_base_key):
    base_key = list(combi) + ['addr1']

    list_p = Parallel(60)(
        [delayed(parallel_agg)(
            data[base_key + [feature]], base_key, feature
        ) for feature in cols_feature])
                  
                  
#========================================================================
# Domain and addr
#========================================================================

for domain in tqdm(list_domain):
    base_key = ['addr', domain]

    list_p = Parallel(60)(
        [delayed(parallel_agg)(
            data[base_key + [feature]], base_key, feature
        ) for feature in cols_feature])
                  
                  
#========================================================================
# Card and Domain and Addr
#========================================================================

combi_card = list(combinations(list_card, 3))
list_base_key = combi_card
    
for combi in tqdm(list_base_key):
    for domain in list_domain:
        base_key = list(combi) + [domain]  + ['addr']
        
        list_p = Parallel(60)(
            [delayed(parallel_agg)(
                data[base_key + [feature]], base_key, feature
            ) for feature in cols_feature])
    
    
combi_card = list(combinations(list_card, 4))
list_base_key = combi_card
    
for combi in tqdm(list_base_key):
    for domain in list_domain:
        base_key = list(combi) + [domain]  + ['addr']
        
        list_p = Parallel(60)(
            [delayed(parallel_agg)(
                data[base_key + [feature]], base_key, feature
            ) for feature in cols_feature])
        
        
combi_card = list(combinations(list_card, 5))
list_base_key = combi_card
    
for combi in tqdm(list_base_key):
    for domain in list_domain:
        base_key = list(combi) + [domain]  + ['addr']
        
        list_p = Parallel(60)(
            [delayed(parallel_agg)(
                data[base_key + [feature]], base_key, feature
            ) for feature in cols_feature])

100%|██████████| 20/20 [05:26<00:00, 17.68s/it]
100%|██████████| 15/15 [05:06<00:00, 21.76s/it]
100%|██████████| 6/6 [02:29<00:00, 24.73s/it]
 85%|████████▌ | 17/20 [33:34<05:58, 119.34s/it]