In [1]:
%load_ext autoreload
%autoreload 2
from glob import glob
import gc
import os
import sys
import datetime
import numpy as np
import pandas as pd
from tqdm import tqdm
from func.utils import get_numeric_features, get_categorical_features, read_pkl_gzip, to_pkl_gzip, parallel_load_data, get_filename
from func.time_utils import date_add_days
from func.ml_utils import save_feature, get_cnt_feature, get_dummie_feature, get_label_feature
from func.parallel_utils import get_parallel_arg_list
from joblib import delayed, Parallel
from itertools import combinations

In [5]:
COLUMN_ID = 'TransactionID'
COLUMN_DT = 'TransactionDT'
COLUMN_TARGET = 'isFraud'
COLUMNS_IGNORE = [COLUMN_ID, COLUMN_DT, COLUMN_TARGET, 'ProductCD']

train_paths = glob('../feature/eda_base/*_train.gz')
test_paths = glob('../feature/eda_base/*_test.gz')

train_paths = [path for path in train_paths 
               if path.count(COLUMN_DT) 
               or path.count(COLUMN_ID)
               or path.count('C')
               or path.count('D')
               or path.count('card')
               or path.count('addr')
               or path.count('domain')
               or path.count('Product')
              ]
test_paths = [path for path in test_paths 
               if path.count(COLUMN_DT) 
               or path.count(COLUMN_ID)
               or path.count('C')
               or path.count('D')
               or path.count('card')
               or path.count('addr')
               or path.count('domain')
               or path.count('Product')
              ]

df_train = parallel_load_data(train_paths)
df_test = parallel_load_data(test_paths)
data = pd.concat([df_train, df_test], axis=0, ignore_index=True)
if COLUMN_ID in data.columns:
    data.set_index(COLUMN_ID, inplace=True)

base_train = read_pkl_gzip('../input/base_train.gz').set_index(COLUMN_ID)
base_test = read_pkl_gzip('../input/base_test.gz').set_index(COLUMN_ID)
base = pd.concat([base_train, base_test], axis=0)

In [18]:
# START_DATE = '2017-12-01'
# startdate = datetime.datetime.strptime(START_DATE, '%Y-%m-%d')

# data['datetime'] = data['TransactionDT'].apply(lambda x: (startdate + datetime.timedelta(seconds = x) - datetime.timedelta(seconds = 14400) ))
# data['datetime'].fillna(datetime.date(2020, 1, 1), inplace=True)
# data['date'] = data['datetime'].map(lambda x: x.date())
# data['year'] = data['datetime'].map(lambda x: x.year)
# data['month'] = data['datetime'].map(lambda x: x.month)
data['yyyymmdd'] = data['date'].map(lambda x: int(str(x).replace('-', '')))

In [7]:
#========================================================================
# 
#========================================================================
cols_C = [col for col in data.columns if col.startswith('C')]
cols_D = [col for col in data.columns if col.startswith('D')]

list_domain = [col for col in data.columns if col.count('domain')]
data[list_domain[0]].fillna('#', inplace=True)
data[list_domain[0] +'_prefix'] = data[list_domain[0]].apply(lambda x: x.split('.')[0])
data[list_domain[1]].fillna('#', inplace=True)
data[list_domain[1] +'_prefix'] = data[list_domain[0]].apply(lambda x: x.split('.')[0])

In [5]:
#========================================================================
# ProductCDあたりのC
#========================================================================
# cols_feature = [col for col in data.columns if col.count('__ratio')]
# cols_pcd = data['ProductCD'].unique()

# for col in tqdm(cols_feature):
#     for pcd in cols_pcd:
#         feature_name = f'{col}__ProductCD-{pcd}'
#         data[feature_name] = np.nan
#         data.loc[data['ProductCD'].isin([pcd]), feature_name] = data.loc[data['ProductCD'].isin([pcd]), col]

100%|██████████| 126/126 [07:29<00:00,  6.40s/it]


In [39]:
prefix = 522
length = len(df_train)
dir_save = 'valid_use'
# feature = 'yyyymmdd'
# list_feature = ['C1', 'V201']
list_feature = ['D1', 'D3', 'C14']

def parallel_agg(df, base_key):
    fname = '-'.join(base_key)
    base = df[base_key].copy()
    tmp = df[~df[feature].isnull()]
    tmp.sort_values(by=feature, inplace=True)
    df_first = tmp.groupby(base_key, as_index=False)[feature].first()
    df_first.columns = base_key + [f'{fname}_first_{feature}']
    df_last = tmp.groupby(base_key, as_index=False)[feature].last()
    df_last.columns = base_key + [f'{fname}_last_{feature}']
    df_fl = df_first.merge(df_last, on=base_key, how='inner')
    base = base.merge(df_fl, how='left', on=base_key)
    
    train = base.iloc[:length]
    test = base.iloc[length:]
    
    cols_save = [col for col in train.columns if col.count(f'_first') or col.count('_last')]
        
    print(train[cols_save].head())
    save_feature(train[cols_save], prefix, dir_save, is_train=True, auto_type=True, list_ignore=COLUMNS_IGNORE)
    save_feature(test[cols_save],  prefix, dir_save, is_train=False, auto_type=True, list_ignore=COLUMNS_IGNORE)

for feature in list_feature:

    list_domain = [col for col in data.columns if col.count('prefix')]
    list_card = [col for col in data.columns if col.count('card')]
    list_addr = [col for col in data.columns if col.count('addr')]
    
    # card * 3
    combi_card = list(combinations(list_card, 3))
    list_base_key = combi_card
    Parallel(60)([delayed(parallel_agg)(data[list(base_key) + [feature]], list(base_key)) for base_key in list_base_key])
    
    # card * 4
    combi_card = list(combinations(list_card, 4))
    list_base_key = combi_card
    Parallel(60)([delayed(parallel_agg)(data[list(base_key) + [feature]], list(base_key)) for base_key in list_base_key])
    
    # card * 2 * domain
    list_base_key = []
    combi_card = list(combinations(list_card, 2))
    for domain in list_domain:
        for card in combi_card:
            list_base_key.append([domain] + list(card))
    Parallel(60)([delayed(parallel_agg)(data[list(base_key) + [feature]], list(base_key)) for base_key in list_base_key])
    
    # card * 3 * domain
    list_base_key = []
    combi_card = list(combinations(list_card, 3))
    for domain in list_domain:
        for card in combi_card:
            list_base_key.append([domain] + list(card))
    Parallel(60)([delayed(parallel_agg)(data[list(base_key) + [feature]], list(base_key)) for base_key in list_base_key])
    
    # card * 4 * domain
    list_base_key = []
    combi_card = list(combinations(list_card, 4))
    for domain in list_domain:
        for card in combi_card:
            list_base_key.append([domain] + list(card))
    Parallel(60)([delayed(parallel_agg)(data[list(base_key) + [feature]], list(base_key)) for base_key in list_base_key])