In [1]:
%load_ext autoreload
%autoreload 2
from glob import glob
import gc
import os
from pathlib import Path
import re
import sys
import yaml
import datetime
import numpy as np
import pandas as pd
from itertools import combinations
from tqdm import tqdm
from func.utils import get_categorical_features, read_pkl_gzip, to_pkl_gzip, parallel_load_data, get_filename
from func.ml_utils import save_feature, get_cnt_feature, get_dummie_feature, get_label_feature
from kaggle_utils import reduce_mem_usage, move_feature
from joblib import Parallel, delayed

In [5]:
COLUMN_ID = 'TransactionID'
COLUMN_DT = 'TransactionDT'
COLUMN_TARGET = 'isFraud'
COLUMNS_IGNORE = [COLUMN_ID, COLUMN_DT, COLUMN_TARGET, 'is_train', 'date']

def filter_feature(path):
#     if path.count('') :
    if path.count('fill__')\
    or path.count('bin__')\
    or path.count(COLUMN_TARGET)\
    :
        return True
    else:
        return False


paths_train = glob('../feature/eda_base/*_train.gz')
paths_train = [path for path in paths_train if filter_feature(path) ]
df_train = parallel_load_data(paths_train)

paths_test = glob('../feature/eda_base/*_test.gz')
paths_test = [path for path in paths_test if filter_feature(path) ]
df_test = parallel_load_data(paths_test)

group_kfold_path = '../input/0908_ieee__DT-M_GroupKFold.gz'
group = read_pkl_gzip(group_kfold_path)
COLUMN_GROUP = 'DT-M'
df_train[COLUMN_GROUP] = group

In [3]:
"""
Judgeのルールはどう作るか？bearさんが共有してくれたのも使うが、自分でも考えてみる.
自分は条件のかけ算だと思っている。しかしこれだとツリーとやってること同じかも？
別の期間での各カテゴリのFraud Probをかけ合わせる。また、閾値をもうけてカウントする.
Cとかはその怪しいトリガーのカウントではないのか？
直近で同じカテゴリを含むトランザクションがFraudしたか、は特徴にならないな。Privateでは使えないから
Stripeとか見るか、

1. まずはProduct別にcard, addr, domain, device, Amt, (D, C, V, )のTEを行う
"""

df_train['cents'] = np.round( df_train['TransactionAmt'] - np.floor(df_train['TransactionAmt']),2 )

list_domain = [col for col in df_train.columns if col.count('domain')]
df_train[list_domain[0]].fillna('#', inplace=True)
df_train[list_domain[0] +'_prefix'] = df_train[list_domain[0]].apply(lambda x: x.split('.')[0])
df_train[list_domain[1]].fillna('#', inplace=True)
df_train[list_domain[1] +'_prefix'] = df_train[list_domain[0]].apply(lambda x: x.split('.')[0])

df_test['cents'] = np.round( df_test['TransactionAmt'] - np.floor(df_test['TransactionAmt']),2 )

list_domain = [col for col in df_test.columns if col.count('domain')]
df_test[list_domain[0]].fillna('#', inplace=True)
df_test[list_domain[0] +'_prefix'] = df_test[list_domain[0]].apply(lambda x: x.split('.')[0])
df_test[list_domain[1]].fillna('#', inplace=True)
df_test[list_domain[1] +'_prefix'] = df_test[list_domain[0]].apply(lambda x: x.split('.')[0])

emails = {'gmail': 'google', 'att.net': 'att', 'twc.com': 'spectrum', 'scranton.edu': 'other', 'optonline.net': 'other',
          'hotmail.co.uk': 'microsoft', 'comcast.net': 'other', 'yahoo.com.mx': 'yahoo', 'yahoo.fr': 'yahoo',
          'yahoo.es': 'yahoo', 'charter.net': 'spectrum', 'live.com': 'microsoft', 'aim.com': 'aol', 'hotmail.de': 'microsoft',
          'centurylink.net': 'centurylink', 'gmail.com': 'google', 'me.com': 'apple', 'earthlink.net': 'other', 
          'gmx.de': 'other', 'web.de': 'other', 'cfl.rr.com': 'other', 'hotmail.com': 'microsoft', 'protonmail.com': 'other',
          'hotmail.fr': 'microsoft', 'windstream.net': 'other', 'outlook.es': 'microsoft', 'yahoo.co.jp': 'yahoo',
          'yahoo.de': 'yahoo', 'servicios-ta.com': 'other', 'netzero.net': 'other', 'suddenlink.net': 'other',
          'roadrunner.com': 'other', 'sc.rr.com': 'other', 'live.fr': 'microsoft', 'verizon.net': 'yahoo',
          'msn.com': 'microsoft', 'q.com': 'centurylink', 'prodigy.net.mx': 'att', 'frontier.com': 'yahoo',
          'anonymous.com': 'anonymous', 'rocketmail.com': 'yahoo', 'sbcglobal.net': 'att', 'frontiernet.net': 'yahoo',
          'ymail.com': 'yahoo', 'outlook.com': 'microsoft', 'mail.com': 'anonymous', 'bellsouth.net': 'other',
          'embarqmail.com': 'centurylink', 'cableone.net': 'other', 'hotmail.es': 'microsoft', 'mac.com': 'apple',
          'yahoo.co.uk': 'yahoo', 'netzero.com': 'other', 'yahoo.com': 'yahoo', 'live.com.mx': 'microsoft', 'ptd.net': 'other',
          'cox.net': 'other', 'aol.com': 'aol', 'juno.com': 'other', 'icloud.com': 'apple'}
us_emails = ['gmail', 'net', 'edu']

for c in ['P_emaildomain', 'R_emaildomain']:
    df_train[c + '_bin'] = df_train[c].map(emails)
    df_train[c + '_suffix'] = df_train[c].map(lambda x: str(x).split('.')[-1])
    df_train[c + '_suffix'] = df_train[c + '_suffix'].map(lambda x: x if str(x) not in us_emails else 'us')
    df_test[c + '_bin'] = df_test[c].map(emails)
    df_test[c + '_suffix'] = df_test[c].map(lambda x: str(x).split('.')[-1])
    df_test[c + '_suffix'] = df_test[c + '_suffix'].map(lambda x: x if str(x) not in us_emails else 'us')


list_domain = [col for col in df_train.columns if col.count('prefix') or col.count('_suffix') or col.count('_bin')]
list_card = [col for col in df_train.columns if col.count('card')]
list_addr = [col for col in df_train.columns if col.count('addr')]
list_amt = ['cents', 'TransactionAmt']

list_single = ['ProductCD']\
+ sorted(list_card)\
+ sorted(list_addr)\
+ sorted(list_domain)\
+ list_amt

In [10]:
list_combi = []
for i in range(2, len(list_single)+1, 1):
    list_combi.append(combinations(list_single, i))

In [7]:
#========================================================================
# Target Encoding
# 当月のDataはDropして計算する(DT-MでGroupKするので)
#========================================================================
dir_save = 'create'
cols_DTM = df_train['DT-M'].unique().tolist()

list_single = [col for col in df_train.columns if col.count('fill__') or col.count('bin__')]

# def parallel_TE(tmp_train, tmp_test, combi):
    
# for combi in list_combi:
#     for col in tqdm(combi):
for j in range(1):
    for col in tqdm(list_single):

        tmp_train = df_train
        tmp_test = df_test

        if str(type(col)).count('tuple'):
            col = list(col)
        if str(type(col)).count('list'):
            pass
        else:
            col = [col]
            
#         for c in col:
#             if str(tmp_train[c].dtype).count('int') or str(tmp_train[c].dtype).count('float'):
#                 tmp_train[c].fillna(-999, inplace=True)
#                 tmp_test[c].fillna(-999, inplace=True)
#             else:
#                 tmp_train[c].fillna('#', inplace=True)
#                 tmp_test[c].fillna('#', inplace=True)
    
        base_train = tmp_train[col + ['DT-M']]
        base_test = tmp_test[col]
        list_dtm = []
    
        fname = '-'.join(col)
        feature_name = f'{fname}_fraud_mean'
    
        for dtm in cols_DTM + ['test']:
    
            # validationの期間を除く
            if dtm != 'test':
                df = tmp_train[tmp_train['DT-M']!=dtm].copy()
            else:
                df = tmp_train.copy()
    
            te_map = df.groupby(col)[COLUMN_TARGET].agg({
                feature_name: 'mean'
            })
            cnt_map = df.groupby(col)[COLUMN_TARGET].agg({
                'cnt': 'count'
            })
            # 母数が少ないのは平均埋め
            df_te = pd.concat([te_map, cnt_map], axis=1)
            df_te.loc[df_te[df_te['cnt']<100].index, feature_name] = tmp_train[COLUMN_TARGET].mean()
            
            if dtm != 'test':
                te_map['DT-M'] = dtm
                list_dtm.append(te_map)
            else:
                test_TE = te_map
    
        train_TE = pd.concat(list_dtm, axis=0)
        result_train = base_train.merge(train_TE, how='left', on=col + ['DT-M'])
        result_test = base_test.merge(test_TE, how='left', on=col)
    
        cols_save = [col for col in result_train.columns if col.count(f'_fraud_')]
    
        save_feature(result_train[cols_save], '524', dir_save, is_train=True, auto_type=True, list_ignore=COLUMNS_IGNORE)
        save_feature(result_test[cols_save],  '524', dir_save, is_train=False, auto_type=True, list_ignore=COLUMNS_IGNORE)
        
# Parallel(4)([delayed(parallel_TE)(df_train, df_test, combi) for combi in list_combi])


  0%|          | 0/124 [00:00<?, ?it/s][A
  1%|          | 1/124 [00:07<15:52,  7.75s/it][A
  2%|▏         | 2/124 [00:15<15:35,  7.67s/it][A

(590540,) | bin__C7_fraud_mean
(506691,) | bin__C7_fraud_mean



  2%|▏         | 3/124 [00:22<15:22,  7.63s/it][A

(590540,) | fill__P_emaildomain_bin_fraud_mean
(506691,) | fill__P_emaildomain_bin_fraud_mean



  3%|▎         | 4/124 [00:32<16:39,  8.33s/it][A

(590540,) | fill__cnt_V87_fraud_mean
(506691,) | fill__cnt_V87_fraud_mean



  4%|▍         | 5/124 [00:41<16:47,  8.46s/it][A

(590540,) | fill__cnt_V78_fraud_mean
(506691,) | fill__cnt_V78_fraud_mean



  5%|▍         | 6/124 [00:49<16:04,  8.17s/it][A

(590540,) | fill__cnt_V165_fraud_mean
(506691,) | fill__cnt_V165_fraud_mean



  6%|▌         | 7/124 [00:57<16:16,  8.35s/it][A

(590540,) | fill__cnt_C5_fraud_mean
(506691,) | fill__cnt_C5_fraud_mean



  6%|▋         | 8/124 [01:06<16:31,  8.55s/it][A

(590540,) | fill__cnt_addr2_fraud_mean
(506691,) | fill__cnt_addr2_fraud_mean



  7%|▋         | 9/124 [01:16<16:50,  8.78s/it][A

(590540,) | fill__cnt_V314_fraud_mean
(506691,) | fill__cnt_V314_fraud_mean



  8%|▊         | 10/124 [01:23<15:51,  8.35s/it][A

(590540,) | fill__cnt_V156_fraud_mean
(506691,) | fill__cnt_V156_fraud_mean



  9%|▉         | 11/124 [01:30<15:16,  8.11s/it][A

(590540,) | fill__M2_fraud_mean
(506691,) | fill__M2_fraud_mean



 10%|▉         | 12/124 [01:38<14:56,  8.00s/it][A

(590540,) | fill__cnt_V128_fraud_mean
(506691,) | fill__cnt_V128_fraud_mean



 10%|█         | 13/124 [01:46<14:32,  7.86s/it][A

(590540,) | fill__addr1_fraud_mean
(506691,) | fill__addr1_fraud_mean



 11%|█▏        | 14/124 [01:54<14:53,  8.12s/it][A

(590540,) | fill__R_emaildomain_bin_fraud_mean
(506691,) | fill__R_emaildomain_bin_fraud_mean



 12%|█▏        | 15/124 [02:03<15:04,  8.30s/it][A

(590540,) | fill__cnt_V127_fraud_mean
(506691,) | fill__cnt_V127_fraud_mean



 13%|█▎        | 16/124 [02:10<14:20,  7.97s/it][A

(590540,) | fill__cnt_V315_fraud_mean
(506691,) | fill__cnt_V315_fraud_mean



 14%|█▎        | 17/124 [02:18<13:52,  7.78s/it][A

(590540,) | bin__TransactionAmt_fraud_mean
(506691,) | bin__TransactionAmt_fraud_mean



 15%|█▍        | 18/124 [02:26<13:56,  7.90s/it][A

(590540,) | fill__cnt_V318_fraud_mean
(506691,) | fill__cnt_V318_fraud_mean



 15%|█▌        | 19/124 [02:34<13:56,  7.97s/it][A

(590540,) | fill__card3_fraud_mean
(506691,) | fill__card3_fraud_mean



 16%|█▌        | 20/124 [02:42<13:43,  7.92s/it][A

(590540,) | fill__card5_fraud_mean
(506691,) | fill__card5_fraud_mean



 17%|█▋        | 21/124 [02:50<13:39,  7.95s/it][A

(590540,) | fill__cnt_V90_fraud_mean
(506691,) | fill__cnt_V90_fraud_mean



 18%|█▊        | 22/124 [02:58<13:24,  7.89s/it][A

(590540,) | bin__C12_fraud_mean
(506691,) | bin__C12_fraud_mean



 19%|█▊        | 23/124 [03:05<13:13,  7.85s/it][A

(590540,) | fill__cnt_V6_fraud_mean
(506691,) | fill__cnt_V6_fraud_mean



 19%|█▉        | 24/124 [03:15<13:47,  8.28s/it][A

(590540,) | fill__cnt_cents_fraud_mean
(506691,) | fill__cnt_cents_fraud_mean



 20%|██        | 25/124 [03:23<13:54,  8.43s/it][A

(590540,) | fill__cnt_V133_fraud_mean
(506691,) | fill__cnt_V133_fraud_mean



 21%|██        | 26/124 [03:32<13:45,  8.43s/it][A

(590540,) | fill__cnt_TransactionAmt_fraud_mean
(506691,) | fill__cnt_TransactionAmt_fraud_mean



 22%|██▏       | 27/124 [03:40<13:24,  8.29s/it][A

(590540,) | fill__cnt_card6_fraud_mean
(506691,) | fill__cnt_card6_fraud_mean



 23%|██▎       | 28/124 [03:48<13:08,  8.22s/it][A

(590540,) | fill__cnt_V38_fraud_mean
(506691,) | fill__cnt_V38_fraud_mean



 23%|██▎       | 29/124 [03:57<13:25,  8.48s/it][A

(590540,) | fill__cnt_V130_fraud_mean
(506691,) | fill__cnt_V130_fraud_mean



 24%|██▍       | 30/124 [04:05<12:59,  8.29s/it][A

(590540,) | fill__card6_fraud_mean
(506691,) | fill__card6_fraud_mean



 25%|██▌       | 31/124 [04:12<12:32,  8.09s/it][A

(590540,) | fill__cnt_V48_fraud_mean
(506691,) | fill__cnt_V48_fraud_mean



 26%|██▌       | 32/124 [04:21<12:32,  8.18s/it][A

(590540,) | bin__Amt_DIV200_fraud_mean
(506691,) | bin__Amt_DIV200_fraud_mean



 27%|██▋       | 33/124 [04:29<12:21,  8.15s/it][A

(590540,) | fill__cnt_V201_fraud_mean
(506691,) | fill__cnt_V201_fraud_mean



 27%|██▋       | 34/124 [04:36<11:46,  7.85s/it][A

(590540,) | fill__cnt_V83_fraud_mean
(506691,) | fill__cnt_V83_fraud_mean



 28%|██▊       | 35/124 [04:43<11:18,  7.62s/it][A

(590540,) | bin__C3_fraud_mean
(506691,) | bin__C3_fraud_mean



 29%|██▉       | 36/124 [04:50<10:54,  7.44s/it][A

(590540,) | fill__cnt_V320_fraud_mean
(506691,) | fill__cnt_V320_fraud_mean



 30%|██▉       | 37/124 [04:57<10:31,  7.26s/it][A

(590540,) | bin__C13_fraud_mean
(506691,) | bin__C13_fraud_mean



 31%|███       | 38/124 [05:06<11:01,  7.69s/it][A

(590540,) | fill__cnt_V4_fraud_mean
(506691,) | fill__cnt_V4_fraud_mean



 31%|███▏      | 39/124 [05:14<11:08,  7.87s/it][A

(590540,) | fill__cnt_V258_fraud_mean
(506691,) | fill__cnt_V258_fraud_mean



 32%|███▏      | 40/124 [05:21<10:43,  7.66s/it][A

(590540,) | bin__C9_fraud_mean
(506691,) | bin__C9_fraud_mean



 33%|███▎      | 41/124 [05:29<10:32,  7.62s/it][A

(590540,) | bin__C14_fraud_mean
(506691,) | bin__C14_fraud_mean



 34%|███▍      | 42/124 [05:38<11:03,  8.10s/it][A

(590540,) | fill__cnt_card3_fraud_mean
(506691,) | fill__cnt_card3_fraud_mean



 35%|███▍      | 43/124 [05:46<10:43,  7.94s/it][A

(590540,) | fill__addr2_fraud_mean
(506691,) | fill__addr2_fraud_mean



 35%|███▌      | 44/124 [05:55<11:07,  8.35s/it][A

(590540,) | fill__M8_fraud_mean
(506691,) | fill__M8_fraud_mean



 36%|███▋      | 45/124 [06:03<10:52,  8.26s/it][A

(590540,) | bin__Amt_DIV50_fraud_mean
(506691,) | bin__Amt_DIV50_fraud_mean



 37%|███▋      | 46/124 [06:12<10:55,  8.41s/it][A

(590540,) | fill__cnt_V29_fraud_mean
(506691,) | fill__cnt_V29_fraud_mean



 38%|███▊      | 47/124 [06:19<10:24,  8.11s/it][A

(590540,) | fill__cnt_V307_fraud_mean
(506691,) | fill__cnt_V307_fraud_mean



 39%|███▊      | 48/124 [06:27<10:11,  8.05s/it][A

(590540,) | fill__cnt_C4_fraud_mean
(506691,) | fill__cnt_C4_fraud_mean



 40%|███▉      | 49/124 [06:36<10:21,  8.28s/it][A

(590540,) | fill__cnt_V53_fraud_mean
(506691,) | fill__cnt_V53_fraud_mean



 40%|████      | 50/124 [06:43<09:59,  8.10s/it][A

(590540,) | fill__cnt_V308_fraud_mean
(506691,) | fill__cnt_V308_fraud_mean



 41%|████      | 51/124 [06:51<09:38,  7.93s/it][A

(590540,) | fill__cnt_V267_fraud_mean
(506691,) | fill__cnt_V267_fraud_mean



 42%|████▏     | 52/124 [06:58<09:20,  7.78s/it][A

(590540,) | bin__Amt_DIV30_fraud_mean
(506691,) | bin__Amt_DIV30_fraud_mean



 43%|████▎     | 53/124 [07:08<09:41,  8.19s/it][A

(590540,) | fill__cnt_addr1_fraud_mean
(506691,) | fill__cnt_addr1_fraud_mean



 44%|████▎     | 54/124 [07:17<09:53,  8.48s/it][A

(590540,) | fill__cnt_V61_fraud_mean
(506691,) | fill__cnt_V61_fraud_mean



 44%|████▍     | 55/124 [07:24<09:27,  8.22s/it][A

(590540,) | fill__cnt_C11_fraud_mean
(506691,) | fill__cnt_C11_fraud_mean



 45%|████▌     | 56/124 [07:32<09:07,  8.06s/it][A

(590540,) | fill__cnt_V2_fraud_mean
(506691,) | fill__cnt_V2_fraud_mean



 46%|████▌     | 57/124 [07:40<08:50,  7.92s/it][A

(590540,) | fill__cnt_V312_fraud_mean
(506691,) | fill__cnt_V312_fraud_mean



 47%|████▋     | 58/124 [07:47<08:39,  7.87s/it][A

(590540,) | fill__cnt_V94_fraud_mean
(506691,) | fill__cnt_V94_fraud_mean



 48%|████▊     | 59/124 [07:55<08:24,  7.76s/it][A

(590540,) | fill__cnt_card1_fraud_mean
(506691,) | fill__cnt_card1_fraud_mean



 48%|████▊     | 60/124 [08:03<08:21,  7.84s/it][A

(590540,) | fill__M7_fraud_mean
(506691,) | fill__M7_fraud_mean



 49%|████▉     | 61/124 [08:11<08:17,  7.89s/it][A

(590540,) | fill__cnt_C13_fraud_mean
(506691,) | fill__cnt_C13_fraud_mean



 50%|█████     | 62/124 [08:20<08:38,  8.37s/it][A

(590540,) | fill__cnt_V3_fraud_mean
(506691,) | fill__cnt_V3_fraud_mean



 51%|█████     | 63/124 [08:28<08:24,  8.27s/it][A

(590540,) | fill__cnt_C1_fraud_mean
(506691,) | fill__cnt_C1_fraud_mean



 52%|█████▏    | 64/124 [08:36<08:12,  8.21s/it][A

(590540,) | fill__cnt_V294_fraud_mean
(506691,) | fill__cnt_V294_fraud_mean



 52%|█████▏    | 65/124 [08:45<08:01,  8.16s/it][A

(590540,) | fill__cnt_C2_fraud_mean
(506691,) | fill__cnt_C2_fraud_mean



 53%|█████▎    | 66/124 [08:53<07:52,  8.14s/it][A

(590540,) | fill__M4_fraud_mean
(506691,) | fill__M4_fraud_mean



 54%|█████▍    | 67/124 [09:01<07:46,  8.18s/it][A

(590540,) | fill__cnt_card5_fraud_mean
(506691,) | fill__cnt_card5_fraud_mean



 55%|█████▍    | 68/124 [09:09<07:34,  8.12s/it][A

(590540,) | fill__cnt_V7_fraud_mean
(506691,) | fill__cnt_V7_fraud_mean



 56%|█████▌    | 69/124 [09:17<07:23,  8.07s/it][A

(590540,) | fill__cnt_V131_fraud_mean
(506691,) | fill__cnt_V131_fraud_mean



 56%|█████▋    | 70/124 [09:26<07:36,  8.46s/it][A

(590540,) | bin__cents_fraud_mean
(506691,) | bin__cents_fraud_mean



 57%|█████▋    | 71/124 [09:34<07:25,  8.40s/it][A

(590540,) | fill__card1_fraud_mean
(506691,) | fill__card1_fraud_mean



 58%|█████▊    | 72/124 [09:44<07:37,  8.79s/it][A

(590540,) | bin__Amt_DIV100_fraud_mean
(506691,) | bin__Amt_DIV100_fraud_mean



 59%|█████▉    | 73/124 [09:52<07:18,  8.60s/it][A

(590540,) | fill__cnt_C10_fraud_mean
(506691,) | fill__cnt_C10_fraud_mean



 60%|█████▉    | 74/124 [10:00<06:59,  8.38s/it][A

(590540,) | fill__cnt_C3_fraud_mean
(506691,) | fill__cnt_C3_fraud_mean



 60%|██████    | 75/124 [10:08<06:46,  8.30s/it][A

(590540,) | fill__cnt_V91_fraud_mean
(506691,) | fill__cnt_V91_fraud_mean



 61%|██████▏   | 76/124 [10:18<06:53,  8.62s/it][A

(590540,) | fill__cnt_V313_fraud_mean
(506691,) | fill__cnt_V313_fraud_mean



 62%|██████▏   | 77/124 [10:26<06:41,  8.55s/it][A

(590540,) | bin__C6_fraud_mean
(506691,) | bin__C6_fraud_mean



 63%|██████▎   | 78/124 [10:34<06:30,  8.48s/it][A

(590540,) | fill__cnt_card4_fraud_mean
(506691,) | fill__cnt_card4_fraud_mean



 64%|██████▎   | 79/124 [10:42<06:15,  8.34s/it][A

(590540,) | fill__cnt_C9_fraud_mean
(506691,) | fill__cnt_C9_fraud_mean



 65%|██████▍   | 80/124 [10:51<06:07,  8.34s/it][A

(590540,) | fill__cnt_V259_fraud_mean
(506691,) | fill__cnt_V259_fraud_mean



 65%|██████▌   | 81/124 [10:59<05:54,  8.24s/it][A

(590540,) | fill__cnt_C7_fraud_mean
(506691,) | fill__cnt_C7_fraud_mean



 66%|██████▌   | 82/124 [11:07<05:42,  8.15s/it][A

(590540,) | fill__M6_fraud_mean
(506691,) | fill__M6_fraud_mean



 67%|██████▋   | 83/124 [11:16<05:49,  8.53s/it][A

(590540,) | fill__cnt_V49_fraud_mean
(506691,) | fill__cnt_V49_fraud_mean



 68%|██████▊   | 84/124 [11:24<05:32,  8.32s/it][A

(590540,) | fill__cnt_V310_fraud_mean
(506691,) | fill__cnt_V310_fraud_mean



 69%|██████▊   | 85/124 [11:33<05:31,  8.49s/it][A

(590540,) | fill__cnt_R_emaildomain_fraud_mean
(506691,) | fill__cnt_R_emaildomain_fraud_mean



 69%|██████▉   | 86/124 [11:41<05:18,  8.37s/it][A

(590540,) | fill__cnt_V187_fraud_mean
(506691,) | fill__cnt_V187_fraud_mean



 70%|███████   | 87/124 [11:49<05:05,  8.26s/it][A

(590540,) | fill__cnt_V5_fraud_mean
(506691,) | fill__cnt_V5_fraud_mean



 71%|███████   | 88/124 [11:58<05:04,  8.45s/it][A

(590540,) | fill__cnt_V45_fraud_mean
(506691,) | fill__cnt_V45_fraud_mean



 72%|███████▏  | 89/124 [12:06<04:56,  8.48s/it][A

(590540,) | fill__R_emaildomain_prefix_fraud_mean
(506691,) | fill__R_emaildomain_prefix_fraud_mean



 73%|███████▎  | 90/124 [12:15<04:48,  8.50s/it][A

(590540,) | bin__C8_fraud_mean
(506691,) | bin__C8_fraud_mean



 73%|███████▎  | 91/124 [12:23<04:37,  8.40s/it][A

(590540,) | fill__card4_fraud_mean
(506691,) | fill__card4_fraud_mean



 74%|███████▍  | 92/124 [12:32<04:30,  8.46s/it][A

(590540,) | fill__cnt_V283_fraud_mean
(506691,) | fill__cnt_V283_fraud_mean



 75%|███████▌  | 93/124 [12:40<04:24,  8.54s/it][A

(590540,) | fill__M3_fraud_mean
(506691,) | fill__M3_fraud_mean



 76%|███████▌  | 94/124 [12:49<04:20,  8.68s/it][A

(590540,) | fill__M1_fraud_mean
(506691,) | fill__M1_fraud_mean



 77%|███████▋  | 95/124 [12:58<04:13,  8.73s/it][A

(590540,) | bin__C1_fraud_mean
(506691,) | bin__C1_fraud_mean



 77%|███████▋  | 96/124 [13:06<03:58,  8.52s/it][A

(590540,) | fill__cnt_V70_fraud_mean
(506691,) | fill__cnt_V70_fraud_mean



 78%|███████▊  | 97/124 [13:14<03:45,  8.36s/it][A

(590540,) | fill__cnt_C12_fraud_mean
(506691,) | fill__cnt_C12_fraud_mean



 79%|███████▉  | 98/124 [13:22<03:35,  8.29s/it][A

(590540,) | fill__card2_fraud_mean
(506691,) | fill__card2_fraud_mean



 80%|███████▉  | 99/124 [13:31<03:33,  8.53s/it][A

(590540,) | fill__cnt_V243_fraud_mean
(506691,) | fill__cnt_V243_fraud_mean



 81%|████████  | 100/124 [13:40<03:21,  8.41s/it][A

(590540,) | fill__M9_fraud_mean
(506691,) | fill__M9_fraud_mean



 81%|████████▏ | 101/124 [13:48<03:13,  8.40s/it][A

(590540,) | bin__C2_fraud_mean
(506691,) | bin__C2_fraud_mean



 82%|████████▏ | 102/124 [13:56<03:02,  8.32s/it][A

(590540,) | fill__R_emaildomain_suffix_fraud_mean
(506691,) | fill__R_emaildomain_suffix_fraud_mean



 83%|████████▎ | 103/124 [14:06<03:04,  8.80s/it][A

(590540,) | fill__cnt_P_emaildomain_fraud_mean
(506691,) | fill__cnt_P_emaildomain_fraud_mean



 84%|████████▍ | 104/124 [14:14<02:51,  8.58s/it][A

(590540,) | fill__cnt_V76_fraud_mean
(506691,) | fill__cnt_V76_fraud_mean



 85%|████████▍ | 105/124 [14:24<02:50,  8.97s/it][A

(590540,) | fill__cnt_V44_fraud_mean
(506691,) | fill__cnt_V44_fraud_mean



 85%|████████▌ | 106/124 [14:33<02:43,  9.11s/it][A

(590540,) | fill__P_emaildomain_suffix_fraud_mean
(506691,) | fill__P_emaildomain_suffix_fraud_mean



 86%|████████▋ | 107/124 [14:43<02:38,  9.35s/it][A

(590540,) | fill__cnt_V62_fraud_mean
(506691,) | fill__cnt_V62_fraud_mean



 87%|████████▋ | 108/124 [14:51<02:22,  8.93s/it][A

(590540,) | fill__cnt_C14_fraud_mean
(506691,) | fill__cnt_C14_fraud_mean



 88%|████████▊ | 109/124 [14:59<02:09,  8.65s/it][A

(590540,) | bin__C10_fraud_mean
(506691,) | bin__C10_fraud_mean



 89%|████████▊ | 110/124 [15:07<01:57,  8.37s/it][A

(590540,) | bin__C5_fraud_mean
(506691,) | bin__C5_fraud_mean



 90%|████████▉ | 111/124 [15:16<01:52,  8.65s/it][A

(590540,) | fill__P_emaildomain_prefix_fraud_mean
(506691,) | fill__P_emaildomain_prefix_fraud_mean



 90%|█████████ | 112/124 [15:25<01:43,  8.60s/it][A

(590540,) | bin__C11_fraud_mean
(506691,) | bin__C11_fraud_mean



 91%|█████████ | 113/124 [15:33<01:31,  8.36s/it][A

(590540,) | bin__C4_fraud_mean
(506691,) | bin__C4_fraud_mean



 92%|█████████▏| 114/124 [15:40<01:22,  8.23s/it][A

(590540,) | fill__cnt_V282_fraud_mean
(506691,) | fill__cnt_V282_fraud_mean



 93%|█████████▎| 115/124 [15:48<01:12,  8.11s/it][A

(590540,) | fill__cnt_V317_fraud_mean
(506691,) | fill__cnt_V317_fraud_mean



 94%|█████████▎| 116/124 [15:56<01:03,  7.99s/it][A

(590540,) | fill__cnt_V281_fraud_mean
(506691,) | fill__cnt_V281_fraud_mean



 94%|█████████▍| 117/124 [16:04<00:55,  7.96s/it][A

(590540,) | fill__cnt_C8_fraud_mean
(506691,) | fill__cnt_C8_fraud_mean



 95%|█████████▌| 118/124 [16:13<00:50,  8.34s/it][A

(590540,) | fill__cnt_C6_fraud_mean
(506691,) | fill__cnt_C6_fraud_mean



 96%|█████████▌| 119/124 [16:21<00:41,  8.22s/it][A

(590540,) | fill__cnt_card2_fraud_mean
(506691,) | fill__cnt_card2_fraud_mean



 97%|█████████▋| 120/124 [16:30<00:34,  8.51s/it][A

(590540,) | fill__cnt_V265_fraud_mean
(506691,) | fill__cnt_V265_fraud_mean



 98%|█████████▊| 121/124 [16:38<00:24,  8.28s/it][A

(590540,) | fill__cnt_V54_fraud_mean
(506691,) | fill__cnt_V54_fraud_mean



 98%|█████████▊| 122/124 [16:47<00:17,  8.62s/it][A

(590540,) | fill__cnt_V37_fraud_mean
(506691,) | fill__cnt_V37_fraud_mean



 99%|█████████▉| 123/124 [16:55<00:08,  8.39s/it][A

(590540,) | fill__cnt_V67_fraud_mean
(506691,) | fill__cnt_V67_fraud_mean



100%|██████████| 124/124 [17:04<00:00,  8.39s/it][A
[A

In [None]:
#========================================================================
# Target Encoding
# 当月のDataはDropして計算する(DT-MでGroupKするので)
#========================================================================
"""
last: ProductCD-card1-card4-card5-R_emaildomain_suffix_fraud_mean
"""

dir_save = 'create'
cols_DTM = df_train['DT-M'].unique().tolist()

# def parallel_TE(tmp_train, tmp_test, combi):
    
for combi in list_combi:
    for col in tqdm(combi):

        tmp_train = df_train
        tmp_test = df_test

        if str(type(col)).count('tuple'):
            col = list(col)
        if str(type(col)).count('list'):
            pass
        else:
            col = [col]
            
        for c in col:
            if str(tmp_train[c].dtype).count('int') or str(tmp_train[c].dtype).count('float'):
                tmp_train[c].fillna(-999, inplace=True)
                tmp_test[c].fillna(-999, inplace=True)
            else:
                tmp_train[c].fillna('#', inplace=True)
                tmp_test[c].fillna('#', inplace=True)
    
        base_train = tmp_train[col + ['DT-M']]
        base_test = tmp_test[col]
        list_dtm = []
    
        fname = '-'.join(col)
        feature_name = f'{fname}_fraud_mean'
    
        for dtm in cols_DTM + ['test']:
    
            # validationの期間を除く
            if dtm != 'test':
                df = tmp_train[tmp_train['DT-M']!=dtm].copy()
            else:
                df = tmp_train.copy()
    
            te_map = df.groupby(col)[COLUMN_TARGET].agg({
                feature_name: 'mean'
            })
            cnt_map = df.groupby(col)[COLUMN_TARGET].agg({
                'cnt': 'count'
            })
            # 母数が少ないのは平均埋め
            df_te = pd.concat([te_map, cnt_map], axis=1)
            df_te.loc[df_te[df_te['cnt']<100].index, feature_name] = tmp_train[COLUMN_TARGET].mean()
            
            if dtm != 'test':
                te_map['DT-M'] = dtm
                list_dtm.append(te_map)
            else:
                test_TE = te_map
    
        train_TE = pd.concat(list_dtm, axis=0)
        result_train = base_train.merge(train_TE, how='left', on=col + ['DT-M'])
        result_test = base_test.merge(test_TE, how='left', on=col)
    
        cols_save = [col for col in result_train.columns if col.count(f'_fraud_')]
    
        save_feature(result_train[cols_save], f'524__combi{len(col)}', dir_save, is_train=True, auto_type=True, list_ignore=COLUMNS_IGNORE)
        save_feature(result_test[cols_save],  f'524__combi{len(col)}', dir_save, is_train=False, auto_type=True, list_ignore=COLUMNS_IGNORE)
        
# Parallel(4)([delayed(parallel_TE)(df_train, df_test, combi) for combi in list_combi])