In [1]:
%load_ext autoreload
%autoreload 2
from glob import glob
import gc
import os
from pathlib import Path
import re
import sys
import yaml
import datetime
import numpy as np
import pandas as pd
from itertools import combinations
from tqdm import tqdm
from func.utils import get_categorical_features, read_pkl_gzip, to_pkl_gzip, parallel_load_data, get_filename
from func.ml_utils import save_feature, get_cnt_feature, get_dummie_feature, get_label_feature
from kaggle_utils import reduce_mem_usage, move_feature
from joblib import Parallel, delayed

In [3]:
COLUMN_ID = 'TransactionID'
COLUMN_DT = 'TransactionDT'
COLUMN_TARGET = 'isFraud'
COLUMNS_IGNORE = [COLUMN_ID, COLUMN_DT, COLUMN_TARGET, 'is_train', 'date']

def filter_feature(path):
    if path.count('fill_') or path.count('bin') or path.count(COLUMN_ID) or path.count(COLUMN_TARGET):
        return True
    else:
        return False


paths_train = glob('../feature/eda_base/*_train.gz')
paths_train = [path for path in paths_train if filter_feature(path) ]
df_train = parallel_load_data(paths_train)

paths_test = glob('../feature/eda_base/*_test.gz')
paths_test = [path for path in paths_test if filter_feature(path) ]
df_test = parallel_load_data(paths_test)

group_kfold_path = '../input/0908_ieee__DT-M_GroupKFold.gz'
group = read_pkl_gzip(group_kfold_path)
COLUMN_GROUP = 'DT-M'
df_train[COLUMN_GROUP] = group

In [7]:
df_train['bin__V87'].value_counts()

1.0     463708
#        89164
2.0      30832
3.0       3761
4.0       1033
0.0        694
5.0        480
6.0        205
7.0        148
8.0        147
10.0        80
9.0         75
11.0        54
12.0        34
19.0        30
13.0        25
14.0        14
15.0        14
16.0        10
17.0         8
22.0         7
18.0         5
20.0         2
21.0         2
30.0         1
24.0         1
23.0         1
26.0         1
27.0         1
28.0         1
29.0         1
25.0         1
Name: bin__V87, dtype: int64

In [3]:
"""
Judgeのルールはどう作るか？bearさんが共有してくれたのも使うが、自分でも考えてみる.
自分は条件のかけ算だと思っている。しかしこれだとツリーとやってること同じかも？
別の期間での各カテゴリのFraud Probをかけ合わせる。また、閾値をもうけてカウントする.
Cとかはその怪しいトリガーのカウントではないのか？
直近で同じカテゴリを含むトランザクションがFraudしたか、は特徴にならないな。Privateでは使えないから
Stripeとか見るか、

1. まずはProduct別にcard, addr, domain, device, Amt, (D, C, V, )のTEを行う
"""

df_train['cents'] = np.round( df_train['TransactionAmt'] - np.floor(df_train['TransactionAmt']),2 )

list_domain = [col for col in df_train.columns if col.count('domain')]
df_train[list_domain[0]].fillna('#', inplace=True)
df_train[list_domain[0] +'_prefix'] = df_train[list_domain[0]].apply(lambda x: x.split('.')[0])
df_train[list_domain[1]].fillna('#', inplace=True)
df_train[list_domain[1] +'_prefix'] = df_train[list_domain[0]].apply(lambda x: x.split('.')[0])

df_test['cents'] = np.round( df_test['TransactionAmt'] - np.floor(df_test['TransactionAmt']),2 )

list_domain = [col for col in df_test.columns if col.count('domain')]
df_test[list_domain[0]].fillna('#', inplace=True)
df_test[list_domain[0] +'_prefix'] = df_test[list_domain[0]].apply(lambda x: x.split('.')[0])
df_test[list_domain[1]].fillna('#', inplace=True)
df_test[list_domain[1] +'_prefix'] = df_test[list_domain[0]].apply(lambda x: x.split('.')[0])

emails = {'gmail': 'google', 'att.net': 'att', 'twc.com': 'spectrum', 'scranton.edu': 'other', 'optonline.net': 'other',
          'hotmail.co.uk': 'microsoft', 'comcast.net': 'other', 'yahoo.com.mx': 'yahoo', 'yahoo.fr': 'yahoo',
          'yahoo.es': 'yahoo', 'charter.net': 'spectrum', 'live.com': 'microsoft', 'aim.com': 'aol', 'hotmail.de': 'microsoft',
          'centurylink.net': 'centurylink', 'gmail.com': 'google', 'me.com': 'apple', 'earthlink.net': 'other', 
          'gmx.de': 'other', 'web.de': 'other', 'cfl.rr.com': 'other', 'hotmail.com': 'microsoft', 'protonmail.com': 'other',
          'hotmail.fr': 'microsoft', 'windstream.net': 'other', 'outlook.es': 'microsoft', 'yahoo.co.jp': 'yahoo',
          'yahoo.de': 'yahoo', 'servicios-ta.com': 'other', 'netzero.net': 'other', 'suddenlink.net': 'other',
          'roadrunner.com': 'other', 'sc.rr.com': 'other', 'live.fr': 'microsoft', 'verizon.net': 'yahoo',
          'msn.com': 'microsoft', 'q.com': 'centurylink', 'prodigy.net.mx': 'att', 'frontier.com': 'yahoo',
          'anonymous.com': 'anonymous', 'rocketmail.com': 'yahoo', 'sbcglobal.net': 'att', 'frontiernet.net': 'yahoo',
          'ymail.com': 'yahoo', 'outlook.com': 'microsoft', 'mail.com': 'anonymous', 'bellsouth.net': 'other',
          'embarqmail.com': 'centurylink', 'cableone.net': 'other', 'hotmail.es': 'microsoft', 'mac.com': 'apple',
          'yahoo.co.uk': 'yahoo', 'netzero.com': 'other', 'yahoo.com': 'yahoo', 'live.com.mx': 'microsoft', 'ptd.net': 'other',
          'cox.net': 'other', 'aol.com': 'aol', 'juno.com': 'other', 'icloud.com': 'apple'}
us_emails = ['gmail', 'net', 'edu']

for c in ['P_emaildomain', 'R_emaildomain']:
    df_train[c + '_bin'] = df_train[c].map(emails)
    df_train[c + '_suffix'] = df_train[c].map(lambda x: str(x).split('.')[-1])
    df_train[c + '_suffix'] = df_train[c + '_suffix'].map(lambda x: x if str(x) not in us_emails else 'us')
    df_test[c + '_bin'] = df_test[c].map(emails)
    df_test[c + '_suffix'] = df_test[c].map(lambda x: str(x).split('.')[-1])
    df_test[c + '_suffix'] = df_test[c + '_suffix'].map(lambda x: x if str(x) not in us_emails else 'us')


list_domain = [col for col in df_train.columns if col.count('prefix') or col.count('_suffix') or col.count('_bin')]
list_card = [col for col in df_train.columns if col.count('card')]
list_addr = [col for col in df_train.columns if col.count('addr')]
list_amt = ['cents', 'TransactionAmt']

list_single = ['ProductCD']\
+ sorted(list_card)\
+ sorted(list_addr)\
+ sorted(list_domain)\
+ list_amt

In [10]:
list_combi = []
for i in range(2, len(list_single)+1, 1):
    list_combi.append(combinations(list_single, i))

In [None]:
#========================================================================
# Target Encoding
# 当月のDataはDropして計算する(DT-MでGroupKするので)
#========================================================================
dir_save = 'create'
cols_DTM = df_train['DT-M'].unique().tolist()

# def parallel_TE(tmp_train, tmp_test, combi):
    
# for combi in list_combi:
#     for col in tqdm(combi):
for j in range(1):
    for col in tqdm(list_single):

        tmp_train = df_train
        tmp_test = df_test

        if str(type(col)).count('tuple'):
            col = list(col)
        if str(type(col)).count('list'):
            pass
        else:
            col = [col]
            
        for c in col:
            if str(tmp_train[c].dtype).count('int') or str(tmp_train[c].dtype).count('float'):
                tmp_train[c].fillna(-999, inplace=True)
                tmp_test[c].fillna(-999, inplace=True)
            else:
                tmp_train[c].fillna('#', inplace=True)
                tmp_test[c].fillna('#', inplace=True)
    
        base_train = tmp_train[col + ['DT-M']]
        base_test = tmp_test[col]
        list_dtm = []
    
        fname = '-'.join(col)
        feature_name = f'{fname}_fraud_mean'
    
        for dtm in cols_DTM + ['test']:
    
            # validationの期間を除く
            if dtm != 'test':
                df = tmp_train[tmp_train['DT-M']!=dtm].copy()
            else:
                df = tmp_train.copy()
    
            te_map = df.groupby(col)[COLUMN_TARGET].agg({
                feature_name: 'mean'
            })
            cnt_map = df.groupby(col)[COLUMN_TARGET].agg({
                'cnt': 'count'
            })
            # 母数が少ないのは平均埋め
            df_te = pd.concat([te_map, cnt_map], axis=1)
            df_te.loc[df_te[df_te['cnt']<100].index, feature_name] = tmp_train[COLUMN_TARGET].mean()
            
            if dtm != 'test':
                te_map['DT-M'] = dtm
                list_dtm.append(te_map)
            else:
                test_TE = te_map
    
        train_TE = pd.concat(list_dtm, axis=0)
        result_train = base_train.merge(train_TE, how='left', on=col + ['DT-M'])
        result_test = base_test.merge(test_TE, how='left', on=col)
    
        cols_save = [col for col in result_train.columns if col.count(f'_fraud_')]
    
        save_feature(result_train[cols_save], '524', dir_save, is_train=True, auto_type=True, list_ignore=COLUMNS_IGNORE)
        save_feature(result_test[cols_save],  '524', dir_save, is_train=False, auto_type=True, list_ignore=COLUMNS_IGNORE)
        
# Parallel(4)([delayed(parallel_TE)(df_train, df_test, combi) for combi in list_combi])

In [None]:
#========================================================================
# Target Encoding
# 当月のDataはDropして計算する(DT-MでGroupKするので)
#========================================================================
dir_save = 'create'
cols_DTM = df_train['DT-M'].unique().tolist()

# def parallel_TE(tmp_train, tmp_test, combi):
    
for combi in list_combi:
    for col in tqdm(combi):

        tmp_train = df_train
        tmp_test = df_test

        if str(type(col)).count('tuple'):
            col = list(col)
        if str(type(col)).count('list'):
            pass
        else:
            col = [col]
            
        for c in col:
            if str(tmp_train[c].dtype).count('int') or str(tmp_train[c].dtype).count('float'):
                tmp_train[c].fillna(-999, inplace=True)
                tmp_test[c].fillna(-999, inplace=True)
            else:
                tmp_train[c].fillna('#', inplace=True)
                tmp_test[c].fillna('#', inplace=True)
    
        base_train = tmp_train[col + ['DT-M']]
        base_test = tmp_test[col]
        list_dtm = []
    
        fname = '-'.join(col)
        feature_name = f'{fname}_fraud_mean'
    
        for dtm in cols_DTM + ['test']:
    
            # validationの期間を除く
            if dtm != 'test':
                df = tmp_train[tmp_train['DT-M']!=dtm].copy()
            else:
                df = tmp_train.copy()
    
            te_map = df.groupby(col)[COLUMN_TARGET].agg({
                feature_name: 'mean'
            })
            cnt_map = df.groupby(col)[COLUMN_TARGET].agg({
                'cnt': 'count'
            })
            # 母数が少ないのは平均埋め
            df_te = pd.concat([te_map, cnt_map], axis=1)
            df_te.loc[df_te[df_te['cnt']<100].index, feature_name] = tmp_train[COLUMN_TARGET].mean()
            
            if dtm != 'test':
                te_map['DT-M'] = dtm
                list_dtm.append(te_map)
            else:
                test_TE = te_map
    
        train_TE = pd.concat(list_dtm, axis=0)
        result_train = base_train.merge(train_TE, how='left', on=col + ['DT-M'])
        result_test = base_test.merge(test_TE, how='left', on=col)
    
        cols_save = [col for col in result_train.columns if col.count(f'_fraud_')]
    
        save_feature(result_train[cols_save], f'524__combi{len(col)}', dir_save, is_train=True, auto_type=True, list_ignore=COLUMNS_IGNORE)
        save_feature(result_test[cols_save],  f'524__combi{len(col)}', dir_save, is_train=False, auto_type=True, list_ignore=COLUMNS_IGNORE)
        
# Parallel(4)([delayed(parallel_TE)(df_train, df_test, combi) for combi in list_combi])