In [1]:
%load_ext autoreload
%autoreload 2
from glob import glob
import gc
import os
import sys
import datetime
import numpy as np
import pandas as pd
from tqdm import tqdm
from func.utils import get_numeric_features, get_categorical_features, read_pkl_gzip, to_pkl_gzip, parallel_load_data, get_filename
from func.ml_utils import save_feature, get_cnt_feature, get_dummie_feature, get_label_feature
from func.parallel_utils import get_parallel_arg_list
from joblib import delayed, Parallel

In [2]:
COLUMN_ID = 'TransactionID'
COLUMN_DT = 'TransactionDT'
COLUMN_TARGET = 'isFraud'
COLUMNS_IGNORE = [COLUMN_ID, COLUMN_DT, COLUMN_TARGET]

train_paths = glob('../feature/eda_base/*_train.gz')
test_paths = glob('../feature/eda_base/*_test.gz')

train_paths = [path for path in train_paths 
               if path.count(COLUMN_DT) 
               or path.count(COLUMN_ID)
               or path.count('time_zone')
               or path.count('hour')
               or path.count('C')
               or path.count('D')
               or path.count('Product')
              ]
test_paths = [path for path in test_paths 
               if path.count(COLUMN_DT) 
               or path.count(COLUMN_ID)
               or path.count('time_zone')
               or path.count('hour')
               or path.count('C')
               or path.count('D')
               or path.count('Product')
              ]

df_train = parallel_load_data(train_paths)
df_test = parallel_load_data(test_paths)
data = pd.concat([df_train, df_test], axis=0, ignore_index=True)
if COLUMN_ID in data.columns:
    data.set_index(COLUMN_ID, inplace=True)

base_train = read_pkl_gzip('../input/base_train.gz').set_index(COLUMN_ID)
base_test = read_pkl_gzip('../input/base_test.gz').set_index(COLUMN_ID)
base = pd.concat([base_train, base_test], axis=0)

cols_num = get_numeric_features(data, COLUMNS_IGNORE)
cols_num = [col for col in cols_num if col.count('C') or col.count('D')]
data[cols_num] = data[cols_num].astype('float32')

df_user_id_ca = pd.read_csv('../output/same_user_pattern/0903__same_user_id__card_addr.csv').set_index(COLUMN_ID)
df_user_id_cap = pd.read_csv('../output/same_user_pattern/0903__same_user_id__card_addr_pemail.csv').set_index(COLUMN_ID)
df_user_id_capm = pd.read_csv('../output/same_user_pattern/0902__same_user_id__card_addr_pemail_M.csv').set_index(COLUMN_ID)

data['user_id_card_addr'] = df_user_id_ca['predicted_user_id']
data['user_id_card_addr_pemail'] = df_user_id_cap['predicted_user_id']
data['user_id_card_addr_pemail_M'] = df_user_id_capm['predicted_user_id']

In [3]:
START_DATE = '2017-12-01'
startdate = datetime.datetime.strptime(START_DATE, '%Y-%m-%d')
df_train['datetime'] = df_train['TransactionDT'].apply(lambda x: (startdate + datetime.timedelta(seconds = x) - datetime.timedelta(seconds = 14400) ))
df_test['datetime'] = df_test['TransactionDT'].apply(lambda x: (startdate + datetime.timedelta(seconds = x) - datetime.timedelta(seconds = 14400) ))
df_train['datetime'].fillna(datetime.date(2020, 1, 1), inplace=True)
df_test['datetime'].fillna(datetime.date(2020, 1, 1), inplace=True)
df_train['date'] = df_train['datetime'].map(lambda x: x.date())
df_test['date']  =  df_test['datetime'].map(lambda x: x.date())

In [4]:
#========================================================================
# ProductCDあたりのC
#========================================================================
cols_C = [col for col in data.columns if col.startswith('C')]
cols_pcd = data['ProductCD'].unique()

for col in tqdm(cols_C):
    for pcd in cols_pcd:
        feature_name = f'{col}__ProductCD-{pcd}'
        data[feature_name] = -1
        data.loc[data['ProductCD'].isin([pcd]), feature_name] = data.loc[data['ProductCD'].isin([pcd]), col]

100%|██████████| 14/14 [00:09<00:00,  1.16it/s]


In [28]:
#========================================================================
# Save Feature
#========================================================================
dir_save = 'org_use'
cols_feature = [col for col in df_train.columns if col.count('__Pro')]
save_feature(df_train[cols_feature], '502', dir_save, is_train=True, auto_type=True, list_ignore=COLUMNS_IGNORE)
save_feature(df_test[cols_feature], '502', dir_save, is_train=False, auto_type=True, list_ignore=COLUMNS_IGNORE)

(590540,) | C12__ProductCD-W
(590540,) | C12__ProductCD-H
(590540,) | C12__ProductCD-C
(590540,) | C12__ProductCD-S
(590540,) | C12__ProductCD-R
(590540,) | C1__ProductCD-W
(590540,) | C1__ProductCD-H
(590540,) | C1__ProductCD-C
(590540,) | C1__ProductCD-S
(590540,) | C1__ProductCD-R
(590540,) | C6__ProductCD-W
(590540,) | C6__ProductCD-H
(590540,) | C6__ProductCD-C
(590540,) | C6__ProductCD-S
(590540,) | C6__ProductCD-R
(590540,) | C14__ProductCD-W
(590540,) | C14__ProductCD-H
(590540,) | C14__ProductCD-C
(590540,) | C14__ProductCD-S
(590540,) | C14__ProductCD-R
(590540,) | C13__ProductCD-W
(590540,) | C13__ProductCD-H
(590540,) | C13__ProductCD-C
(590540,) | C13__ProductCD-S
(590540,) | C13__ProductCD-R
(590540,) | C3__ProductCD-W
(590540,) | C3__ProductCD-H
(590540,) | C3__ProductCD-C
(590540,) | C3__ProductCD-S
(590540,) | C3__ProductCD-R
(590540,) | C9__ProductCD-W
(590540,) | C9__ProductCD-H
(590540,) | C9__ProductCD-C
(590540,) | C9__ProductCD-S
(590540,) | C9__ProductCD-R
(5905

In [30]:
df_train.drop(cols_feature, axis=1, inplace=True)
df_test.drop(cols_feature, axis=1, inplace=True)

In [5]:
#========================================================================
# FE Categorical Encoding 
#========================================================================

cols_C = [col for col in data.columns if col.startswith('C') and col.count('Product')]
df_cat = data[cols_C].copy()

for col in tqdm(cols_C):
    num = df_cat[col].value_counts().shape[0]
    df_cat[col].fillna(-1, inplace=True)
    tmp_cols = [col]
    if num>15:
        cnt_feature = get_cnt_feature(df_cat[col].to_frame(), tmp_cols)
#         label_feature = get_label_feature(df_cat[col].to_frame(), tmp_cols)
        df_cat = df_cat.join(cnt_feature)
#         df_cat = df_cat.join(label_feature)
    elif num>2:
        cnt_feature = get_cnt_feature(df_cat[col].to_frame(), tmp_cols)
#         label_feature = get_label_feature(df_cat[col].to_frame(), tmp_cols)
        dummie_feature = get_dummie_feature(df_cat[col].to_frame(), tmp_cols)
        df_cat = df_cat.join(cnt_feature)
        df_cat = df_cat.join(dummie_feature)
#         df_cat = df_cat.join(label_feature)
    elif num<=2:
#         label_feature = get_label_feature(df_cat[col].to_frame(), tmp_cols)
#         df_cat = df_cat.join(label_feature)
        pass
    else:
        print(col)
    df_cat.drop(col, axis=1, inplace=True)

100%|██████████| 70/70 [01:07<00:00,  1.62it/s]


In [7]:
data_cat = data.join(df_cat)
data_cat.head()

Unnamed: 0_level_0,C1,C10,C11,C12,C13,C14,C2,C3,C4,C5,...,C6__ProductCD-S_2.0_dummie,C6__ProductCD-S_4.0_dummie,C6__ProductCD-S_5.0_dummie,cnt__C6__ProductCD-R,cnt__C7__ProductCD-C,cnt__C8__ProductCD-H,cnt__C8__ProductCD-C,cnt__C8__ProductCD-S,cnt__C8__ProductCD-R,cnt__C9__ProductCD-W
TransactionID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2987000,1.0,0.0,2.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,...,0,0,0,1023885,959446,1034837,959446,1074185,1023885,420354
2987001,1.0,0.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,...,0,0,0,1023885,959446,1034837,959446,1074185,1023885,44990
2987002,1.0,0.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,...,0,0,0,1023885,959446,1034837,959446,1074185,1023885,420354
2987003,2.0,0.0,1.0,0.0,25.0,1.0,5.0,0.0,0.0,0.0,...,0,0,0,1023885,959446,1034837,959446,1074185,1023885,420354
2987004,1.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,...,0,0,0,1023885,959446,55290,959446,1074185,1023885,296574


In [30]:
#========================================================================
# Save Feature
#========================================================================
dir_save = 'org_use'
tmp_train = df_cat.iloc[:len(df_train)]
tmp_test = df_cat.iloc[len(df_train):]

save_feature(tmp_train, '502', dir_save, is_train=True, auto_type=True, list_ignore=COLUMNS_IGNORE)
save_feature(tmp_test, '502', dir_save, is_train=False, auto_type=True, list_ignore=COLUMNS_IGNORE)

(590540,) | cnt__C12
(590540,) | label__C12
(590540,) | cnt__C1
(590540,) | label__C1
(590540,) | cnt__C6
(590540,) | label__C6
(590540,) | cnt__C14
(590540,) | label__C14
(590540,) | cnt__C13
(590540,) | label__C13
(590540,) | cnt__C3
(590540,) | label__C3
(590540,) | cnt__C9
(590540,) | label__C9
(590540,) | cnt__C7
(590540,) | label__C7
(590540,) | cnt__C4
(590540,) | label__C4
(590540,) | cnt__C11
(590540,) | label__C11
(590540,) | cnt__C2
(590540,) | label__C2
(590540,) | cnt__C8
(590540,) | label__C8
(590540,) | cnt__C10
(590540,) | label__C10
(590540,) | cnt__C5
(590540,) | label__C5
(506691,) | cnt__C12
(506691,) | label__C12
(506691,) | cnt__C1
(506691,) | label__C1
(506691,) | cnt__C6
(506691,) | label__C6
(506691,) | cnt__C14
(506691,) | label__C14
(506691,) | cnt__C13
(506691,) | label__C13
(506691,) | cnt__C3
(506691,) | label__C3
(506691,) | cnt__C9
(506691,) | label__C9
(506691,) | cnt__C7
(506691,) | label__C7
(506691,) | cnt__C4
(506691,) | label__C4
(506691,) | cnt__C

In [31]:
del df_cat, tmp_train, tmp_test
gc.collect()

80

In [32]:
sorted([col for col in data.columns if not col.count('C')])

['D1',
 'D10',
 'D11',
 'D12',
 'D13',
 'D14',
 'D15',
 'D2',
 'D3',
 'D4',
 'D5',
 'D6',
 'D7',
 'D8',
 'D9',
 'TransactionDT',
 'TransactionID',
 'date',
 'datetime',
 'hour',
 'time_zone',
 'user_id_card_addr_pemail_M']

In [8]:
#========================================================================
# FE Aggregation User ID
#========================================================================
#========================================================================
# predicted_user_idを使った集計がある場合、どのuser_idがわかるようにする
#========================================================================
# df_feat = data.copy()
df_feat = data_cat.copy()

train_idx = base_train.index
test_idx = base_test.index
dir_save = 'org_use'
cols_C = [col for col in df_feat.columns if col.startswith('C') and not col.count('dummie') and not col.count('label')]

list_key = [
    'user_id_card_addr',
    'user_id_card_addr_pemail',
    'user_id_card_addr_pemail_M',
]

def get_new_columns(name, aggs):
    return [name + '_' + k + '_' + agg for k in aggs.keys() for agg in aggs[k]]


#========================================================================
# Parallel
#========================================================================
if len(cols_C)>60:
    n_jobs = 60
else:
    n_jobs = len(cols_C)
arg_list = get_parallel_arg_list(n_jobs=n_jobs, arg_list=cols_C)
arg_df_list = []
for arg_cols_num in tqdm(arg_list):
    use_cols = list(set(list_key + arg_cols_num))
    tmp = df_feat[use_cols]
    print(tmp.shape)
    arg_df_list.append(tmp)

 17%|█▋        | 10/60 [00:00<00:01, 43.12it/s]

(1097231, 4)
(1097231, 4)
(1097231, 4)
(1097231, 4)
(1097231, 4)
(1097231, 4)
(1097231, 4)
(1097231, 4)
(1097231, 4)
(1097231, 4)


 30%|███       | 18/60 [00:00<00:01, 37.27it/s]

(1097231, 4)
(1097231, 4)
(1097231, 4)
(1097231, 4)
(1097231, 4)
(1097231, 4)
(1097231, 4)
(1097231, 4)
(1097231, 4)
(1097231, 4)


 43%|████▎     | 26/60 [00:00<00:00, 35.24it/s]

(1097231, 4)
(1097231, 4)
(1097231, 4)
(1097231, 4)
(1097231, 4)
(1097231, 4)
(1097231, 4)
(1097231, 4)
(1097231, 4)


 57%|█████▋    | 34/60 [00:00<00:00, 32.79it/s]

(1097231, 4)
(1097231, 4)
(1097231, 4)
(1097231, 4)
(1097231, 4)
(1097231, 4)
(1097231, 4)


 70%|███████   | 42/60 [00:01<00:00, 31.78it/s]

(1097231, 4)
(1097231, 4)
(1097231, 4)
(1097231, 4)
(1097231, 4)
(1097231, 4)
(1097231, 4)
(1097231, 4)


 83%|████████▎ | 50/60 [00:01<00:00, 31.13it/s]

(1097231, 4)
(1097231, 4)
(1097231, 4)
(1097231, 4)
(1097231, 4)
(1097231, 4)
(1097231, 4)


 97%|█████████▋| 58/60 [00:01<00:00, 30.30it/s]

(1097231, 4)
(1097231, 4)
(1097231, 4)
(1097231, 4)
(1097231, 4)
(1097231, 4)
(1097231, 4)
(1097231, 4)


100%|██████████| 60/60 [00:02<00:00, 29.68it/s]

(1097231, 28)





In [13]:
# for df, agg_cols in zip(arg_df_list, arg_list):
def parallel_agg(df, agg_cols):
    error_keys = []
    error_cols = []
    for col in agg_cols:
        aggs = {}
#         aggs[col] = ['mean', 'max', 'min', 'std']
        aggs[col] = ['max', 'min']
        
        for key in list_key:
            
            tmp_base_train = base_train.join(df[key])
            tmp_base_test = base_test.join(df[key])
            
            try:
                base_agg = df[key].to_frame()
            except AttributeError:
                error_keys.append(key)
                error_cols += agg_cols
            
            df_agg = df.groupby(key).agg(aggs)
            df_agg.columns = get_new_columns(key+'_', aggs)
            max_col = [col for col in df_agg.columns if col.count('_max')][0]
            min_col = [col for col in df_agg.columns if col.count('_min')][0]
            df_agg[max_col+'_min_diff'] = df_agg[max_col] - df_agg[min_col]
            
            df_agg.drop([max_col, min_col], axis=1, inplace=True)
            
            df_agg.reset_index(inplace=True)
            
            base_train_agg = tmp_base_train.merge(df_agg, on=key, how='left')
            base_test_agg = tmp_base_test.merge(df_agg, on=key, how='left')
            
            del df_agg, tmp_base_train, tmp_base_test
            gc.collect()
            
            print(base_train_agg.shape, base_test_agg.shape)
            cols_feature = [
                col for col in base_train_agg.columns 
                if col not in COLUMNS_IGNORE and col != key and col != 'D1']
            save_feature(base_train_agg[cols_feature], '502', dir_save, is_train=True, auto_type=True, list_ignore=COLUMNS_IGNORE)
            save_feature(base_test_agg[cols_feature],  '502', dir_save, is_train=False, auto_type=True, list_ignore=COLUMNS_IGNORE)
            
            del base_train_agg, base_test_agg
            gc.collect()

In [14]:
err = Parallel(n_jobs)([delayed(parallel_agg)(d, ac) for d, ac in zip(arg_df_list, arg_list)])

In [None]:
df_feat.head()