In [1]:
%load_ext autoreload
%autoreload 2
from glob import glob
import gc
import os
import sys
import datetime
import numpy as np
import pandas as pd
from tqdm import tqdm
from func.utils import get_numeric_features, get_categorical_features, read_pkl_gzip, to_pkl_gzip, parallel_load_data, get_filename
from func.time_utils import date_add_days
from func.ml_utils import save_feature, get_cnt_feature, get_dummie_feature, get_label_feature
from func.parallel_utils import get_parallel_arg_list
from joblib import delayed, Parallel

In [2]:
COLUMN_ID = 'TransactionID'
COLUMN_DT = 'TransactionDT'
COLUMN_TARGET = 'isFraud'
COLUMNS_IGNORE = [COLUMN_ID, COLUMN_DT, COLUMN_TARGET, 'ProductCD']

train_paths = glob('../feature/eda_base/*_train.gz')
test_paths = glob('../feature/eda_base/*_test.gz')

train_paths = [path for path in train_paths 
               if path.count(COLUMN_DT) 
               or path.count(COLUMN_ID)
               or path.count('time_zone')
               or path.count('hour')
               or path.count('C')
#                or path.count('D')
               or path.count('Product')
              ]
test_paths = [path for path in test_paths 
               if path.count(COLUMN_DT) 
               or path.count(COLUMN_ID)
               or path.count('time_zone')
               or path.count('hour')
               or path.count('C')
#                or path.count('D')
               or path.count('Product')
              ]

df_train = parallel_load_data(train_paths)
df_test = parallel_load_data(test_paths)
data = pd.concat([df_train, df_test], axis=0, ignore_index=True)
if COLUMN_ID in data.columns:
    data.set_index(COLUMN_ID, inplace=True)

base_train = read_pkl_gzip('../input/base_train.gz').set_index(COLUMN_ID)
base_test = read_pkl_gzip('../input/base_test.gz').set_index(COLUMN_ID)
base = pd.concat([base_train, base_test], axis=0)

cols_num = get_numeric_features(data, COLUMNS_IGNORE)
cols_num = [col for col in cols_num if col.count('C') or col.count('D')]
data[cols_num] = data[cols_num].astype('float32')

df_user_id_ca = pd.read_csv('../output/same_user_pattern/0903__same_user_id__card_addr.csv').set_index(COLUMN_ID)
df_user_id_cap = pd.read_csv('../output/same_user_pattern/0903__same_user_id__card_addr_pemail.csv').set_index(COLUMN_ID)
df_user_id_capm = pd.read_csv('../output/same_user_pattern/0902__same_user_id__card_addr_pemail_M.csv').set_index(COLUMN_ID)
df_user_id_bear = pd.read_csv('../output/same_user_pattern/20190901_user_ids_share.csv').set_index(COLUMN_ID)

data['user_id_card_addr'] = df_user_id_ca['predicted_user_id']
data['user_id_card_addr_pemail'] = df_user_id_cap['predicted_user_id']
data['user_id_card_addr_pemail_M'] = df_user_id_capm['predicted_user_id']
data['user_id_bear'] = df_user_id_bear['predicted_user_id']

In [3]:
#========================================================================
# C同士のdiff, ratio
# 類似してるCでまとめる
#========================================================================
from itertools import combinations

cols_C = [col for col in data.columns if col.startswith('C')]

# df_corr = data[cols_C].corr()
# c1_group = df_corr[df_corr['C1']>0.9].index
# list_remain_group = list(set(df_corr.columns) - set(c1_group))
# df_corr[list_remain_group].loc[list_remain_group]
# c13_group = ['C13', 'C5', 'C9']
# C3はグループなし

# combi_c1_group = combinations(c1_group, 2)
# for (f1, f2) in tqdm(list(combi_c1_group)):
#     data[f'{f1}-{f2}__diff'] = data[f1] - data[f2]
#     data[f'{f1}-{f2}__ratio'] = data[f1] / (data[f2]+1)

combi_C = combinations(cols_C, 2)
for (f1, f2) in tqdm(list(combi_C)):
    data[f'{f1}-{f2}__diff'] = data[f1] - data[f2]
    data[f'{f1}-{f2}__ratio'] = data[f1] / (data[f2]+1)

100%|██████████| 91/91 [00:01<00:00, 65.63it/s]


In [None]:
#========================================================================
# ProductCDあたりのC
#========================================================================
cols_feature = [col for col in data.columns if col.count('__diff') or col.count('__ratio')]
cols_pcd = data['ProductCD'].unique()

for col in tqdm(cols_feature):
    for pcd in cols_pcd:
        feature_name = f'{col}__ProductCD-{pcd}'
        data[feature_name] = np.nan
        data.loc[data['ProductCD'].isin([pcd]), feature_name] = data.loc[data['ProductCD'].isin([pcd]), col]

 64%|██████▎   | 116/182 [06:58<07:19,  6.65s/it]

In [None]:
# #========================================================================
# # FE Aggregation User ID & TimeSeries Date
# #========================================================================
prefix = '507'
dir_save = 'valid'
# cols_save = [col for col in data.columns if col.count('__diff') or col.count('__ratio')]

# train = data.iloc[:len(base_train)]
# test  = data.iloc[len(base_train):]
# train[cols_save]

save_feature(train[cols_save], prefix, dir_save, is_train=True, auto_type=True, list_ignore=COLUMNS_IGNORE)
save_feature(test[cols_save],  prefix, dir_save, is_train=False, auto_type=True, list_ignore=COLUMNS_IGNORE)

In [23]:
train.shape

(590540, 600)