In [1]:
%load_ext autoreload
%autoreload 2
from glob import glob
import gc
import os
import sys
import datetime
import numpy as np
import pandas as pd
from tqdm import tqdm
from func.utils import get_numeric_features, get_categorical_features, read_pkl_gzip, to_pkl_gzip, parallel_load_data, get_filename
from func.time_utils import date_add_days
from func.ml_utils import save_feature, get_cnt_feature, get_dummie_feature, get_label_feature
from func.parallel_utils import get_parallel_arg_list
from joblib import delayed, Parallel

In [2]:
COLUMN_ID = 'TransactionID'
COLUMN_DT = 'TransactionDT'
COLUMN_TARGET = 'isFraud'
COLUMNS_IGNORE = [COLUMN_ID, COLUMN_DT, COLUMN_TARGET, 'ProductCD']

train_paths = glob('../feature/eda_base/*_train.gz')
test_paths = glob('../feature/eda_base/*_test.gz')

train_paths = [path for path in train_paths 
               if path.count(COLUMN_DT) 
               or path.count(COLUMN_ID)
               or path.count(COLUMN_TARGET)
               or path.count('time_zone')
               or path.count('hour')
               or path.count('Amt')
               or path.count('D')
               or path.count('Product')
              ]
test_paths = [path for path in test_paths 
               if path.count(COLUMN_DT) 
               or path.count(COLUMN_ID)
               or path.count(COLUMN_TARGET)
               or path.count('time_zone')
               or path.count('hour')
               or path.count('Amt')
               or path.count('D')
               or path.count('Product')
              ]

df_train = parallel_load_data(train_paths)
df_test = parallel_load_data(test_paths)
data = pd.concat([df_train, df_test], axis=0, ignore_index=True)
if COLUMN_ID in data.columns:
    data.set_index(COLUMN_ID, inplace=True)

base_train = read_pkl_gzip('../input/base_train.gz').set_index(COLUMN_ID)
base_test = read_pkl_gzip('../input/base_test.gz').set_index(COLUMN_ID)
base = pd.concat([base_train, base_test], axis=0)

cols_num = get_numeric_features(data, COLUMNS_IGNORE)
cols_num = [col for col in cols_num if col.count('C') or col.count('D')]
data[cols_num] = data[cols_num].astype('float32')

In [4]:
START_DATE = '2017-12-01'
startdate = datetime.datetime.strptime(START_DATE, '%Y-%m-%d')

data['datetime'] = data['TransactionDT'].apply(lambda x: (startdate + datetime.timedelta(seconds = x) - datetime.timedelta(seconds = 14400) ))
data['datetime'].fillna(datetime.date(2020, 1, 1), inplace=True)
data['date'] = data['datetime'].map(lambda x: x.date())

In [3]:
#========================================================================
# ProductCDあたりのAmt
#========================================================================
data['cents'] = np.round( data['TransactionAmt'] - np.floor(data['TransactionAmt']),2 )
data['Amt_x200'] = data['TransactionAmt'].map(lambda x: int(x)%200)
data['Amt_x150'] = data['TransactionAmt'].map(lambda x: int(x)%150)
data['Amt_x100'] = data['TransactionAmt'].map(lambda x: int(x)%100)
data['Amt_x90'] = data['TransactionAmt'].map(lambda x: int(x)%90)
data['Amt_x80'] = data['TransactionAmt'].map(lambda x: int(x)%80)
data['Amt_x70'] = data['TransactionAmt'].map(lambda x: int(x)%70)
data['Amt_x60'] = data['TransactionAmt'].map(lambda x: int(x)%60)
data['Amt_x50'] = data['TransactionAmt'].map(lambda x: int(x)%50)
data['Amt_x40'] = data['TransactionAmt'].map(lambda x: int(x)%40)
data['Amt_x30'] = data['TransactionAmt'].map(lambda x: int(x)%30)
data['Amt_x25'] = data['TransactionAmt'].map(lambda x: int(x)%25)
data['Amt_x20'] = data['TransactionAmt'].map(lambda x: int(x)%20)
data['Amt_x15'] = data['TransactionAmt'].map(lambda x: int(x)%15)
data['Amt_x10'] = data['TransactionAmt'].map(lambda x: int(x)%10)
data['Amt_x9']  = data['TransactionAmt'].map(lambda x: int(x)%9)
data['Amt_x8']  = data['TransactionAmt'].map(lambda x: int(x)%8)
data['Amt_x7']  = data['TransactionAmt'].map(lambda x: int(x)%7)
data['Amt_x6']  = data['TransactionAmt'].map(lambda x: int(x)%6)
data['Amt_x5']  = data['TransactionAmt'].map(lambda x: int(x)%5)
data['Amt_x4']  = data['TransactionAmt'].map(lambda x: int(x)%4)
data['Amt_x3']  = data['TransactionAmt'].map(lambda x: int(x)%3)
data['Amt_x2']  = data['TransactionAmt'].map(lambda x: int(x)%2)

cols_Amt = [col for col in data.columns if col.count('Amt') or col=='cents']
cols_pcd = data['ProductCD'].unique()

for col in tqdm(cols_Amt):
    for pcd in cols_pcd:
        feature_name = f'{col}__ProductCD-{pcd}'
        data[feature_name] = -1
        data.loc[data['ProductCD'].isin([pcd]), feature_name] = data.loc[data['ProductCD'].isin([pcd]), col]

100%|██████████| 24/24 [00:35<00:00,  1.92s/it]


In [7]:
train_length = df_train.shape[0]
# cols_save = [col for col in data.columns if col.count('Amt') or col.count('cent')]
# for col in cols_save:
#     print(col)
#     display(data.iloc[:train_length].groupby(col)['isFraud'].mean().sort_values(ascending=False).head(5))

In [9]:
#========================================================================
# FE Aggregation User ID & TimeSeries Date
#========================================================================

dir_save = 'valid_use'
cols_save = [col for col in data.columns if col.count('Amt') or col.count('cent')]

train_length = df_train.shape[0]
base_train = data.iloc[:train_length]
base_test = data.iloc[train_length:]

save_feature(base_train[cols_save], '517', dir_save, is_train=True, auto_type=True, list_ignore=COLUMNS_IGNORE)
save_feature(base_test[cols_save],  '517', dir_save, is_train=False, auto_type=True, list_ignore=COLUMNS_IGNORE)

(590540,) | TransactionAmt
(590540,) | cents
(590540,) | Amt_x200
(590540,) | Amt_x150
(590540,) | Amt_x100
(590540,) | Amt_x90
(590540,) | Amt_x80
(590540,) | Amt_x70
(590540,) | Amt_x60
(590540,) | Amt_x50
(590540,) | Amt_x40
(590540,) | Amt_x30
(590540,) | Amt_x25
(590540,) | Amt_x20
(590540,) | Amt_x15
(590540,) | Amt_x10
(590540,) | Amt_x9
(590540,) | Amt_x8
(590540,) | Amt_x7
(590540,) | Amt_x6
(590540,) | Amt_x5
(590540,) | Amt_x4
(590540,) | Amt_x3
(590540,) | Amt_x2
(590540,) | TransactionAmt__ProductCD-W
(590540,) | TransactionAmt__ProductCD-H
(590540,) | TransactionAmt__ProductCD-C
(590540,) | TransactionAmt__ProductCD-S
(590540,) | TransactionAmt__ProductCD-R
(590540,) | cents__ProductCD-W
(590540,) | cents__ProductCD-H
(590540,) | cents__ProductCD-C
(590540,) | cents__ProductCD-S
(590540,) | cents__ProductCD-R
(590540,) | Amt_x200__ProductCD-W
(590540,) | Amt_x200__ProductCD-H
(590540,) | Amt_x200__ProductCD-C
(590540,) | Amt_x200__ProductCD-S
(590540,) | Amt_x200__Product