In [1]:
%load_ext autoreload
%autoreload 2
from glob import glob
import gc
import os
import sys
import datetime
import numpy as np
import pandas as pd
from tqdm import tqdm
from func.utils import get_numeric_features, get_categorical_features, read_pkl_gzip, to_pkl_gzip, parallel_load_data, get_filename
from func.time_utils import date_add_days
from func.ml_utils import save_feature, get_cnt_feature, get_dummie_feature, get_label_feature
from func.parallel_utils import get_parallel_arg_list
from joblib import delayed, Parallel

In [2]:
COLUMN_ID = 'TransactionID'
COLUMN_DT = 'TransactionDT'
COLUMN_TARGET = 'isFraud'
COLUMNS_IGNORE = [COLUMN_ID, COLUMN_DT, COLUMN_TARGET, 'ProductCD']

train_paths = glob('../feature/eda_base/*_train.gz')
test_paths = glob('../feature/eda_base/*_test.gz')
train_paths += glob('../feature/org_use/526*_train.gz')
test_paths  += glob('../feature/org_use/526*_test.gz')
train_paths += glob('../feature/raw_use/ker__uid*_train.gz')
test_paths  += glob('../feature/raw_use/ker__uid*_test.gz')

train_paths = [path for path in train_paths 
               if (path.count(COLUMN_DT) 
               or path.count(COLUMN_ID)
               or path.count(COLUMN_TARGET)
#                or path.count('V')
#                or path.count('C')
#                or path.count('D')
#                or (path.count('526') and path.count('mean'))
#                or path.count('uid_')
#                or path.count('uid2_t')
#                or path.count('uid3_t')
#                or path.count('uid4_t')
#                or path.count('uid5_t')
#                or path.count('card')
#                or path.count('addr')
#                or path.count('domain')
#                or path.count('TransactionAmt')
#                or path.count('Product')
               )
               and not path.count('fill')
               and not path.count('bin')
               and not path.count('129')
               and not path.count('130')
              ]
test_paths = [path for path in test_paths 
               if (path.count(COLUMN_DT) 
               or path.count(COLUMN_ID)
               or path.count(COLUMN_TARGET)
#                or path.count('V')
#                or path.count('C')
#                or path.count('D')
#                or (path.count('526') and path.count('mean'))
#                or path.count('uid_')
#                or path.count('uid2_t')
#                or path.count('uid3_t')
#                or path.count('uid4_t')
#                or path.count('uid5_t')
#                or path.count('card')
#                or path.count('addr')
#                or path.count('domain')
#                or path.count('TransactionAmt')
#                or path.count('Product')
               )
               and not path.count('fill')
               and not path.count('bin')
               and not path.count('129')
               and not path.count('130')
              ]

df_train = parallel_load_data(train_paths)
df_test = parallel_load_data(test_paths)
data = pd.concat([df_train, df_test], axis=0, ignore_index=True)
train_length = df_train.shape[0]

Process ForkPoolWorker-55:
Process ForkPoolWorker-66:
Process ForkPoolWorker-15:


In [4]:
START_DATE = '2017-12-01'
# START_DATE = '2017-12-01'
startdate = datetime.datetime.strptime(START_DATE, '%Y-%m-%d')

data['datetime'] = data['TransactionDT'].apply(lambda x: (startdate + datetime.timedelta(seconds = x) - datetime.timedelta(seconds = 14400) ))
data['datetime'].fillna(datetime.date(2020, 1, 1), inplace=True)
data['date'] = data['datetime'].map(lambda x: x.date())

In [5]:
col_bear = 'user_id_bear'
data.columns = [col.replace('ker__', '') for col in data.columns]

data.set_index(COLUMN_ID, inplace=True)
df_user_id_bear = pd.read_csv('../output/same_user_pattern/20190901_user_ids_share.csv').set_index(COLUMN_ID)
data[col_bear] = df_user_id_bear['predicted_user_id']

max_id = data[col_bear].max()

data.loc[data[data[col_bear].isnull()].index, col_bear] = np.arange(data[data[col_bear].isnull()].shape[0]) + max_id+1
data.reset_index(inplace=True)
data[col_bear] = data[col_bear].astype('int')
data[col_bear].head()

0    241400
1    241401
2     44469
3    237055
4    241402
Name: user_id_bear, dtype: int64

In [23]:
data.sort_values(by=COLUMN_DT, inplace=True)
df_dt_diff = data.groupby(col_bear)['TransactionDT'].diff(1)
df_dt_diff_2 = data.groupby(col_bear)['TransactionDT'].diff(2)
df_dt_diff_3 = data.groupby(col_bear)['TransactionDT'].diff(3)

In [25]:
data[f'{col_bear}_dt_diff_1'] = df_dt_diff.values
data[f'{col_bear}_dt_diff_2'] = df_dt_diff_2.values
data[f'{col_bear}_dt_diff_3'] = df_dt_diff_3.values
data[f'{col_bear}_dt_diff_1_2_ratio'] = df_dt_diff.values   / df_dt_diff_2.values
data[f'{col_bear}_dt_diff_2_3_ratio'] = df_dt_diff_2.values / df_dt_diff_3.values
data[f'{col_bear}_dt_diff_1_3_ratio'] = df_dt_diff.values   / df_dt_diff_3.values
data[f'{col_bear}_dt_diff_1_2_ratio_div_2_3_ratio'] = data[f'{col_bear}_dt_diff_1_2_ratio'] / data[f'{col_bear}_dt_diff_2_3_ratio']

In [26]:
cols_feature = [col for col in data.columns if col.count('dt_diff')]

In [31]:
data.drop(['avg_x', 'avg_y'], axis=1, inplace=True)

In [35]:
# #========================================================================
# # FE Aggregation User ID & TimeSeries Date
# #========================================================================

# # User別に期間を切って集計できる様にする
# dir_save = 'valid_use'
# prefix = '704'

# base_key = col_bear

# for i, col in tqdm(enumerate(cols_feature)):
# #     data[col] = data[col].astype('float')
#     avg = data.groupby(base_key)[col].mean().to_frame(f'avg_{i}')
#     data = data.merge(avg, how='left', on=base_key)
#     data[f"{col}__ratio_from_avg"] = data[col] / data[f'avg_{i}']+1

base_train = data.iloc[:len(df_train)]
base_test  = data.iloc[len(df_train):]

cols_save = [col for col in base_train.columns if col.count('dt_diff')]

save_feature(base_train[cols_save], prefix, dir_save, is_train=True, auto_type=True, list_ignore=COLUMNS_IGNORE)
save_feature(base_test[cols_save],  prefix, dir_save, is_train=False, auto_type=True, list_ignore=COLUMNS_IGNORE)

(590540,) | user_id_bear_dt_diff_1
(590540,) | user_id_bear_dt_diff_2
(590540,) | user_id_bear_dt_diff_3
(590540,) | user_id_bear_dt_diff_1_2_ratio
(590540,) | user_id_bear_dt_diff_2_3_ratio
(590540,) | user_id_bear_dt_diff_1_3_ratio
(590540,) | user_id_bear_dt_diff_1_2_ratio_div_2_3_ratio
(590540,) | user_id_bear_dt_diff_1__ratio_from_avg
(590540,) | user_id_bear_dt_diff_2__ratio_from_avg
(590540,) | user_id_bear_dt_diff_3__ratio_from_avg
(590540,) | user_id_bear_dt_diff_1_2_ratio__ratio_from_avg
(590540,) | user_id_bear_dt_diff_2_3_ratio__ratio_from_avg
(590540,) | user_id_bear_dt_diff_1_3_ratio__ratio_from_avg
(590540,) | user_id_bear_dt_diff_1_2_ratio_div_2_3_ratio__ratio_from_avg
(506691,) | user_id_bear_dt_diff_1
(506691,) | user_id_bear_dt_diff_2
(506691,) | user_id_bear_dt_diff_3
(506691,) | user_id_bear_dt_diff_1_2_ratio
(506691,) | user_id_bear_dt_diff_2_3_ratio
(506691,) | user_id_bear_dt_diff_1_3_ratio
(506691,) | user_id_bear_dt_diff_1_2_ratio_div_2_3_ratio
(506691,) | use

[]