In [1]:
%load_ext autoreload
%autoreload 2
from glob import glob
import gc
import os
import sys
import datetime
import numpy as np
import pandas as pd
from tqdm import tqdm
from func.utils import get_numeric_features, get_categorical_features, read_pkl_gzip, to_pkl_gzip, parallel_load_data, get_filename
from func.time_utils import date_add_days
from func.ml_utils import save_feature, get_cnt_feature, get_dummie_feature, get_label_feature
from func.parallel_utils import get_parallel_arg_list
from joblib import delayed, Parallel

In [2]:
COLUMN_ID = 'TransactionID'
COLUMN_DT = 'TransactionDT'
COLUMN_TARGET = 'isFraud'
COLUMNS_IGNORE = [COLUMN_ID, COLUMN_DT, COLUMN_TARGET, 'ProductCD']

train_paths = glob('../feature/eda_base/*_train.gz')
test_paths = glob('../feature/eda_base/*_test.gz')
train_paths += glob('../feature/org_use/526*_train.gz')
test_paths  += glob('../feature/org_use/526*_test.gz')
train_paths += glob('../feature/raw_use/ker__uid*_train.gz')
test_paths  += glob('../feature/raw_use/ker__uid*_test.gz')

train_paths = [path for path in train_paths 
               if (path.count(COLUMN_DT) 
               or path.count(COLUMN_ID)
               or path.count(COLUMN_TARGET)
               or path.count('V')
               or path.count('C')
               or path.count('D')
               or (path.count('526') and path.count('mean'))
#                or path.count('uid_')
#                or path.count('uid2_t')
#                or path.count('uid3_t')
#                or path.count('uid4_t')
#                or path.count('uid5_t')
#                or path.count('card')
#                or path.count('addr')
#                or path.count('domain')
               or path.count('TransactionAmt')
#                or path.count('Product')
               )
               and not path.count('fill')
               and not path.count('bin')
               and not path.count('129')
               and not path.count('130')
              ]
test_paths = [path for path in test_paths 
               if (path.count(COLUMN_DT) 
               or path.count(COLUMN_ID)
               or path.count(COLUMN_TARGET)
               or path.count('V')
               or path.count('C')
               or path.count('D')
               or (path.count('526') and path.count('mean'))
#                or path.count('uid_')
#                or path.count('uid2_t')
#                or path.count('uid3_t')
#                or path.count('uid4_t')
#                or path.count('uid5_t')
#                or path.count('card')
#                or path.count('addr')
#                or path.count('domain')
               or path.count('TransactionAmt')
#                or path.count('Product')
               )
               and not path.count('fill')
               and not path.count('bin')
               and not path.count('129')
               and not path.count('130')
              ]

df_train = parallel_load_data(train_paths)
df_test = parallel_load_data(test_paths)
data = pd.concat([df_train, df_test], axis=0, ignore_index=True)
train_length = df_train.shape[0]

In [3]:
START_DATE = '2017-12-01'
# START_DATE = '2017-12-01'
startdate = datetime.datetime.strptime(START_DATE, '%Y-%m-%d')

data['datetime'] = data['TransactionDT'].apply(lambda x: (startdate + datetime.timedelta(seconds = x) - datetime.timedelta(seconds = 14400) ))
data['datetime'].fillna(datetime.date(2020, 1, 1), inplace=True)
data['date'] = data['datetime'].map(lambda x: x.date())

list_regist = []
for d, diff in tqdm(data[['date', 'D1']].values):
    if diff < 999999:
        regist = date_add_days(d, -1*diff)
    else:
        regist = date_add_days(d, 0)
    list_regist.append(str(regist))

data['Regist_date'] = list_regist

100%|██████████| 1097231/1097231 [00:04<00:00, 272867.36it/s]


In [5]:
col_bear = 'user_id_bear'
data.columns = [col.replace('ker__', '') for col in data.columns]

data.set_index(COLUMN_ID, inplace=True)
df_user_id_bear = pd.read_csv('../output/same_user_pattern/20190901_user_ids_share.csv').set_index(COLUMN_ID)
data[col_bear] = df_user_id_bear['predicted_user_id']

max_id = data[col_bear].max()

data.loc[data[data[col_bear].isnull()].index, col_bear] = np.arange(data[data[col_bear].isnull()].shape[0]) + max_id+1
data.reset_index(inplace=True)
data[col_bear] = data[col_bear].astype('int')
data[col_bear].head()

0    241400
1    241401
2     44469
3    237055
4    241402
Name: user_id_bear, dtype: int64

In [6]:
cols_V = [col for col in data.columns if col.startswith('V') and col.count('mean')]
cols_C = [col for col in data.columns if col.startswith('C')]
cols_D = [col for col in data.columns if col.startswith('D')]

In [13]:
# #========================================================================
# # FE Aggregation User ID & TimeSeries Date
# #========================================================================

# # User別に期間を切って集計できる様にする
# dir_save = 'valid_use'
# prefix = '703'

# def parallel_agg(df, base_key, feature):
#     result = df.groupby(base_key)[feature].agg({
#         f'{base_key}_agg_{feature}_mean': 'mean',
# #         f'{base_key}_agg_{feature}_std': 'std',
#         f'{base_key}_agg_{feature}_skew': 'skew',
#         f'{base_key}_agg_{feature}_max': 'max',
# #         f'{base_key}_agg_{feature}_min': 'min',
#     })
#     return result
    
# # cols_feature = cols_C
# # cols_feature = cols_D
# cols_feature = ['TransactionAmt'] + cols_V + cols_C + cols_D

# for col in cols_feature:
#     data[col] = data[col].astype('float')

# base_key = col_bear

# base_train = data[[base_key]].iloc[:len(df_train)]
# base_test  = data[[base_key]].iloc[len(df_train):]

# list_p = Parallel(len(cols_feature))([delayed(parallel_agg)(data[[base_key, feature]], base_key, feature) for feature in cols_feature])

# df_agg = pd.concat(list_p, axis=1)
# df_agg.reset_index(inplace=True)

# base_train_agg = base_train.merge(df_agg, how='left', on=base_key)
# base_test_agg = base_test.merge(df_agg, how='left', on=base_key)

cols_save = [col for col in base_train_agg.columns if col.count('bear') and col.count('agg')]

save_feature(base_train_agg[cols_save], prefix, dir_save, is_train=True, auto_type=True, list_ignore=COLUMNS_IGNORE)
save_feature(base_test_agg[cols_save],  prefix, dir_save, is_train=False, auto_type=True, list_ignore=COLUMNS_IGNORE)

(590540,) | user_id_bear_agg_TransactionAmt_mean
(590540,) | user_id_bear_agg_TransactionAmt_skew
(590540,) | user_id_bear_agg_TransactionAmt_max
(590540,) | user_id_bear_agg_C1_mean
(590540,) | user_id_bear_agg_C1_skew
(590540,) | user_id_bear_agg_C1_max
(590540,) | user_id_bear_agg_C10_mean
(590540,) | user_id_bear_agg_C10_skew
(590540,) | user_id_bear_agg_C10_max
(590540,) | user_id_bear_agg_C11_mean
(590540,) | user_id_bear_agg_C11_skew
(590540,) | user_id_bear_agg_C11_max
(590540,) | user_id_bear_agg_C12_mean
(590540,) | user_id_bear_agg_C12_skew
(590540,) | user_id_bear_agg_C12_max
(590540,) | user_id_bear_agg_C13_mean
(590540,) | user_id_bear_agg_C13_skew
(590540,) | user_id_bear_agg_C13_max
(590540,) | user_id_bear_agg_C14_mean
(590540,) | user_id_bear_agg_C14_skew
(590540,) | user_id_bear_agg_C14_max
(590540,) | user_id_bear_agg_C2_mean
(590540,) | user_id_bear_agg_C2_skew
(590540,) | user_id_bear_agg_C2_max
(590540,) | user_id_bear_agg_C3_mean
(590540,) | user_id_bear_agg_C3_

[]