In [1]:
%load_ext autoreload
%autoreload 2
from glob import glob
import gc
import os
import sys
import datetime
import numpy as np
import pandas as pd
from tqdm import tqdm
from func.utils import get_numeric_features, get_categorical_features, read_pkl_gzip, to_pkl_gzip, parallel_load_data, get_filename
from func.time_utils import date_add_days
from func.ml_utils import save_feature, get_cnt_feature, get_dummie_feature, get_label_feature
from func.parallel_utils import get_parallel_arg_list
from joblib import delayed, Parallel

In [2]:
COLUMN_ID = 'TransactionID'
COLUMN_DT = 'TransactionDT'
COLUMN_TARGET = 'isFraud'
COLUMNS_IGNORE = [COLUMN_ID, COLUMN_DT, COLUMN_TARGET, 'ProductCD']

train_paths = glob('../feature/eda_base/*_train.gz')
test_paths = glob('../feature/eda_base/*_test.gz')

train_paths = [path for path in train_paths 
               if path.count(COLUMN_DT) 
               or path.count(COLUMN_ID)
               or path.count('time_zone')
               or path.count('hour')
               or path.count('Product')
              ]
test_paths = [path for path in test_paths 
               if path.count(COLUMN_DT) 
               or path.count(COLUMN_ID)
               or path.count('time_zone')
               or path.count('hour')
               or path.count('Product')
              ]

df_train = parallel_load_data(train_paths)
df_test = parallel_load_data(test_paths)
data = pd.concat([df_train, df_test], axis=0, ignore_index=True)
if COLUMN_ID in data.columns:
    data.set_index(COLUMN_ID, inplace=True)

base_train = read_pkl_gzip('../input/base_train.gz').set_index(COLUMN_ID)
base_test = read_pkl_gzip('../input/base_test.gz').set_index(COLUMN_ID)
base = pd.concat([base_train, base_test], axis=0)

df_user_id_ca = pd.read_csv('../output/same_user_pattern/0903__same_user_id__card_addr.csv').set_index(COLUMN_ID)
df_user_id_cap = pd.read_csv('../output/same_user_pattern/0903__same_user_id__card_addr_pemail.csv').set_index(COLUMN_ID)
df_user_id_capm = pd.read_csv('../output/same_user_pattern/0902__same_user_id__card_addr_pemail_M.csv').set_index(COLUMN_ID)
df_user_id_bear = pd.read_csv('../output/same_user_pattern/20190901_user_ids_share.csv').set_index(COLUMN_ID)

data['user_id_card_addr'] = df_user_id_ca['predicted_user_id']
data['user_id_card_addr_pemail'] = df_user_id_cap['predicted_user_id']
data['user_id_card_addr_pemail_M'] = df_user_id_capm['predicted_user_id']
data['user_id_bear'] = df_user_id_bear['predicted_user_id']

In [3]:
START_DATE = '2017-12-01'
startdate = datetime.datetime.strptime(START_DATE, '%Y-%m-%d')

data['datetime'] = data['TransactionDT'].apply(lambda x: (startdate + datetime.timedelta(seconds = x) - datetime.timedelta(seconds = 14400) ))
data['datetime'].fillna(datetime.date(2020, 1, 1), inplace=True)
data['date'] = data['datetime'].map(lambda x: x.date())

In [5]:
#========================================================================
# FE Aggregation User ID & TimeSeries Date
#========================================================================

def parallel_agg(df, base_key, n_day, feature, name_feature):
    list_term_df = []
    for end_date in tqdm(list_end_date):
        start_date = date_add_days(end_date, n_day*-1)
        tmp_user = df[df['date']==end_date][[base_key]].drop_duplicates()
        
        if n_day<0:
            tmp = df[(end_date <= df.date) & (df.date <= start_date)]
        else:
            tmp = df[(start_date <= df.date) & (df.date <= end_date)]

        result = tmp.groupby(base_key, as_index=False)[feature].agg({
            f'{base_key}-{name_feature}_day{n_day}_{feature}_count': 'count',
        })

        tmp_user = tmp_user.merge(result, on=base_key, how='inner')
        tmp_user['date'] = end_date
        list_term_df.append(tmp_user)
    df_agg = pd.concat(list_term_df, axis=0)
    df_agg.set_index([base_key, 'date'], inplace=True)
    return df_agg
    

# User別に期間を切って集計できる様にする
dir_save = 'valid'
feature = 'date'
base_key = 'ProductCD'

list_base_key = [col for col in data.columns if col.count('user_id')]
list_end_date = sorted(data['date'].unique())
list_base_date = list_end_date

# numpy.intだとエラーでる、、
# tmp_n_day = np.array([1, 3, 5, 7, 10, 14, 21, 28, 31, 62, 93, 124, 180])
# list_n_day = np.append(tmp_n_day, np.array([1, 3, 5, 7, 10, 14, 21, 28, 31, 62, 93, 124, 180])*-1).tolist()
list_n_day = (np.array([1, 3, 5, 7, 10, 14, 21, 28, 31, 62, 93, 124])*-1).tolist()
list_product = data['ProductCD'].unique()

base_train = data[[base_key, 'date']].iloc[:len(df_train)]
base_test  = data[[base_key, 'date']].iloc[len(df_train):]

for product in list_product.tolist() + ['all']:
    name_feature = product
    
    if product != 'all':
        df = data[data['ProductCD']==product]
    else:
        df = data.copy()

    list_p = Parallel(len(list_n_day))([delayed(parallel_agg)(df[[base_key, 'date']], base_key, n_day, feature, name_feature) for n_day in list_n_day])

    df_agg = pd.concat(list_p, axis=1)
    df_agg.reset_index(inplace=True)
    df_agg['date'] = df_agg['date'].map(lambda x: x.date())

    base_train_agg = base_train.merge(df_agg, how='left', on=[base_key, 'date'])
    base_test_agg = base_test.merge(df_agg, how='left', on=[base_key, 'date'])

    cols_save = [col for col in base_train_agg.columns if col.count(f'{name_feature}_day')]

    save_feature(base_train_agg[cols_save], '601', dir_save, is_train=True, auto_type=True, list_ignore=COLUMNS_IGNORE)
    save_feature(base_test_agg[cols_save],  '601', dir_save, is_train=False, auto_type=True, list_ignore=COLUMNS_IGNORE)

(590540,) | ProductCD-W_day-1_date_count
(590540,) | ProductCD-W_day-3_date_count
(590540,) | ProductCD-W_day-5_date_count
(590540,) | ProductCD-W_day-7_date_count
(590540,) | ProductCD-W_day-10_date_count
(590540,) | ProductCD-W_day-14_date_count
(590540,) | ProductCD-W_day-21_date_count
(590540,) | ProductCD-W_day-28_date_count
(590540,) | ProductCD-W_day-31_date_count
(590540,) | ProductCD-W_day-62_date_count
(590540,) | ProductCD-W_day-93_date_count
(590540,) | ProductCD-W_day-124_date_count
(506691,) | ProductCD-W_day-1_date_count
(506691,) | ProductCD-W_day-3_date_count
(506691,) | ProductCD-W_day-5_date_count
(506691,) | ProductCD-W_day-7_date_count
(506691,) | ProductCD-W_day-10_date_count
(506691,) | ProductCD-W_day-14_date_count
(506691,) | ProductCD-W_day-21_date_count
(506691,) | ProductCD-W_day-28_date_count
(506691,) | ProductCD-W_day-31_date_count
(506691,) | ProductCD-W_day-62_date_count
(506691,) | ProductCD-W_day-93_date_count
(506691,) | ProductCD-W_day-124_date_coun

In [60]:
#========================================================================
# 期間を絞らず当日のみのカウント
#========================================================================
base_key = 'ProductCD'
feature = 'date'
base_train = data[[base_key, 'date']].iloc[:len(df_train)]
base_test  = data[[base_key, 'date']].iloc[len(df_train):]

for product in list_product.tolist() + ['all']:
    if product != 'all':
        df = data[data['ProductCD']==product]
        cols_key = [base_key, 'date']
    else:
        df = data.copy()
        cols_key = ['date']
        
    df_agg = df.groupby(cols_key, as_index=False)['date'].agg({
        f'{base_key}-{product}_today_Transaction_count': 'count',
    })
    
    df_agg.reset_index(inplace=True)

    base_train_agg = base_train.merge(df_agg, how='left', on=cols_key)
    base_test_agg = base_test.merge(df_agg, how='left', on=cols_key)

    cols_save = [col for col in base_train_agg.columns if col.count(f'today')]

    save_feature(base_train_agg[cols_save], '601', dir_save, is_train=True, auto_type=True, list_ignore=COLUMNS_IGNORE)
    save_feature(base_test_agg[cols_save],  '601', dir_save, is_train=False, auto_type=True, list_ignore=COLUMNS_IGNORE)

(590540,) | ProductCD-W_today_Transaction_count
(506691,) | ProductCD-W_today_Transaction_count
(590540,) | ProductCD-H_today_Transaction_count
(506691,) | ProductCD-H_today_Transaction_count
(590540,) | ProductCD-C_today_Transaction_count
(506691,) | ProductCD-C_today_Transaction_count
(590540,) | ProductCD-S_today_Transaction_count
(506691,) | ProductCD-S_today_Transaction_count
(590540,) | ProductCD-R_today_Transaction_count
(506691,) | ProductCD-R_today_Transaction_count
(590540,) | ProductCD-all_today_Transaction_count
(506691,) | ProductCD-all_today_Transaction_count


In [61]:
base_train_agg.head()

Unnamed: 0,ProductCD,date,index,ProductCD-all_today_Transaction_count
0,W,2017-12-01,0,743
1,W,2017-12-01,0,743
2,W,2017-12-01,0,743
3,W,2017-12-01,0,743
4,H,2017-12-01,0,743
