In [1]:
%load_ext autoreload
%autoreload 2
from glob import glob
import gc
import os
import sys
import datetime
import numpy as np
import pandas as pd
from tqdm import tqdm
from func.utils import get_numeric_features, get_categorical_features, read_pkl_gzip, to_pkl_gzip, parallel_load_data, get_filename
from func.time_utils import date_add_days
from func.ml_utils import save_feature, get_cnt_feature, get_dummie_feature, get_label_feature
from func.parallel_utils import get_parallel_arg_list
from joblib import delayed, Parallel

In [None]:
COLUMN_ID = 'TransactionID'
COLUMN_DT = 'TransactionDT'
COLUMN_TARGET = 'isFraud'
COLUMNS_IGNORE = [COLUMN_ID, COLUMN_DT, COLUMN_TARGET, 'ProductCD']

train_paths = glob('../feature/eda_base/*_train.gz')
test_paths = glob('../feature/eda_base/*_test.gz')

train_paths = [path for path in train_paths 
               if path.count(COLUMN_DT) 
               or path.count(COLUMN_ID)
               or path.count('time_zone')
               or path.count('hour')
               or path.count('C')
#                or path.count('D')
               or path.count('Product')
              ]
test_paths = [path for path in test_paths 
               if path.count(COLUMN_DT) 
               or path.count(COLUMN_ID)
               or path.count('time_zone')
               or path.count('hour')
               or path.count('C')
#                or path.count('D')
               or path.count('Product')
              ]

df_train = parallel_load_data(train_paths)
df_test = parallel_load_data(test_paths)
data = pd.concat([df_train, df_test], axis=0, ignore_index=True)
if COLUMN_ID in data.columns:
    data.set_index(COLUMN_ID, inplace=True)

base_train = read_pkl_gzip('../input/base_train.gz').set_index(COLUMN_ID)
base_test = read_pkl_gzip('../input/base_test.gz').set_index(COLUMN_ID)
base = pd.concat([base_train, base_test], axis=0)

cols_num = get_numeric_features(data, COLUMNS_IGNORE)
cols_num = [col for col in cols_num if col.count('C') or col.count('D')]
data[cols_num] = data[cols_num].astype('float32')

df_user_id_ca = pd.read_csv('../output/same_user_pattern/0903__same_user_id__card_addr.csv').set_index(COLUMN_ID)
df_user_id_cap = pd.read_csv('../output/same_user_pattern/0903__same_user_id__card_addr_pemail.csv').set_index(COLUMN_ID)
df_user_id_capm = pd.read_csv('../output/same_user_pattern/0902__same_user_id__card_addr_pemail_M.csv').set_index(COLUMN_ID)
df_user_id_bear = pd.read_csv('../output/same_user_pattern/20190901_user_ids_share.csv').set_index(COLUMN_ID)

data['user_id_card_addr'] = df_user_id_ca['predicted_user_id']
data['user_id_card_addr_pemail'] = df_user_id_cap['predicted_user_id']
data['user_id_card_addr_pemail_M'] = df_user_id_capm['predicted_user_id']
data['user_id_bear'] = df_user_id_bear['predicted_user_id']

In [3]:
START_DATE = '2017-12-01'
startdate = datetime.datetime.strptime(START_DATE, '%Y-%m-%d')

data['datetime'] = data['TransactionDT'].apply(lambda x: (startdate + datetime.timedelta(seconds = x) - datetime.timedelta(seconds = 14400) ))
data['datetime'].fillna(datetime.date(2020, 1, 1), inplace=True)
data['date'] = data['datetime'].map(lambda x: x.date())

In [4]:
#========================================================================
# ProductCDあたりのC
#========================================================================
cols_C = [col for col in data.columns if col.startswith('C')]
cols_pcd = data['ProductCD'].unique()

for col in tqdm(cols_C):
    for pcd in cols_pcd:
        feature_name = f'{col}__ProductCD-{pcd}'
        data[feature_name] = -1
        data.loc[data['ProductCD'].isin([pcd]), feature_name] = data.loc[data['ProductCD'].isin([pcd]), col]

100%|██████████| 14/14 [00:17<00:00,  1.16s/it]


In [5]:
# sorted([col for col in data.columns if not col.count('C')])
cols_C = sorted([col for col in data.columns if col.count('C') and col not in COLUMNS_IGNORE
#                  and not col.count('Product')
                ])
# cols_D = ['D3', 'D5', 'D6', 'D7', 'D8', 'D9', 'D11', 'D12', 'D13', 'D14']
# cols_feature = cols_C + cols_D
# cols_feature = cols_feature[:3]
len(cols_C)

84

In [22]:
#========================================================================
# FE Aggregation User ID & TimeSeries Date
#========================================================================
prefix = '506'

def parallel_agg(df, base_key, feature):
    df_last = df.groupby(base_key)[feature].last()
    df_first = df.groupby(base_key)[feature].first()
    df[f'{base_key}__{feature}__diff_last-first'] = (df_last - df_first)
    df[f'{base_key}__{feature}__ratio_last-first'] = (df_last / df_first+10)
    
    return df[[f'{base_key}__{feature}__diff_last-first',
                 f'{base_key}__{feature}__ratio_last-first']]

# User別に期間を切って集計できる様にする
dir_save = 'valid'
list_base_key = [col for col in df.columns if col.count('user_id')]
data.sort_values(by='datetime', inplace=True)

for base_key in list_base_key:
    if base_key in data.columns:
        data.set_index(base_key, inplace=True)
        
    list_p = Parallel(60)([delayed(parallel_agg)(data[[feature]], base_key, feature) for feature in cols_C])
    df_agg = pd.concat(list_p, axis=1)
    train = df_agg.iloc[:len(df_train)]
    test  = df_agg.iloc[len(df_train):]

    cols_save = [col for col in train.columns if col.count('__diff') or col.count('__ratio')]
    save_feature(train[cols_save], prefix, dir_save, is_train=True, auto_type=True, list_ignore=COLUMNS_IGNORE)
    save_feature(test[cols_save],  prefix, dir_save, is_train=False, auto_type=True, list_ignore=COLUMNS_IGNORE)
    
    data.reset_index(inplace=True)

(590540,) | user_id_card_addr_pemail__C1__diff_last-first__C1__diff_last-first
(590540,) | user_id_card_addr_pemail__C1__diff_last-first__C1__ratio_last-first
(590540,) | user_id_card_addr_pemail__C1__diff_last-first__C10__diff_last-first
(590540,) | user_id_card_addr_pemail__C1__diff_last-first__C10__ratio_last-first
(590540,) | user_id_card_addr_pemail__C1__diff_last-first__C10__ProductCD-C__diff_last-first
(590540,) | user_id_card_addr_pemail__C1__diff_last-first__C10__ProductCD-C__ratio_last-first
(590540,) | user_id_card_addr_pemail__C1__diff_last-first__C10__ProductCD-H__diff_last-first
(590540,) | user_id_card_addr_pemail__C1__diff_last-first__C10__ProductCD-H__ratio_last-first
(590540,) | user_id_card_addr_pemail__C1__diff_last-first__C10__ProductCD-R__diff_last-first
(590540,) | user_id_card_addr_pemail__C1__diff_last-first__C10__ProductCD-R__ratio_last-first
(590540,) | user_id_card_addr_pemail__C1__diff_last-first__C10__ProductCD-S__diff_last-first
(590540,) | user_id_card_a

MemoryError: 

In [23]:
train.head()

Unnamed: 0_level_0,user_id_card_addr_pemail__C10__ProductCD-H__ratio_last-first__C1__diff_last-first,user_id_card_addr_pemail__C10__ProductCD-H__ratio_last-first__C1__ratio_last-first,user_id_card_addr_pemail__C10__ProductCD-H__ratio_last-first__C10__diff_last-first,user_id_card_addr_pemail__C10__ProductCD-H__ratio_last-first__C10__ratio_last-first,user_id_card_addr_pemail__C10__ProductCD-H__ratio_last-first__C10__ProductCD-C__diff_last-first,user_id_card_addr_pemail__C10__ProductCD-H__ratio_last-first__C10__ProductCD-C__ratio_last-first,user_id_card_addr_pemail__C10__ProductCD-H__ratio_last-first__C10__ProductCD-H__diff_last-first,user_id_card_addr_pemail__C10__ProductCD-H__ratio_last-first__C10__ProductCD-H__ratio_last-first,user_id_card_addr_pemail__C10__ProductCD-H__ratio_last-first__C10__ProductCD-R__diff_last-first,user_id_card_addr_pemail__C10__ProductCD-H__ratio_last-first__C10__ProductCD-R__ratio_last-first,...,user_id_card_addr_pemail__C10__ProductCD-H__ratio_last-first__C9__ProductCD-C__diff_last-first,user_id_card_addr_pemail__C10__ProductCD-H__ratio_last-first__C9__ProductCD-C__ratio_last-first,user_id_card_addr_pemail__C10__ProductCD-H__ratio_last-first__C9__ProductCD-H__diff_last-first,user_id_card_addr_pemail__C10__ProductCD-H__ratio_last-first__C9__ProductCD-H__ratio_last-first,user_id_card_addr_pemail__C10__ProductCD-H__ratio_last-first__C9__ProductCD-R__diff_last-first,user_id_card_addr_pemail__C10__ProductCD-H__ratio_last-first__C9__ProductCD-R__ratio_last-first,user_id_card_addr_pemail__C10__ProductCD-H__ratio_last-first__C9__ProductCD-S__diff_last-first,user_id_card_addr_pemail__C10__ProductCD-H__ratio_last-first__C9__ProductCD-S__ratio_last-first,user_id_card_addr_pemail__C10__ProductCD-H__ratio_last-first__C9__ProductCD-W__diff_last-first,user_id_card_addr_pemail__C10__ProductCD-H__ratio_last-first__C9__ProductCD-W__ratio_last-first
user_id_card_addr_pemail__C10__ProductCD-H__ratio_last-first,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1.0,0.0,2.0,1.0,inf,2.0,0.0,0.0,2.0,0.0,2.0,...,1.0,1.0,0.0,2.0,0.0,2.0,0.0,2.0,-2.0,0.0
1.0,0.0,2.0,1.0,inf,2.0,0.0,0.0,2.0,0.0,2.0,...,1.0,1.0,0.0,2.0,0.0,2.0,0.0,2.0,-2.0,0.0
1.0,0.0,2.0,1.0,inf,2.0,0.0,0.0,2.0,0.0,2.0,...,1.0,1.0,0.0,2.0,0.0,2.0,0.0,2.0,-2.0,0.0
1.0,0.0,2.0,1.0,inf,2.0,0.0,0.0,2.0,0.0,2.0,...,1.0,1.0,0.0,2.0,0.0,2.0,0.0,2.0,-2.0,0.0
1.0,0.0,2.0,1.0,inf,2.0,0.0,0.0,2.0,0.0,2.0,...,1.0,1.0,0.0,2.0,0.0,2.0,0.0,2.0,-2.0,0.0
