In [1]:
%load_ext autoreload
%autoreload 2
from glob import glob
import gc
import os
import sys
import datetime
import numpy as np
import pandas as pd
from tqdm import tqdm
from func.utils import get_numeric_features, get_categorical_features, read_pkl_gzip, to_pkl_gzip, parallel_load_data, get_filename
from func.time_utils import date_add_days
from func.ml_utils import save_feature, get_cnt_feature, get_dummie_feature, get_label_feature
from func.parallel_utils import get_parallel_arg_list
from joblib import delayed, Parallel

In [2]:
COLUMN_ID = 'TransactionID'
COLUMN_DT = 'TransactionDT'
COLUMN_TARGET = 'isFraud'
COLUMNS_IGNORE = [COLUMN_ID, COLUMN_DT, COLUMN_TARGET, 'ProductCD']

train_paths = glob('../feature/eda_base/*_train.gz')
test_paths = glob('../feature/eda_base/*_test.gz')
# train_paths += glob('../feature/org_use/526*_train.gz')
# test_paths  += glob('../feature/org_use/526*_test.gz')
train_paths += glob('../feature/raw_use/ker__uid*_train.gz')
test_paths  += glob('../feature/raw_use/ker__uid*_test.gz')

train_paths = [path for path in train_paths 
               if (path.count(COLUMN_DT) 
               or path.count(COLUMN_ID)
               or path.count(COLUMN_TARGET)
               or path.count('fill__M')
#                or path.count('V')
#                or path.count('C')
               or path.count('D1_')
#                or (path.count('526') and path.count('mean'))
#                or path.count('uid_')
               or path.count('uid2_t')
               or path.count('uid3_t')
               or path.count('uid4_t')
#                or path.count('uid5_t')
#                or path.count('card')
#                or path.count('addr')
               or path.count('domain')
               or path.startswith('TransactionAmt')
               or path.count('Product')
               or path.count('hour')
               or path.count('zone')
               )
#                and not path.count('fill')
               and not path.count('bin')
               and not path.count('129')
               and not path.count('130')
               or path.count('501')
              ]
test_paths = [path for path in test_paths 
               if (path.count(COLUMN_DT) 
               or path.count(COLUMN_ID)
               or path.count(COLUMN_TARGET)
               or path.count('fill__M')
#                or path.count('V')
#                or path.count('C')
               or path.count('D1_')
#                or (path.count('526') and path.count('mean'))
#                or path.count('uid_')
               or path.count('uid2_t')
               or path.count('uid3_t')
               or path.count('uid4_t')
#                or path.count('uid5_t')
#                or path.count('card')
#                or path.count('addr')
               or path.count('domain')
               or path.startswith('TransactionAmt')
               or path.count('Product')
               or path.count('hour')
               or path.count('zone')
               or path.count('501')
               )
#                and not path.count('fill')
               and not path.count('bin')
               and not path.count('129')
               and not path.count('130')
              ]

df_train = parallel_load_data(train_paths)
df_test = parallel_load_data(test_paths)
data = pd.concat([df_train, df_test], axis=0, ignore_index=True)
train_length = df_train.shape[0]

In [3]:
START_DATE = '2017-12-01'
# START_DATE = '2017-12-01'
startdate = datetime.datetime.strptime(START_DATE, '%Y-%m-%d')

data['datetime'] = data['TransactionDT'].apply(lambda x: (startdate + datetime.timedelta(seconds = x) - datetime.timedelta(seconds = 14400) ))
data['datetime'].fillna(datetime.date(2020, 1, 1), inplace=True)
data['date'] = data['datetime'].map(lambda x: x.date())

list_regist = []
for d, diff in tqdm(data[['date', 'D1']].values):
    if diff < 999999:
        regist = date_add_days(d, -1*diff)
    else:
        regist = date_add_days(d, 0)
    list_regist.append(str(regist))

data['Regist_date'] = list_regist

100%|██████████| 1097231/1097231 [00:04<00:00, 243685.55it/s]


In [4]:
cols_uid = [col for col in data.columns if col.count('uid')]
cols_M = [col for col in data.columns if col.count('M')]
cols_domain = [col for col in data.columns if col.count('domain')]
cols_501 = [col for col in data.columns if col.count('501')]
cols_time = ['hour', 'time_zone']
cos_pd = ['ProductCD']

In [5]:
list_base_key = cols_uid
col_cat = 'ProductCD'
dir_save = 'valid_use'

for base_key in list_base_key:

    if str(type(base_key)).count('list'):
        pass
    else:
        base_key = [base_key]
    
    base_key += ['Regist_date']
    name_base = '-'.join(base_key)
    
    base = data[base_key]
    
    all_cnt = data.groupby(base_key, as_index=False)[col_cat].agg({
        f'count': 'count'
    })
    
    part_cnt = data.groupby(base_key + [col_cat], as_index=False)[col_cat].agg({
        f'count_cat': 'count'
    })
    
    tmp = all_cnt.merge(part_cnt[base_key + [f"count_cat", col_cat]], how='inner', on=base_key)
    
    tmp['cat_ratio'] = tmp['count_cat'] / tmp['count']
    tmp.drop(['count', 'count_cat'], axis=1, inplace=True)
    
    result = pd.pivot_table(data=tmp, columns=col_cat, index=base_key)
    cols_save = [f"cnt_ratio-{col_cat}-{col[1]}-{name_base}".replace('.', '_') for col in result.columns]
    result.columns = cols_save
    df_feat = base.merge(result, how='left', on=base_key)
    
    base_train = df_feat.iloc[:df_train.shape[0]]
    base_test  = df_feat.iloc[df_train.shape[0]:]
    
    save_feature(base_train[cols_save], '533', dir_save, is_train=True, auto_type=True, list_ignore=COLUMNS_IGNORE)
    save_feature(base_test[cols_save],  '533', dir_save, is_train=False, auto_type=True, list_ignore=COLUMNS_IGNORE)

(590540,) | cnt_ratio-ProductCD-C-ker__uid2-Regist_date
(590540,) | cnt_ratio-ProductCD-H-ker__uid2-Regist_date
(590540,) | cnt_ratio-ProductCD-R-ker__uid2-Regist_date
(590540,) | cnt_ratio-ProductCD-S-ker__uid2-Regist_date
(590540,) | cnt_ratio-ProductCD-W-ker__uid2-Regist_date
(506691,) | cnt_ratio-ProductCD-C-ker__uid2-Regist_date
(506691,) | cnt_ratio-ProductCD-H-ker__uid2-Regist_date
(506691,) | cnt_ratio-ProductCD-R-ker__uid2-Regist_date
(506691,) | cnt_ratio-ProductCD-S-ker__uid2-Regist_date
(506691,) | cnt_ratio-ProductCD-W-ker__uid2-Regist_date
(590540,) | cnt_ratio-ProductCD-C-ker__uid3-Regist_date
(590540,) | cnt_ratio-ProductCD-H-ker__uid3-Regist_date
(590540,) | cnt_ratio-ProductCD-R-ker__uid3-Regist_date
(590540,) | cnt_ratio-ProductCD-S-ker__uid3-Regist_date
(590540,) | cnt_ratio-ProductCD-W-ker__uid3-Regist_date
(506691,) | cnt_ratio-ProductCD-C-ker__uid3-Regist_date
(506691,) | cnt_ratio-ProductCD-H-ker__uid3-Regist_date
(506691,) | cnt_ratio-ProductCD-R-ker__uid3-Regi

In [6]:
list_base_key = cols_uid
cat_list = cols_M
dir_save = 'valid_use'

for base_key in list_base_key:
    
    if str(type(base_key)).count('list'):
        pass
    else:
        base_key = [base_key]
        
    base_key += ['Regist_date']
    name_base = '-'.join(base_key)

    base = data[base_key]
    
    for col_cat in cat_list:


        all_cnt = data.groupby(base_key, as_index=False)[col_cat].agg({
            f'count': 'count'
        })

        part_cnt = data.groupby(base_key + [col_cat], as_index=False)[col_cat].agg({
            f'count_cat': 'count'
        })

        tmp = all_cnt.merge(part_cnt[base_key + [f"count_cat", col_cat]], how='inner', on=base_key)

        tmp['cat_ratio'] = tmp['count_cat'] / tmp['count']
        tmp.drop(['count', 'count_cat'], axis=1, inplace=True)

        result = pd.pivot_table(data=tmp, columns=col_cat, index=base_key)
        cols_save = [f"cnt_ratio-{col_cat}-{col[1]}-{name_base}".replace('.', '_') for col in result.columns]
        result.columns = cols_save
        df_feat = base.merge(result, how='left', on=base_key)

        base_train = df_feat.iloc[:df_train.shape[0]]
        base_test  = df_feat.iloc[df_train.shape[0]:]

        save_feature(base_train[cols_save], '533', dir_save, is_train=True, auto_type=True, list_ignore=COLUMNS_IGNORE)
        save_feature(base_test[cols_save],  '533', dir_save, is_train=False, auto_type=True, list_ignore=COLUMNS_IGNORE)

(590540,) | cnt_ratio-fill__M1-#-ker__uid2-Regist_date
(590540,) | cnt_ratio-fill__M1-F-ker__uid2-Regist_date
(590540,) | cnt_ratio-fill__M1-T-ker__uid2-Regist_date
(506691,) | cnt_ratio-fill__M1-#-ker__uid2-Regist_date
(506691,) | cnt_ratio-fill__M1-F-ker__uid2-Regist_date
(506691,) | cnt_ratio-fill__M1-T-ker__uid2-Regist_date
(590540,) | cnt_ratio-fill__M2-#-ker__uid2-Regist_date
(590540,) | cnt_ratio-fill__M2-F-ker__uid2-Regist_date
(590540,) | cnt_ratio-fill__M2-T-ker__uid2-Regist_date
(506691,) | cnt_ratio-fill__M2-#-ker__uid2-Regist_date
(506691,) | cnt_ratio-fill__M2-F-ker__uid2-Regist_date
(506691,) | cnt_ratio-fill__M2-T-ker__uid2-Regist_date
(590540,) | cnt_ratio-fill__M3-#-ker__uid2-Regist_date
(590540,) | cnt_ratio-fill__M3-F-ker__uid2-Regist_date
(590540,) | cnt_ratio-fill__M3-T-ker__uid2-Regist_date
(506691,) | cnt_ratio-fill__M3-#-ker__uid2-Regist_date
(506691,) | cnt_ratio-fill__M3-F-ker__uid2-Regist_date
(506691,) | cnt_ratio-fill__M3-T-ker__uid2-Regist_date
(590540,) 

In [None]:
list_base_key = cols_uid
cat_list = cols_domain
dir_save = 'valid_use'

for base_key in list_base_key:
    
    if str(type(base_key)).count('list'):
        pass
    else:
        base_key = [base_key]
        
    base_key += ['Regist_date']
    name_base = '-'.join(base_key)

    base = data[base_key]
    
    for col_cat in cat_list:


        all_cnt = data.groupby(base_key, as_index=False)[col_cat].agg({
            f'count': 'count'
        })

        part_cnt = data.groupby(base_key + [col_cat], as_index=False)[col_cat].agg({
            f'count_cat': 'count'
        })

        tmp = all_cnt.merge(part_cnt[base_key + [f"count_cat", col_cat]], how='inner', on=base_key)

        tmp['cat_ratio'] = tmp['count_cat'] / tmp['count']
        tmp.drop(['count', 'count_cat'], axis=1, inplace=True)

        result = pd.pivot_table(data=tmp, columns=col_cat, index=base_key)
        cols_save = [f"cnt_ratio-{col_cat}-{col[1]}-{name_base}".replace('.', '_') for col in result.columns]
        result.columns = cols_save
        df_feat = base.merge(result, how='left', on=base_key)

        base_train = df_feat.iloc[:df_train.shape[0]]
        base_test  = df_feat.iloc[df_train.shape[0]:]

        save_feature(base_train[cols_save], '533', dir_save, is_train=True, auto_type=True, list_ignore=COLUMNS_IGNORE)
        save_feature(base_test[cols_save],  '533', dir_save, is_train=False, auto_type=True, list_ignore=COLUMNS_IGNORE)

(590540,) | cnt_ratio-P_emaildomain-aim_com-ker__uid2-Regist_date
(590540,) | cnt_ratio-P_emaildomain-anonymous_com-ker__uid2-Regist_date
(590540,) | cnt_ratio-P_emaildomain-aol_com-ker__uid2-Regist_date
(590540,) | cnt_ratio-P_emaildomain-att_net-ker__uid2-Regist_date
(590540,) | cnt_ratio-P_emaildomain-bellsouth_net-ker__uid2-Regist_date
(590540,) | cnt_ratio-P_emaildomain-cableone_net-ker__uid2-Regist_date
(590540,) | cnt_ratio-P_emaildomain-centurylink_net-ker__uid2-Regist_date
(590540,) | cnt_ratio-P_emaildomain-cfl_rr_com-ker__uid2-Regist_date
(590540,) | cnt_ratio-P_emaildomain-charter_net-ker__uid2-Regist_date
(590540,) | cnt_ratio-P_emaildomain-comcast_net-ker__uid2-Regist_date
(590540,) | cnt_ratio-P_emaildomain-cox_net-ker__uid2-Regist_date
(590540,) | cnt_ratio-P_emaildomain-earthlink_net-ker__uid2-Regist_date
(590540,) | cnt_ratio-P_emaildomain-embarqmail_com-ker__uid2-Regist_date
(590540,) | cnt_ratio-P_emaildomain-frontier_com-ker__uid2-Regist_date
(590540,) | cnt_ratio-

In [None]:
list_base_key = cols_uid
cat_list = cols_time
dir_save = 'valid_use'

for base_key in list_base_key:
    
    if str(type(base_key)).count('list'):
        pass
    else:
        base_key = [base_key]
        
    base_key += ['Regist_date']
    name_base = '-'.join(base_key)

    base = data[base_key]
    
    for col_cat in cat_list:


        all_cnt = data.groupby(base_key, as_index=False)[col_cat].agg({
            f'count': 'count'
        })

        part_cnt = data.groupby(base_key + [col_cat], as_index=False)[col_cat].agg({
            f'count_cat': 'count'
        })

        tmp = all_cnt.merge(part_cnt[base_key + [f"count_cat", col_cat]], how='inner', on=base_key)

        tmp['cat_ratio'] = tmp['count_cat'] / tmp['count']
        tmp.drop(['count', 'count_cat'], axis=1, inplace=True)

        result = pd.pivot_table(data=tmp, columns=col_cat, index=base_key)
        cols_save = [f"cnt_ratio-{col_cat}-{col[1]}-{name_base}".replace('.', '_') for col in result.columns]
        result.columns = cols_save
        df_feat = base.merge(result, how='left', on=base_key)

        base_train = df_feat.iloc[:df_train.shape[0]]
        base_test  = df_feat.iloc[df_train.shape[0]:]

        save_feature(base_train[cols_save], '533', dir_save, is_train=True, auto_type=True, list_ignore=COLUMNS_IGNORE)
        save_feature(base_test[cols_save],  '533', dir_save, is_train=False, auto_type=True, list_ignore=COLUMNS_IGNORE)

In [None]:
list_base_key = cols_uid
cat_list = cols_501
dir_save = 'valid_use'

for base_key in list_base_key:
    
    if str(type(base_key)).count('list'):
        pass
    else:
        base_key = [base_key]
        
    base_key += ['Regist_date']
    name_base = '-'.join(base_key)

    base = data[base_key]
    
    for col_cat in cat_list:


        all_cnt = data.groupby(base_key, as_index=False)[col_cat].agg({
            f'count': 'count'
        })

        part_cnt = data.groupby(base_key + [col_cat], as_index=False)[col_cat].agg({
            f'count_cat': 'count'
        })

        tmp = all_cnt.merge(part_cnt[base_key + [f"count_cat", col_cat]], how='inner', on=base_key)

        tmp['cat_ratio'] = tmp['count_cat'] / tmp['count']
        tmp.drop(['count', 'count_cat'], axis=1, inplace=True)

        result = pd.pivot_table(data=tmp, columns=col_cat, index=base_key)
        cols_save = [f"cnt_ratio-{col_cat}-{col[1]}-{name_base}".replace('.', '_') for col in result.columns]
        result.columns = cols_save
        df_feat = base.merge(result, how='left', on=base_key)

        base_train = df_feat.iloc[:df_train.shape[0]]
        base_test  = df_feat.iloc[df_train.shape[0]:]

        save_feature(base_train[cols_save], '533', dir_save, is_train=True, auto_type=True, list_ignore=COLUMNS_IGNORE)
        save_feature(base_test[cols_save],  '533', dir_save, is_train=False, auto_type=True, list_ignore=COLUMNS_IGNORE)