In [2]:
%load_ext autoreload
%autoreload 2
from glob import glob
import gc
import os
import sys
import datetime
import numpy as np
import pandas as pd
from tqdm import tqdm
from func.utils import get_numeric_features, get_categorical_features, read_pkl_gzip, to_pkl_gzip, parallel_load_data, get_filename
from func.time_utils import date_add_days, date_add_times
from func.ml_utils import save_feature, get_cnt_feature, get_dummie_feature, get_label_feature, get_factorize_feature
from func.parallel_utils import get_parallel_arg_list
from joblib import delayed, Parallel
from itertools import combinations

In [3]:
COLUMN_ID = 'TransactionID'
COLUMN_DT = 'TransactionDT'
COLUMN_TARGET = 'isFraud'
COLUMNS_IGNORE = [COLUMN_ID, COLUMN_DT, COLUMN_TARGET, 'ProductCD']

train_paths = glob('../feature/eda_base/*_train.gz')
test_paths  = glob('../feature/eda_base/*_test.gz')

train_paths = [path for path in train_paths 
               if (path.count(COLUMN_DT) 
               or path.count(COLUMN_ID)
               or path.count(COLUMN_TARGET)
               or path.count('ProductCD_t')
               or path.count('D')
               or path.count('C')
               or path.count('card')
               or path.count('addr')
               or path.count('domain')
               )
               and not path.count('fill')
               and not path.count('bin')
               and not path.count('129')
              ]
test_paths = [path for path in test_paths 
               if (path.count(COLUMN_DT) 
               or path.count(COLUMN_ID)
               or path.count(COLUMN_TARGET)
               or path.count('ProductCD_t')
               or path.count('D')
               or path.count('C')
               or path.count('card')
               or path.count('addr')
               or path.count('domain')
               )
               and not path.count('fill')
               and not path.count('bin')
               and not path.count('129')
              ]

df_train = parallel_load_data(train_paths)
df_test = parallel_load_data(test_paths)
data = pd.concat([df_train, df_test], axis=0, ignore_index=True)
train_length = df_train.shape[0]
del df_train, df_test
gc.collect()

21

In [4]:
startdate = datetime.datetime(2017,12,1)
data['datetime'] = data['TransactionDT'].apply(lambda x: (startdate + datetime.timedelta(seconds = x)))
data['year'] = data['datetime'].map(lambda x: x.year)
data['month'] = data['datetime'].map(lambda x: x.month)
data['month'] = data['month'].map(lambda x: 5 if x==6 else x)
data['week_no'] = data['datetime'].map(lambda x: x.isocalendar()[1])

data['DT-M'] = data[['year', 'month']].apply(lambda x: str(x[0]) + '-' + str(x[1]), axis=1)
data['DT-W'] = data[['year', 'week_no']].apply(lambda x: str(x[0]) + '-' + str(x[1]), axis=1)

In [11]:
#========================================================================
# DT-MでCount Regularize
#========================================================================
prefix = '609'
dir_save = 'valid_use'

col_dtm = 'DT-M'
list_dtm = sorted(data['DT-M'].unique())
n_jobs = len(list_dtm)

cols_C = [col for col in data.columns if col.startswith('C') and not col.count('cnt')]
cols_feature = cols_C

# 全体におけるカウントを先に作っておく
for col in cols_feature:
    cnt_map = data[col].value_counts().to_dict()
    data[f"{col}_cnt"] = data[col].map(cnt_map)
    
    
def parallel_dtm(df, col, dtm):
    tmp = df[df[col_dtm]==dtm]

    # local count
    local_cnt_map = tmp[col].value_counts().to_dict()
    df[f"DT-M_{dtm}_{col}_cnt"] = df[col].map(local_cnt_map)
    df[f"DT-M_{dtm}_{col}_cnt_ratio_global"] = df[f"DT-M_{dtm}_{col}_cnt"] / df[f"{col}_cnt"]

    cols_save = [col for col in data.columns if col.count('_cnt') and col.count('DT-M')]
    train = data.iloc[:train_length][cols_save]
    test = data.iloc[train_length:][cols_save]
    save_feature(train[cols_save], prefix, dir_save, is_train=True, auto_type=True, list_ignore=COLUMNS_IGNORE)
    save_feature(test[cols_save],  prefix, dir_save, is_train=False, auto_type=True, list_ignore=COLUMNS_IGNORE)
    
    
for col in tqdm(cols_feature):
#     for dtm in list_dtm:
    Parallel(n_jobs)([delayed(parallel_dtm)(data[[col_dtm, col, f"{col}_cnt"]], col, dtm) for dtm in list_dtm])

SystemExit: 

In [30]:
prefix = '609'
dir_save = 'valid_use'
cols_save = [col for col in data.columns if col.count('_cnt') and col.count('DT-M')]
train = data.iloc[:train_length][cols_save]
test = data.iloc[train_length:][cols_save]
save_feature(train, prefix, dir_save, is_train=True, auto_type=True, list_ignore=COLUMNS_IGNORE)
save_feature(test,  prefix, dir_save, is_train=False, auto_type=True, list_ignore=COLUMNS_IGNORE)

data.drop(cols_save, axis=1, inplace=True)
del train, test
gc.collect()

2084

In [None]:
#========================================================================
# DT-WでCount Regularize
#========================================================================
prefix = '609'
dir_save = 'valid_use'

col_dtw = 'DT-W'
list_dtw = sorted(data['DT-W'].unique())
n_jobs = len(list_dtw)

cols_C = [col for col in data.columns if col.startswith('C') and not col.count('cnt')]
cols_feature = cols_C

# 全体におけるカウントを先に作っておく
for col in cols_feature:
    cnt_map = data[col].value_counts().to_dict()
    data[f"{col}_cnt"] = data[col].map(cnt_map)
    
    
def parallel_dtw(df, col, dtw):
    tmp = df[df[col_dtw]==dtw]

    # local count
    local_cnt_map = tmp[col].value_counts().to_dict()
    df[f"DT-W_{dtw}_{col}_cnt"] = df[col].map(local_cnt_map)
    df[f"DT-W_{dtw}_{col}_cnt_ratio_global"] = df[f"DT-W_{dtw}_{col}_cnt"] / df[f"{col}_cnt"]

    cols_save = [col for col in data.columns if col.count('_cnt') and col.count('DT-W')]
    train = data.iloc[:train_length][cols_save]
    test = data.iloc[train_length:][cols_save]
    save_feature(train[cols_save], prefix, dir_save, is_train=True, auto_type=True, list_ignore=COLUMNS_IGNORE)
    save_feature(test[cols_save],  prefix, dir_save, is_train=False, auto_type=True, list_ignore=COLUMNS_IGNORE)
    
    
for col in tqdm(cols_feature):
#     for dtw in list_dtw:
    Parallel(n_jobs)([delayed(parallel_dtw)(data[[col_dtw, col, f"{col}_cnt"]], col, dtw) for dtw in list_dtw])

  0%|          | 0/14 [00:00<?, ?it/s]Exception ignored in: <function tqdm.__del__ at 0x7fee98f9a8c8>
Traceback (most recent call last):
  File "/home/yryrgogo/anaconda3/lib/python3.7/site-packages/tqdm/_tqdm.py", line 888, in __del__
    def __del__(self):
KeyboardInterrupt
ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.

KeyboardInterrupt
KeyboardInterrupt
  File "/home/yryrgogo/anaconda3/lib/python3.7/multiprocessing/synchronize.py", line 95, in __enter__
    return self._semlock.__enter__()
  File "/home/yryrgogo/anaconda3/lib/python3.7/multiprocessing/synchronize.py", line 95, in __enter__
    return self._semlock.__enter__()
  File "/home/yryrgogo/anaconda3/lib/python3.7/multiprocessing/queues.py", line 351, in get
    with self._rlock:
  File "/home/yryrgogo/anaconda3/lib/python3.7/multiprocessing/queues.py", line 351, in get
    with self._rlock:
  File "/home/yryrgogo/anaconda3/lib/python3.7/multiprocessing/pool.py", lin