In [1]:
%load_ext autoreload
%autoreload 2
from glob import glob
import gc
import os
import sys
import datetime
import numpy as np
import pandas as pd
from tqdm import tqdm
from func.utils import get_numeric_features, get_categorical_features, read_pkl_gzip, to_pkl_gzip, parallel_load_data, get_filename
from func.time_utils import date_add_days, date_add_times
from func.ml_utils import save_feature, get_cnt_feature, get_dummie_feature, get_label_feature, get_factorize_feature
from func.parallel_utils import get_parallel_arg_list
from joblib import delayed, Parallel
from itertools import combinations

In [2]:
COLUMN_ID = 'TransactionID'
COLUMN_DT = 'TransactionDT'
COLUMN_TARGET = 'isFraud'
COLUMNS_IGNORE = [COLUMN_ID, COLUMN_DT, COLUMN_TARGET, 'ProductCD']

train_paths = glob('../feature/eda_base/*_train.gz')
test_paths  = glob('../feature/eda_base/*_test.gz')

train_paths = [path for path in train_paths 
               if (path.count(COLUMN_DT) 
               or path.count(COLUMN_ID)
               or path.count(COLUMN_TARGET)
               or path.count('ProductCD_t')
               or path.count('uid')
               or path.count('C')
               or path.count('V')
               )
               and not path.count('bin_')
               and not path.count('fill_')
               and not path.count('129')
              ]
test_paths = [path for path in test_paths 
               if (path.count(COLUMN_DT) 
               or path.count(COLUMN_ID)
               or path.count(COLUMN_TARGET)
               or path.count('ProductCD_t')
               or path.count('uid')
               or path.count('C')
               or path.count('V')
               )
               and not path.count('bin_')
               and not path.count('fill_')
               and not path.count('129')
              ]

df_train = parallel_load_data(train_paths)
df_test = parallel_load_data(test_paths)
train_length = df_train.shape[0]
cols_uid = [col for col in df_train.columns if col.count('uid')]

Process ForkPoolWorker-85:
Process ForkPoolWorker-92:
Process ForkPoolWorker-170:
Process ForkPoolWorker-191:
Process ForkPoolWorker-185:
Process ForkPoolWorker-186:
Process ForkPoolWorker-156:
Process ForkPoolWorker-192:
Process ForkPoolWorker-22:
Process ForkPoolWorker-184:
Process ForkPoolWorker-68:
Process ForkPoolWorker-54:
Process ForkPoolWorker-158:
Process ForkPoolWorker-59:
Process ForkPoolWorker-65:
Process ForkPoolWorker-96:
Process ForkPoolWorker-79:
Process ForkPoolWorker-94:
Process ForkPoolWorker-168:
Process ForkPoolWorker-148:
Process ForkPoolWorker-66:
Process ForkPoolWorker-137:
Process ForkPoolWorker-139:
Process ForkPoolWorker-167:
Process ForkPoolWorker-130:
Process ForkPoolWorker-8:
Process ForkPoolWorker-141:
Process ForkPoolWorker-67:
Process ForkPoolWorker-174:
Process ForkPoolWorker-89:
Process ForkPoolWorker-10:
Process ForkPoolWorker-45:
Process ForkPoolWorker-72:
Process ForkPoolWorker-26:
Process ForkPoolWorker-36:
Process ForkPoolWorker-58:
Process ForkP

In [18]:
def parallel_uid_agg(tmp_train, tmp_test, uid, cols_feature, list_agg):
    for feature in tqdm(cols_feature):
        for agg_type in list_agg:
            new_col_name = uid.replace('130__', '') + '_' + feature + '_' + agg_type
            temp_df = pd.concat([tmp_train[[uid, feature]], tmp_test[[uid, feature]]])
            temp_df = temp_df.groupby([uid])[feature].agg([agg_type]).reset_index().rename(
                                                    columns={agg_type: new_col_name})

            temp_df.index = list(temp_df[uid])
            temp_df = temp_df[new_col_name].to_dict()   

            tmp_train[new_col_name] = tmp_train[uid].map(temp_df)
            tmp_test[new_col_name]  = tmp_test[uid].map(temp_df)
            
            save_feature(tmp_train[[new_col_name]], prefix, dir_save, is_train=True, auto_type=False, list_ignore=COLUMNS_IGNORE)
            save_feature(tmp_test[[new_col_name]],  prefix, dir_save, is_train=False, auto_type=False, list_ignore=COLUMNS_IGNORE)

In [14]:
cols_C = [col for col in df_train.columns if col.startswith('C')]
cols_V = [col for col in df_train.columns if col.startswith('V')]
# cols_V

In [20]:
def make_uid_feature(cols_feature, list_agg):
    Parallel(len(cols_uid))([
        delayed(parallel_uid_agg)(
            df_train[[uid] + cols_feature], df_test[[uid] + cols_feature], uid, cols_feature, list_agg
        ) for uid in cols_uid])

In [None]:
prefix = '612'
dir_save = 'create'
list_agg = ['mean', 'std']

# C
# make_uid_feature(cols_C, list_agg)
# V
make_uid_feature(cols_V, list_agg)
# D
# make_uid_feature(cols_D, list_agg)