In [1]:
%load_ext autoreload
%autoreload 2
from glob import glob
import gc
import os
import sys
import datetime
import numpy as np
import pandas as pd
from tqdm import tqdm
from func.utils import get_numeric_features, get_categorical_features, read_pkl_gzip, to_pkl_gzip, parallel_load_data, get_filename
from func.time_utils import date_add_days
from func.ml_utils import save_feature, get_cnt_feature, get_dummie_feature, get_label_feature
from func.parallel_utils import get_parallel_arg_list
from joblib import delayed, Parallel

In [2]:
feim = read_pkl_gzip('../output/feature_importances/20190905_1024__CV0-9434494228779833__feature155.gz')
cols_V = []
for v in feim.loc[[i for i in feim.index if i.count('V')]].index:
    cols_V.append(v.replace('raw__', ''))
# cols_V

In [3]:
COLUMN_ID = 'TransactionID'
COLUMN_DT = 'TransactionDT'
COLUMN_TARGET = 'isFraud'
COLUMNS_IGNORE = [COLUMN_ID, COLUMN_DT, COLUMN_TARGET, 'ProductCD']

train_paths = glob('../feature/eda_base/*_train.gz')
test_paths = glob('../feature/eda_base/*_test.gz')

train_paths = [path for path in train_paths 
               if path.count(COLUMN_DT) 
               or path.count(COLUMN_ID)
               or path.count('time_zone')
               or path.count('hour')
               or path.count('Product')
               or path.count('V')
              ]
test_paths = [path for path in test_paths 
               if path.count(COLUMN_DT) 
               or path.count(COLUMN_ID)
               or path.count('time_zone')
               or path.count('hour')
               or path.count('V')
               or path.count('Product')
              ]

df_train = parallel_load_data(train_paths)
df_test = parallel_load_data(test_paths)
data = pd.concat([df_train, df_test], axis=0, ignore_index=True)
if COLUMN_ID in data.columns:
    data.set_index(COLUMN_ID, inplace=True)

base_train = read_pkl_gzip('../input/base_train.gz').set_index(COLUMN_ID)
base_test = read_pkl_gzip('../input/base_test.gz').set_index(COLUMN_ID)
base = pd.concat([base_train, base_test], axis=0)

cols_num = get_numeric_features(data, COLUMNS_IGNORE)
cols_num = [col for col in cols_num if col.count('V')]
data[cols_num] = data[cols_num].astype('float32')

In [4]:
all_V = [col for col in data.columns if col.count('V')]
drop_V = list(set(all_V) - set(cols_V))
data.drop(drop_V, axis=1, inplace=True)
del df_train, df_test
gc.collect()

57

In [5]:
#========================================================================
# ProductCDあたりのC
#========================================================================
cols_V = [col for col in data.columns if col.count('V')]
cols_pcd = data['ProductCD'].unique()

for col in tqdm(cols_V):
    for pcd in cols_pcd:
        feature_name = f'{col}__ProductCD-{pcd}'
        data[feature_name] = -1
        data.loc[data['ProductCD'].isin([pcd]), feature_name] = data.loc[data['ProductCD'].isin([pcd]), col]

100%|██████████| 47/47 [01:48<00:00,  5.27s/it]


In [6]:
# sorted([col for col in data.columns if not col.count('C')])
cols_feature = sorted([col for col in data.columns if col.count('V') and col not in COLUMNS_IGNORE
                 and col.count('Product')
                ])

In [7]:
#========================================================================
# FE Aggregation User ID & TimeSeries Date
#========================================================================
# User別に期間を切って集計できる様にする
dir_save = 'valid'
        
train = data.iloc[:len(base_train)]
test  = data.iloc[len(base_train):]

save_feature(train[cols_feature], '511', dir_save, is_train=True, auto_type=True, list_ignore=COLUMNS_IGNORE)
save_feature(test[cols_feature],  '511', dir_save, is_train=False, auto_type=True, list_ignore=COLUMNS_IGNORE)

(590540,) | V127__ProductCD-C
(590540,) | V127__ProductCD-H
(590540,) | V127__ProductCD-R
(590540,) | V127__ProductCD-S
(590540,) | V127__ProductCD-W
(590540,) | V128__ProductCD-C
(590540,) | V128__ProductCD-H
(590540,) | V128__ProductCD-R
(590540,) | V128__ProductCD-S
(590540,) | V128__ProductCD-W
(590540,) | V130__ProductCD-C
(590540,) | V130__ProductCD-H
(590540,) | V130__ProductCD-R
(590540,) | V130__ProductCD-S
(590540,) | V130__ProductCD-W
(590540,) | V131__ProductCD-C
(590540,) | V131__ProductCD-H
(590540,) | V131__ProductCD-R
(590540,) | V131__ProductCD-S
(590540,) | V131__ProductCD-W
(590540,) | V133__ProductCD-C
(590540,) | V133__ProductCD-H
(590540,) | V133__ProductCD-R
(590540,) | V133__ProductCD-S
(590540,) | V133__ProductCD-W
(590540,) | V156__ProductCD-C
(590540,) | V156__ProductCD-H
(590540,) | V156__ProductCD-R
(590540,) | V156__ProductCD-S
(590540,) | V156__ProductCD-W
(590540,) | V165__ProductCD-C
(590540,) | V165__ProductCD-H
(590540,) | V165__ProductCD-R
(590540,) 

In [13]:
# for df, agg_cols in zip(arg_df_list, arg_list):
def parallel_agg(df, agg_cols):
    error_keys = []
    error_cols = []
    for col in agg_cols:
        aggs = {}
#         aggs[col] = ['mean', 'max', 'min', 'std']
        aggs[col] = ['max', 'min']
        
        for key in list_key:
            
            tmp_base_train = base_train.join(df[key])
            tmp_base_test = base_test.join(df[key])
            
            try:
                base_agg = df[key].to_frame()
            except AttributeError:
                error_keys.append(key)
                error_cols += agg_cols
            
            df_agg = df.groupby(key).agg(aggs)
            df_agg.columns = get_new_columns(key+'_', aggs)
            max_col = [col for col in df_agg.columns if col.count('_max')][0]
            min_col = [col for col in df_agg.columns if col.count('_min')][0]
            df_agg[max_col+'_min_diff'] = df_agg[max_col] - df_agg[min_col]
            
            df_agg.drop([max_col, min_col], axis=1, inplace=True)
            
            df_agg.reset_index(inplace=True)
            
            base_train_agg = tmp_base_train.merge(df_agg, on=key, how='left')
            base_test_agg = tmp_base_test.merge(df_agg, on=key, how='left')
            
            del df_agg, tmp_base_train, tmp_base_test
            gc.collect()
            
            print(base_train_agg.shape, base_test_agg.shape)
            cols_feature = [
                col for col in base_train_agg.columns 
                if col not in COLUMNS_IGNORE and col != key and col != 'D1']
            save_feature(base_train_agg[cols_feature], '502', dir_save, is_train=True, auto_type=True, list_ignore=COLUMNS_IGNORE)
            save_feature(base_test_agg[cols_feature],  '502', dir_save, is_train=False, auto_type=True, list_ignore=COLUMNS_IGNORE)
            
            del base_train_agg, base_test_agg
            gc.collect()

In [14]:
err = Parallel(n_jobs)([delayed(parallel_agg)(d, ac) for d, ac in zip(arg_df_list, arg_list)])

In [None]:
df_feat.head()