In [1]:
%load_ext autoreload
%autoreload 2
from glob import glob
import gc
import os
import sys
import datetime
import numpy as np
import pandas as pd
from tqdm import tqdm
from func.utils import get_numeric_features, get_categorical_features, read_pkl_gzip, to_pkl_gzip, parallel_load_data, get_filename
from func.time_utils import date_add_days, date_add_times
from func.ml_utils import save_feature, get_cnt_feature, get_dummie_feature, get_label_feature
from func.parallel_utils import get_parallel_arg_list
from joblib import delayed, Parallel

In [2]:
COLUMN_ID = 'TransactionID'
COLUMN_DT = 'TransactionDT'
COLUMN_TARGET = 'isFraud'
COLUMNS_IGNORE = [COLUMN_ID, COLUMN_DT, COLUMN_TARGET, 'ProductCD']

train_paths = glob('../feature/eda_base/*_train.gz')
test_paths = glob('../feature/eda_base/*_test.gz')

train_paths = [path for path in train_paths 
               if path.count(COLUMN_DT) 
               or path.count(COLUMN_ID)
               or path.count(COLUMN_TARGET)
               or path.count('time_zone')
               or path.count('fill_')
               or path.count('bin_')
               or path.startswith('Product')
              ]
test_paths = [path for path in test_paths 
               if path.count(COLUMN_DT) 
               or path.count(COLUMN_ID)
               or path.count(COLUMN_TARGET)
               or path.count('time_zone')
               or path.count('fill_')
               or path.count('bin_')
               or path.startswith('Product')
              ]

df_train = parallel_load_data(train_paths)
df_test = parallel_load_data(test_paths)

In [4]:
df_train.head()

Unnamed: 0,bin__V315,fill__M5,bin__C7,fill__P_emaildomain_bin,TransactionID,bin__V67,bin__V87,fill__M2,fill__addr1,fill__R_emaildomain_bin,...,bin__V48,bin__C4,bin__V128,bin__V62,bin__V156,bin__V78,bin__V29,bin__V90,bin__V282,bin__V131
0,0,F,0.0,#,2987000,1,1,T,315,#,...,#,0.0,0,1,#,1,0,0,1,0
1,0,T,0.0,google,2987001,1,1,#,325,#,...,0,0.0,0,1,#,1,0,0,1,0
2,0,F,0.0,microsoft,2987002,1,1,T,330,#,...,0,0.0,0,1,#,1,0,0,1,0
3,0,T,0.0,yahoo,2987003,1,1,#,476,#,...,0,0.0,925,1,#,1,0,0,0,135
4,0,#,0.0,google,2987004,#,#,#,420,#,...,#,0.0,0,#,0,#,#,#,1,0


In [3]:
use_cols = [col for col in  df_train.columns if col not in COLUMNS_IGNORE]
print(len(use_cols))
cols_card   = sorted([col for col in use_cols if col.count('card') and not col.count('cnt')])
cols_addr   = sorted([col for col in use_cols if col.count('addr') and not col.count('cnt')])
cols_cnt_card   = sorted([col for col in use_cols if col.count('card') and col.count('cnt')])
cols_cnt_addr   = sorted([col for col in use_cols if col.count('addr') and col.count('cnt')])
cols_domain = [col for col in df_train.columns if col.count('prefix') or col.count('_suffix') or col.count('_bin')]
cols_C      = sorted([col for col in use_cols if col.count('C') and not col.count('Prod')])
cols_V      = sorted([col for col in use_cols if col.count('V')])
cols_M      = sorted([col for col in use_cols if col.count('M')])
cols_Amt    = sorted([col for col in use_cols if col.count('Amt') or col.count('cents')])
cols_time = ['time_zone']

0