In [2]:
%load_ext autoreload
%autoreload 2
from glob import glob
import gc
import os
import sys
import datetime
import numpy as np
import pandas as pd
from tqdm import tqdm
from func.utils import get_numeric_features, get_categorical_features, read_pkl_gzip, to_pkl_gzip, parallel_load_data, get_filename
from func.time_utils import date_add_days, date_add_times
from func.ml_utils import save_feature, get_cnt_feature, get_dummie_feature, get_label_feature, get_factorize_feature
from func.parallel_utils import get_parallel_arg_list
from joblib import delayed, Parallel
from itertools import combinations

In [13]:
COLUMN_ID = 'TransactionID'
COLUMN_DT = 'TransactionDT'
COLUMN_TARGET = 'isFraud'
COLUMNS_IGNORE = [COLUMN_ID, COLUMN_DT, COLUMN_TARGET, 'ProductCD']

train_paths = glob('../feature/eda_base/*_train.gz')
test_paths  = glob('../feature/eda_base/*_test.gz')

train_paths = [path for path in train_paths 
               if (path.count(COLUMN_DT) 
               or path.count(COLUMN_ID)
               or path.count(COLUMN_TARGET)
               or path.count('ProductCD_t')
               or path.count('D')
               or path.count('card')
               or path.count('addr')
               or path.count('domain')
               or path.count('bin_')
               or path.count('fill__cnt_V282')
               or path.count('fill__cnt_V258')
               or path.count('fill__cnt_V243')
               or path.count('fill__cnt_V2_')
               or path.count('fill__cnt_V187_')
               or path.count('fill__cnt_V201_')
               )
               and not path.count('129')
              ]
test_paths = [path for path in test_paths 
               if (path.count(COLUMN_DT) 
               or path.count(COLUMN_ID)
               or path.count(COLUMN_TARGET)
               or path.count('ProductCD_t')
               or path.count('D')
               or path.count('card')
               or path.count('addr')
               or path.count('domain')
               or path.count('bin_')
               or path.count('fill__cnt_V282')
               or path.count('fill__cnt_V258')
               or path.count('fill__cnt_V243')
               or path.count('fill__cnt_V2_')
               or path.count('fill__cnt_V187_')
               or path.count('fill__cnt_V201_')
               )
               and not path.count('129')
              ]

df_train = parallel_load_data(train_paths)
df_test = parallel_load_data(test_paths)
train_length = df_train.shape[0]

In [14]:
########################### Card columns "outliers"
for col in ['card1']: 
    valid_card = pd.concat([df_train[[col]], df_test[[col]]])
    valid_card = valid_card[col].value_counts()
    valid_card_std = valid_card.values.std()

    invalid_cards = valid_card[valid_card<=2]
    print('Rare cards',len(invalid_cards))

    valid_card = valid_card[valid_card>2]
    valid_card = list(valid_card.index)

    print('No intersection in Train', len(df_train[~df_train[col].isin(df_test[col])]))
    print('Intersection in Train', len(df_train[df_train[col].isin(df_test[col])]))
    
    df_train[col] = np.where(df_train[col].isin(df_test[col]), df_train[col], np.nan)
    df_test[col]  = np.where(df_test[col].isin(df_train[col]), df_test[col], np.nan)

    df_train[col] = np.where(df_train[col].isin(valid_card), df_train[col], np.nan)
    df_test[col]  = np.where(df_test[col].isin(valid_card), df_test[col], np.nan)
    print('#'*20)

for col in ['card2','card3','card4','card5','card6',]: 
    print('No intersection in Train', col, len(df_train[~df_train[col].isin(df_test[col])]))
    print('Intersection in Train', col, len(df_train[df_train[col].isin(df_test[col])]))
    
    df_train[col] = np.where(df_train[col].isin(df_test[col]), df_train[col], np.nan)
    df_test[col]  = np.where(df_test[col].isin(df_train[col]), df_test[col], np.nan)
    print('#'*20)

Rare cards 5993
No intersection in Train 10396
Intersection in Train 580144
####################
No intersection in Train card2 5012
Intersection in Train card2 585528
####################
No intersection in Train card3 47
Intersection in Train card3 590493
####################
No intersection in Train card4 0
Intersection in Train card4 590540
####################
No intersection in Train card5 7279
Intersection in Train card5 583261
####################
No intersection in Train card6 30
Intersection in Train card6 590510
####################


In [15]:
########################### Client Virtual ID
# Let's add some kind of client uID based on cardID and addr columns
# The value will be very specific for each client so we need to remove it
# from final features. But we can use it for aggregations.
df_train['uid'] = df_train['card1'].astype(str)+'_'+df_train['card2'].astype(str)
df_test['uid'] = df_test['card1'].astype(str)+'_'+df_test['card2'].astype(str)

df_train['uid2'] = df_train['uid'].astype(str)+'_'+df_train['card3'].astype(str)+'_'+df_train['card5'].astype(str)
df_test['uid2'] = df_test['uid'].astype(str)+'_'+df_test['card3'].astype(str)+'_'+df_test['card5'].astype(str)

df_train['uid3'] = df_train['uid2'].astype(str)+'_'+df_train['addr1'].astype(str)+'_'+df_train['addr2'].astype(str)
df_test['uid3'] = df_test['uid2'].astype(str)+'_'+df_test['addr1'].astype(str)+'_'+df_test['addr2'].astype(str)

df_train['uid4'] = df_train['uid3'].astype(str)+'_'+df_train['P_emaildomain'].astype(str)
df_test['uid4'] = df_test['uid3'].astype(str)+'_'+df_test['P_emaildomain'].astype(str)

df_train['uid5'] = df_train['uid3'].astype(str)+'_'+df_train['R_emaildomain'].astype(str)
df_test['uid5'] = df_test['uid3'].astype(str)+'_'+df_test['R_emaildomain'].astype(str)

In [16]:
cols_bin = sorted([col for col in df_train.columns if col.count('bin_') or col.count('fill__')])
cols_bin

['bin__Amt_DIV100',
 'bin__Amt_DIV200',
 'bin__Amt_DIV30',
 'bin__Amt_DIV50',
 'bin__C1',
 'bin__C10',
 'bin__C11',
 'bin__C12',
 'bin__C13',
 'bin__C14',
 'bin__C2',
 'bin__C3',
 'bin__C4',
 'bin__C5',
 'bin__C6',
 'bin__C7',
 'bin__C8',
 'bin__C9',
 'bin__TransactionAmt',
 'bin__cents',
 'fill__P_emaildomain_bin',
 'fill__P_emaildomain_prefix',
 'fill__P_emaildomain_suffix',
 'fill__R_emaildomain_bin',
 'fill__R_emaildomain_prefix',
 'fill__R_emaildomain_suffix',
 'fill__addr1',
 'fill__addr2',
 'fill__card1',
 'fill__card2',
 'fill__card3',
 'fill__card4',
 'fill__card5',
 'fill__card6',
 'fill__cnt_P_emaildomain',
 'fill__cnt_R_emaildomain',
 'fill__cnt_V187',
 'fill__cnt_V2',
 'fill__cnt_V201',
 'fill__cnt_V243',
 'fill__cnt_V258',
 'fill__cnt_V282',
 'fill__cnt_addr1',
 'fill__cnt_addr2',
 'fill__cnt_card1',
 'fill__cnt_card2',
 'fill__cnt_card3',
 'fill__cnt_card4',
 'fill__cnt_card5',
 'fill__cnt_card6']

In [None]:
df_train['uid3_C1'] = df_train['uid3'].astype(str)+'_'+df_train['bin__C1'].astype(str)
df_train['uid3_C6'] = df_train['uid3'].astype(str)+'_'+df_train['bin__C6'].astype(str)
df_train['uid3_C1_C6'] = df_train['uid3'].astype(str)+'_'+df_train['bin__C1'].astype(str)+df_train['bin__C6'].astype(str)
df_train['uid3_C1_C12'] = df_train['uid3'].astype(str)+'_'+df_train['bin__C1'].astype(str)+df_train['bin__C12'].astype(str)
df_train['uid3_C6_C11'] = df_train['uid3'].astype(str)+'_'+df_train['bin__C6'].astype(str)+df_train['bin__C11'].astype(str)
df_train['uid3_C6_C12'] = df_train['uid3'].astype(str)+'_'+df_train['bin__C6'].astype(str)+df_train['bin__C12'].astype(str)
df_train['uid3_C6_C14'] = df_train['uid3'].astype(str)+'_'+df_train['bin__C6'].astype(str)+df_train['bin__C14'].astype(str)
df_train['uid3_C11_C14'] = df_train['uid3'].astype(str)+'_'+df_train['bin__C11'].astype(str)+df_train['bin__C14'].astype(str)
df_train['uid3_C12_C14'] = df_train['uid3'].astype(str)+'_'+df_train['bin__C12'].astype(str)+df_train['bin__C14'].astype(str)

df_train['uid3_C6_V187'] = df_train['uid3'].astype(str)+'_'+df_train['bin__C6'].astype(str)+'_'+df_train['fill__cnt_V187'].astype(str)
df_train['uid3_C14_V187'] = df_train['uid3'].astype(str)+'_'+df_train['bin__C14'].astype(str)+'_'+df_train['fill__cnt_V187'].astype(str)
df_train['uid3_C6_V258'] = df_train['uid3'].astype(str)+'_'+df_train['bin__C6'].astype(str)+'_'+df_train['fill__cnt_V258'].astype(str)
df_train['uid3_C14_V258'] = df_train['uid3'].astype(str)+'_'+df_train['bin__C14'].astype(str)+'_'+df_train['fill__cnt_V258'].astype(str)


df_train['new_uid4'] = df_train['uid3'].astype(str)+'_'+df_train['fill__R_emaildomain_prefix'].astype(str)

df_train['new_uid4_C1'] = df_train['new_uid4'].astype(str)+'_'+df_train['bin__C1'].astype(str)
df_train['new_uid4_C6'] = df_train['new_uid4'].astype(str)+'_'+df_train['bin__C6'].astype(str)
df_train['new_uid4_C1_C6'] = df_train['new_uid4'].astype(str)+'_'+df_train['bin__C1'].astype(str)+df_train['bin__C6'].astype(str)
df_train['new_uid4_C1_C12'] = df_train['new_uid4'].astype(str)+'_'+df_train['bin__C1'].astype(str)+df_train['bin__C12'].astype(str)
df_train['new_uid4_C6_C11'] = df_train['new_uid4'].astype(str)+'_'+df_train['bin__C6'].astype(str)+df_train['bin__C11'].astype(str)
df_train['new_uid4_C6_C12'] = df_train['new_uid4'].astype(str)+'_'+df_train['bin__C6'].astype(str)+df_train['bin__C12'].astype(str)
df_train['new_uid4_C6_C14'] = df_train['new_uid4'].astype(str)+'_'+df_train['bin__C6'].astype(str)+df_train['bin__C14'].astype(str)
df_train['new_uid4_C11_C14'] = df_train['new_uid4'].astype(str)+'_'+df_train['bin__C11'].astype(str)+df_train['bin__C14'].astype(str)
df_train['new_uid4_C12_C14'] = df_train['new_uid4'].astype(str)+'_'+df_train['bin__C12'].astype(str)+df_train['bin__C14'].astype(str)

df_train['new_uid4_C6_V187'] = df_train['new_uid4'].astype(str)+'_'+df_train['bin__C6'].astype(str)+'_'+df_train['fill__cnt_V187'].astype(str)
df_train['new_uid4_C14_V187'] = df_train['new_uid4'].astype(str)+'_'+df_train['bin__C14'].astype(str)+'_'+df_train['fill__cnt_V187'].astype(str)
df_train['new_uid4_C6_V258'] = df_train['new_uid4'].astype(str)+'_'+df_train['bin__C6'].astype(str)+'_'+df_train['fill__cnt_V258'].astype(str)
df_train['new_uid4_C14_V258'] = df_train['new_uid4'].astype(str)+'_'+df_train['bin__C14'].astype(str)+'_'+df_train['fill__cnt_V258'].astype(str)


df_test['uid3_C1'] = df_test['uid3'].astype(str)+'_'+df_test['bin__C1'].astype(str)
df_test['uid3_C6'] = df_test['uid3'].astype(str)+'_'+df_test['bin__C6'].astype(str)
df_test['uid3_C1_C6'] = df_test['uid3'].astype(str)+'_'+df_test['bin__C1'].astype(str)+df_test['bin__C6'].astype(str)
df_test['uid3_C1_C12'] = df_test['uid3'].astype(str)+'_'+df_test['bin__C1'].astype(str)+df_test['bin__C12'].astype(str)
df_test['uid3_C6_C11'] = df_test['uid3'].astype(str)+'_'+df_test['bin__C6'].astype(str)+df_test['bin__C11'].astype(str)
df_test['uid3_C6_C12'] = df_test['uid3'].astype(str)+'_'+df_test['bin__C6'].astype(str)+df_test['bin__C12'].astype(str)
df_test['uid3_C6_C14'] = df_test['uid3'].astype(str)+'_'+df_test['bin__C6'].astype(str)+df_test['bin__C14'].astype(str)
df_test['uid3_C11_C14'] = df_test['uid3'].astype(str)+'_'+df_test['bin__C11'].astype(str)+df_test['bin__C14'].astype(str)
df_test['uid3_C12_C14'] = df_test['uid3'].astype(str)+'_'+df_test['bin__C12'].astype(str)+df_test['bin__C14'].astype(str)

df_test['uid3_C6_V187'] = df_test['uid3'].astype(str)+'_'+df_test['bin__C6'].astype(str)+'_'+df_test['fill__cnt_V187'].astype(str)
df_test['uid3_C14_V187'] = df_test['uid3'].astype(str)+'_'+df_test['bin__C14'].astype(str)+'_'+df_test['fill__cnt_V187'].astype(str)
df_test['uid3_C6_V258'] = df_test['uid3'].astype(str)+'_'+df_test['bin__C6'].astype(str)+'_'+df_test['fill__cnt_V258'].astype(str)
df_test['uid3_C14_V258'] = df_test['uid3'].astype(str)+'_'+df_test['bin__C14'].astype(str)+'_'+df_test['fill__cnt_V258'].astype(str)


df_test['new_uid4'] = df_test['uid3'].astype(str)+'_'+df_test['fill__R_emaildomain_prefix'].astype(str)

df_test['new_uid4_C1'] = df_test['new_uid4'].astype(str)+'_'+df_test['bin__C1'].astype(str)
df_test['new_uid4_C6'] = df_test['new_uid4'].astype(str)+'_'+df_test['bin__C6'].astype(str)
df_test['new_uid4_C1_C6'] = df_test['new_uid4'].astype(str)+'_'+df_test['bin__C1'].astype(str)+df_test['bin__C6'].astype(str)
df_test['new_uid4_C1_C12'] = df_test['new_uid4'].astype(str)+'_'+df_test['bin__C1'].astype(str)+df_test['bin__C12'].astype(str)
df_test['new_uid4_C6_C11'] = df_test['new_uid4'].astype(str)+'_'+df_test['bin__C6'].astype(str)+df_test['bin__C11'].astype(str)
df_test['new_uid4_C6_C12'] = df_test['new_uid4'].astype(str)+'_'+df_test['bin__C6'].astype(str)+df_test['bin__C12'].astype(str)
df_test['new_uid4_C6_C14'] = df_test['new_uid4'].astype(str)+'_'+df_test['bin__C6'].astype(str)+df_test['bin__C14'].astype(str)
df_test['new_uid4_C11_C14'] = df_test['new_uid4'].astype(str)+'_'+df_test['bin__C11'].astype(str)+df_test['bin__C14'].astype(str)
df_test['new_uid4_C12_C14'] = df_test['new_uid4'].astype(str)+'_'+df_test['bin__C12'].astype(str)+df_test['bin__C14'].astype(str)

df_test['new_uid4_C6_V187'] = df_test['new_uid4'].astype(str)+'_'+df_test['bin__C6'].astype(str)+'_'+df_test['fill__cnt_V187'].astype(str)
df_test['new_uid4_C14_V187'] = df_test['new_uid4'].astype(str)+'_'+df_test['bin__C14'].astype(str)+'_'+df_test['fill__cnt_V187'].astype(str)
df_test['new_uid4_C6_V258'] = df_test['new_uid4'].astype(str)+'_'+df_test['bin__C6'].astype(str)+'_'+df_test['fill__cnt_V258'].astype(str)
df_test['new_uid4_C14_V258'] = df_test['new_uid4'].astype(str)+'_'+df_test['bin__C14'].astype(str)+'_'+df_test['fill__cnt_V258'].astype(str)

In [22]:
#========================================================================
# cnt encodingしたものを使ってuidを作る
#========================================================================
df_train['cnt_uid'] = df_train['fill__cnt_card1'].astype(str)+'_'+df_train['fill__cnt_card2'].astype(str)
df_test['cnt_uid'] = df_test['fill__cnt_card1'].astype(str)+'_'+df_test['fill__cnt_card2'].astype(str)

df_train['cnt_uid2'] = df_train['cnt_uid'].astype(str)+'_'+df_train['fill__cnt_card3'].astype(str)+'_'+df_train['fill__cnt_card5'].astype(str)
df_test['cnt_uid2'] = df_test['cnt_uid'].astype(str)+'_'+df_test['fill__cnt_card3'].astype(str)+'_'+df_test['fill__cnt_card5'].astype(str)

df_train['cnt_uid3'] = df_train['cnt_uid2'].astype(str)+'_'+df_train['fill__cnt_addr1'].astype(str)+'_'+df_train['fill__cnt_addr2'].astype(str)
df_test['cnt_uid3'] = df_test['cnt_uid2'].astype(str)+'_'+df_test['fill__cnt_addr1'].astype(str)+'_'+df_test['fill__cnt_addr2'].astype(str)

df_train['cnt_uid4'] = df_train['cnt_uid3'].astype(str)+'_'+df_train['fill__R_emaildomain_prefix'].astype(str)
df_train['cnt_uid4'] = df_train['cnt_uid3'].astype(str)+'_'+df_train['fill__R_emaildomain_suffix'].astype(str)

In [None]:
df_train['cnt_uid3_C1'] = df_train['cnt_uid3'].astype(str)+'_'+df_train['bin__C1'].astype(str)
df_train['cnt_uid3_C6'] = df_train['cnt_uid3'].astype(str)+'_'+df_train['bin__C6'].astype(str)
df_train['cnt_uid3_C1_C6'] = df_train['cnt_uid3'].astype(str)+'_'+df_train['bin__C1'].astype(str)+df_train['bin__C6'].astype(str)
df_train['cnt_uid3_C1_C12'] = df_train['cnt_uid3'].astype(str)+'_'+df_train['bin__C1'].astype(str)+df_train['bin__C12'].astype(str)
df_train['cnt_uid3_C6_C11'] = df_train['cnt_uid3'].astype(str)+'_'+df_train['bin__C6'].astype(str)+df_train['bin__C11'].astype(str)
df_train['cnt_uid3_C6_C12'] = df_train['cnt_uid3'].astype(str)+'_'+df_train['bin__C6'].astype(str)+df_train['bin__C12'].astype(str)
df_train['cnt_uid3_C6_C14'] = df_train['cnt_uid3'].astype(str)+'_'+df_train['bin__C6'].astype(str)+df_train['bin__C14'].astype(str)
df_train['cnt_uid3_C11_C14'] = df_train['cnt_uid3'].astype(str)+'_'+df_train['bin__C11'].astype(str)+df_train['bin__C14'].astype(str)
df_train['cnt_uid3_C12_C14'] = df_train['cnt_uid3'].astype(str)+'_'+df_train['bin__C12'].astype(str)+df_train['bin__C14'].astype(str)

df_train['cnt_uid3_C6_V187'] = df_train['cnt_uid3'].astype(str)+'_'+df_train['bin__C6'].astype(str)+'_'+df_train['fill__cnt_V187'].astype(str)
df_train['cnt_uid3_C14_V187'] = df_train['cnt_uid3'].astype(str)+'_'+df_train['bin__C14'].astype(str)+'_'+df_train['fill__cnt_V187'].astype(str)
df_train['cnt_uid3_C6_V258'] = df_train['cnt_uid3'].astype(str)+'_'+df_train['bin__C6'].astype(str)+'_'+df_train['fill__cnt_V258'].astype(str)
df_train['cnt_uid3_C14_V258'] = df_train['cnt_uid3'].astype(str)+'_'+df_train['bin__C14'].astype(str)+'_'+df_train['fill__cnt_V258'].astype(str)


df_train['cnt_uid4'] = df_train['cnt_uid3'].astype(str)+'_'+df_train['fill__R_emaildomain_prefix'].astype(str)

df_train['cnt_uid4_C1'] = df_train['cnt_uid4'].astype(str)+'_'+df_train['bin__C1'].astype(str)
df_train['cnt_uid4_C6'] = df_train['cnt_uid4'].astype(str)+'_'+df_train['bin__C6'].astype(str)
df_train['cnt_uid4_C1_C6'] = df_train['cnt_uid4'].astype(str)+'_'+df_train['bin__C1'].astype(str)+df_train['bin__C6'].astype(str)
df_train['cnt_uid4_C1_C12'] = df_train['cnt_uid4'].astype(str)+'_'+df_train['bin__C1'].astype(str)+df_train['bin__C12'].astype(str)
df_train['cnt_uid4_C6_C11'] = df_train['cnt_uid4'].astype(str)+'_'+df_train['bin__C6'].astype(str)+df_train['bin__C11'].astype(str)
df_train['cnt_uid4_C6_C12'] = df_train['cnt_uid4'].astype(str)+'_'+df_train['bin__C6'].astype(str)+df_train['bin__C12'].astype(str)
df_train['cnt_uid4_C6_C14'] = df_train['cnt_uid4'].astype(str)+'_'+df_train['bin__C6'].astype(str)+df_train['bin__C14'].astype(str)
df_train['cnt_uid4_C11_C14'] = df_train['cnt_uid4'].astype(str)+'_'+df_train['bin__C11'].astype(str)+df_train['bin__C14'].astype(str)
df_train['cnt_uid4_C12_C14'] = df_train['cnt_uid4'].astype(str)+'_'+df_train['bin__C12'].astype(str)+df_train['bin__C14'].astype(str)

df_train['cnt_uid4_C6_V187'] = df_train['cnt_uid4'].astype(str)+'_'+df_train['bin__C6'].astype(str)+'_'+df_train['fill__cnt_V187'].astype(str)
df_train['cnt_uid4_C14_V187'] = df_train['cnt_uid4'].astype(str)+'_'+df_train['bin__C14'].astype(str)+'_'+df_train['fill__cnt_V187'].astype(str)
df_train['cnt_uid4_C6_V258'] = df_train['cnt_uid4'].astype(str)+'_'+df_train['bin__C6'].astype(str)+'_'+df_train['fill__cnt_V258'].astype(str)
df_train['cnt_uid4_C14_V258'] = df_train['cnt_uid4'].astype(str)+'_'+df_train['bin__C14'].astype(str)+'_'+df_train['fill__cnt_V258'].astype(str)


df_test['cnt_uid3_C1'] = df_test['cnt_uid3'].astype(str)+'_'+df_test['bin__C1'].astype(str)
df_test['cnt_uid3_C6'] = df_test['cnt_uid3'].astype(str)+'_'+df_test['bin__C6'].astype(str)
df_test['cnt_uid3_C1_C6'] = df_test['cnt_uid3'].astype(str)+'_'+df_test['bin__C1'].astype(str)+df_test['bin__C6'].astype(str)
df_test['cnt_uid3_C1_C12'] = df_test['cnt_uid3'].astype(str)+'_'+df_test['bin__C1'].astype(str)+df_test['bin__C12'].astype(str)
df_test['cnt_uid3_C6_C11'] = df_test['cnt_uid3'].astype(str)+'_'+df_test['bin__C6'].astype(str)+df_test['bin__C11'].astype(str)
df_test['cnt_uid3_C6_C12'] = df_test['cnt_uid3'].astype(str)+'_'+df_test['bin__C6'].astype(str)+df_test['bin__C12'].astype(str)
df_test['cnt_uid3_C6_C14'] = df_test['cnt_uid3'].astype(str)+'_'+df_test['bin__C6'].astype(str)+df_test['bin__C14'].astype(str)
df_test['cnt_uid3_C11_C14'] = df_test['cnt_uid3'].astype(str)+'_'+df_test['bin__C11'].astype(str)+df_test['bin__C14'].astype(str)
df_test['cnt_uid3_C12_C14'] = df_test['cnt_uid3'].astype(str)+'_'+df_test['bin__C12'].astype(str)+df_test['bin__C14'].astype(str)

df_test['cnt_uid3_C6_V187'] = df_test['cnt_uid3'].astype(str)+'_'+df_test['bin__C6'].astype(str)+'_'+df_test['fill__cnt_V187'].astype(str)
df_test['cnt_uid3_C14_V187'] = df_test['cnt_uid3'].astype(str)+'_'+df_test['bin__C14'].astype(str)+'_'+df_test['fill__cnt_V187'].astype(str)
df_test['cnt_uid3_C6_V258'] = df_test['cnt_uid3'].astype(str)+'_'+df_test['bin__C6'].astype(str)+'_'+df_test['fill__cnt_V258'].astype(str)
df_test['cnt_uid3_C14_V258'] = df_test['cnt_uid3'].astype(str)+'_'+df_test['bin__C14'].astype(str)+'_'+df_test['fill__cnt_V258'].astype(str)


df_test['cnt_uid4'] = df_test['cnt_uid3'].astype(str)+'_'+df_test['fill__R_emaildomain_prefix'].astype(str)

df_test['cnt_uid4_C1'] = df_test['cnt_uid4'].astype(str)+'_'+df_test['bin__C1'].astype(str)
df_test['cnt_uid4_C6'] = df_test['cnt_uid4'].astype(str)+'_'+df_test['bin__C6'].astype(str)
df_test['cnt_uid4_C1_C6'] = df_test['cnt_uid4'].astype(str)+'_'+df_test['bin__C1'].astype(str)+df_test['bin__C6'].astype(str)
df_test['cnt_uid4_C1_C12'] = df_test['cnt_uid4'].astype(str)+'_'+df_test['bin__C1'].astype(str)+df_test['bin__C12'].astype(str)
df_test['cnt_uid4_C6_C11'] = df_test['cnt_uid4'].astype(str)+'_'+df_test['bin__C6'].astype(str)+df_test['bin__C11'].astype(str)
df_test['cnt_uid4_C6_C12'] = df_test['cnt_uid4'].astype(str)+'_'+df_test['bin__C6'].astype(str)+df_test['bin__C12'].astype(str)
df_test['cnt_uid4_C6_C14'] = df_test['cnt_uid4'].astype(str)+'_'+df_test['bin__C6'].astype(str)+df_test['bin__C14'].astype(str)
df_test['cnt_uid4_C11_C14'] = df_test['cnt_uid4'].astype(str)+'_'+df_test['bin__C11'].astype(str)+df_test['bin__C14'].astype(str)
df_test['cnt_uid4_C12_C14'] = df_test['cnt_uid4'].astype(str)+'_'+df_test['bin__C12'].astype(str)+df_test['bin__C14'].astype(str)

df_test['cnt_uid4_C6_V187'] = df_test['cnt_uid4'].astype(str)+'_'+df_test['bin__C6'].astype(str)+'_'+df_test['fill__cnt_V187'].astype(str)
df_test['cnt_uid4_C14_V187'] = df_test['cnt_uid4'].astype(str)+'_'+df_test['bin__C14'].astype(str)+'_'+df_test['fill__cnt_V187'].astype(str)
df_test['cnt_uid4_C6_V258'] = df_test['cnt_uid4'].astype(str)+'_'+df_test['bin__C6'].astype(str)+'_'+df_test['fill__cnt_V258'].astype(str)
df_test['cnt_uid4_C14_V258'] = df_test['cnt_uid4'].astype(str)+'_'+df_test['bin__C14'].astype(str)+'_'+df_test['fill__cnt_V258'].astype(str)

In [None]:
list_agg = ['mean','std']
cols_uid = [col for col in df_train.columns if col.count('uid')]
cols_uid.remove('uid')
cols_uid.remove('uid2')
cols_uid.remove('uid3')
cols_uid.remove('uid4')
cols_uid.remove('uid5')

In [None]:
prefix = '130'
dir_save = 'eda_base'
cols_save = cols_uid
save_feature(df_train[cols_save], prefix, dir_save, is_train=True, auto_type=False, list_ignore=COLUMNS_IGNORE)
save_feature(df_test[cols_save],  prefix, dir_save, is_train=False, auto_type=False, list_ignore=COLUMNS_IGNORE)