In [2]:
%load_ext autoreload
%autoreload 2
from glob import glob
import gc
import os
import sys
import datetime
import numpy as np
import pandas as pd
from tqdm import tqdm
from func.utils import get_numeric_features, get_categorical_features, read_pkl_gzip, to_pkl_gzip, parallel_load_data, get_filename
from func.time_utils import date_add_days
from func.ml_utils import save_feature, get_cnt_feature, get_dummie_feature, get_label_feature
from func.parallel_utils import get_parallel_arg_list
from joblib import delayed, Parallel

In [3]:
COLUMN_ID = 'TransactionID'
COLUMN_DT = 'TransactionDT'
COLUMN_TARGET = 'isFraud'
COLUMNS_IGNORE = [COLUMN_ID, COLUMN_DT, COLUMN_TARGET, 'ProductCD']

train_paths = glob('../feature/eda_base/*_train.gz')
test_paths  = glob('../feature/eda_base/*_test.gz')

train_paths = [path for path in train_paths 
               if (path.count(COLUMN_DT) 
               or path.count(COLUMN_ID)
               or path.count(COLUMN_TARGET)
               or path.count('ProductCD_t')
               or path.count('domain')
               )
               and not path.count('fill')
               and not path.count('bin')
               and not path.count('uid')
               and not path.count('129')
              ]
test_paths = [path for path in test_paths 
               if (path.count(COLUMN_DT) 
               or path.count(COLUMN_ID)
               or path.count(COLUMN_TARGET)
               or path.count('ProductCD_t')
               or path.count('domain')
               )
               and not path.count('fill')
               and not path.count('bin')
               and not path.count('uid')
               and not path.count('129')
              ]

df_train = parallel_load_data(train_paths)
df_test = parallel_load_data(test_paths)
data = pd.concat([df_train, df_test], axis=0, ignore_index=True)
train_length = df_train.shape[0]
del df_train, df_test
gc.collect()

base_train = read_pkl_gzip('../input/base_train.gz').set_index(COLUMN_ID)
base_test = read_pkl_gzip('../input/base_test.gz').set_index(COLUMN_ID)
base = pd.concat([base_train, base_test], axis=0)

cols_num = get_numeric_features(data, COLUMNS_IGNORE)
cols_num = [col for col in cols_num if col.count('V') or col.count('D')]
data[cols_num] = data[cols_num].astype('float32')

In [4]:
emails = {'gmail': 'google', 'att.net': 'att', 'twc.com': 'spectrum', 'scranton.edu': 'other', 'optonline.net': 'other',
          'hotmail.co.uk': 'microsoft', 'comcast.net': 'other', 'yahoo.com.mx': 'yahoo', 'yahoo.fr': 'yahoo',
          'yahoo.es': 'yahoo', 'charter.net': 'spectrum', 'live.com': 'microsoft', 'aim.com': 'aol', 'hotmail.de': 'microsoft',
          'centurylink.net': 'centurylink', 'gmail.com': 'google', 'me.com': 'apple', 'earthlink.net': 'other', 
          'gmx.de': 'other', 'web.de': 'other', 'cfl.rr.com': 'other', 'hotmail.com': 'microsoft', 'protonmail.com': 'other',
          'hotmail.fr': 'microsoft', 'windstream.net': 'other', 'outlook.es': 'microsoft', 'yahoo.co.jp': 'yahoo',
          'yahoo.de': 'yahoo', 'servicios-ta.com': 'other', 'netzero.net': 'other', 'suddenlink.net': 'other',
          'roadrunner.com': 'other', 'sc.rr.com': 'other', 'live.fr': 'microsoft', 'verizon.net': 'yahoo',
          'msn.com': 'microsoft', 'q.com': 'centurylink', 'prodigy.net.mx': 'att', 'frontier.com': 'yahoo',
          'anonymous.com': 'anonymous', 'rocketmail.com': 'yahoo', 'sbcglobal.net': 'att', 'frontiernet.net': 'yahoo',
          'ymail.com': 'yahoo', 'outlook.com': 'microsoft', 'mail.com': 'anonymous', 'bellsouth.net': 'other',
          'embarqmail.com': 'centurylink', 'cableone.net': 'other', 'hotmail.es': 'microsoft', 'mac.com': 'apple',
          'yahoo.co.uk': 'yahoo', 'netzero.com': 'other', 'yahoo.com': 'yahoo', 'live.com.mx': 'microsoft', 'ptd.net': 'other',
          'cox.net': 'other', 'aol.com': 'aol', 'juno.com': 'other', 'icloud.com': 'apple'}
us_emails = ['gmail', 'net', 'edu']

for c in ['P_emaildomain', 'R_emaildomain']:
    data[c + '_bin'] = data[c].map(emails)
    data[c + '_suffix'] = data[c].map(lambda x: str(x).split('.')[-1])
    data[c + '_suffix'] = data[c + '_suffix'].map(lambda x: x if str(x) not in us_emails else 'us')

In [5]:
p = 'P_emaildomain'
r = 'R_emaildomain'
uknown = 'email_not_provided'

def setDomain(df):
    df[p] = df[p].fillna(uknown)
    df[r] = df[r].fillna(uknown)
    
    # Check if P_emaildomain matches R_emaildomain
    df['email_check'] = np.where((df[p]==df[r])&(df[p]!=uknown),1,0)

    df[p+'_prefix'] = df[p].apply(lambda x: x.split('.')[0])
    df[r+'_prefix'] = df[r].apply(lambda x: x.split('.')[0])
    
    return df
    
data=setDomain(data)

data[f'{p}__isnull'] = data[p].map(lambda x:
                                   1 if x=='email_not_provided'
                                   else 0
                                  )
data[f'{r}__isnull'] = data[r].map(lambda x:
                                   1 if x=='email_not_provided'
                                   else 0
                                  )

In [6]:
#========================================================================
# FE Categorical Encoding 
#========================================================================

cols_categorical = get_categorical_features(data, ignore_list=COLUMNS_IGNORE)
df_cat = data[cols_categorical].copy()
for col in tqdm(cols_categorical):
    num = df_cat[col].value_counts().shape[0]
    df_cat[col].fillna('#', inplace=True)
    cols_cat = [col]
    if num>15:
        cnt_feature = get_cnt_feature(df_cat[col].to_frame(), cols_cat)
        label_feature = get_label_feature(df_cat[col].to_frame(), cols_cat)
        df_cat = df_cat.join(cnt_feature).join(label_feature)
    elif num>2:
        cnt_feature = get_cnt_feature(df_cat[col].to_frame(), cols_cat)
        label_feature = get_label_feature(df_cat[col].to_frame(), cols_cat)
        dummie_feature = get_dummie_feature(df_cat[col].to_frame(), cols_cat)
        df_cat = df_cat.join(cnt_feature).join(label_feature).join(dummie_feature)
    elif num<=2:
        label_feature = get_label_feature(df_cat[col].to_frame(), cols_cat)
        df_cat = df_cat.join(label_feature)
    else:
        print(col)
    df_cat.drop(col, axis=1, inplace=True)

100%|██████████| 8/8 [00:13<00:00,  1.45s/it]


In [7]:
data_cat = data.join(df_cat, how='left')
cols_feature = [col for col in data_cat.columns if not col in [p, r] and col.count('domain') and (
    col.count('cnt') or 
    col.count('dummie') or 
    col.count('label') or 
    col.count('null')
)]

In [8]:
#========================================================================
# Save
#========================================================================
prefix = '516'
dir_save = 'check_trush'

train = data_cat.iloc[:len(base_train)]
test  = data_cat.iloc[len(base_train):]
save_feature(train[cols_feature], prefix, dir_save, is_train=True, auto_type=True, list_ignore=COLUMNS_IGNORE)
save_feature(test[ cols_feature], prefix, dir_save, is_train=False, auto_type=True, list_ignore=COLUMNS_IGNORE)

(590540,) | P_emaildomain__isnull
(590540,) | R_emaildomain__isnull
(590540,) | cnt__P_emaildomain
(590540,) | label__P_emaildomain
(590540,) | cnt__R_emaildomain
(590540,) | label__R_emaildomain
(590540,) | cnt__P_emaildomain_bin
(590540,) | label__P_emaildomain_bin
(590540,) | P_emaildomain_bin_#_dummie
(590540,) | P_emaildomain_bin_anonymous_dummie
(590540,) | P_emaildomain_bin_aol_dummie
(590540,) | P_emaildomain_bin_apple_dummie
(590540,) | P_emaildomain_bin_att_dummie
(590540,) | P_emaildomain_bin_centurylink_dummie
(590540,) | P_emaildomain_bin_google_dummie
(590540,) | P_emaildomain_bin_microsoft_dummie
(590540,) | P_emaildomain_bin_other_dummie
(590540,) | P_emaildomain_bin_spectrum_dummie
(590540,) | P_emaildomain_bin_yahoo_dummie
(590540,) | cnt__P_emaildomain_suffix
(590540,) | label__P_emaildomain_suffix
(590540,) | P_emaildomain_suffix_com_dummie
(590540,) | P_emaildomain_suffix_nan_dummie
(590540,) | P_emaildomain_suffix_uk_dummie
(590540,) | P_emaildomain_suffix_us_dumm