In [1]:
%load_ext autoreload
%autoreload 2
from glob import glob
import gc
import os
import sys
import datetime
import numpy as np
import pandas as pd
from tqdm import tqdm
from func.utils import get_numeric_features, get_categorical_features, read_pkl_gzip, to_pkl_gzip, parallel_load_data, get_filename
from func.time_utils import date_add_days, date_add_times
from func.ml_utils import save_feature, get_cnt_feature, get_dummie_feature, get_label_feature
from func.parallel_utils import get_parallel_arg_list
from joblib import delayed, Parallel

In [2]:
COLUMN_ID = 'TransactionID'
COLUMN_DT = 'TransactionDT'
COLUMN_TARGET = 'isFraud'
COLUMNS_IGNORE = [COLUMN_ID, COLUMN_DT, COLUMN_TARGET, 'ProductCD']

train_paths = glob('../feature/eda_base/*_train.gz')
test_paths = glob('../feature/eda_base/*_test.gz')

train_paths = [path for path in train_paths 
               if (path.count(COLUMN_DT) 
               or path.count(COLUMN_ID)
               or path.count(COLUMN_TARGET)
               or path.count('time_zone')
               or path.count('hour')
               or path.count('Product')
               or path.count('card')
               or path.count('addr')
               or path.count('domain')
               or path.count('C')
               or path.count('V')
               or path.count('M')
               or path.count('Amt')
              )
              and not path.count('bin_')
              and not path.count('fill_')
              ]
test_paths = [path for path in test_paths 
               if (path.count(COLUMN_DT) 
               or path.count(COLUMN_ID)
               or path.count(COLUMN_TARGET)
               or path.count('time_zone')
               or path.count('hour')
               or path.count('Product')
               or path.count('card')
               or path.count('addr')
               or path.count('domain')
               or path.count('C')
               or path.count('V')
               or path.count('M')
               or path.count('Amt')
              )
              and not path.count('bin_')
              and not path.count('fill_')
              ]

df_train = parallel_load_data(train_paths)
df_test  = parallel_load_data(test_paths)
data = pd.concat([df_train, df_test], axis=0, ignore_index=True)
train_length = df_train.shape[0]
del df_train, df_test
gc.collect()

21

In [7]:
use_cols = [col for col in  data.columns if col not in COLUMNS_IGNORE]
len(use_cols)

data['cents'] = np.round( data['TransactionAmt'] - np.floor(data['TransactionAmt']),2 )

list_domain = [col for col in data.columns if col.count('domain')]
data[list_domain[0]].fillna('#', inplace=True)
data[list_domain[0] +'_prefix'] = data[list_domain[0]].apply(lambda x: x.split('.')[0])
data[list_domain[1]].fillna('#', inplace=True)
data[list_domain[1] +'_prefix'] = data[list_domain[0]].apply(lambda x: x.split('.')[0])


emails = {'gmail': 'google', 'att.net': 'att', 'twc.com': 'spectrum', 'scranton.edu': 'other', 'optonline.net': 'other',
          'hotmail.co.uk': 'microsoft', 'comcast.net': 'other', 'yahoo.com.mx': 'yahoo', 'yahoo.fr': 'yahoo',
          'yahoo.es': 'yahoo', 'charter.net': 'spectrum', 'live.com': 'microsoft', 'aim.com': 'aol', 'hotmail.de': 'microsoft',
          'centurylink.net': 'centurylink', 'gmail.com': 'google', 'me.com': 'apple', 'earthlink.net': 'other', 
          'gmx.de': 'other', 'web.de': 'other', 'cfl.rr.com': 'other', 'hotmail.com': 'microsoft', 'protonmail.com': 'other',
          'hotmail.fr': 'microsoft', 'windstream.net': 'other', 'outlook.es': 'microsoft', 'yahoo.co.jp': 'yahoo',
          'yahoo.de': 'yahoo', 'servicios-ta.com': 'other', 'netzero.net': 'other', 'suddenlink.net': 'other',
          'roadrunner.com': 'other', 'sc.rr.com': 'other', 'live.fr': 'microsoft', 'verizon.net': 'yahoo',
          'msn.com': 'microsoft', 'q.com': 'centurylink', 'prodigy.net.mx': 'att', 'frontier.com': 'yahoo',
          'anonymous.com': 'anonymous', 'rocketmail.com': 'yahoo', 'sbcglobal.net': 'att', 'frontiernet.net': 'yahoo',
          'ymail.com': 'yahoo', 'outlook.com': 'microsoft', 'mail.com': 'anonymous', 'bellsouth.net': 'other',
          'embarqmail.com': 'centurylink', 'cableone.net': 'other', 'hotmail.es': 'microsoft', 'mac.com': 'apple',
          'yahoo.co.uk': 'yahoo', 'netzero.com': 'other', 'yahoo.com': 'yahoo', 'live.com.mx': 'microsoft', 'ptd.net': 'other',
          'cox.net': 'other', 'aol.com': 'aol', 'juno.com': 'other', 'icloud.com': 'apple'}
us_emails = ['gmail', 'net', 'edu']

for c in ['P_emaildomain', 'R_emaildomain']:
    data[c + '_bin'] = data[c].map(emails)
    data[c + '_suffix'] = data[c].map(lambda x: str(x).split('.')[-1])
    data[c + '_suffix'] = data[c + '_suffix'].map(lambda x: x if str(x) not in us_emails else 'us')


cols_card   = sorted([col for col in use_cols if col.count('card')])
cols_addr   = sorted([col for col in use_cols if col.count('addr')])
cols_domain = [col for col in data.columns if col.count('prefix') or col.count('_suffix') or col.count('_bin')]
cols_C      = sorted([col for col in use_cols if col.count('C') and not col.count('Prod')])
cols_V      = sorted([col for col in use_cols if col.count('V')])
cols_M      = sorted([col for col in use_cols if col.count('M')])
cols_time = ['time_zone']

In [4]:
#========================================================================
# Card
#========================================================================
is_viz = [True, False][1]
prefix = 'fill'
dir_save = 'eda_base'
pd.set_option('max_rows', 300)
def cnt_null(df, col):
    print(df[col].isnull().sum())
    return df[col].isnull().sum()

for col in cols_card:
    
    if is_viz:
        agg = data.groupby(col)[COLUMN_TARGET].agg({
            'cnt': 'count',
            'mean': 'mean',
        })
        print(col, agg.shape)
        
        agg2 = agg.groupby('cnt')['mean'].agg({
            'max': 'max',
            'mean': 'mean',
        })
        display(agg2)
#         agg.sort_values(by='cnt', ascending=False, inplace=True)
#         display(agg)
#         agg.sort_values(by='mean', ascending=False, inplace=True)
#         display(agg)
        sys.exit()
        continue
        
    data_type = str(data[col].dtype)
    
    if cnt_null(data, col)==0:
        pass
    else:
        if data_type.count('int') or data_type.count('float'):
            data[col].fillna(-100, inplace=True)
            data[col] = data[col].astype('int16')
        else:
            data[col].fillna('#Ca', inplace=True)
        
        
    df_train = data.iloc[:train_length]
    df_test = data.iloc[train_length:]
    save_feature(df_train[[col]], prefix, dir_save, is_train=True, auto_type=False, list_ignore=COLUMNS_IGNORE)
    save_feature(df_test[[col]],  prefix, dir_save, is_train=False, auto_type=False, list_ignore=COLUMNS_IGNORE)

0
(590540,) | card1
(506691,) | card1
17587
(590540,) | card2
(506691,) | card2
4567
(590540,) | card3
(506691,) | card3
4663
(590540,) | card4
(506691,) | card4
8806
(590540,) | card5
(506691,) | card5
4578
(590540,) | card6
(506691,) | card6


In [9]:
#========================================================================
# Cnt Card
#========================================================================
is_viz = [True, False][1]
prefix = 'fill'
dir_save = 'eda_base'
pd.set_option('max_rows', 300)
def cnt_null(df, col):
    print(df[col].isnull().sum())
    return df[col].isnull().sum()

for cols_list in [cols_card, cols_addr, cols_domain]:
# for cols_list in [cols_V, cols_C, ['TransactionAmt', 'cents']]:

    for tmp_col in cols_list:

        col = f'cnt_{tmp_col}'
        data[col] = data[tmp_col].map(data[tmp_col].value_counts(dropna=False))

        if is_viz:
            agg = data.groupby(col)[COLUMN_TARGET].agg({
                'cnt': 'count',
                'mean': 'mean',
            })
            print(col, agg.shape)

            agg2 = agg.groupby('cnt')['mean'].agg({
                'max': 'max',
                'mean': 'mean',
            })
            display(agg2)
    #         agg.sort_values(by='cnt', ascending=False, inplace=True)
    #         display(agg)
    #         agg.sort_values(by='mean', ascending=False, inplace=True)
    #         display(agg)
            sys.exit()
            continue

        data_type = str(data[col].dtype)

        if cnt_null(data, col)==0:
            pass
        else:
            if data_type.count('int') or data_type.count('float'):
                data[col].fillna(-100, inplace=True)
                data[col] = data[col].astype('int16')
            else:
                data[col].fillna('#cntCa', inplace=True)


        df_train = data.iloc[:train_length]
        df_test = data.iloc[train_length:]
        save_feature(df_train[[col]], prefix, dir_save, is_train=True, auto_type=False, list_ignore=COLUMNS_IGNORE)
        save_feature(df_test[[col]],  prefix, dir_save, is_train=False, auto_type=False, list_ignore=COLUMNS_IGNORE)

0
(590540,) | cnt_card1
(506691,) | cnt_card1
0
(590540,) | cnt_card2
(506691,) | cnt_card2
0
(590540,) | cnt_card3
(506691,) | cnt_card3
0
(590540,) | cnt_card4
(506691,) | cnt_card4
0
(590540,) | cnt_card5
(506691,) | cnt_card5
0
(590540,) | cnt_card6
(506691,) | cnt_card6
0
(590540,) | cnt_addr1
(506691,) | cnt_addr1
0
(590540,) | cnt_addr2
(506691,) | cnt_addr2


In [6]:
#========================================================================
# Addr
#========================================================================
is_viz = [True, False][1]
prefix = 'fill'
dir_save = 'eda_base'
pd.set_option('max_rows', 300)
def cnt_null(df, col):
    print(df[col].isnull().sum())
    return df[col].isnull().sum()

for col in cols_addr:
    
    if is_viz:
        print(col, data[col].value_counts().shape)
        display(data[col].value_counts())
        continue
    
    data_type = str(data[col].dtype)
    
    if cnt_null(data, col)==0:
        pass
    else:
        if data_type.count('int') or data_type.count('float'):
            data[col].fillna(-100, inplace=True)
            data[col] = data[col].astype('int16')
        else:
            data[col].fillna('#', inplace=True)
        
    df_train = data.iloc[:train_length]
    df_test = data.iloc[train_length:]
    save_feature(df_train[[col]], prefix, dir_save, is_train=True, auto_type=False, list_ignore=COLUMNS_IGNORE)
    save_feature(df_test[[col]],  prefix, dir_save, is_train=False, auto_type=False, list_ignore=COLUMNS_IGNORE)

131315
(590540,) | addr1
(506691,) | addr1
131315
(590540,) | addr2
(506691,) | addr2


In [7]:
#========================================================================
# domain
#========================================================================
is_viz = [True, False][1]
prefix = 'fill'
dir_save = 'eda_base'
pd.set_option('max_rows', 300)
def cnt_null(df, col):
    print(df[col].isnull().sum())
    return df[col].isnull().sum()

for col in cols_domain:
    
    if is_viz:
        print(col, data[col].value_counts().shape)
        display(data[col].value_counts())
        continue
    
    data_type = str(data[col].dtype)
    
    if cnt_null(data, col)==0:
        pass
    else:
        if data_type.count('int') or data_type.count('float'):
            data[col].fillna(-100, inplace=True)
            data[col] = data[col].astype('float32')
        else:
            data[col].fillna('#', inplace=True)
            
    df_train = data.iloc[:train_length]
    df_test = data.iloc[train_length:]
    save_feature(df_train[[col]], prefix, dir_save, is_train=True, auto_type=False, list_ignore=COLUMNS_IGNORE)
    save_feature(df_test[[col]],  prefix, dir_save, is_train=False, auto_type=False, list_ignore=COLUMNS_IGNORE)

0
(590540,) | P_emaildomain_prefix
(506691,) | P_emaildomain_prefix
0
(590540,) | R_emaildomain_prefix
(506691,) | R_emaildomain_prefix
163648
(590540,) | P_emaildomain_bin
(506691,) | P_emaildomain_bin
0
(590540,) | P_emaildomain_suffix
(506691,) | P_emaildomain_suffix
824070
(590540,) | R_emaildomain_bin
(506691,) | R_emaildomain_bin
0
(590540,) | R_emaildomain_suffix
(506691,) | R_emaildomain_suffix


In [23]:
#========================================================================
# C
#========================================================================
is_viz = [True, False][1]
prefix = 'bin'
dir_save = 'eda_base'
pd.set_option('max_rows', 300)
def cnt_null(df, col):
    print(df[col].isnull().sum())
    return df[col].isnull().sum()

for col in cols_C:
    
    if is_viz:
        print(col, data[col].value_counts().shape)
        display(data[col].value_counts())
        continue
    
    data_type = str(data[col].dtype)
    
    if data_type.count('int') or data_type.count('float'):
        data[col].fillna(-99999, inplace=True)
        data[col] = data[col].astype('float32')
        
        data[col] = pd.qcut(data[col], q=100, duplicates='drop')
        
    df_train = data.iloc[:train_length]
    df_test = data.iloc[train_length:]
    save_feature(df_train[[col]], prefix, dir_save, is_train=True, auto_type=False, list_ignore=COLUMNS_IGNORE)
    save_feature(df_test[[col]],  prefix, dir_save, is_train=False, auto_type=False, list_ignore=COLUMNS_IGNORE)

(590540,) | C1
(506691,) | C1
(590540,) | C10
(506691,) | C10
(590540,) | C11
(506691,) | C11
(590540,) | C12
(506691,) | C12
(590540,) | C13
(506691,) | C13
(590540,) | C14
(506691,) | C14
(590540,) | C2
(506691,) | C2
(590540,) | C3
(506691,) | C3
(590540,) | C4
(506691,) | C4
(590540,) | C5
(506691,) | C5
(590540,) | C6
(506691,) | C6
(590540,) | C7
(506691,) | C7
(590540,) | C8
(506691,) | C8
(590540,) | C9
(506691,) | C9


In [25]:
#========================================================================
# V
#========================================================================
is_viz = [True, False][0]
prefix = 'bin'
dir_save = 'eda_base'
pd.set_option('max_rows', 300)
def cnt_null(df, col):
    print(df[col].isnull().sum())
    return df[col].isnull().sum()

for col in cols_V:
    
    if is_viz:
        print(col, data[col].min(), df_test[col].min())
        print(col, data[col].max(), df_test[col].max())
#         print(col, data[col].value_counts().shape)
#         display(data[col].value_counts())
        continue
    
    data_type = str(data[col].dtype)
    
    data[col].fillna(-99999, inplace=True)
    data[col] = data[col].astype('int32')
    data[col] = pd.qcut(data[col], q=100, duplicates='drop')
    
    df_train = data.iloc[:train_length]
    df_test = data.iloc[train_length:]
    save_feature(df_train[[col]], prefix, dir_save, is_train=True, auto_type=False, list_ignore=COLUMNS_IGNORE)
    save_feature(df_test[[col]],  prefix, dir_save, is_train=False, auto_type=False, list_ignore=COLUMNS_IGNORE)

V127 0.0 0.0
V127 544500.0 544500.0
V128 0.0 0.0
V128 519038.5 519038.5
V130 0.0 0.0
V130 167200.0 167200.0
V131 0.0 0.0
V131 167200.0 167200.0
V133 0.0 0.0
V133 519038.5 519038.5
V156 0.0 0.0
V156 24.0 11.0
V165 0.0 0.0
V165 928882.0 928882.0
V187 0.0 0.0
V187 218.0 158.0
V2 0.0 0.0
V2 11.0 11.0
V201 0.0 0.0
V201 55.0 47.0
V243 0.0 0.0
V243 57.0 54.0
V258 0.0 0.0
V258 269.0 269.0
V259 0.0 0.0
V259 285.0 131.0
V265 0.0 0.0
V265 1065496.5 1065496.5
V267 0.0 0.0
V267 64800.0 64800.0
V281 0.0 0.0
V281 30.0 30.0
V282 0.0 0.0
V282 63.0 63.0
V283 0.0 0.0
V283 68.0 68.0
V29 0.0 0.0
V29 5.0 4.0
V294 0.0 0.0
V294 1286.0 246.0
V3 0.0 0.0
V3 11.0 11.0
V306 0.0 0.0
V306 718740.0 718740.0
V307 0.0 0.0
V307 958320.0 958320.0
V308 0.0 0.0
V308 718740.0 718740.0
V310 0.0 0.0
V310 167200.0 167200.0
V312 0.0 0.0
V312 167200.0 167200.0
V313 0.0 0.0
V313 4817.47021484375 4727.9599609375
V314 0.0 0.0
V314 7539.75 7539.75
V315 0.0 0.0
V315 4817.47021484375 4727.9599609375
V317 0.0 0.0
V317 958320.0 958320.0

In [30]:
#========================================================================
# M
#========================================================================
is_viz = [True, False][1]
prefix = 'fill'
dir_save = 'eda_base'
pd.set_option('max_rows', 300)
def cnt_null(df, col):
    print(df[col].isnull().sum())
    return df[col].isnull().sum()

for col in cols_M:
    
    if is_viz:
        print(col, data[col].value_counts().shape)
        display(data[col].value_counts())
        continue
    
    data[col].fillna('#', inplace=True)
        
    df_train = data.iloc[:train_length]
    df_test = data.iloc[train_length:]
    save_feature(df_train[[col]], prefix, dir_save, is_train=True, auto_type=False, list_ignore=COLUMNS_IGNORE)
    save_feature(df_test[[col]],  prefix, dir_save, is_train=False, auto_type=False, list_ignore=COLUMNS_IGNORE)

In [36]:
#========================================================================
# Amt
#========================================================================
is_viz = [True, False][1]
prefix = 'bin'
dir_save = 'eda_base'
pd.set_option('max_rows', 300)
def cnt_null(df, col):
    print(df[col].isnull().sum())
    return df[col].isnull().sum()

cols_Amt = ['TransactionAmt']

for col in cols_Amt:
    
    data[col] = (data[col]*100).astype('float32')
    
    if is_viz:
        print(col, data[col].min(), df_test[col].min())
        print(col, data[col].value_counts().shape)
        display(data[col].value_counts())
        continue
    
    data[col].fillna(-100, inplace=True)
    data[col] = pd.qcut(data[col], q=100, duplicates='drop')
        
    df_train = data.iloc[:train_length]
    df_test = data.iloc[train_length:]
    save_feature(df_train[[col]], prefix, dir_save, is_train=True, auto_type=False, list_ignore=COLUMNS_IGNORE)
    save_feature(df_test[[col]],  prefix, dir_save, is_train=False, auto_type=False, list_ignore=COLUMNS_IGNORE)

(590540,) | TransactionAmt
(506691,) | TransactionAmt


In [3]:
#========================================================================
# Amt_Dev
#========================================================================
is_viz = [True, False][1]
prefix = 'bin'
dir_save = 'eda_base'
pd.set_option('max_rows', 300)
def cnt_null(df, col):
    print(df[col].isnull().sum())
    return df[col].isnull().sum()

col = 'TransactionAmt'
data[col] = (data[col]*100).astype('int16')

data['Amt_DIV200'] = data[col].map(lambda x: x%200)
data['Amt_DIV100'] = data[col].map(lambda x: x%100)
data['Amt_DIV50']  = data[col].map(lambda x: x%50)
data['Amt_DIV30']  = data[col].map(lambda x: x%30)
cols_Amt_Dev = [
    'Amt_DIV200',
    'Amt_DIV100',
    'Amt_DIV50',
    'Amt_DIV30',
]

for col in cols_Amt_Dev:
    
    if is_viz:
        print(col, df_train[col].min(), df_test[col].min())
        print(col, df_train[col].value_counts().shape)
        display(df_train[col].value_counts())
        continue
    
    data[col].fillna(-100, inplace=True)

    df_train = data.iloc[:train_length]
    df_test = data.iloc[train_length:]
    save_feature(df_train[[col]], prefix, dir_save, is_train=True, auto_type=False, list_ignore=COLUMNS_IGNORE)
    save_feature(df_test[[col]],  prefix, dir_save, is_train=False, auto_type=False, list_ignore=COLUMNS_IGNORE)

(590540,) | Amt_DIV200
(506691,) | Amt_DIV200
(590540,) | Amt_DIV100
(506691,) | Amt_DIV100
(590540,) | Amt_DIV50
(506691,) | Amt_DIV50
(590540,) | Amt_DIV30
(506691,) | Amt_DIV30


In [40]:
#========================================================================
# cents
#========================================================================
prefix = 'bin'
dir_save = 'eda_base'
col = 'cents'
save_feature(df_train[[col]], prefix, dir_save, is_train=True, auto_type=False, list_ignore=COLUMNS_IGNORE)
save_feature(df_test[[col]],  prefix, dir_save, is_train=False, auto_type=False, list_ignore=COLUMNS_IGNORE)

(590540,) | cents
(506691,) | cents
