In [1]:
%load_ext autoreload
%autoreload 2
from glob import glob
import gc
import os
import sys
import datetime
import numpy as np
import pandas as pd
from tqdm import tqdm
from func.utils import get_numeric_features, get_categorical_features, read_pkl_gzip, to_pkl_gzip, parallel_load_data, get_filename
from func.time_utils import date_add_days, date_add_times
from func.ml_utils import save_feature, get_cnt_feature, get_dummie_feature, get_label_feature
from func.parallel_utils import get_parallel_arg_list
from joblib import delayed, Parallel

In [15]:
COLUMN_ID = 'TransactionID'
COLUMN_DT = 'TransactionDT'
COLUMN_TARGET = 'isFraud'
COLUMNS_IGNORE = [COLUMN_ID, COLUMN_DT, COLUMN_TARGET, 'ProductCD']

train_paths = glob('../feature/eda_base/*_train.gz')
test_paths = glob('../feature/eda_base/*_test.gz')

train_paths = [path for path in train_paths 
               if path.count(COLUMN_DT) 
               or path.count(COLUMN_ID)
               or path.count(COLUMN_TARGET)
#                or path.count('time_zone')
#                or path.count('hour')
#                or path.count('Product')
#                or path.count('card')
#                or path.count('addr')
#                or path.count('domain')
#                or path.count('C')
               or path.count('V')
#                or path.count('M')
#                or path.count('Amt')
              ]
test_paths = [path for path in test_paths 
               if path.count(COLUMN_DT) 
               or path.count(COLUMN_ID)
               or path.count(COLUMN_TARGET)
#                or path.count('time_zone')
#                or path.count('hour')
#                or path.count('Product')
#                or path.count('card')
#                or path.count('addr')
#                or path.count('domain')
#                or path.count('C')
               or path.count('V')
#                or path.count('M')
#                or path.count('Amt')
              ]

df_train = parallel_load_data(train_paths)
df_test = parallel_load_data(test_paths)

In [3]:
use_cols = [col for col in  df_train.columns if col not in COLUMNS_IGNORE]
len(use_cols)

df_train['cents'] = np.round( df_train['TransactionAmt'] - np.floor(df_train['TransactionAmt']),2 )

list_domain = [col for col in df_train.columns if col.count('domain')]
df_train[list_domain[0]].fillna('#', inplace=True)
df_train[list_domain[0] +'_prefix'] = df_train[list_domain[0]].apply(lambda x: x.split('.')[0])
df_train[list_domain[1]].fillna('#', inplace=True)
df_train[list_domain[1] +'_prefix'] = df_train[list_domain[0]].apply(lambda x: x.split('.')[0])

df_test['cents'] = np.round( df_test['TransactionAmt'] - np.floor(df_test['TransactionAmt']),2 )

list_domain = [col for col in df_test.columns if col.count('domain')]
df_test[list_domain[0]].fillna('#', inplace=True)
df_test[list_domain[0] +'_prefix'] = df_test[list_domain[0]].apply(lambda x: x.split('.')[0])
df_test[list_domain[1]].fillna('#', inplace=True)
df_test[list_domain[1] +'_prefix'] = df_test[list_domain[0]].apply(lambda x: x.split('.')[0])

emails = {'gmail': 'google', 'att.net': 'att', 'twc.com': 'spectrum', 'scranton.edu': 'other', 'optonline.net': 'other',
          'hotmail.co.uk': 'microsoft', 'comcast.net': 'other', 'yahoo.com.mx': 'yahoo', 'yahoo.fr': 'yahoo',
          'yahoo.es': 'yahoo', 'charter.net': 'spectrum', 'live.com': 'microsoft', 'aim.com': 'aol', 'hotmail.de': 'microsoft',
          'centurylink.net': 'centurylink', 'gmail.com': 'google', 'me.com': 'apple', 'earthlink.net': 'other', 
          'gmx.de': 'other', 'web.de': 'other', 'cfl.rr.com': 'other', 'hotmail.com': 'microsoft', 'protonmail.com': 'other',
          'hotmail.fr': 'microsoft', 'windstream.net': 'other', 'outlook.es': 'microsoft', 'yahoo.co.jp': 'yahoo',
          'yahoo.de': 'yahoo', 'servicios-ta.com': 'other', 'netzero.net': 'other', 'suddenlink.net': 'other',
          'roadrunner.com': 'other', 'sc.rr.com': 'other', 'live.fr': 'microsoft', 'verizon.net': 'yahoo',
          'msn.com': 'microsoft', 'q.com': 'centurylink', 'prodigy.net.mx': 'att', 'frontier.com': 'yahoo',
          'anonymous.com': 'anonymous', 'rocketmail.com': 'yahoo', 'sbcglobal.net': 'att', 'frontiernet.net': 'yahoo',
          'ymail.com': 'yahoo', 'outlook.com': 'microsoft', 'mail.com': 'anonymous', 'bellsouth.net': 'other',
          'embarqmail.com': 'centurylink', 'cableone.net': 'other', 'hotmail.es': 'microsoft', 'mac.com': 'apple',
          'yahoo.co.uk': 'yahoo', 'netzero.com': 'other', 'yahoo.com': 'yahoo', 'live.com.mx': 'microsoft', 'ptd.net': 'other',
          'cox.net': 'other', 'aol.com': 'aol', 'juno.com': 'other', 'icloud.com': 'apple'}
us_emails = ['gmail', 'net', 'edu']

for c in ['P_emaildomain', 'R_emaildomain']:
    df_train[c + '_bin'] = df_train[c].map(emails)
    df_train[c + '_suffix'] = df_train[c].map(lambda x: str(x).split('.')[-1])
    df_train[c + '_suffix'] = df_train[c + '_suffix'].map(lambda x: x if str(x) not in us_emails else 'us')
    df_test[c + '_bin'] = df_test[c].map(emails)
    df_test[c + '_suffix'] = df_test[c].map(lambda x: str(x).split('.')[-1])
    df_test[c + '_suffix'] = df_test[c + '_suffix'].map(lambda x: x if str(x) not in us_emails else 'us')


cols_card   = sorted([col for col in use_cols if col.count('card')])
cols_addr   = sorted([col for col in use_cols if col.count('addr')])
cols_domain = [col for col in df_train.columns if col.count('prefix') or col.count('_suffix') or col.count('_bin')]
cols_C      = sorted([col for col in use_cols if col.count('C') and not col.count('Prod')])
cols_V      = sorted([col for col in use_cols if col.count('V')])
cols_M      = sorted([col for col in use_cols if col.count('M')])
cols_time = ['time_zone']

In [4]:
#========================================================================
# Card
#========================================================================
is_viz = [True, False][1]
prefix = 'fill'
dir_save = 'eda_base'
pd.set_option('max_rows', 300)
def cnt_null(df, col):
    print(df[col].isnull().sum())
    return df[col].isnull().sum()

for col in cols_card:
    
    if is_viz:
        agg = df_train.groupby(col)[COLUMN_TARGET].agg({
            'cnt': 'count',
            'mean': 'mean',
        })
        print(col, agg.shape)
        
        agg2 = agg.groupby('cnt')['mean'].agg({
            'max': 'max',
            'mean': 'mean',
        })
        display(agg2)
#         agg.sort_values(by='cnt', ascending=False, inplace=True)
#         display(agg)
#         agg.sort_values(by='mean', ascending=False, inplace=True)
#         display(agg)
        sys.exit()
        continue
        
    data_type = str(type(df_train[col].dtype))
    
    if cnt_null(df_train, col)==0:
        pass
    else:
        if data_type.count('int') or data_type.count('float'):
            df_train[col].fillna(-100, inplace=True)
            df_train[col] = df_train[col].astype('float32')
        else:
            df_train[col].fillna('#', inplace=True)
        
    if cnt_null(df_test, col)==0:
        pass
    else:
        if data_type.count('int') or data_type.count('float'):
            df_test[col].fillna(-100, inplace=True)
            df_test[col] = df_test[col].astype('float32')
        else:
            df_test[col].fillna('#', inplace=True)
        
    save_feature(df_train[[col]], prefix, dir_save, is_train=True, auto_type=False, list_ignore=COLUMNS_IGNORE)
    save_feature(df_test[[col]],  prefix, dir_save, is_train=False, auto_type=False, list_ignore=COLUMNS_IGNORE)

0
0
(590540,) | card1
(506691,) | card1
8933
8654
(590540,) | card2
(506691,) | card2
1565
3002
(590540,) | card3
(506691,) | card3
1577
3086
(590540,) | card4
(506691,) | card4
4259
4547
(590540,) | card5
(506691,) | card5
1571
3007
(590540,) | card6
(506691,) | card6


In [5]:
#========================================================================
# Cnt Card
#========================================================================
is_viz = [True, False][1]
prefix = 'fill'
dir_save = 'eda_base'
pd.set_option('max_rows', 300)
def cnt_null(df, col):
    print(df[col].isnull().sum())
    return df[col].isnull().sum()

for cols_list in [cols_card, cols_addr]:

    for tmp_col in cols_list:

        col = f'cnt_{tmp_col}'
        df_train[col] = df_train[tmp_col].map(df_train[tmp_col].value_counts())
        df_test[col] = df_test[tmp_col].map(df_test[tmp_col].value_counts())

        if is_viz:
            agg = df_train.groupby(col)[COLUMN_TARGET].agg({
                'cnt': 'count',
                'mean': 'mean',
            })
            print(col, agg.shape)

            agg2 = agg.groupby('cnt')['mean'].agg({
                'max': 'max',
                'mean': 'mean',
            })
            display(agg2)
    #         agg.sort_values(by='cnt', ascending=False, inplace=True)
    #         display(agg)
    #         agg.sort_values(by='mean', ascending=False, inplace=True)
    #         display(agg)
            sys.exit()
            continue

        data_type = str(type(df_train[col].dtype))

        if cnt_null(df_train, col)==0:
            pass
        else:
            if data_type.count('int') or data_type.count('float'):
                df_train[col].fillna(-100, inplace=True)
                df_train[col] = df_train[col].astype('int16')
            else:
                df_train[col].fillna('#', inplace=True)

        if cnt_null(df_test, col)==0:
            pass
        else:
            if data_type.count('int') or data_type.count('float'):
                df_test[col].fillna(-100, inplace=True)
                df_test[col] = df_test[col].astype('int16')
            else:
                df_test[col].fillna('#', inplace=True)

        save_feature(df_train[[col]], prefix, dir_save, is_train=True, auto_type=False, list_ignore=COLUMNS_IGNORE)
        save_feature(df_test[[col]],  prefix, dir_save, is_train=False, auto_type=False, list_ignore=COLUMNS_IGNORE)

0
0
(590540,) | cnt_card1
(506691,) | cnt_card1
0
0
(590540,) | cnt_card2
(506691,) | cnt_card2
0
0
(590540,) | cnt_card3
(506691,) | cnt_card3
0
0
(590540,) | cnt_card4
(506691,) | cnt_card4
0
0
(590540,) | cnt_card5
(506691,) | cnt_card5
0
0
(590540,) | cnt_card6
(506691,) | cnt_card6
65706
65609
(590540,) | cnt_addr1
(506691,) | cnt_addr1
65706
65609
(590540,) | cnt_addr2
(506691,) | cnt_addr2


In [6]:
#========================================================================
# Addr
#========================================================================
is_viz = [True, False][1]
prefix = 'fill'
dir_save = 'eda_base'
pd.set_option('max_rows', 300)
def cnt_null(df, col):
    print(df[col].isnull().sum())
    return df[col].isnull().sum()

for col in cols_addr:
    
    if is_viz:
        print(col, df_train[col].value_counts().shape)
        display(df_train[col].value_counts())
        continue
    
    data_type = str(type(df_train[col].dtype))
    
    if cnt_null(df_train, col)==0:
        pass
    else:
        if data_type.count('int') or data_type.count('float'):
            df_train[col].fillna(-100, inplace=True)
            df_train[col] = df_train[col].astype('float32')
        else:
            df_train[col].fillna('#', inplace=True)
        
    if cnt_null(df_test, col)==0:
        pass
    else:
        if data_type.count('int') or data_type.count('float'):
            df_test[col].fillna(-100, inplace=True)
            df_test[col] = df_test[col].astype('float32')
        else:
            df_test[col].fillna('#', inplace=True)
        
    save_feature(df_train[[col]], prefix, dir_save, is_train=True, auto_type=False, list_ignore=COLUMNS_IGNORE)
    save_feature(df_test[[col]],  prefix, dir_save, is_train=False, auto_type=False, list_ignore=COLUMNS_IGNORE)

65706
65609
(590540,) | addr1
(506691,) | addr1
65706
65609
(590540,) | addr2
(506691,) | addr2


In [8]:
#========================================================================
# domain
#========================================================================
is_viz = [True, False][1]
prefix = 'fill'
dir_save = 'eda_base'
pd.set_option('max_rows', 300)
def cnt_null(df, col):
    print(df[col].isnull().sum())
    return df[col].isnull().sum()

for col in cols_domain:
    
    if is_viz:
        print(col, df_train[col].value_counts().shape)
        display(df_train[col].value_counts())
        continue
    
    data_type = str(type(df_train[col].dtype))
    
    if cnt_null(df_train, col)==0:
        pass
    else:
        if data_type.count('int') or data_type.count('float'):
            df_train[col].fillna(-100, inplace=True)
            df_train[col] = df_train[col].astype('float32')
        else:
            df_train[col].fillna('#', inplace=True)
        
    if cnt_null(df_test, col)==0:
        pass
    else:
        if data_type.count('int') or data_type.count('float'):
            df_test[col].fillna(-100, inplace=True)
            df_test[col] = df_test[col].astype('float32')
        else:
            df_test[col].fillna('#', inplace=True)
        
    save_feature(df_train[[col]], prefix, dir_save, is_train=True, auto_type=False, list_ignore=COLUMNS_IGNORE)
    save_feature(df_test[[col]],  prefix, dir_save, is_train=False, auto_type=False, list_ignore=COLUMNS_IGNORE)

0
0
(590540,) | R_emaildomain_prefix
(506691,) | R_emaildomain_prefix
0
0
(590540,) | P_emaildomain_prefix
(506691,) | P_emaildomain_prefix
94456
69192
(590540,) | P_emaildomain_bin
(506691,) | P_emaildomain_bin
0
0
(590540,) | P_emaildomain_suffix
(506691,) | P_emaildomain_suffix
453249
370821
(590540,) | R_emaildomain_bin
(506691,) | R_emaildomain_bin
0
0
(590540,) | R_emaildomain_suffix
(506691,) | R_emaildomain_suffix


In [9]:
#========================================================================
# C
#========================================================================
is_viz = [True, False][1]
prefix = 'bin'
dir_save = 'eda_base'
pd.set_option('max_rows', 300)
def cnt_null(df, col):
    print(df[col].isnull().sum())
    return df[col].isnull().sum()

for col in cols_C:
    
    if is_viz:
        print(col, df_train[col].min(), df_test[col].min())
        print(col, df_train[col].value_counts().shape)
        display(df_train[col].value_counts())
        continue
    
    data_type = str(type(df_train[col].dtype))
    
    if data_type.count('int') or data_type.count('float'):
        df_train[col].fillna(-100, inplace=True)
        df_test[col] = df_test[col].astype('int16')
        df_train[col] = df_train[col].map(lambda x:
                                          x if x <20
                                          else 20 if 20 <= x and x < 30
                                          else 30 if 30 <= x and x < 40
                                          else 40 if 40 <= x and x < 50
                                          else 50 if 50 <= x and x < 80
                                          else 80 if 80 <= x and x < 100
                                          else 100 if 100 <= x and x < 200
                                          else 200 if 200 <= x and x < 400
                                          else 400 if 400 <= x and x < 800
                                          else 800 if 800 <= x and x < 1600
                                          else 1600 if 1600 <= x and x < 3200
                                          else 3200
                                         )
    else:
        df_train[col].fillna('#', inplace=True)

    if data_type.count('int') or data_type.count('float'):
        df_test[col].fillna(-100, inplace=True)
        df_test[col] = df_test[col].astype('int16')
        df_test[col] = df_test[col].map(lambda x:
                                          x if x <20
                                          else 20 if 20 <= x and x < 30
                                          else 30 if 30 <= x and x < 40
                                          else 40 if 40 <= x and x < 50
                                          else 50 if 50 <= x and x < 80
                                          else 80 if 80 <= x and x < 100
                                          else 100 if 100 <= x and x < 200
                                          else 200 if 200 <= x and x < 400
                                          else 400 if 400 <= x and x < 800
                                          else 800 if 800 <= x and x < 1600
                                          else 1600 if 1600 <= x and x < 3200
                                          else 3200
                                         )
    else:
        df_test[col].fillna('#', inplace=True)
        
    save_feature(df_train[[col]], prefix, dir_save, is_train=True, auto_type=False, list_ignore=COLUMNS_IGNORE)
    save_feature(df_test[[col]],  prefix, dir_save, is_train=False, auto_type=False, list_ignore=COLUMNS_IGNORE)

(590540,) | 507__C1-C14__diff
(506691,) | 507__C1-C14__diff
(590540,) | 507__C1-C14__ratio
(506691,) | 507__C1-C14__ratio
(590540,) | C1
(506691,) | C1
(590540,) | C10
(506691,) | C10
(590540,) | C11
(506691,) | C11
(590540,) | C12
(506691,) | C12
(590540,) | C13
(506691,) | C13
(590540,) | C14
(506691,) | C14
(590540,) | C2
(506691,) | C2
(590540,) | C3
(506691,) | C3
(590540,) | C4
(506691,) | C4
(590540,) | C5
(506691,) | C5
(590540,) | C6
(506691,) | C6
(590540,) | C7
(506691,) | C7
(590540,) | C8
(506691,) | C8
(590540,) | C9
(506691,) | C9


In [21]:
#========================================================================
# V
#========================================================================
is_viz = [True, False][1]
prefix = 'bin'
dir_save = 'eda_base'
pd.set_option('max_rows', 300)
def cnt_null(df, col):
    print(df[col].isnull().sum())
    return df[col].isnull().sum()

cols_V      = ['V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V294']
def relax_data(df_train, df_test, col):
    cv1 = pd.DataFrame(df_train[col].value_counts().reset_index().rename({col:'train'},axis=1))
    cv2 = pd.DataFrame(df_test[col].value_counts().reset_index().rename({col:'test'},axis=1))
    cv3 = pd.merge(cv1,cv2,on='index',how='outer')
    factor = len(df_test)/len(df_train)
    cv3['train'].fillna(0,inplace=True)
    cv3['test'].fillna(0,inplace=True)
    cv3['remove'] = False
    cv3['remove'] = cv3['remove'] | (cv3['train'] < len(df_train)/10000)
    cv3['remove'] = cv3['remove'] | (factor*cv3['train'] < cv3['test']/3)
    cv3['remove'] = cv3['remove'] | (factor*cv3['train'] > 3*cv3['test'])
    cv3['new'] = cv3.apply(lambda x: x['index'] if x['remove']==False else 0,axis=1)
    cv3['new'],_ = cv3['new'].factorize(sort=True)
    cv3.set_index('index',inplace=True)
    cc = cv3['new'].to_dict()
    df_train[col] = df_train[col].map(cc)
    df_test[col] = df_test[col].map(cc)
    return df_train, df_test

df_train, df_test = relax_data(df_train, df_test, 'V294')

for col in cols_V:
    
    if is_viz:
        print(col, df_train[col].min(), df_test[col].min())
        print(col, df_train[col].value_counts().shape)
#         display(df_train[col].value_counts())
        continue
    
    data_type = str(type(df_train[col].dtype))
    
    if data_type.count('int') or data_type.count('float'):
        df_train[col].fillna(-100, inplace=True)
        df_test[col] = df_test[col].astype('int16')
        df_train[col] = df_train[col].map(lambda x:
                                          x if x <20
                                          else 20 if 20 <= x and x < 30
                                          else 30 if 30 <= x and x < 40
                                          else 40 if 40 <= x and x < 50
                                          else 50 if 50 <= x and x < 80
                                          else 80 if 80 <= x and x < 100
                                          else 100 if 100 <= x and x < 200
                                          else 200 if 200 <= x and x < 400
                                          else 400 if 400 <= x and x < 800
                                          else 800 if 800 <= x and x < 1600
                                          else 1600 if 1600 <= x and x < 3200
                                          else 3200
                                         )
    else:
        df_train[col].fillna('#', inplace=True)
        
    if data_type.count('int') or data_type.count('float'):
        df_test[col].fillna(-100, inplace=True)
        df_test[col] = df_test[col].astype('int16')
        df_test[col] = df_test[col].map(lambda x:
                                          x if x <20
                                          else 20 if 20 <= x and x < 30
                                          else 30 if 30 <= x and x < 40
                                          else 40 if 40 <= x and x < 50
                                          else 50 if 50 <= x and x < 80
                                          else 80 if 80 <= x and x < 100
                                          else 100 if 100 <= x and x < 200
                                          else 200 if 200 <= x and x < 400
                                          else 400 if 400 <= x and x < 800
                                          else 800 if 800 <= x and x < 1600
                                          else 1600 if 1600 <= x and x < 3200
                                          else 3200
                                         )
    else:
        df_test[col].fillna('#', inplace=True)
        
    save_feature(df_train[[col]], prefix, dir_save, is_train=True, auto_type=False, list_ignore=COLUMNS_IGNORE)
    save_feature(df_test[[col]],  prefix, dir_save, is_train=False, auto_type=False, list_ignore=COLUMNS_IGNORE)

(590540,) | V1
(506691,) | V1
(590540,) | V2
(506691,) | V2
(590540,) | V3
(506691,) | V3
(590540,) | V4
(506691,) | V4
(590540,) | V5
(506691,) | V5
(590540,) | V6
(506691,) | V6
(590540,) | V7
(506691,) | V7


In [11]:
#========================================================================
# M
#========================================================================
is_viz = [True, False][1]
prefix = 'fill'
dir_save = 'eda_base'
pd.set_option('max_rows', 300)
def cnt_null(df, col):
    print(df[col].isnull().sum())
    return df[col].isnull().sum()

for col in cols_M:
    
    if is_viz:
        print(col, df_train[col].value_counts().shape)
        display(df_train[col].value_counts())
        continue
    
    data_type = str(type(df_train[col].dtype))
    
    if cnt_null(df_train, col)==0:
        pass
    else:
        if data_type.count('int') or data_type.count('float'):
            df_train[col].fillna(-100, inplace=True)
            df_train[col] = df_train[col].astype('float32')
        else:
            df_train[col].fillna('#', inplace=True)
        
    if cnt_null(df_test, col)==0:
        pass
    else:
        if data_type.count('int') or data_type.count('float'):
            df_test[col].fillna(-100, inplace=True)
            df_test[col] = df_test[col].astype('float32')
        else:
            df_test[col].fillna('#', inplace=True)
        
    save_feature(df_train[[col]], prefix, dir_save, is_train=True, auto_type=False, list_ignore=COLUMNS_IGNORE)
    save_feature(df_test[[col]],  prefix, dir_save, is_train=False, auto_type=False, list_ignore=COLUMNS_IGNORE)

271100
176639
(590540,) | M1
(506691,) | M1
271100
176639
(590540,) | M2
(506691,) | M2
271100
176639
(590540,) | M3
(506691,) | M3
281444
237745
(590540,) | M4
(506691,) | M4
350482
309632
(590540,) | M5
(506691,) | M5
169360
158939
(590540,) | M6
(506691,) | M6
346265
235018
(590540,) | M7
(506691,) | M7
346252
235004
(590540,) | M8
(506691,) | M8
346252
235004
(590540,) | M9
(506691,) | M9


In [12]:
#========================================================================
# Amt
#========================================================================
is_viz = [True, False][1]
prefix = 'bin'
dir_save = 'eda_base'
pd.set_option('max_rows', 300)
def cnt_null(df, col):
    print(df[col].isnull().sum())
    return df[col].isnull().sum()

cols_Amt = ['TransactionAmt']

for col in cols_Amt:
    
    df_train[col] = (df_train[col]*100).astype('int16')
    df_test[col]  = (df_test[col]*100).astype('int16')
    
    if is_viz:
        print(col, df_train[col].min(), df_test[col].min())
        print(col, df_train[col].value_counts().shape)
        display(df_train[col].value_counts())
        continue
    
    data_type = str(type(df_train[col].dtype))
    
    if data_type.count('int') or data_type.count('float'):
        df_train[col].fillna(-100, inplace=True)
        df_train[col] = df_train[col].map(lambda x:
                                          x if x <0
                                          else 0 if 0 <= x and x < 30000
                                          else 30000 if 30000 <= x and x < 50000
                                          else 50000 if 50000 <= x and x < 60000
                                          else 60000 if 60000 <= x and x < 80000
                                          else 80000 if 80000 <= x and x < 100000
                                          else 100000 if 100000 <= x and x < 150000
                                          else 150000 if 150000 <= x and x < 200000
                                          else 200000 if 200000 <= x and x < 300000
                                          else 300000 if 300000 <= x and x < 500000
                                          else 500000 if 500000 <= x and x < 800000
                                          else 800000 if 800000 <= x and x < 1000000
                                          else 1000000 if 1000000 <= x and x < 1500000
                                          else 1500000 if 1500000 <= x and x < 2000000
                                          else 2000000
                                         )
    else:
        df_train[col].fillna('#', inplace=True)

    if data_type.count('int') or data_type.count('float'):
        df_test[col].fillna(-100, inplace=True)
        df_test[col] = df_test[col].astype('int16')
        df_test[col] = df_test[col].map(lambda x:
                                          x if x <0
                                          else 0 if 0 <= x and x < 30000
                                          else 30000 if 30000 <= x and x < 50000
                                          else 50000 if 50000 <= x and x < 60000
                                          else 60000 if 60000 <= x and x < 80000
                                          else 80000 if 80000 <= x and x < 100000
                                          else 100000 if 100000 <= x and x < 150000
                                          else 150000 if 150000 <= x and x < 200000
                                          else 200000 if 200000 <= x and x < 300000
                                          else 300000 if 300000 <= x and x < 500000
                                          else 500000 if 500000 <= x and x < 800000
                                          else 800000 if 800000 <= x and x < 1000000
                                          else 1000000 if 1000000 <= x and x < 1500000
                                          else 1500000 if 1500000 <= x and x < 2000000
                                          else 2000000
                                         )
        df_test[col].fillna('#', inplace=True)
        
    save_feature(df_train[[col]], prefix, dir_save, is_train=True, auto_type=False, list_ignore=COLUMNS_IGNORE)
    save_feature(df_test[[col]],  prefix, dir_save, is_train=False, auto_type=False, list_ignore=COLUMNS_IGNORE)

(590540,) | TransactionAmt
(506691,) | TransactionAmt


In [13]:
#========================================================================
# Amt_Dev
#========================================================================
is_viz = [True, False][1]
prefix = 'bin'
dir_save = 'eda_base'
pd.set_option('max_rows', 300)
def cnt_null(df, col):
    print(df[col].isnull().sum())
    return df[col].isnull().sum()

col = 'TransactionAmt'

df_train['Amt_DIV200'] = df_train[col].map(lambda x: x%200)
df_test['Amt_DIV200'] = df_test[col].map(lambda x: x%200)
df_train['Amt_DIV100'] = df_train[col].map(lambda x: x%100)
df_test['Amt_DIV100'] = df_test[col].map(lambda x: x%100)
df_train['Amt_DIV50'] = df_train[col].map(lambda x: x%50)
df_test['Amt_DIV50'] = df_test[col].map(lambda x: x%50)
df_train['Amt_DIV30'] = df_train[col].map(lambda x: x%30)
df_test['Amt_DIV30'] = df_test[col].map(lambda x: x%30)
cols_Amt_Dev = [
    'Amt_DIV200',
    'Amt_DIV100',
    'Amt_DIV50',
    'Amt_DIV30',
]

for col in cols_Amt_Dev:
    
    if is_viz:
        print(col, df_train[col].min(), df_test[col].min())
        print(col, df_train[col].value_counts().shape)
        display(df_train[col].value_counts())
        continue
    
    data_type = str(type(df_train[col].dtype))
    
    if data_type.count('int') or data_type.count('float'):
        df_train[col].fillna(-100, inplace=True)
        df_test[col] = df_test[col].astype('int16')
    else:
        df_train[col].fillna('#', inplace=True)

    if data_type.count('int') or data_type.count('float'):
        df_test[col].fillna(-100, inplace=True)
        df_test[col] = df_test[col].astype('int16')
    else:
        df_test[col].fillna('#', inplace=True)

    save_feature(df_train[[col]], prefix, dir_save, is_train=True, auto_type=False, list_ignore=COLUMNS_IGNORE)
    save_feature(df_test[[col]],  prefix, dir_save, is_train=False, auto_type=False, list_ignore=COLUMNS_IGNORE)

(590540,) | Amt_DIV200
(506691,) | Amt_DIV200
(590540,) | Amt_DIV100
(506691,) | Amt_DIV100
(590540,) | Amt_DIV50
(506691,) | Amt_DIV50
(590540,) | Amt_DIV30
(506691,) | Amt_DIV30


In [14]:
#========================================================================
# cents
#========================================================================
prefix = 'bin'
dir_save = 'eda_base'
col = 'cents'
df_train[col] = (df_train[col]*100).astype('int8')
df_test[col] = (df_test[col]*100).astype('int8')
save_feature(df_train[[col]], prefix, dir_save, is_train=True, auto_type=False, list_ignore=COLUMNS_IGNORE)
save_feature(df_test[[col]],  prefix, dir_save, is_train=False, auto_type=False, list_ignore=COLUMNS_IGNORE)

(590540,) | cents
(506691,) | cents
