In [1]:
%load_ext autoreload
%autoreload 2
import gc
from glob import glob
import os
from pathlib import Path
import re
import sys
import yaml
import datetime
import numpy as np
import pandas as pd
from tqdm import tqdm
from func.utils import timer, get_categorical_features, read_pkl_gzip, to_pkl_gzip, parallel_load_data, get_filename, logger_func
from ieee_train import eval_train, eval_check_feature
from kaggle_utils import reduce_mem_usage, move_feature

import lightgbm as lgb
from sklearn.metrics import roc_auc_score
import shutil

try:
    logger
except NameError:
    logger = logger_func()
    

2019-09-14 19:16:01,269 func.utils 347 [INFO]    [logger_func] start 


In [2]:
COLUMN_ID = 'TransactionID'
COLUMN_DT = 'TransactionDT'
COLUMN_TARGET = 'isFraud'
COLUMN_GROUP = 'DT-M'
COLUMNS_IGNORE = [COLUMN_ID, COLUMN_DT, COLUMN_TARGET, COLUMN_GROUP, 'is_train', 'date']

paths_train = glob('../feature/raw_use/*_train.gz')
paths_train += sorted(glob('../feature/org_use/*_train.gz'))
# paths_train += sorted(glob('../feature/valid_use/*_train.gz'))

df_train = parallel_load_data(paths_train)

group_kfold_path = '../input/0908_ieee__DT-M_GroupKFold.gz'
group = read_pkl_gzip(group_kfold_path)
df_train[COLUMN_GROUP] = group

#========================================================================
# Negative Down Sampling
#========================================================================
frac = 0.2
seed = 1208
np.random.seed(seed)
df_pos = df_train[df_train.isFraud==1]
df_neg = df_train[df_train.isFraud!=1]
del df_train
gc.collect()
print(df_pos.shape, df_neg.shape)
df_neg = df_neg.sample(int(df_neg.shape[0] * frac))
df_train = pd.concat([df_pos, df_neg], axis=0)
print(df_train.shape)

(20663, 154) (569877, 154)
(134638, 154)


In [3]:
#========================================================================
# Base Featureに検証用Feature Groupを追加して、スコアの変化を見る.
# Baseより向上したFeature Groupのみ、追加検証を行う
#========================================================================
np.random.seed(1208)

save_file_path = '../output/913_ieee__valid_single_feature.csv'

# 最初はbaseをTrueにして
start_time = "{0:%Y%m%d_%H%M%S}".format(datetime.datetime.now())[:14]
is_base = [True, False][0]
is_result = [True, False][0]
is_write  = [True, False][1]
to_dir = '../feature/check_trush/'

def get_tree_importance(estimator, use_cols, importance_type="gain"):
    feim = estimator.feature_importance(importance_type=importance_type)
    feim = pd.DataFrame([np.array(use_cols), feim]).T
    feim.columns = ['feature', 'importance']
    feim['importance'] = feim['importance'].astype('float32')
    return feim


# valid_paths_train = sorted(glob('../feature/valid/*_train.gz'))
valid_paths_train = sorted(glob('../feature/valid_use/*_train.gz'))
    
for path in valid_paths_train:
    
    feature = pd.Series(read_pkl_gzip(path)).loc[df_train.index]
    tmp_train = df_train.copy()
    feature_name = get_filename(path)
    tmp_train[feature_name] = feature
    
    use_cols = [col for col in tmp_train.columns if col not in COLUMNS_IGNORE]
    
    fold_map = {
        0: '2018-5',
        1: '2018-4',
        2: '2018-3',
    }
        
    for fold in range(3):
        with timer('  * Make Dataset'):
            if fold==0:
                train = tmp_train[
                    (tmp_train[COLUMN_GROUP] == '2017-12') | 
                    (tmp_train[COLUMN_GROUP] == '2018-1') | 
                    (tmp_train[COLUMN_GROUP] == '2018-2') | 
                    (tmp_train[COLUMN_GROUP] == '2018-3') | 
                    (tmp_train[COLUMN_GROUP] == '2018-4')
                    ]
                test  = tmp_train[tmp_train[COLUMN_GROUP] == '2018-5']
            elif fold==1:
                train = tmp_train[
                    (tmp_train[COLUMN_GROUP] == '2017-12') | 
                    (tmp_train[COLUMN_GROUP] == '2018-1') | 
                    (tmp_train[COLUMN_GROUP] == '2018-2') | 
                    (tmp_train[COLUMN_GROUP] == '2018-3') |
                    (tmp_train[COLUMN_GROUP] == '2018-5')
                    ]
                test  = tmp_train[tmp_train[COLUMN_GROUP] == '2018-4']
            elif fold==2:
                train = tmp_train[
                    (tmp_train[COLUMN_GROUP] == '2017-12') | 
                    (tmp_train[COLUMN_GROUP] == '2018-1') | 
                    (tmp_train[COLUMN_GROUP] == '2018-2') | 
                    (tmp_train[COLUMN_GROUP] == '2018-4') |
                    (tmp_train[COLUMN_GROUP] == '2018-5')
                    ]
                test  = tmp_train[tmp_train[COLUMN_GROUP] == '2018-3']
        
            Y_TRAIN = train[COLUMN_TARGET]
            train.drop(COLUMN_TARGET, axis=1, inplace=True)
        
            Y_TEST = test[COLUMN_TARGET]
            test.drop(COLUMN_TARGET, axis=1, inplace=True)
        
        start_time = "{0:%Y%m%d_%H%M%S}".format(datetime.datetime.now())[:13]
        params = {
#             'n_jobs': 64,
            'n_jobs': 32,
#             'n_jobs': 16,
            'seed': 1208,
            'metric': 'auc',
            'objective': 'binary',
            'num_leaves': 2**7-1,
            'max_depth': -1,
            'subsample': 0.9,
            'subsample_freq': 1,
            'colsample_bytree' : 1.0,
            'lambda_l1' : 0.1,
            'lambda_l2' : 1.0,
            'learning_rate' : 0.1,
        }
        
        x_train = train[use_cols]
        y_train = Y_TRAIN
        x_valid = test[use_cols]
        y_valid = Y_TEST
        early_stopping_rounds=20
        num_boost_round=500
        metric = 'auc'
        params['metric'] = metric
        
        #========================================================================
        # Fitting
        #========================================================================
        lgb_train = lgb.Dataset(data=x_train, label=y_train)
        lgb_valid = lgb.Dataset(data=x_valid, label=y_valid)
        
        with timer("  * Train & Validation"):
            estimator = lgb.train(
                params = params,
                train_set = lgb_train,
                valid_sets = lgb_valid,
                early_stopping_rounds = early_stopping_rounds,
                num_boost_round = num_boost_round,
                verbose_eval = 200
            )
            best_iter = estimator.best_iteration
        
            oof_pred = estimator.predict(x_valid)
            score = roc_auc_score(y_valid, oof_pred)
            cvs = str(score).replace('.', '-')
            logger.info(f"  * {feature_name} Fold{fold} {fold_map[fold]}:{score}")
            
            with open(save_file_path, 'a') as f:
                line = f'{start_time},{fold_map[fold]},{feature_name},{score}\n'
                f.write(line)
                
    #========================================================================
    # PostProcess
    #========================================================================
    try:
        shutil.move(path, '../')
    except FileNotFoundError:
        print(feature_name)

[  * Make Dataset] done in 1 s
Training until validation scores don't improve for 20 rounds.


2019-09-14 19:16:18,321 func.utils 125 [INFO]    [<module>]   * 501__device_os__id_12_linux-NotFound_dummie_train Fold0 2018-5:0.9343934514603925 


Early stopping, best iteration is:
[165]	valid_0's auc: 0.934393
[  * Train & Validation] done in 11 s
[  * Make Dataset] done in 1 s
Training until validation scores don't improve for 20 rounds.
[200]	valid_0's auc: 0.947629


2019-09-14 19:16:29,562 func.utils 125 [INFO]    [<module>]   * 501__device_os__id_12_linux-NotFound_dummie_train Fold1 2018-4:0.948371014454868 


Early stopping, best iteration is:
[249]	valid_0's auc: 0.948371
[  * Train & Validation] done in 11 s
[  * Make Dataset] done in 0 s
Training until validation scores don't improve for 20 rounds.


2019-09-14 19:16:38,435 func.utils 125 [INFO]    [<module>]   * 501__device_os__id_12_linux-NotFound_dummie_train Fold2 2018-3:0.9355253671785929 


Early stopping, best iteration is:
[166]	valid_0's auc: 0.935525
[  * Train & Validation] done in 8 s
[  * Make Dataset] done in 1 s
Training until validation scores don't improve for 20 rounds.


2019-09-14 19:16:47,179 func.utils 125 [INFO]    [<module>]   * 501__device_os__id_37_other_device_os-T_dummie_train Fold0 2018-5:0.9343934514603925 


Early stopping, best iteration is:
[165]	valid_0's auc: 0.934393
[  * Train & Validation] done in 8 s
[  * Make Dataset] done in 1 s
Training until validation scores don't improve for 20 rounds.
[200]	valid_0's auc: 0.947629


2019-09-14 19:16:59,018 func.utils 125 [INFO]    [<module>]   * 501__device_os__id_37_other_device_os-T_dummie_train Fold1 2018-4:0.948371014454868 


Early stopping, best iteration is:
[249]	valid_0's auc: 0.948371
[  * Train & Validation] done in 11 s
[  * Make Dataset] done in 0 s
Training until validation scores don't improve for 20 rounds.


2019-09-14 19:17:06,813 func.utils 125 [INFO]    [<module>]   * 501__device_os__id_37_other_device_os-T_dummie_train Fold2 2018-3:0.9352601953408406 


Early stopping, best iteration is:
[153]	valid_0's auc: 0.93526
[  * Train & Validation] done in 7 s
[  * Make Dataset] done in 1 s
Training until validation scores don't improve for 20 rounds.


2019-09-14 19:17:15,606 func.utils 125 [INFO]    [<module>]   * 523__P_emaildomain_prefix-card2-card3-card4_future_datetime_shift_m3_diff_train Fold0 2018-5:0.9339802166176068 


Early stopping, best iteration is:
[154]	valid_0's auc: 0.93398
[  * Train & Validation] done in 8 s
[  * Make Dataset] done in 1 s
Training until validation scores don't improve for 20 rounds.


2019-09-14 19:17:26,038 func.utils 125 [INFO]    [<module>]   * 523__P_emaildomain_prefix-card2-card3-card4_future_datetime_shift_m3_diff_train Fold1 2018-4:0.9486968398832806 


[200]	valid_0's auc: 0.948342
Early stopping, best iteration is:
[181]	valid_0's auc: 0.948697
[  * Train & Validation] done in 10 s
[  * Make Dataset] done in 1 s
Training until validation scores don't improve for 20 rounds.
[200]	valid_0's auc: 0.93806


2019-09-14 19:17:36,072 func.utils 125 [INFO]    [<module>]   * 523__P_emaildomain_prefix-card2-card3-card4_future_datetime_shift_m3_diff_train Fold2 2018-3:0.9382631136663395 


Early stopping, best iteration is:
[205]	valid_0's auc: 0.938263
[  * Train & Validation] done in 9 s
[  * Make Dataset] done in 1 s
Training until validation scores don't improve for 20 rounds.


2019-09-14 19:17:45,675 func.utils 125 [INFO]    [<module>]   * 523__R_emaildomain_prefix-card1-card4-card5_past_datetime_shift_p3_diff_train Fold0 2018-5:0.9332930748873093 


Early stopping, best iteration is:
[155]	valid_0's auc: 0.933293
[  * Train & Validation] done in 9 s
[  * Make Dataset] done in 1 s
Training until validation scores don't improve for 20 rounds.


2019-09-14 19:17:57,908 func.utils 125 [INFO]    [<module>]   * 523__R_emaildomain_prefix-card1-card4-card5_past_datetime_shift_p3_diff_train Fold1 2018-4:0.9458490859918157 


Early stopping, best iteration is:
[164]	valid_0's auc: 0.945849
[  * Train & Validation] done in 12 s
[  * Make Dataset] done in 1 s
Training until validation scores don't improve for 20 rounds.
[200]	valid_0's auc: 0.938445
Early stopping, best iteration is:
[215]	valid_0's auc: 0.938931


2019-09-14 19:18:09,736 func.utils 125 [INFO]    [<module>]   * 523__R_emaildomain_prefix-card1-card4-card5_past_datetime_shift_p3_diff_train Fold2 2018-3:0.938931269979657 


[  * Train & Validation] done in 11 s
[  * Make Dataset] done in 1 s
Training until validation scores don't improve for 20 rounds.


2019-09-14 19:18:17,718 func.utils 125 [INFO]    [<module>]   * 606__card1-card2-card3-P_emaildomain_prefix_507__C1-C14__diff__ProductCD-S_std_train Fold0 2018-5:0.9343256032856815 


Early stopping, best iteration is:
[126]	valid_0's auc: 0.934326
[  * Train & Validation] done in 7 s
[  * Make Dataset] done in 1 s
Training until validation scores don't improve for 20 rounds.


2019-09-14 19:18:27,043 func.utils 125 [INFO]    [<module>]   * 606__card1-card2-card3-P_emaildomain_prefix_507__C1-C14__diff__ProductCD-S_std_train Fold1 2018-4:0.9471458088942478 


Early stopping, best iteration is:
[178]	valid_0's auc: 0.947146
[  * Train & Validation] done in 9 s
[  * Make Dataset] done in 1 s
Training until validation scores don't improve for 20 rounds.


2019-09-14 19:18:35,604 func.utils 125 [INFO]    [<module>]   * 606__card1-card2-card3-P_emaildomain_prefix_507__C1-C14__diff__ProductCD-S_std_train Fold2 2018-3:0.9371298238233721 


Early stopping, best iteration is:
[129]	valid_0's auc: 0.93713
[  * Train & Validation] done in 8 s
[  * Make Dataset] done in 1 s
Training until validation scores don't improve for 20 rounds.


2019-09-14 19:18:42,400 func.utils 125 [INFO]    [<module>]   * 606__card1-card5-card6_C8__ProductCD-S_std_train Fold0 2018-5:0.9318105765561894 


Early stopping, best iteration is:
[94]	valid_0's auc: 0.931811
[  * Train & Validation] done in 6 s
[  * Make Dataset] done in 1 s
Training until validation scores don't improve for 20 rounds.


2019-09-14 19:18:51,182 func.utils 125 [INFO]    [<module>]   * 606__card1-card5-card6_C8__ProductCD-S_std_train Fold1 2018-4:0.9450070417242585 


Early stopping, best iteration is:
[140]	valid_0's auc: 0.945007
[  * Train & Validation] done in 8 s
[  * Make Dataset] done in 1 s
Training until validation scores don't improve for 20 rounds.
[200]	valid_0's auc: 0.938162


2019-09-14 19:19:01,418 func.utils 125 [INFO]    [<module>]   * 606__card1-card5-card6_C8__ProductCD-S_std_train Fold2 2018-3:0.9384744560147786 


Early stopping, best iteration is:
[213]	valid_0's auc: 0.938474
[  * Train & Validation] done in 10 s
[  * Make Dataset] done in 1 s
Training until validation scores don't improve for 20 rounds.


2019-09-14 19:19:10,383 func.utils 125 [INFO]    [<module>]   * 606__card2-card4-card5_C4__ProductCD-C_std_train Fold0 2018-5:0.9346120985761696 


Early stopping, best iteration is:
[156]	valid_0's auc: 0.934612
[  * Train & Validation] done in 8 s
[  * Make Dataset] done in 1 s
Training until validation scores don't improve for 20 rounds.


2019-09-14 19:19:18,321 func.utils 125 [INFO]    [<module>]   * 606__card2-card4-card5_C4__ProductCD-C_std_train Fold1 2018-4:0.947220789914813 


Early stopping, best iteration is:
[149]	valid_0's auc: 0.947221
[  * Train & Validation] done in 7 s
[  * Make Dataset] done in 1 s
Training until validation scores don't improve for 20 rounds.
[200]	valid_0's auc: 0.937086


2019-09-14 19:19:29,893 func.utils 125 [INFO]    [<module>]   * 606__card2-card4-card5_C4__ProductCD-C_std_train Fold2 2018-3:0.9371694274920082 


Early stopping, best iteration is:
[187]	valid_0's auc: 0.937169
[  * Train & Validation] done in 11 s
[  * Make Dataset] done in 1 s
Training until validation scores don't improve for 20 rounds.


2019-09-14 19:19:38,808 func.utils 125 [INFO]    [<module>]   * 606__card2-card4-card5_C8__ProductCD-C_std_train Fold0 2018-5:0.9336792647422859 


Early stopping, best iteration is:
[155]	valid_0's auc: 0.933679
[  * Train & Validation] done in 8 s
[  * Make Dataset] done in 1 s
Training until validation scores don't improve for 20 rounds.


2019-09-14 19:19:47,854 func.utils 125 [INFO]    [<module>]   * 606__card2-card4-card5_C8__ProductCD-C_std_train Fold1 2018-4:0.9451382258340332 


Early stopping, best iteration is:
[159]	valid_0's auc: 0.945138
[  * Train & Validation] done in 8 s
[  * Make Dataset] done in 1 s
Training until validation scores don't improve for 20 rounds.


2019-09-14 19:19:54,976 func.utils 125 [INFO]    [<module>]   * 606__card2-card4-card5_C8__ProductCD-C_std_train Fold2 2018-3:0.936414982685144 


Early stopping, best iteration is:
[144]	valid_0's auc: 0.936415
[  * Train & Validation] done in 7 s
