In [1]:
%load_ext autoreload
%autoreload 2
import gc
from glob import glob
import os
from pathlib import Path
import re
import sys
import yaml
import datetime
import numpy as np
import pandas as pd
from tqdm import tqdm
from func.utils import timer, get_categorical_features, read_pkl_gzip, to_pkl_gzip, parallel_load_data, get_filename, logger_func
from ieee_train import eval_train, eval_check_feature
from kaggle_utils import reduce_mem_usage, move_feature

import lightgbm as lgb
from sklearn.metrics import roc_auc_score
import shutil

try:
    logger
except NameError:
    logger = logger_func()
    

2019-09-14 22:40:39,463 func.utils 347 [INFO]    [logger_func] start 


In [2]:
COLUMN_ID = 'TransactionID'
COLUMN_DT = 'TransactionDT'
COLUMN_TARGET = 'isFraud'
COLUMN_GROUP = 'DT-M'
COLUMNS_IGNORE = [COLUMN_ID, COLUMN_DT, COLUMN_TARGET, COLUMN_GROUP, 'is_train', 'date']

paths_train = glob('../feature/raw_use/*_train.gz')
paths_train += sorted(glob('../feature/org_use/*_train.gz'))
# paths_train += sorted(glob('../feature/valid_use/*_train.gz'))

df_train = parallel_load_data(paths_train)

group_kfold_path = '../input/0908_ieee__DT-M_GroupKFold.gz'
group = read_pkl_gzip(group_kfold_path)
df_train[COLUMN_GROUP] = group

#========================================================================
# Negative Down Sampling
#========================================================================
frac = 0.2
seed = 1208
np.random.seed(seed)
df_pos = df_train[df_train.isFraud==1]
df_neg = df_train[df_train.isFraud!=1]
del df_train
gc.collect()
print(df_pos.shape, df_neg.shape)
df_neg = df_neg.sample(int(df_neg.shape[0] * frac))
df_train = pd.concat([df_pos, df_neg], axis=0)
print(df_train.shape)

(20663, 121) (569877, 121)
(134638, 121)


In [3]:
#========================================================================
# Base Featureに検証用Feature Groupを追加して、スコアの変化を見る.
# Baseより向上したFeature Groupのみ、追加検証を行う
#========================================================================
np.random.seed(1208)

save_file_path = '../output/913_ieee__valid_single_feature.csv'

# 最初はbaseをTrueにして
start_time = "{0:%Y%m%d_%H%M%S}".format(datetime.datetime.now())[:14]
is_base = [True, False][0]
is_result = [True, False][0]
is_write  = [True, False][1]
to_dir = '../feature/check_trush/'

def get_tree_importance(estimator, use_cols, importance_type="gain"):
    feim = estimator.feature_importance(importance_type=importance_type)
    feim = pd.DataFrame([np.array(use_cols), feim]).T
    feim.columns = ['feature', 'importance']
    feim['importance'] = feim['importance'].astype('float32')
    return feim


# valid_paths_train = sorted(glob('../feature/valid/*_train.gz'))
valid_paths_train = sorted(glob('../feature/valid_use/*_train.gz'))
score_map = {}
    
for path in valid_paths_train:
    
    feature = pd.Series(read_pkl_gzip(path)).loc[df_train.index]
    tmp_train = df_train.copy()
    feature_name = get_filename(path)
    tmp_train[feature_name] = feature
    
    use_cols = [col for col in tmp_train.columns if col not in COLUMNS_IGNORE]
    
    fold_map = {
        0: '2018-5',
        1: '2018-4',
        2: '2018-3',
    }
        
    for fold in range(3):
        with timer('  * Make Dataset'):
            if fold==0:
                train = tmp_train[
                    (tmp_train[COLUMN_GROUP] == '2017-12') | 
                    (tmp_train[COLUMN_GROUP] == '2018-1') | 
                    (tmp_train[COLUMN_GROUP] == '2018-2') | 
                    (tmp_train[COLUMN_GROUP] == '2018-3') | 
                    (tmp_train[COLUMN_GROUP] == '2018-4')
                    ]
                test  = tmp_train[tmp_train[COLUMN_GROUP] == '2018-5']
            elif fold==1:
                train = tmp_train[
                    (tmp_train[COLUMN_GROUP] == '2017-12') | 
                    (tmp_train[COLUMN_GROUP] == '2018-1') | 
                    (tmp_train[COLUMN_GROUP] == '2018-2') | 
                    (tmp_train[COLUMN_GROUP] == '2018-3') |
                    (tmp_train[COLUMN_GROUP] == '2018-5')
                    ]
                test  = tmp_train[tmp_train[COLUMN_GROUP] == '2018-4']
            elif fold==2:
                train = tmp_train[
                    (tmp_train[COLUMN_GROUP] == '2017-12') | 
                    (tmp_train[COLUMN_GROUP] == '2018-1') | 
                    (tmp_train[COLUMN_GROUP] == '2018-2') | 
                    (tmp_train[COLUMN_GROUP] == '2018-4') |
                    (tmp_train[COLUMN_GROUP] == '2018-5')
                    ]
                test  = tmp_train[tmp_train[COLUMN_GROUP] == '2018-3']
        
            Y_TRAIN = train[COLUMN_TARGET]
            train.drop(COLUMN_TARGET, axis=1, inplace=True)
        
            Y_TEST = test[COLUMN_TARGET]
            test.drop(COLUMN_TARGET, axis=1, inplace=True)
        
        start_time = "{0:%Y%m%d_%H%M%S}".format(datetime.datetime.now())[:13]
        params = {
#             'n_jobs': 64,
            'n_jobs': 32,
#             'n_jobs': 16,
            'seed': 1208,
            'metric': 'auc',
            'objective': 'binary',
            'num_leaves': 2**7-1,
            'max_depth': -1,
            'subsample': 0.9,
            'subsample_freq': 1,
            'colsample_bytree' : 1.0,
            'lambda_l1' : 0.1,
            'lambda_l2' : 1.0,
            'learning_rate' : 0.1,
        }
        
        x_train = train[use_cols]
        y_train = Y_TRAIN
        x_valid = test[use_cols]
        y_valid = Y_TEST
        early_stopping_rounds=20
        num_boost_round=500
        metric = 'auc'
        params['metric'] = metric
        
        #========================================================================
        # Fitting
        #========================================================================
        lgb_train = lgb.Dataset(data=x_train, label=y_train)
        lgb_valid = lgb.Dataset(data=x_valid, label=y_valid)
        
        with timer("  * Train & Validation"):
            estimator = lgb.train(
                params = params,
                train_set = lgb_train,
                valid_sets = lgb_valid,
                early_stopping_rounds = early_stopping_rounds,
                num_boost_round = num_boost_round,
                verbose_eval = 200
            )
            best_iter = estimator.best_iteration
        
            oof_pred = estimator.predict(x_valid)
            score = roc_auc_score(y_valid, oof_pred)
            cvs = str(score).replace('.', '-')
            logger.info(f"  * {feature_name} Fold{fold} {fold_map[fold]}:{score}")
            
            score_map[fold_map[fold]] = score
            
            with open(save_file_path, 'a') as f:
                line = f'{start_time},{fold_map[fold]},{feature_name},{score}\n'
                f.write(line)
                
    display(pd.Series(score_map))
                
    #========================================================================
    # PostProcess
    #========================================================================
#     try:
#         shutil.move(path, './')
#     except FileNotFoundError:
#         print(feature_name)

[  * Make Dataset] done in 1 s
Training until validation scores don't improve for 20 rounds.


2019-09-14 22:40:50,980 func.utils 126 [INFO]    [<module>]   * 512__V187-V267__diff__ProductCD-S_train Fold0 2018-5:0.9286550078849496 


Early stopping, best iteration is:
[133]	valid_0's auc: 0.928655
[  * Train & Validation] done in 7 s
[  * Make Dataset] done in 0 s
Training until validation scores don't improve for 20 rounds.


2019-09-14 22:40:58,774 func.utils 126 [INFO]    [<module>]   * 512__V187-V267__diff__ProductCD-S_train Fold1 2018-4:0.9435610103049888 


Early stopping, best iteration is:
[170]	valid_0's auc: 0.943561
[  * Train & Validation] done in 7 s
[  * Make Dataset] done in 0 s
Training until validation scores don't improve for 20 rounds.
[200]	valid_0's auc: 0.935909


2019-09-14 22:41:06,271 func.utils 126 [INFO]    [<module>]   * 512__V187-V267__diff__ProductCD-S_train Fold2 2018-3:0.9362576539995895 


Early stopping, best iteration is:
[195]	valid_0's auc: 0.936258
[  * Train & Validation] done in 7 s


2018-5    0.928655
2018-4    0.943561
2018-3    0.936258
dtype: float64

[  * Make Dataset] done in 1 s
Training until validation scores don't improve for 20 rounds.


2019-09-14 22:41:13,375 func.utils 126 [INFO]    [<module>]   * 512__V258-V78__diff__ProductCD-C_train Fold0 2018-5:0.9289831444252283 


Early stopping, best iteration is:
[114]	valid_0's auc: 0.928983
[  * Train & Validation] done in 6 s
[  * Make Dataset] done in 0 s
Training until validation scores don't improve for 20 rounds.
[200]	valid_0's auc: 0.945338


2019-09-14 22:41:23,272 func.utils 126 [INFO]    [<module>]   * 512__V258-V78__diff__ProductCD-C_train Fold1 2018-4:0.9463143517202393 


Early stopping, best iteration is:
[260]	valid_0's auc: 0.946314
[  * Train & Validation] done in 9 s
[  * Make Dataset] done in 1 s
Training until validation scores don't improve for 20 rounds.
[200]	valid_0's auc: 0.936391
Early stopping, best iteration is:
[184]	valid_0's auc: 0.936594


2019-09-14 22:41:32,506 func.utils 126 [INFO]    [<module>]   * 512__V258-V78__diff__ProductCD-C_train Fold2 2018-3:0.936593796170409 


[  * Train & Validation] done in 9 s


2018-5    0.928983
2018-4    0.946314
2018-3    0.936594
dtype: float64

[  * Make Dataset] done in 1 s
Training until validation scores don't improve for 20 rounds.


2019-09-14 22:41:39,046 func.utils 126 [INFO]    [<module>]   * 512__V259-V83__diff__ProductCD-C_train Fold0 2018-5:0.9303159961517549 


Early stopping, best iteration is:
[96]	valid_0's auc: 0.930316
[  * Train & Validation] done in 6 s
[  * Make Dataset] done in 0 s
Training until validation scores don't improve for 20 rounds.
[200]	valid_0's auc: 0.946283


2019-09-14 22:41:47,777 func.utils 126 [INFO]    [<module>]   * 512__V259-V83__diff__ProductCD-C_train Fold1 2018-4:0.9466362124346067 


Early stopping, best iteration is:
[210]	valid_0's auc: 0.946636
[  * Train & Validation] done in 8 s
[  * Make Dataset] done in 0 s
Training until validation scores don't improve for 20 rounds.


2019-09-14 22:41:53,902 func.utils 126 [INFO]    [<module>]   * 512__V259-V83__diff__ProductCD-C_train Fold2 2018-3:0.9357687176235563 


Early stopping, best iteration is:
[135]	valid_0's auc: 0.935769
[  * Train & Validation] done in 6 s


2018-5    0.930316
2018-4    0.946636
2018-3    0.935769
dtype: float64

[  * Make Dataset] done in 1 s
Training until validation scores don't improve for 20 rounds.


2019-09-14 22:42:01,517 func.utils 126 [INFO]    [<module>]   * 512__V265-V283__diff__ProductCD-S_train Fold0 2018-5:0.93449115562552 


Early stopping, best iteration is:
[175]	valid_0's auc: 0.934491
[  * Train & Validation] done in 7 s
[  * Make Dataset] done in 1 s
Training until validation scores don't improve for 20 rounds.
[200]	valid_0's auc: 0.946712


2019-09-14 22:42:10,194 func.utils 126 [INFO]    [<module>]   * 512__V265-V283__diff__ProductCD-S_train Fold1 2018-4:0.9473043321036183 


Early stopping, best iteration is:
[227]	valid_0's auc: 0.947304
[  * Train & Validation] done in 8 s
[  * Make Dataset] done in 0 s
Training until validation scores don't improve for 20 rounds.
[200]	valid_0's auc: 0.935659


2019-09-14 22:42:19,074 func.utils 126 [INFO]    [<module>]   * 512__V265-V283__diff__ProductCD-S_train Fold2 2018-3:0.9357463564116789 


Early stopping, best iteration is:
[197]	valid_0's auc: 0.935746
[  * Train & Validation] done in 8 s


2018-5    0.934491
2018-4    0.947304
2018-3    0.935746
dtype: float64

[  * Make Dataset] done in 1 s
Training until validation scores don't improve for 20 rounds.


2019-09-14 22:42:26,003 func.utils 126 [INFO]    [<module>]   * 512__V29-V131__diff__ProductCD-S_train Fold0 2018-5:0.9291116124822344 


Early stopping, best iteration is:
[139]	valid_0's auc: 0.929112
[  * Train & Validation] done in 6 s
[  * Make Dataset] done in 0 s
Training until validation scores don't improve for 20 rounds.
[200]	valid_0's auc: 0.945536


2019-09-14 22:42:34,078 func.utils 126 [INFO]    [<module>]   * 512__V29-V131__diff__ProductCD-S_train Fold1 2018-4:0.9456036658356016 


Early stopping, best iteration is:
[191]	valid_0's auc: 0.945604
[  * Train & Validation] done in 8 s
[  * Make Dataset] done in 1 s
Training until validation scores don't improve for 20 rounds.


2019-09-14 22:42:42,136 func.utils 126 [INFO]    [<module>]   * 512__V29-V131__diff__ProductCD-S_train Fold2 2018-3:0.9346632252882253 


Early stopping, best iteration is:
[172]	valid_0's auc: 0.934663
[  * Train & Validation] done in 7 s


2018-5    0.929112
2018-4    0.945604
2018-3    0.934663
dtype: float64

[  * Make Dataset] done in 0 s
Training until validation scores don't improve for 20 rounds.
[200]	valid_0's auc: 0.931894


2019-09-14 22:42:49,759 func.utils 126 [INFO]    [<module>]   * 512__V314-V131__ratio__ProductCD-H_train Fold0 2018-5:0.9319527504333571 


Early stopping, best iteration is:
[189]	valid_0's auc: 0.931953
[  * Train & Validation] done in 7 s
[  * Make Dataset] done in 0 s
Training until validation scores don't improve for 20 rounds.
[200]	valid_0's auc: 0.944177


2019-09-14 22:43:01,906 func.utils 126 [INFO]    [<module>]   * 512__V314-V131__ratio__ProductCD-H_train Fold1 2018-4:0.945784016756363 


Early stopping, best iteration is:
[331]	valid_0's auc: 0.945784
[  * Train & Validation] done in 12 s
[  * Make Dataset] done in 0 s
Training until validation scores don't improve for 20 rounds.
[200]	valid_0's auc: 0.93646


2019-09-14 22:43:09,422 func.utils 126 [INFO]    [<module>]   * 512__V314-V131__ratio__ProductCD-H_train Fold2 2018-3:0.9367177386330612 


Early stopping, best iteration is:
[197]	valid_0's auc: 0.936718
[  * Train & Validation] done in 7 s


2018-5    0.931953
2018-4    0.945784
2018-3    0.936718
dtype: float64

[  * Make Dataset] done in 1 s
Training until validation scores don't improve for 20 rounds.


2019-09-14 22:43:16,613 func.utils 126 [INFO]    [<module>]   * 512__V315-V67__diff__ProductCD-W_train Fold0 2018-5:0.9328496348612372 


Early stopping, best iteration is:
[164]	valid_0's auc: 0.93285
[  * Train & Validation] done in 7 s
[  * Make Dataset] done in 0 s
Training until validation scores don't improve for 20 rounds.


2019-09-14 22:43:23,456 func.utils 126 [INFO]    [<module>]   * 512__V315-V67__diff__ProductCD-W_train Fold1 2018-4:0.9447266362342188 


Early stopping, best iteration is:
[174]	valid_0's auc: 0.944727
[  * Train & Validation] done in 6 s
[  * Make Dataset] done in 0 s
Training until validation scores don't improve for 20 rounds.


2019-09-14 22:43:30,536 func.utils 126 [INFO]    [<module>]   * 512__V315-V67__diff__ProductCD-W_train Fold2 2018-3:0.9350338015660596 


Early stopping, best iteration is:
[139]	valid_0's auc: 0.935034
[  * Train & Validation] done in 7 s


2018-5    0.932850
2018-4    0.944727
2018-3    0.935034
dtype: float64