In [1]:
%load_ext autoreload
%autoreload 2
import gc
from glob import glob
import os
from pathlib import Path
import re
import sys
import yaml
import datetime
import numpy as np
import pandas as pd
from tqdm import tqdm
from func.utils import timer, get_categorical_features, read_pkl_gzip, to_pkl_gzip, parallel_load_data, get_filename, logger_func
from ieee_train import eval_train, eval_check_feature
from kaggle_utils import reduce_mem_usage, move_feature

import lightgbm as lgb
from sklearn.metrics import roc_auc_score
import shutil

try:
    logger
except NameError:
    logger = logger_func()
    

2019-09-15 20:07:08,340 func.utils 347 [INFO]    [logger_func] start 


In [2]:
COLUMN_ID = 'TransactionID'
COLUMN_DT = 'TransactionDT'
COLUMN_TARGET = 'isFraud'
COLUMN_GROUP = 'DT-M'
COLUMNS_IGNORE = [COLUMN_ID, COLUMN_DT, COLUMN_TARGET, COLUMN_GROUP, 'is_train', 'date']

paths_train = glob('../feature/raw_use/*_train.gz')
paths_train += sorted(glob('../feature/org_use/*_train.gz'))
# paths_train += sorted(glob('../feature/valid_use/*_train.gz'))

df_train = parallel_load_data(paths_train)

group_kfold_path = '../input/0908_ieee__DT-M_GroupKFold.gz'
group = read_pkl_gzip(group_kfold_path)
df_train[COLUMN_GROUP] = group

#========================================================================
# Negative Down Sampling
#========================================================================
frac = 0.2
seed = 1208
np.random.seed(seed)
df_pos = df_train[df_train.isFraud==1]
df_neg = df_train[df_train.isFraud!=1]
del df_train
gc.collect()
print(df_pos.shape, df_neg.shape)
df_neg = df_neg.sample(int(df_neg.shape[0] * frac))
df_train = pd.concat([df_pos, df_neg], axis=0)
print(df_train.shape)

(20663, 126) (569877, 126)
(134638, 126)


In [None]:
#========================================================================
# Base Featureに検証用Feature Groupを追加して、スコアの変化を見る.
# Baseより向上したFeature Groupのみ、追加検証を行う
#========================================================================
np.random.seed(1208)

save_file_path = '../output/913_ieee__valid_single_feature.csv'

# 最初はbaseをTrueにして
start_time = "{0:%Y%m%d_%H%M%S}".format(datetime.datetime.now())[:14]
is_base = [True, False][0]
is_result = [True, False][0]
is_write  = [True, False][1]
to_dir = '../feature/check_trush/'

def get_tree_importance(estimator, use_cols, importance_type="gain"):
    feim = estimator.feature_importance(importance_type=importance_type)
    feim = pd.DataFrame([np.array(use_cols), feim]).T
    feim.columns = ['feature', 'importance']
    feim['importance'] = feim['importance'].astype('float32')
    return feim


# valid_paths_train = sorted(glob('../feature/valid/*_train.gz'))
valid_paths_train = sorted(glob('../feature/valid_use/*_train.gz'))
score_map = {}
    
for path in valid_paths_train:
    
    feature = pd.Series(read_pkl_gzip(path)).loc[df_train.index]
    tmp_train = df_train.copy()
    feature_name = get_filename(path)
    tmp_train[feature_name] = feature
    
    use_cols = [col for col in tmp_train.columns if col not in COLUMNS_IGNORE]
    
    fold_map = {
        0: '2018-5',
        1: '2018-4',
        2: '2018-3',
    }
        
    for fold in range(3):
        with timer('  * Make Dataset'):
            if fold==0:
                train = tmp_train[
                    (tmp_train[COLUMN_GROUP] == '2017-12') | 
                    (tmp_train[COLUMN_GROUP] == '2018-1') | 
                    (tmp_train[COLUMN_GROUP] == '2018-2') | 
                    (tmp_train[COLUMN_GROUP] == '2018-3') | 
                    (tmp_train[COLUMN_GROUP] == '2018-4')
                    ]
                test  = tmp_train[tmp_train[COLUMN_GROUP] == '2018-5']
            elif fold==1:
                train = tmp_train[
                    (tmp_train[COLUMN_GROUP] == '2017-12') | 
                    (tmp_train[COLUMN_GROUP] == '2018-1') | 
                    (tmp_train[COLUMN_GROUP] == '2018-2') | 
                    (tmp_train[COLUMN_GROUP] == '2018-3') |
                    (tmp_train[COLUMN_GROUP] == '2018-5')
                    ]
                test  = tmp_train[tmp_train[COLUMN_GROUP] == '2018-4']
            elif fold==2:
                train = tmp_train[
                    (tmp_train[COLUMN_GROUP] == '2017-12') | 
                    (tmp_train[COLUMN_GROUP] == '2018-1') | 
                    (tmp_train[COLUMN_GROUP] == '2018-2') | 
                    (tmp_train[COLUMN_GROUP] == '2018-4') |
                    (tmp_train[COLUMN_GROUP] == '2018-5')
                    ]
                test  = tmp_train[tmp_train[COLUMN_GROUP] == '2018-3']
        
            Y_TRAIN = train[COLUMN_TARGET]
            train.drop(COLUMN_TARGET, axis=1, inplace=True)
        
            Y_TEST = test[COLUMN_TARGET]
            test.drop(COLUMN_TARGET, axis=1, inplace=True)
        
        start_time = "{0:%Y%m%d_%H%M%S}".format(datetime.datetime.now())[:13]
        params = {
            'n_jobs': 64,
#             'n_jobs': 32,
#             'n_jobs': 16,
            'seed': 1208,
            'metric': 'auc',
            'objective': 'binary',
            'num_leaves': 2**7-1,
            'max_depth': -1,
            'subsample': 0.9,
            'subsample_freq': 1,
            'colsample_bytree' : 1.0,
            'lambda_l1' : 0.1,
            'lambda_l2' : 1.0,
            'learning_rate' : 0.1,
        }
        
        x_train = train[use_cols]
        y_train = Y_TRAIN
        x_valid = test[use_cols]
        y_valid = Y_TEST
        early_stopping_rounds=20
        num_boost_round=500
        metric = 'auc'
        params['metric'] = metric
        
        #========================================================================
        # Fitting
        #========================================================================
        lgb_train = lgb.Dataset(data=x_train, label=y_train)
        lgb_valid = lgb.Dataset(data=x_valid, label=y_valid)
        
        with timer("  * Train & Validation"):
            estimator = lgb.train(
                params = params,
                train_set = lgb_train,
                valid_sets = lgb_valid,
                early_stopping_rounds = early_stopping_rounds,
                num_boost_round = num_boost_round,
                verbose_eval = 200
            )
            best_iter = estimator.best_iteration
        
            oof_pred = estimator.predict(x_valid)
            score = roc_auc_score(y_valid, oof_pred)
            cvs = str(score).replace('.', '-')
            logger.info(f"  * {feature_name} Fold{fold} {fold_map[fold]}:{score}")
            
            score_map[fold_map[fold]] = score
            
            with open(save_file_path, 'a') as f:
                line = f'{start_time},{fold_map[fold]},{feature_name},{score}\n'
                f.write(line)
                
    display(pd.Series(score_map))
                
    #========================================================================
    # PostProcess
    #========================================================================
#     try:
#         shutil.move(path, './')
#     except FileNotFoundError:
#         print(feature_name)

[  * Make Dataset] done in 0 s
Training until validation scores don't improve for 20 rounds.


2019-09-15 20:07:18,380 func.utils 126 [INFO]    [<module>]   * 512__V127-V318__ratio__ProductCD-W_train Fold0 2018-5:0.9311536923881087 


Early stopping, best iteration is:
[153]	valid_0's auc: 0.931154
[  * Train & Validation] done in 6 s
[  * Make Dataset] done in 0 s
Training until validation scores don't improve for 20 rounds.
[200]	valid_0's auc: 0.94422
Early stopping, best iteration is:
[197]	valid_0's auc: 0.944483


2019-09-15 20:07:25,125 func.utils 126 [INFO]    [<module>]   * 512__V127-V318__ratio__ProductCD-W_train Fold1 2018-4:0.9444833944923151 


[  * Train & Validation] done in 6 s
[  * Make Dataset] done in 0 s
Training until validation scores don't improve for 20 rounds.
[200]	valid_0's auc: 0.937986


2019-09-15 20:07:32,403 func.utils 126 [INFO]    [<module>]   * 512__V127-V318__ratio__ProductCD-W_train Fold2 2018-3:0.9383416224142032 


Early stopping, best iteration is:
[215]	valid_0's auc: 0.938342
[  * Train & Validation] done in 7 s


2018-5    0.931154
2018-4    0.944483
2018-3    0.938342
dtype: float64

[  * Make Dataset] done in 0 s
Training until validation scores don't improve for 20 rounds.


2019-09-15 20:07:38,668 func.utils 126 [INFO]    [<module>]   * 512__V127-V78__diff__ProductCD-C_train Fold0 2018-5:0.9317058536143679 


Early stopping, best iteration is:
[172]	valid_0's auc: 0.931706
[  * Train & Validation] done in 6 s
[  * Make Dataset] done in 0 s
Training until validation scores don't improve for 20 rounds.
Early stopping, best iteration is:
[147]	valid_0's auc: 0.94386


2019-09-15 20:07:44,535 func.utils 126 [INFO]    [<module>]   * 512__V127-V78__diff__ProductCD-C_train Fold1 2018-4:0.9438599976690969 


[  * Train & Validation] done in 5 s
[  * Make Dataset] done in 0 s
Training until validation scores don't improve for 20 rounds.
[200]	valid_0's auc: 0.935288


2019-09-15 20:07:51,684 func.utils 126 [INFO]    [<module>]   * 512__V127-V78__diff__ProductCD-C_train Fold2 2018-3:0.935976567024954 


Early stopping, best iteration is:
[230]	valid_0's auc: 0.935977
[  * Train & Validation] done in 7 s


2018-5    0.931706
2018-4    0.943860
2018-3    0.935977
dtype: float64

[  * Make Dataset] done in 0 s
Training until validation scores don't improve for 20 rounds.


2019-09-15 20:07:58,063 func.utils 126 [INFO]    [<module>]   * 512__V53-V78__diff__ProductCD-W_train Fold0 2018-5:0.9332905432389992 


[200]	valid_0's auc: 0.932856
Early stopping, best iteration is:
[182]	valid_0's auc: 0.933291
[  * Train & Validation] done in 6 s
[  * Make Dataset] done in 0 s
Training until validation scores don't improve for 20 rounds.
[200]	valid_0's auc: 0.943058


2019-09-15 20:08:04,895 func.utils 126 [INFO]    [<module>]   * 512__V53-V78__diff__ProductCD-W_train Fold1 2018-4:0.9431420794489483 


Early stopping, best iteration is:
[202]	valid_0's auc: 0.943142
[  * Train & Validation] done in 6 s
[  * Make Dataset] done in 0 s
Training until validation scores don't improve for 20 rounds.
Early stopping, best iteration is:
[164]	valid_0's auc: 0.936051


2019-09-15 20:08:10,702 func.utils 126 [INFO]    [<module>]   * 512__V53-V78__diff__ProductCD-W_train Fold2 2018-3:0.9360511446196931 


[  * Train & Validation] done in 5 s


2018-5    0.933291
2018-4    0.943142
2018-3    0.936051
dtype: float64

[  * Make Dataset] done in 0 s
Training until validation scores don't improve for 20 rounds.
[200]	valid_0's auc: 0.931781


2019-09-15 20:08:17,259 func.utils 126 [INFO]    [<module>]   * 603__addr1-addr2_C8__ProductCD-H_std_train Fold0 2018-5:0.9319469189124221 


Early stopping, best iteration is:
[190]	valid_0's auc: 0.931947
[  * Train & Validation] done in 6 s
[  * Make Dataset] done in 0 s
Training until validation scores don't improve for 20 rounds.
[200]	valid_0's auc: 0.944663


2019-09-15 20:08:24,695 func.utils 126 [INFO]    [<module>]   * 603__addr1-addr2_C8__ProductCD-H_std_train Fold1 2018-4:0.9456653585288731 


Early stopping, best iteration is:
[256]	valid_0's auc: 0.945665
[  * Train & Validation] done in 7 s
[  * Make Dataset] done in 0 s
Training until validation scores don't improve for 20 rounds.


2019-09-15 20:08:28,833 func.utils 126 [INFO]    [<module>]   * 603__addr1-addr2_C8__ProductCD-H_std_train Fold2 2018-3:0.9325379384855192 


Early stopping, best iteration is:
[94]	valid_0's auc: 0.932538
[  * Train & Validation] done in 4 s


2018-5    0.931947
2018-4    0.945665
2018-3    0.932538
dtype: float64

[  * Make Dataset] done in 0 s
Training until validation scores don't improve for 20 rounds.


2019-09-15 20:08:34,523 func.utils 126 [INFO]    [<module>]   * 603__card1-card4_C6__ProductCD-C_mean_train Fold0 2018-5:0.9312464205548333 


Early stopping, best iteration is:
[154]	valid_0's auc: 0.931246
[  * Train & Validation] done in 5 s
[  * Make Dataset] done in 0 s
Training until validation scores don't improve for 20 rounds.
[200]	valid_0's auc: 0.943167
Early stopping, best iteration is:
[238]	valid_0's auc: 0.943691


2019-09-15 20:08:41,721 func.utils 126 [INFO]    [<module>]   * 603__card1-card4_C6__ProductCD-C_mean_train Fold1 2018-4:0.9436914755380411 


[  * Train & Validation] done in 7 s
[  * Make Dataset] done in 0 s
Training until validation scores don't improve for 20 rounds.
[200]	valid_0's auc: 0.936


2019-09-15 20:08:49,306 func.utils 126 [INFO]    [<module>]   * 603__card1-card4_C6__ProductCD-C_mean_train Fold2 2018-3:0.9367260518470196 


Early stopping, best iteration is:
[249]	valid_0's auc: 0.936726
[  * Train & Validation] done in 7 s


2018-5    0.931246
2018-4    0.943691
2018-3    0.936726
dtype: float64

[  * Make Dataset] done in 0 s
Training until validation scores don't improve for 20 rounds.


2019-09-15 20:08:55,049 func.utils 126 [INFO]    [<module>]   * 603__card6-addr2_C4__ProductCD-C_std_train Fold0 2018-5:0.9325214284823844 


Early stopping, best iteration is:
[149]	valid_0's auc: 0.932521
[  * Train & Validation] done in 5 s
[  * Make Dataset] done in 1 s
Training until validation scores don't improve for 20 rounds.


2019-09-15 20:09:00,952 func.utils 126 [INFO]    [<module>]   * 603__card6-addr2_C4__ProductCD-C_std_train Fold1 2018-4:0.9419906349968794 


Early stopping, best iteration is:
[158]	valid_0's auc: 0.941991
[  * Train & Validation] done in 5 s
[  * Make Dataset] done in 0 s
Training until validation scores don't improve for 20 rounds.
