In [1]:
%load_ext autoreload
%autoreload 2
from glob import glob
import os
from pathlib import Path
import re
import sys
import yaml
import datetime
import numpy as np
import pandas as pd
from tqdm import tqdm
from func.utils import timer, get_categorical_features, read_pkl_gzip, to_pkl_gzip, parallel_load_data, get_filename, logger_func
from ieee_train import eval_train, eval_check_feature
from kaggle_utils import reduce_mem_usage, move_feature

import lightgbm as lgb
from sklearn.metrics import roc_auc_score
import shutil

try:
    logger
except NameError:
    logger = logger_func()
    
save_file_path = '../output/valid_single_feature.csv'

2019-09-13 21:02:52,264 func.utils 347 [INFO]    [logger_func] start 


In [2]:
COLUMN_ID = 'TransactionID'
COLUMN_DT = 'TransactionDT'
COLUMN_TARGET = 'isFraud'
COLUMN_GROUP = 'DT-M'
COLUMNS_IGNORE = [COLUMN_ID, COLUMN_DT, COLUMN_TARGET, COLUMN_GROUP, 'is_train', 'date']

paths_train = glob('../feature/raw_use/*_train.gz')
paths_train += sorted(glob('../feature/org_use/*_train.gz'))
paths_train += sorted(glob('../feature/valid_use/*_train.gz'))

df_train = parallel_load_data(paths_train)

group_kfold_path = '../input/0908_ieee__DT-M_GroupKFold.gz'
group = read_pkl_gzip(group_kfold_path)
df_train[COLUMN_GROUP] = group
df_train = df_train[('2018-1' <= df_train[COLUMN_GROUP]) & (df_train[COLUMN_GROUP] <= '2018-5')]

In [3]:
#========================================================================
# Base Featureに検証用Feature Groupを追加して、スコアの変化を見る.
# Baseより向上したFeature Groupのみ、追加検証を行う
#========================================================================

# 最初はbaseをTrueにして
exp_no = 5
is_base = [True, False][1]
is_result = [True, False][1]
is_write  = [True, False][0]
to_dir = '../feature/check_trush/'

def get_tree_importance(estimator, use_cols, importance_type="gain"):
    feim = estimator.feature_importance(importance_type=importance_type)
    feim = pd.DataFrame([np.array(use_cols), feim]).T
    feim.columns = ['feature', 'importance']
    feim['importance'] = feim['importance'].astype('float32')
    return feim


# valid_paths_train = sorted(glob('../feature/valid/*_train.gz'))
valid_paths_train = sorted(glob('../feature/valid_use/*_train.gz'))
loop_no = len(valid_paths_train)
if len(valid_paths_train)==0 and (is_base or is_result):
    loop_no = 1
    
for i in range(loop_no):
    
    if is_result:
        valid_path = valid_paths_train
    else:
        valid_path = valid_paths_train[i:i+1]
        
    
    if is_base or len(valid_path)==0:
        tmp_train = df_train.copy()
        feature_name = 'base'
    else:
        df_feat_train = parallel_load_data(valid_path)
        tmp_train = df_train.join(df_feat_train)
        feature_name = get_filename(valid_path[0])
    
    use_cols = [col for col in tmp_train.columns if col not in COLUMNS_IGNORE]
        
    for fold in range(2):

        with timer('  * Make Dataset'):
            if fold==0:
                dataset = tmp_train[
                    (tmp_train[COLUMN_GROUP] == '2018-2') | 
                    (tmp_train[COLUMN_GROUP] == '2018-3') | 
                    (tmp_train[COLUMN_GROUP] == '2018-5')]
                train = dataset[('2018-2' <= dataset[COLUMN_GROUP]) & (dataset[COLUMN_GROUP] <= '2018-3')]
                test  = dataset[dataset[COLUMN_GROUP] == '2018-5']
            elif fold==1:
                dataset = tmp_train[
                    (df_train[COLUMN_GROUP] == '2018-1') | 
                    (df_train[COLUMN_GROUP] == '2018-2') | 
                    (df_train[COLUMN_GROUP] == '2018-4')]
                train = dataset[('2018-1' <= dataset[COLUMN_GROUP]) & (dataset[COLUMN_GROUP] <= '2018-2')]
                test  = dataset[dataset[COLUMN_GROUP] == '2018-4']
        
            Y_TRAIN = train[COLUMN_TARGET]
            train.drop(COLUMN_TARGET, axis=1, inplace=True)
        
            Y_TEST = test[COLUMN_TARGET]
            test.drop(COLUMN_TARGET, axis=1, inplace=True)
        
        start_time = "{0:%Y%m%d_%H%M%S}".format(datetime.datetime.now())[:13]
        params = {
#             'n_jobs': 64,
            'n_jobs': 20,
            'seed': 1208,
            'metric': 'auc',
            'objective': 'binary',
            'num_leaves': 2**7-1,
            'max_depth': -1,
            'subsample': 0.9,
            'subsample_freq': 1,
            'colsample_bytree' : 1.0,
            'lambda_l1' : 0.1,
            'lambda_l2' : 1.0,
            'learning_rate' : 0.1,
        }
        
        x_train = train[use_cols]
        y_train = Y_TRAIN
        x_valid = test[use_cols]
        y_valid = Y_TEST
        early_stopping_rounds=20
        num_boost_round=3500
        metric = 'auc'
        params['metric'] = metric
        
        #========================================================================
        # Fitting
        #========================================================================
        lgb_train = lgb.Dataset(data=x_train, label=y_train)
        lgb_valid = lgb.Dataset(data=x_valid, label=y_valid)
        
        with timer("  * Train & Validation"):
            estimator = lgb.train(
                params = params,
                train_set = lgb_train,
                valid_sets = lgb_valid,
                early_stopping_rounds = early_stopping_rounds,
                num_boost_round = num_boost_round,
                verbose_eval = 200
            )
            best_iter = estimator.best_iteration
        
            oof_pred = estimator.predict(x_valid)
            score = roc_auc_score(y_valid, oof_pred)
            cvs = str(score).replace('.', '-')
            logger.info(f"  * {feature_name} Fold{fold}:{score}")
            
            if not is_result and is_write:
                with open(save_file_path, 'a') as f:
                    line = f'{exp_no},{fold},{feature_name},{score}\n'
                    f.write(line)
            
#             feim = get_tree_importance(estimator=estimator, use_cols=x_train.columns)
#             feim.sort_values(by='importance', ascending=False, inplace=True)
#             feim['is_valid'] = feim['feature'].map(valid_map)

    if is_base or is_result:
        sys.exit()
        
    #========================================================================
    # PostProcess
    #========================================================================
    with timer("  * PostProcess"):
        for path in valid_path:
            try:
                shutil.move(path, to_dir)
            except FileNotFoundError:
                print(feature_name)

[  * Make Dataset] done in 1 s
Training until validation scores don't improve for 20 rounds.
Early stopping, best iteration is:
[89]	valid_0's auc: 0.903056


2019-09-13 21:03:01,169 func.utils 115 [INFO]    [<module>]   * base Fold0:0.9030559122865263 


[  * Train & Validation] done in 5 s
[  * Make Dataset] done in 1 s
Training until validation scores don't improve for 20 rounds.
Early stopping, best iteration is:
[61]	valid_0's auc: 0.894666


2019-09-13 21:03:05,671 func.utils 115 [INFO]    [<module>]   * base Fold1:0.8946657373230775 


[  * Train & Validation] done in 4 s


SystemExit: 

In [6]:
list_path = glob('../feature/valid_use/*.gz')
key = ''
from_dir = 'valid_use'
to_dir = '../'
to_dir = 'kernel'

for path in list_path:
    if path.count(key):
        feature_name = get_filename(path)
        move_feature([feature_name.replace('_train', '')], from_dir, to_dir)
# feature_name = result.index[6].replace('_train', '')
# feature_name = '603__card2-addr1_V45_std'
# move_feature([feature_name], from_dir, to_dir)
print(glob('../feature/valid_use/*.gz'))

[]


In [13]:
is_kernel = [True, False][1]
list_path = glob('../feature/*.gz')
key1 = 'uid_'
key2 = 'D15_'
from_dir = './'
to_dir = 'valid_use'
if is_kernel:
    list_path = glob('../feature/kernel/*.gz')
    from_dir = 'kernel'

for path in list_path:
    if path.count('test'):
        continue
    if path.count(key1) and path.count(key2):
        feature_name = get_filename(path)
        move_feature([feature_name.replace('_train', '')], from_dir, to_dir)
# feature_name = result.index[6].replace('_train', '')
# feature_name = '603__card2-addr1_V45_std'
# move_feature([feature_name], from_dir, to_dir)
print(glob('../feature/valid_use/*.gz'))

['../feature/valid_use/ker__uid_D15_mean_test.gz', '../feature/valid_use/ker__uid_D15_std_test.gz', '../feature/valid_use/ker__uid_D15_std_train.gz', '../feature/valid_use/ker__uid_D15_mean_train.gz']


In [5]:
result[result['score_0']>=0.91497]

['../feature/valid_use/ker__uid_D15_mean_test.gz', '../feature/valid_use/ker__uid_D15_std_test.gz', '../feature/valid_use/ker__uid_D15_std_train.gz', '../feature/valid_use/ker__uid_D15_mean_train.gz']


In [3]:
exp_no = 4
df_feature = pd.read_csv(save_file_path)
df_feature.columns = ['exp_no', 'fold', 'feature', 'score']
df_feature = df_feature[df_feature['exp_no']==exp_no]
df_feature.drop('exp_no', axis=1, inplace=True)
fold0 = df_feature[df_feature.fold==0].set_index('feature').drop('fold', axis=1)
fold0.columns = ['score_0']
fold1 = df_feature[df_feature.fold==1].set_index('feature').drop('fold', axis=1)
fold1.columns = ['score_1']

result = fold0.join(fold1)
result = result.drop_duplicates()
result.sort_values(by='score_0', ascending=False, inplace=True)
base_0 = result.loc['base']['score_0']
base_1 = result.loc['base']['score_1']
result[(result['score_0']>=base_0) & (result['score_1']>=base_1)]

Unnamed: 0_level_0,score_0,score_1
feature,Unnamed: 1_level_1,Unnamed: 2_level_1
606__card1-card3-card4-card5-P_emaildomain_prefix_V282__ProductCD-C_std_train,0.914975,0.901624
606__card1-card2-card3-card4-P_emaildomain_prefix_C3__ProductCD-H_std_train,0.914523,0.900061
606__card1-card2-card4-card6_V314__ProductCD-H_std_train,0.913796,0.900398
606__card1-card2-card3-card5-P_emaildomain_prefix_V45__ProductCD-W_std_train,0.913756,0.901527
606__card3-card4-card5-card6-P_emaildomain_prefix_C2__ProductCD-R_std_train,0.913385,0.900132
606__card1-card2-card3-card5-R_emaildomain_prefix_V258__ProductCD-C_std_train,0.913379,0.90251
606__card1-card2-card5-P_emaildomain_prefix_V87__ProductCD-S_std_train,0.913334,0.900775
606__card1-card2-card6_C8__ProductCD-W_std_train,0.913312,0.903334
606__card4-card5-card6-R_emaildomain_prefix_V45__ProductCD-C_std_train,0.913156,0.901745
606__card1-card2-card4-card6-R_emaildomain_prefix_507__C1-C14__ratio__ProductCD-H_std_train,0.913018,0.90228


In [18]:
# for use_less_feature in result[(result['score_0']<base_0) & (result['score_1']<base_1)].index:
fname_list = []
idx = 8138
for i, path in enumerate(path_list[idx:idx+20]):
    fname = get_filename(path)
    if fname.count('test'):
        continue
    fname = fname.replace('_train', '')
    move_feature([fname], 'check_trush', 'valid_use')

In [13]:
path_list = sorted(glob('../feature/check_trush/*.gz'))
for i, path in enumerate(sorted(path_list)):
    if path.count('606__card1-card3-card4-card5-P_emaildomain_prefix_V282__ProductCD-C'):
        print(i)
        sys.exit()

8138


SystemExit: 