In [1]:
%load_ext autoreload
%autoreload 2
import gc
from glob import glob
import os
from pathlib import Path
import re
import sys
import yaml
import datetime
import numpy as np
import pandas as pd
from tqdm import tqdm
from func.utils import timer, get_categorical_features, read_pkl_gzip, to_pkl_gzip, parallel_load_data, get_filename, logger_func
from ieee_train import eval_train, eval_check_feature
from kaggle_utils import reduce_mem_usage, move_feature

import lightgbm as lgb
from sklearn.metrics import roc_auc_score
import shutil

try:
    logger
except NameError:
    logger = logger_func()
    
save_file_path = '../output/valid_single_feature.csv'

2019-09-16 17:35:52,975 func.utils 347 [INFO]    [logger_func] start 


In [2]:
COLUMN_ID = 'TransactionID'
COLUMN_DT = 'TransactionDT'
COLUMN_TARGET = 'isFraud'
COLUMN_GROUP = 'DT-M'
COLUMNS_IGNORE = [COLUMN_ID, COLUMN_DT, COLUMN_TARGET, COLUMN_GROUP, 'is_train', 'date']

paths_train = glob('../feature/raw_use/*_train.gz')
paths_train += sorted(glob('../feature/org_use/*_train.gz'))
paths_train += sorted(glob('../feature/valid_use/*_train.gz'))
# paths_train += sorted(glob('../feature/eda_base/fill__cnt*_train.gz'))

# paths_train += glob('../feature/create/524__fill*_train.gz')
# paths_train += glob('../feature/create/524__bin*_train.gz')

df_train = parallel_load_data(paths_train)

group_kfold_path = '../input/0908_ieee__DT-M_GroupKFold.gz'
group = read_pkl_gzip(group_kfold_path)
df_train[COLUMN_GROUP] = group

#========================================================================
# Negative Down Sampling
#========================================================================
frac = 0.2
seed = 1208
np.random.seed(seed)
df_pos = df_train[df_train.isFraud==1]
df_neg = df_train[df_train.isFraud!=1]
del df_train
gc.collect()
print(df_pos.shape, df_neg.shape)
df_neg = df_neg.sample(int(df_neg.shape[0] * frac))
df_train = pd.concat([df_pos, df_neg], axis=0)
print(df_train.shape)

(20663, 236) (569877, 236)
(134638, 236)


In [3]:
#========================================================================
# Base Featureに検証用Feature Groupを追加して、スコアの変化を見る.
# Baseより向上したFeature Groupのみ、追加検証を行う
#========================================================================
np.random.seed(1)

# 最初はbaseをTrueにして
start_time = "{0:%Y%m%d_%H%M%S}".format(datetime.datetime.now())[:14]
is_base = [True, False][0]
is_result = [True, False][0]
is_write  = [True, False][1]
to_dir = '../feature/check_trush/'

def get_tree_importance(estimator, use_cols, importance_type="gain"):
    feim = estimator.feature_importance(importance_type=importance_type)
    feim = pd.DataFrame([np.array(use_cols), feim]).T
    feim.columns = ['feature', 'importance']
    feim['importance'] = feim['importance'].astype('float32')
    return feim


# valid_paths_train = sorted(glob('../feature/valid/*_train.gz'))
valid_paths_train = sorted(glob('../feature/valid_use/*_train.gz'))
loop_no = len(valid_paths_train)
if len(valid_paths_train)==0 and (is_base or is_result):
    loop_no = 1
    
score_map = {}
    
for i in range(loop_no):
    
    if is_result:
        valid_path = valid_paths_train
    else:
        valid_path = valid_paths_train[i:i+1]
        
    
    if is_base or len(valid_path)==0:
        tmp_train = df_train.copy()
        feature_name = 'base'
    else:
        df_feat_train = parallel_load_data(valid_path)
        tmp_train = df_train.join(df_feat_train)
        feature_name = get_filename(valid_path[0])
    
    use_cols = [col for col in tmp_train.columns if col not in COLUMNS_IGNORE]
    
    fold_map = {
        0: '2018-5',
        1: '2018-4',
        2: '2018-3',
    }
    
    print('Train Shape:' , tmp_train[use_cols].shape)
        
    for fold in range(3):
        with timer('  * Make Dataset'):
            if fold==0:
                train = tmp_train[
                    (tmp_train[COLUMN_GROUP] == '2017-12') | 
                    (tmp_train[COLUMN_GROUP] == '2018-1') | 
                    (tmp_train[COLUMN_GROUP] == '2018-2') | 
                    (tmp_train[COLUMN_GROUP] == '2018-3') | 
                    (tmp_train[COLUMN_GROUP] == '2018-4')
                    ]
                test  = tmp_train[tmp_train[COLUMN_GROUP] == '2018-5']
            elif fold==1:
                train = tmp_train[
                    (tmp_train[COLUMN_GROUP] == '2017-12') | 
                    (tmp_train[COLUMN_GROUP] == '2018-1') | 
                    (tmp_train[COLUMN_GROUP] == '2018-2') | 
                    (tmp_train[COLUMN_GROUP] == '2018-3') |
                    (tmp_train[COLUMN_GROUP] == '2018-5')
                    ]
                test  = tmp_train[tmp_train[COLUMN_GROUP] == '2018-4']
            elif fold==2:
                train = tmp_train[
                    (tmp_train[COLUMN_GROUP] == '2017-12') | 
                    (tmp_train[COLUMN_GROUP] == '2018-1') | 
                    (tmp_train[COLUMN_GROUP] == '2018-2') | 
                    (tmp_train[COLUMN_GROUP] == '2018-4') |
                    (tmp_train[COLUMN_GROUP] == '2018-5')
                    ]
                test  = tmp_train[tmp_train[COLUMN_GROUP] == '2018-3']
        
            Y_TRAIN = train[COLUMN_TARGET]
            train.drop(COLUMN_TARGET, axis=1, inplace=True)
        
            Y_TEST = test[COLUMN_TARGET]
            test.drop(COLUMN_TARGET, axis=1, inplace=True)
        
        start_time = "{0:%Y%m%d_%H%M%S}".format(datetime.datetime.now())[:13]
        params = {
#             'n_jobs': 64,
            'n_jobs': 40,
#             'n_jobs': 32,
#             'n_jobs': 16,
            'seed': 1208,
            'metric': 'auc',
            'objective': 'binary',
            'num_leaves': 2**7-1,
            'max_depth': -1,
            'subsample': 0.9,
            'subsample_freq': 1,
            'colsample_bytree' : 1.0,
            'lambda_l1' : 0.1,
            'lambda_l2' : 1.0,
            'learning_rate' : 0.1,
        }
        
        x_train = train[use_cols]
        y_train = Y_TRAIN
        x_valid = test[use_cols]
        y_valid = Y_TEST
        early_stopping_rounds=20
        num_boost_round=500
        metric = 'auc'
        params['metric'] = metric
        
        #========================================================================
        # Fitting
        #========================================================================
        lgb_train = lgb.Dataset(data=x_train, label=y_train)
        lgb_valid = lgb.Dataset(data=x_valid, label=y_valid)
        
        with timer("  * Train & Validation"):
            estimator = lgb.train(
                params = params,
                train_set = lgb_train,
                valid_sets = lgb_valid,
                early_stopping_rounds = early_stopping_rounds,
                num_boost_round = num_boost_round,
                verbose_eval = 200
            )
            best_iter = estimator.best_iteration
        
            oof_pred = estimator.predict(x_valid)
            score = roc_auc_score(y_valid, oof_pred)
            cvs = str(score).replace('.', '-')
            logger.info(f"  * {feature_name} Fold{fold} {fold_map[fold]}:{score}")
            
            score_map[fold_map[fold]] = score
            
            if not is_result and is_write:
                with open(save_file_path, 'a') as f:
                    line = f'{start_time},{fold_map[fold]},{feature_name},{score}\n'
                    f.write(line)
                    
    display(pd.Series(score_map))
            
    if is_base or is_result:
        sys.exit()
        
    #========================================================================
    # PostProcess
    #========================================================================
    with timer("  * PostProcess"):
        for path in valid_path:
            try:
                shutil.move(path, to_dir)
            except FileNotFoundError:
                print(feature_name)

Train Shape: (134638, 233)
[  * Make Dataset] done in 0 s
Training until validation scores don't improve for 20 rounds.
[200]	valid_0's auc: 0.935601


2019-09-16 17:36:06,645 func.utils 140 [INFO]    [<module>]   * base Fold0 2018-5:0.9357911308433018 


Early stopping, best iteration is:
[191]	valid_0's auc: 0.935791
[  * Train & Validation] done in 9 s
[  * Make Dataset] done in 0 s
Training until validation scores don't improve for 20 rounds.


2019-09-16 17:36:14,588 func.utils 140 [INFO]    [<module>]   * base Fold1 2018-4:0.9455660228363172 


Early stopping, best iteration is:
[170]	valid_0's auc: 0.945566
[  * Train & Validation] done in 7 s
[  * Make Dataset] done in 0 s
Training until validation scores don't improve for 20 rounds.
[200]	valid_0's auc: 0.93894
Early stopping, best iteration is:
[249]	valid_0's auc: 0.939429


2019-09-16 17:36:25,690 func.utils 140 [INFO]    [<module>]   * base Fold2 2018-3:0.9394288244086632 


[  * Train & Validation] done in 11 s


2018-5    0.935791
2018-4    0.945566
2018-3    0.939429
dtype: float64

SystemExit: 

In [2]:
"""
最初の実験で一番優秀だったdatetimeとmax datetime
1.
20190913_22295 / 20190913_22380
base
2018-5: 0.9230621951960908 
2018-4: 0.9398694476303755 
2018-3: 0.9332691393175264 
2.
20190913_22420 / 20190913_22591
base
2018-5: 0.926138
2018-4: 0.941913
2018-3: 0.933296
3.
20190913_23181/ 20190913_23184
base
2018-5: 0.926513
2018-4: 0.942282
2018-3: 0.933743
4.
/ 
base
2018-5: 0.928030
2018-4: 0.943712
2018-3: 0.933903
"""
feim = pd.read_csv('../output/valid_single_feature.csv', header=None)
feim.columns = ['datetime', 'DT-M', 'feature', 'score']

# feim = feim[feim['datetime']<='20190913_22380']
print(feim['datetime'].max())

feim.set_index('feature', inplace=True)
cnt = feim.groupby('feature')['score'].count()
idx = cnt[cnt>=3].index
feim = feim.loc[idx]
may = feim[feim['DT-M']=='2018-5']
may.sort_values(by='score', ascending=False, inplace=True)
feim.loc[may.index]

20190914_09565


Unnamed: 0_level_0,datetime,DT-M,score
feature,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
606__card2-card5-card6_C5__ProductCD-W_std_train,20190914_09485,2018-5,0.930287
606__card2-card5-card6_C5__ProductCD-W_std_train,20190914_09485,2018-4,0.944163
606__card2-card5-card6_C5__ProductCD-W_std_train,20190914_09485,2018-3,0.934483
606__card1-card3-card5-P_emaildomain_prefix_C14__ProductCD-R_std_train,20190914_09482,2018-5,0.929266
606__card1-card3-card5-P_emaildomain_prefix_C14__ProductCD-R_std_train,20190914_09482,2018-4,0.945125
606__card1-card3-card5-P_emaildomain_prefix_C14__ProductCD-R_std_train,20190914_09482,2018-3,0.934089
605__card3-R_emaildomain_prefix_V312_sum_train,20190914_09475,2018-5,0.929064
605__card3-R_emaildomain_prefix_V312_sum_train,20190914_09475,2018-4,0.944294
605__card3-R_emaildomain_prefix_V312_sum_train,20190914_09475,2018-3,0.934500
522__P_emaildomain_prefix-card1-card2-card4-card5_last_C14_train,20190914_09553,2018-5,0.928594


In [6]:
list_path = glob('../feature/valid_use/*.gz')
key = ''
from_dir = 'valid_use'
to_dir = '../'
to_dir = 'kernel'

for path in list_path:
    if path.count(key):
        feature_name = get_filename(path)
        move_feature([feature_name.replace('_train', '')], from_dir, to_dir)
# feature_name = result.index[6].replace('_train', '')
# feature_name = '603__card2-addr1_V45_std'
# move_feature([feature_name], from_dir, to_dir)
print(glob('../feature/valid_use/*.gz'))

[]


In [13]:
is_kernel = [True, False][1]
list_path = glob('../feature/*.gz')
key1 = 'uid_'
key2 = 'D15_'
from_dir = './'
to_dir = 'valid_use'
if is_kernel:
    list_path = glob('../feature/kernel/*.gz')
    from_dir = 'kernel'

for path in list_path:
    if path.count('test'):
        continue
    if path.count(key1) and path.count(key2):
        feature_name = get_filename(path)
        move_feature([feature_name.replace('_train', '')], from_dir, to_dir)
# feature_name = result.index[6].replace('_train', '')
# feature_name = '603__card2-addr1_V45_std'
# move_feature([feature_name], from_dir, to_dir)
print(glob('../feature/valid_use/*.gz'))

['../feature/valid_use/ker__uid_D15_mean_test.gz', '../feature/valid_use/ker__uid_D15_std_test.gz', '../feature/valid_use/ker__uid_D15_std_train.gz', '../feature/valid_use/ker__uid_D15_mean_train.gz']


In [5]:
result[result['score_0']>=0.91497]

['../feature/valid_use/ker__uid_D15_mean_test.gz', '../feature/valid_use/ker__uid_D15_std_test.gz', '../feature/valid_use/ker__uid_D15_std_train.gz', '../feature/valid_use/ker__uid_D15_mean_train.gz']


In [3]:
exp_no = 4
df_feature = pd.read_csv(save_file_path)
df_feature.columns = ['exp_no', 'fold', 'feature', 'score']
df_feature = df_feature[df_feature['exp_no']==exp_no]
df_feature.drop('exp_no', axis=1, inplace=True)
fold0 = df_feature[df_feature.fold==0].set_index('feature').drop('fold', axis=1)
fold0.columns = ['score_0']
fold1 = df_feature[df_feature.fold==1].set_index('feature').drop('fold', axis=1)
fold1.columns = ['score_1']

result = fold0.join(fold1)
result = result.drop_duplicates()
result.sort_values(by='score_0', ascending=False, inplace=True)
base_0 = result.loc['base']['score_0']
base_1 = result.loc['base']['score_1']
result[(result['score_0']>=base_0) & (result['score_1']>=base_1)]

Unnamed: 0_level_0,score_0,score_1
feature,Unnamed: 1_level_1,Unnamed: 2_level_1
606__card1-card3-card4-card5-P_emaildomain_prefix_V282__ProductCD-C_std_train,0.914975,0.901624
606__card1-card2-card3-card4-P_emaildomain_prefix_C3__ProductCD-H_std_train,0.914523,0.900061
606__card1-card2-card4-card6_V314__ProductCD-H_std_train,0.913796,0.900398
606__card1-card2-card3-card5-P_emaildomain_prefix_V45__ProductCD-W_std_train,0.913756,0.901527
606__card3-card4-card5-card6-P_emaildomain_prefix_C2__ProductCD-R_std_train,0.913385,0.900132
606__card1-card2-card3-card5-R_emaildomain_prefix_V258__ProductCD-C_std_train,0.913379,0.90251
606__card1-card2-card5-P_emaildomain_prefix_V87__ProductCD-S_std_train,0.913334,0.900775
606__card1-card2-card6_C8__ProductCD-W_std_train,0.913312,0.903334
606__card4-card5-card6-R_emaildomain_prefix_V45__ProductCD-C_std_train,0.913156,0.901745
606__card1-card2-card4-card6-R_emaildomain_prefix_507__C1-C14__ratio__ProductCD-H_std_train,0.913018,0.90228


In [18]:
# for use_less_feature in result[(result['score_0']<base_0) & (result['score_1']<base_1)].index:
fname_list = []
idx = 8138
for i, path in enumerate(path_list[idx:idx+20]):
    fname = get_filename(path)
    if fname.count('test'):
        continue
    fname = fname.replace('_train', '')
    move_feature([fname], 'check_trush', 'valid_use')

In [13]:
path_list = sorted(glob('../feature/check_trush/*.gz'))
for i, path in enumerate(sorted(path_list)):
    if path.count('606__card1-card3-card4-card5-P_emaildomain_prefix_V282__ProductCD-C'):
        print(i)
        sys.exit()

8138


SystemExit: 