In [1]:
%load_ext autoreload
%autoreload 2
from glob import glob
import os
from pathlib import Path
import re
import sys
import yaml
import datetime
import numpy as np
import pandas as pd
from tqdm import tqdm
from func.utils import timer, get_categorical_features, read_pkl_gzip, to_pkl_gzip, parallel_load_data, get_filename
from ieee_train import eval_train, eval_check_feature
from kaggle_utils import reduce_mem_usage, move_feature

In [None]:
COLUMN_ID = 'TransactionID'
COLUMN_DT = 'TransactionDT'
COLUMN_TARGET = 'isFraud'
COLUMN_GROUP = 'DT-M'
COLUMNS_IGNORE = [COLUMN_ID, COLUMN_DT, COLUMN_TARGET, COLUMN_GROUP, 'is_train', 'date']

paths_train = glob('../feature/raw_use/*_train.gz')
paths_train += sorted(glob('../feature/org_use/*_train.gz'))

df_train = parallel_load_data(paths_train)

group_kfold_path = '../input/0908_ieee__DT-M_GroupKFold.gz'
group = read_pkl_gzip(group_kfold_path)
df_train[COLUMN_GROUP] = group

df_train = df_train[('2018-2' == df_train[COLUMN_GROUP]) | (df_train[COLUMN_GROUP] == '2018-3') | (df_train[COLUMN_GROUP] == '2018-5')]

In [3]:
#========================================================================
# Base Featureに検証用Feature Groupを追加して、スコアの変化を見る.
# Baseより向上したFeature Groupのみ、追加検証を行う
#========================================================================

import lightgbm as lgb
from sklearn.metrics import roc_auc_score
import shutil

def get_tree_importance(estimator, use_cols, importance_type="gain"):
    feim = estimator.feature_importance(importance_type=importance_type)
    feim = pd.DataFrame([np.array(use_cols), feim]).T
    feim.columns = ['feature', 'importance']
    feim['importance'] = feim['importance'].astype('float32')
    return feim


for i in range(10):
    valid_paths_train = sorted(glob('../feature/valid/*_train.gz'))[i*10:(i+1)*10]
    
    valid_map = {}
    for path in valid_paths_train:
        filename = re.search(r'/([^/.]*).gz', path).group(1)
        valid_map[filename.replace('_train', '')] = 1
    
    with timer('  * Make Dataset'):
        df_feat_train = parallel_load_data(valid_paths_train)
        tmp_train = df_train.join(df_feat_train)
        
        train = tmp_train[('2018-2' <= tmp_train[COLUMN_GROUP]) & (tmp_train[COLUMN_GROUP] <= '2018-3')]
        Y_TRAIN = train[COLUMN_TARGET]
        train.drop(COLUMN_TARGET, axis=1, inplace=True)
    
        test  = tmp_train[tmp_train[COLUMN_GROUP] == '2018-5']
        Y_TEST = test[COLUMN_TARGET]
        test.drop(COLUMN_TARGET, axis=1, inplace=True)
    
    start_time = "{0:%Y%m%d_%H%M%S}".format(datetime.datetime.now())[:13]
    params = {
        'n_jobs': 64,
        'seed': 1208,
        'metric': 'auc',
        'objective': 'binary',
        'num_leaves': 2**7-1,
        'max_depth': -1,
        'subsample': 0.9,
        'subsample_freq': 1,
        'colsample_bytree' : 0.20,
        'lambda_l1' : 0.1,
        'lambda_l2' : 1.0,
        'learning_rate' : 0.1,
    }
    
    use_cols = [col for col in tmp_train.columns if col not in COLUMNS_IGNORE]
    x_train = train[use_cols]
    y_train = Y_TRAIN
    x_valid = test[use_cols]
    y_valid = Y_TEST
    early_stopping_rounds=20
    num_boost_round=3500
    metric = 'auc'
    params['metric'] = metric
    
    #========================================================================
    # Fitting
    #========================================================================
    lgb_train = lgb.Dataset(data=x_train, label=y_train)
    lgb_valid = lgb.Dataset(data=x_valid, label=y_valid)
    
    with timer("  * Train & Validation"):
        estimator = lgb.train(
            params = params,
            train_set = lgb_train,
            valid_sets = lgb_valid,
            early_stopping_rounds = early_stopping_rounds,
            num_boost_round = num_boost_round,
            verbose_eval = 200
        )
        best_iter = estimator.best_iteration
    
        oof_pred = estimator.predict(x_valid)
        score = roc_auc_score(y_valid, oof_pred)
        cvs = str(score).replace('.', '-')
        feim = get_tree_importance(estimator=estimator, use_cols=x_train.columns)
        feim.sort_values(by='importance', ascending=False, inplace=True)
        feim['is_valid'] = feim['feature'].map(valid_map)
    
    #========================================================================
    # PostProcess
    #========================================================================
    
    with timer("  * PostProcess"):
        to_pkl_gzip(obj=feim, path=f"../output/selection_feature/{start_time}__CV{cvs}__feature{len(use_cols)}")
        to_dir = '../feature/valid_trush/'
        for path in valid_paths_train:
            try:
                shutil.move(path, to_dir)
            except FileNotFoundError:
                print(feature_name)

[  * Make Dataset] done in 1 s
Training until validation scores don't improve for 20 rounds.
Early stopping, best iteration is:
[83]	valid_0's auc: 0.91257
[  * Train & Validation] done in 9 s
[  * PostProcess] done in 0 s
[  * Make Dataset] done in 1 s
Training until validation scores don't improve for 20 rounds.


ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "/home/yryrgogo/anaconda3/lib/python3.7/site-packages/IPython/core/interactiveshell.py", line 3325, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-3-48d9b6f77b06>", line 78, in <module>
    verbose_eval = 200
  File "/home/yryrgogo/anaconda3/lib/python3.7/site-packages/lightgbm/engine.py", line 248, in train
    booster.update(fobj=fobj)
  File "/home/yryrgogo/anaconda3/lib/python3.7/site-packages/lightgbm/basic.py", line 1896, in update
    ctypes.byref(is_finished)))
KeyboardInterrupt

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/home/yryrgogo/anaconda3/lib/python3.7/site-packages/IPython/core/interactiveshell.py", line 2039, in showtraceback
    stb = value._render_traceback_()
AttributeError: 'KeyboardInterrupt' object has no attribute '_render_traceback_'

During handling of the above exception, another exception occurred:

Traceba

KeyboardInterrupt: 

In [6]:
# best_feim = read_pkl_gzip('../output/feature_importances/20190908_0913__CV0-9563060582768974__feature289.gz')
for feature_name in feim[feim['imp_avg']<500].index:
# for feature_name in best_feim.index:
#     if not feature_name.count('509__'):
#         continue

    if feature_name.count('raw'):
        from_dir = 'raw_use'
        to_dir = 'raw_trush'
#         from_dir = 'raw_trush'
#         to_dir = 'raw_use'
    else:
        from_dir = 'org_use'
        to_dir = 'org_trush'
#         from_dir = 'org_trush'
#         to_dir = 'org_use'
    from_dir = 'valid'
    to_dir = 'valid_trush'
#     to_dir = 'valid'
#     from_dir = 'valid_trush'
    try:
        move_feature([feature_name], from_dir, to_dir)
    except FileNotFoundError:
        print(feature_name)

In [2]:
feim = read_pkl_gzip('../output/feature_importances/20190910_1212__CV0-9410075197344887__feature800.gz')
feim

Unnamed: 0_level_0,imp_fold1,imp_fold2,imp_fold3,imp_fold4,imp_fold5,imp_fold6,imp_avg
feature,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
507__C11-C14__ratio,55384.425781,53491.351562,53454.464844,57710.449219,51938.207031,55662.804688,54606.953125
512__V87-V78__ratio__ProductCD-C,19726.503906,19645.271484,19932.919922,19025.427734,20935.240234,21938.585938,20200.658203
508__C1-D1__ratio,20962.273438,20288.458984,17401.816406,16774.982422,19952.810547,19637.136719,19169.580078
507__C14-C2__ratio,12338.760742,14870.255859,15617.758789,15504.551758,15143.784180,16368.497070,14973.934570
509__C1-V317__ratio__ProductCD-W,9112.198242,13251.629883,9074.304688,7933.535645,7295.770508,10388.500000,9509.323242
507__C11-C14__diff,2030.500610,8751.769531,8574.958008,9941.893555,11002.894531,9989.223633,8381.873047
507__C14-C8__ratio,7316.846680,6945.955078,9031.683594,7246.200195,8245.911133,7809.406738,7766.000000
507__C1-C14__ratio,8030.936523,6709.392090,7324.322754,7276.584473,7192.876953,7963.055664,7416.193848
509__C13-V308__ratio__ProductCD-W,12200.979492,5139.179199,4622.531250,7101.899902,6224.173340,8245.307617,7255.678223
515__V307-D1__ratio__ProductCD-W,10670.791992,6391.182129,9121.151367,1672.133179,6826.817871,7203.685547,6980.959473
