In [None]:
%load_ext autoreload
%autoreload 2
from glob import glob
import os
from pathlib import Path
import re
import sys
import yaml
import datetime
import numpy as np
import pandas as pd
from tqdm import tqdm
from func.utils import get_categorical_features, read_pkl_gzip, to_pkl_gzip, parallel_load_data, get_filename, logger_func, timer
from func.ml_utils import Regressor
from ieee_train import eval_train, eval_check_feature
from kaggle_utils import reduce_mem_usage, move_feature
from sklearn.model_selection import train_test_split, StratifiedKFold, KFold, GroupKFold
try:
    logger
except NameError:
    logger = logger_func()

In [21]:
COLUMN_TARGET = sys.argv[1]

COLUMN_ID = 'TransactionID'
COLUMN_DT = 'TransactionDT'
COLUMN_GROUP = 'DT-M'
COLUMNS_IGNORE = [COLUMN_ID, COLUMN_DT, COLUMN_TARGET, COLUMN_GROUP, 'isFraud', 'is_train', 'date', 'DT-M', 'predicted_user_id']

def filter_feature(path):
    if path.count(''):
        return True
    else:
        return False

paths_train = glob('../submit/re_sub/70*_train.gz')
paths_test  = glob('../submit/re_sub/70*_test.gz')
paths_train += glob('../submit/re_sub/is*_train.gz')
paths_test  += glob('../submit/re_sub/is*_test.gz')
paths_train += glob('../submit/re_sub/Tran*_train.gz')
paths_test  += glob('../submit/re_sub/Tran*_test.gz')

print(len(paths_train))

df_train = parallel_load_data(paths_train)
df_test  = parallel_load_data(paths_test)

Y = df_train[COLUMN_TARGET]
df_train.drop(COLUMN_TARGET, axis=1, inplace=True)

10


In [19]:
is_submit = [True, False][0]
n_splits = 6
set_type = 'new_set'

tmp_train = df_train
tmp_test = df_test

#========================================================================
# Train Test で片方に存在しないFeatureを除外
#========================================================================
diff_cols = list(set(tmp_train.columns) - set(tmp_test.columns))

for col in list(set(diff_cols)):
    from_dir = 'valid'
    to_dir = 'valid_trush'
    move_feature([col], from_dir, to_dir)
tmp_train.drop(diff_cols, axis=1, inplace=True)
print(f"  * Diff Features: {len(diff_cols)}")

### DT-M
group_kfold_path = '../input/0908_ieee__DT-M_GroupKFold.gz'
group = read_pkl_gzip(group_kfold_path)
tmp_train[COLUMN_GROUP] = group


model_type = "lgb"
params = {
#     'n_jobs': 60,
    'n_jobs': 96,
#     'n_jobs': 84,
#     'n_jobs': 48,
#     'n_jobs': 36,
    'objective': 'regression',
    'num_leaves': 2**8-1,
    'max_depth': -1,
    'subsample': 0.7,
    'subsample_freq': 1,
    'colsample_bytree' : 0.10,
    'lambda_l1' : 0.1,
    'lambda_l2' : 1.0,
    'learning_rate' : 0.1,
    "early_stopping_rounds": 50,
    "seed": 1208,
    "bagging_seed": 1208,
    "feature_fraction_seed": 1208,
    "drop_seed": 1208,
    'n_splits': n_splits,
    'metric': 'auc',
    'model_type': model_type,
    'fold': ['stratified', 'group'][1],
}
if is_submit:
    params['learning_rate'] = 0.01
    params['learning_rate'] = 0.05
    params['learning_rate'] = 1.0
    params["early_stopping_rounds"] = 1

logger.info(f"* EXP: dataset {set_type} {tmp_train.shape} lr {params['learning_rate']} ")

  * Diff Features: 0


2019-09-30 00:35:52,643 func.utils 58 [INFO]    [<module>] * EXP: dataset new_set (590540, 10) lr 1.0  


In [20]:
start_time = "{0:%Y%m%d_%H%M%S}".format(datetime.datetime.now())[:13]
seed       = params['seed']
model_type = params['model_type']
n_splits = params['n_splits']
validation = params['fold']
early_stopping_rounds = params['early_stopping_rounds']

use_cols = [col for col in tmp_train.columns if col not in COLUMNS_IGNORE]

kfold = list(GroupKFold(n_splits=n_splits).split(tmp_train, Y, tmp_train[COLUMN_GROUP]))

score_list = []
feim_list = []
y_pred = np.zeros(len(tmp_train))
test_preds = []

x_test = df_test[use_cols]

for n_fold, (trn_idx, val_idx) in enumerate(kfold):
    x_train = tmp_train.iloc[trn_idx][use_cols]
    y_train = Y.iloc[trn_idx]
    x_valid = tmp_train.iloc[val_idx][use_cols]
    y_valid = Y.iloc[val_idx]

    val_gr = tmp_train.iloc[val_idx][COLUMN_GROUP].value_counts()
    dtm = val_gr.index.tolist()[0]
    print("="*20)
    with timer(f"  * Fold{n_fold} Validation-{COLUMN_GROUP} {dtm}: {val_gr.values[0]}"):
        score, oof_pred, test_pred, feim, _ = Regressor(
            model_type=model_type,
            x_train=x_train,
            y_train=y_train,
            x_valid=x_valid,
            y_valid=y_valid,
            x_test=x_test,
            params=params,
            early_stopping_rounds = early_stopping_rounds,
        )

    score_list.append(score)
    y_pred[val_idx] = oof_pred
    test_preds.append(test_pred)

    feim.rename(columns={'importance': f'imp_fold{n_fold+1}'}, inplace=True)
    feim.set_index('feature', inplace=True)
    feim_list.append(feim)

cv_score = np.mean(score_list)
cvs = str(cv_score).replace('.', '-')
df_feim = pd.concat(feim_list, axis=1)
df_feim['imp_avg'] = df_feim.mean(axis=1)
df_feim.sort_values(by='imp_avg', ascending=False, inplace=True)

## Save
# Feature Importance
to_pkl_gzip(obj=df_feim, path=f"../output/feature_importances/{start_time}__CV{cvs}__{COLUMN_TARGET}__feature{len(use_cols)}")


with timer("  * Make Prediction Result File."):
    test_pred_avg = np.mean(test_preds, axis=0)
    all_pred = np.append(y_pred, test_pred_avg)
    all_ids = np.append(tmp_train[COLUMN_ID].values, df_test[COLUMN_ID].values)
    pred_result = pd.DataFrame([all_ids, all_pred], index=[COLUMN_ID, 'pred_' + start_time]).T
    pred_result[COLUMN_ID] = pred_result[COLUMN_ID].astype('int')

    #========================================================================
    # Save
    #========================================================================
    # Prediction
    to_pkl_gzip(obj=pred_result, path=f"../output/pred_result/{start_time}__CV{cvs}__all_preds")
    # Submit File
    pred_result.columns = [COLUMN_ID, COLUMN_TARGET]
    pred_result.iloc[len(tmp_train):].to_csv(f"../submit/tmp/{start_time}__CV{cvs}__feature{len(use_cols)}.csv", index=False)

Training until validation scores don't improve for 1 rounds.
Early stopping, best iteration is:
[1]	valid_0's l2: 0.0252483
[  * Fold0 Validation-DT-M 2017-12: 134339] done in 2 s
Training until validation scores don't improve for 1 rounds.
Early stopping, best iteration is:
[1]	valid_0's l2: 0.0364717
[  * Fold1 Validation-DT-M 2018-3: 101968] done in 1 s
Training until validation scores don't improve for 1 rounds.
Early stopping, best iteration is:
[10]	valid_0's l2: 0.0345487
[  * Fold2 Validation-DT-M 2018-1: 92510] done in 1 s
Training until validation scores don't improve for 1 rounds.
Early stopping, best iteration is:
[4]	valid_0's l2: 0.0313687
[  * Fold3 Validation-DT-M 2018-5: 92427] done in 1 s
Training until validation scores don't improve for 1 rounds.
Early stopping, best iteration is:
[7]	valid_0's l2: 0.0345455
[  * Fold4 Validation-DT-M 2018-2: 85725] done in 1 s
Training until validation scores don't improve for 1 rounds.
Early stopping, best iteration is:
[1]	valid_

In [24]:
estimator

NameError: name 'estimator' is not defined