In [1]:
%load_ext autoreload
%autoreload 2
import gc
from glob import glob
import os
from pathlib import Path
import re
import sys
import yaml
import datetime
import numpy as np
import pandas as pd
from tqdm import tqdm
from func.utils import timer, get_categorical_features, read_pkl_gzip, to_pkl_gzip, parallel_load_data, get_filename, logger_func
from ieee_train import eval_train, eval_check_feature
from kaggle_utils import reduce_mem_usage, move_feature

import lightgbm as lgb
from sklearn.metrics import roc_auc_score
import shutil

try:
    logger
except NameError:
    logger = logger_func()
    
save_file_path = '../output/valid_single_feature.csv'

2019-09-15 18:55:22,191 func.utils 347 [INFO]    [logger_func] start 


In [25]:
from sklearn.model_selection import StratifiedKFold, GroupKFold
from scipy.sparse import vstack, hstack, csr_matrix, load_npz

COLUMN_ID = 'TransactionID'
COLUMN_DT = 'TransactionDT'
COLUMN_TARGET = 'isFraud'
COLUMN_GROUP = 'DT-M'
COLUMNS_IGNORE = [COLUMN_ID, COLUMN_DT, COLUMN_TARGET, COLUMN_GROUP, 'is_train', 'date']

paths_train = glob('../feature/raw_use/*_train.gz')
paths_train += sorted(glob('../feature/org_use/*_train.gz'))
df_train = parallel_load_data(paths_train)
use_cols = [col for col in df_train.columns if col not in COLUMNS_IGNORE]
df_train = df_train[use_cols].values

Y = read_pkl_gzip('../feature/raw_use/isFraud_train.gz')
group_kfold_path = '../input/0908_ieee__DT-M_GroupKFold.gz'
group_values = read_pkl_gzip(group_kfold_path)
train_index = np.arange(Y.shape[0])

n_splits = 6
kfold = list(GroupKFold(n_splits=n_splits).split(train_index, Y, group_values))

In [28]:
from scipy.sparse import vstack, csr_matrix, save_npz, load_npz
from sklearn.metrics import roc_auc_score
import lightgbm as lgb

m = 100000
counter = 0
params = {
    'n_jobs': 64,
    'seed': 1208,
    'metric': 'auc',
    'objective': 'binary',
    'num_leaves': 2**6-1,
    'max_depth': -1,
    'subsample': 0.9,
    'subsample_freq': 1,
    'colsample_bytree' : 0.3,
    'lambda_l1' : 0.1,
    'lambda_l2' : 1.0,
    'learning_rate' : 0.1,
}
early_stopping_rounds = 100
num_boost_round = 3000
cols_categorical = []

for train_index, test_index in kfold:
    
    print('Fold {}\n'.format(counter + 1))
    
    train = load_npz('../input/610_train.npz')
    
    X_fit = vstack([train[train_index[i*m:(i+1)*m]] for i in range(train_index.shape[0] // m + 1)])
    X_val = vstack([train[test_index[i*m:(i+1)*m]]  for i in range(test_index.shape[0] //  m + 1)])
    
    X_fit = hstack(
        (
    csr_matrix(df_train[train_index]),
    X_fit
    )
    )
    X_val = hstack(
        (
    csr_matrix(df_train[test_index]),
    X_val
    )
    )
    
    X_fit, X_val = csr_matrix(X_fit, dtype='float32'), csr_matrix(X_val, dtype='float32')
    y_fit, y_val = Y[train_index], Y[test_index]
    
    del train
    gc.collect()
    
    lgb_train = lgb.Dataset(data=X_fit, label=y_fit)
    lgb_valid = lgb.Dataset(data=X_val, label=y_val)
    
    estimator = lgb.train(
        params = params,
        train_set = lgb_train,
        valid_sets = lgb_valid,
        early_stopping_rounds = early_stopping_rounds,
        num_boost_round = num_boost_round,
        categorical_feature = cols_categorical,
        verbose_eval = 200
    )

    oof_pred = estimator.predict(X_val)
    score = roc_auc_score(y_val, oof_pred)
    print(f"Fold{counter}: {score}")

Fold 1

Training until validation scores don't improve for 100 rounds.
[200]	valid_0's auc: 0.90203
[400]	valid_0's auc: 0.903261
Early stopping, best iteration is:
[362]	valid_0's auc: 0.904471
Fold0: 0.9044705182409695
Fold 1

Training until validation scores don't improve for 100 rounds.
[200]	valid_0's auc: 0.932531
[400]	valid_0's auc: 0.93517
[600]	valid_0's auc: 0.9365
Early stopping, best iteration is:
[575]	valid_0's auc: 0.936746
Fold0: 0.9367464310104123
Fold 1

Training until validation scores don't improve for 100 rounds.
[200]	valid_0's auc: 0.936436
[400]	valid_0's auc: 0.939597
Early stopping, best iteration is:
[401]	valid_0's auc: 0.939732
Fold0: 0.9397315672330511
Fold 1

Training until validation scores don't improve for 100 rounds.
[200]	valid_0's auc: 0.926829
[400]	valid_0's auc: 0.929915
[600]	valid_0's auc: 0.930757
Early stopping, best iteration is:
[596]	valid_0's auc: 0.930904
Fold0: 0.9309042966665945
Fold 1

Training until validation scores don't improve f

In [27]:
1

1