In [36]:
%load_ext autoreload
%autoreload 2
from glob import glob
import os
from pathlib import Path
import re
import sys
import yaml
import datetime
import numpy as np
import pandas as pd
from sklearn.model_selection import GroupKFold
from tqdm import tqdm
from func.utils import get_categorical_features, read_pkl_gzip, to_pkl_gzip, parallel_load_data, get_filename
from func.ml_utils import Classifier, rm_no_info_feature
from ieee_utils import ieee_cv
from func.preprocessing import get_dummies
from kaggle_utils import reduce_mem_usage

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [None]:
COLUMN_ID = 'TransactionID'
COLUMN_DT = 'TransactionDT'
COLUMN_TARGET = 'isFraud'
COLUMNS_IGNORE = [COLUMN_ID, COLUMN_DT, COLUMN_TARGET]

paths_train = glob('../feature/raw_main/*_train.gz')
paths_test = glob('../feature/raw_main/*_test.gz')

paths_train = [path for path in paths_train 
#                if path.count('DT') 
               if path.count('Fraud') 
               or path.count(COLUMN_ID)
               or path.count('D')
               or path.count('C')
               or path.count('V')
               or path.count('card')
               or path.count('addr')
#                or path.count('Reg')
               or path.count('P_email')
               or path.count('R_email')
               or path.count('M')
               or path.count('Product')
              ]
paths_test  = [path for path in paths_test  
               if path.count('Fraud') 
               or path.count(COLUMN_ID)
               or path.count('D')
               or path.count('C')
               or path.count('V')
               or path.count('card')
               or path.count('addr')
#                or path.count('Reg')
               or path.count('P_email')
               or path.count('R_email')
               or path.count('M')
               or path.count('Product')
              ]

df_train = reduce_mem_usage( parallel_load_data(paths_train) )
df_test  = reduce_mem_usage( parallel_load_data(paths_test) )

In [None]:
# Categorical Encode
cols_cat = get_categorical_features(df=df_train, ignore_list=COLUMNS_IGNORE)
df_train = get_dummies(df_train, cols_cat)
df_test = get_dummies(df_test, cols_cat)

In [None]:
# User Group
# pred_user = pd.read_csv('../output/0830_ieee__same_user__pattern-user_keys__card_addr.csv')
pred_user = pd.read_csv('../output/same_user_pattern/20190901_user_ids_share.csv')
pred_user['same_user_id'] = pred_user['predicted_user_id']
pred_user.loc[pred_user[pred_user['predicted_user_id'].isnull()].index, 'same_user_id'] = pred_user.loc[pred_user[pred_user['predicted_user_id'].isnull()].index, COLUMN_ID]
pred_user['same_user_id'] = pred_user['same_user_id'].astype('int')
pred_user.set_index(COLUMN_ID, inplace=True)

df_train.set_index(COLUMN_ID, inplace=True)
df_test.set_index(COLUMN_ID, inplace=True)
df_train['pred_user'] = pred_user['same_user_id']
df_test['pred_user'] = pred_user['same_user_id']
df_train.reset_index(inplace=True)
df_test.reset_index(inplace=True)

In [None]:
Y = df_train[COLUMN_TARGET]
df_train.drop(COLUMN_TARGET, axis=1, inplace=True)
kfold = list(GroupKFold(n_splits=5).split(df_train, Y, df_train["pred_user"]))

In [50]:
# df_train, df_test = rm_no_info_feature(df_train, df_test, use_cols)
list_rm = rm_no_info_feature(df_train, df_test, use_cols)
cols_train = [col for col in df_train.columns if col not in COLUMNS_IGNORE]
cols_test = [col for col in df_test.columns if col not in COLUMNS_IGNORE]
use_cols = list(set(cols_train) & set(cols_test))

In [51]:
params = {
    'n_jobs': 32,
    'seed': 1208,
    'n_splits': 5,
    'metric': 'auc',
    'model_type': 'lgb',
    'fold': ['stratified', 'group'][1],
}

cv_score, feim_df, pred_result, score_list = ieee_cv(df_train, Y, df_test, use_cols, params)
# save_log_cv_result(cv_score, feim_df, pred_result, score_list)
# list_score = []
# list_feim = []
# list_test_pred = []
# oof_train = np.zeros(len(df_train))

# for i_fold, (trn_idx, val_idx) in enumerate(kfold):
#     # get train data and valid data
#     x_train = df_train.iloc[trn_idx]
#     y_train = Y[trn_idx]
#     x_valid = df_train.iloc[val_idx]
#     y_valid = Y[val_idx]

#     score, oof_pred, test_pred, feim, _ = Classifier(
#         model_type=model_type,
#         x_train=x_train,
#         y_train=y_train,
#         x_valid=x_valid,
#         y_valid=y_valid,
#         x_test=df_test,
#         params=params,
#     )
#     list_score.append(score)
#     list_feim.append(feim)
#     oof_train[val_idx] = oof_pred
#     list_test_pred.append(test_pred)
# test_pred_avg = np.mean(list_test_pred, axis=0)

Training until validation scores don't improve for 50 rounds.
[200]	valid_0's auc: 0.882134
Early stopping, best iteration is:
[277]	valid_0's auc: 0.884488
Training until validation scores don't improve for 50 rounds.
Early stopping, best iteration is:
[99]	valid_0's auc: 0.878335
Training until validation scores don't improve for 50 rounds.
Early stopping, best iteration is:
[95]	valid_0's auc: 0.881429
Training until validation scores don't improve for 50 rounds.
[200]	valid_0's auc: 0.881578
Early stopping, best iteration is:
[195]	valid_0's auc: 0.881737
Training until validation scores don't improve for 50 rounds.
[200]	valid_0's auc: 0.882398
Early stopping, best iteration is:
[262]	valid_0's auc: 0.884899


ValueError: Length of passed values is 590545, index implies 1097231

In [44]:
#========================================================================
# Adversarial Validation
#========================================================================
df_train['is_train'] = 1
df_test['is_train']  = 0
all_data = pd.concat([df_train, df_test], axis=0)
COLUMN_ADV = 'is_train'

Y_ADV = all_data[COLUMN_ADV]
all_data.drop(COLUMN_ADV, axis=1, inplace=True)
kfold = list(GroupKFold(n_splits=5).split(all_data, Y_ADV, all_data["pred_user"]))

In [None]:
params = {}
metric = 'auc'
model_type = 'lgb'
list_score = []
list_feim = []
list_test_pred = []
oof_train = np.zeros(len(all_data))

for i_fold, (trn_idx, val_idx) in enumerate(kfold):
    # get train data and valid data
    x_train = all_data.iloc[trn_idx]
    y_train = Y[trn_idx]
    x_valid = all_data.iloc[val_idx]
    y_valid = Y[val_idx]

    score, oof_pred, test_pred, feim, _ = Classifier(
        model_type=model_type,
        x_train=x_train,
        y_train=y_train,
        x_valid=x_valid,
        y_valid=y_valid,
        x_test=[],
        params=params,
    )
    list_score.append(score)
    list_feim.append(feim)
    oof_train[val_idx] = oof_pred
    list_test_pred.append(test_pred)
test_pred_avg = np.mean(list_test_pred, axis=0)