In [148]:
%load_ext autoreload
%autoreload 2
from glob import glob
import os
from pathlib import Path
import re
import sys
import yaml
import datetime
import numpy as np
import pandas as pd
from sklearn.model_selection import GroupKFold
from tqdm import tqdm
from func.BigQuery import BigQuery
from func.utils import get_categorical_features, read_pkl_gzip, to_pkl_gzip, parallel_load_data, get_filename
from func.ml_utils import Classifier, drop_unique_feature, drop_high_corr_feature, save_feature
from ieee_utils import ieee_cv, save_log_cv_result, valid_submit_prediction
from func.preprocessing import get_dummies
from kaggle_utils import reduce_mem_usage, move_feature

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [143]:
COLUMN_ID = 'TransactionID'
COLUMN_DT = 'TransactionDT'
COLUMN_TARGET = 'isFraud'
COLUMNS_IGNORE = [COLUMN_ID, COLUMN_DT, COLUMN_TARGET]

paths_train = glob('../feature/raw_use/*_train.gz')
paths_test = glob('../feature/raw_use/*_test.gz')

paths_train = [path for path in paths_train 
               if path.count('Fraud') 
               or path.count(COLUMN_ID)
               or path.count('D')
               or path.count('C')
               or path.count('V')
               or path.count('amt')
               or path.count('card')
               or path.count('addr')
#                or path.count('Reg')
               or path.count('P_email')
               or path.count('R_email')
               or path.count('M')
               or path.count('Product')
              ]
paths_test  = [path for path in paths_test  
               if path.count('Fraud') 
               or path.count(COLUMN_ID)
               or path.count('D')
               or path.count('C')
               or path.count('V')
               or path.count('amt')
               or path.count('card')
               or path.count('addr')
#                or path.count('Reg')
               or path.count('P_email')
               or path.count('R_email')
               or path.count('M')
               or path.count('Product')
              ]

df_train = reduce_mem_usage( parallel_load_data(paths_train) )
df_test  = reduce_mem_usage( parallel_load_data(paths_test) )

Memory usage of dataframe is 298.49 MB
Memory usage after optimization is: 308.06 MB
Decreased by -3.2%
Memory usage of dataframe is 478.39 MB
Memory usage after optimization is: 541.20 MB
Decreased by -13.1%


In [4]:
# Categorical Encode
cols_cat = get_categorical_features(df=df_train, ignore_list=COLUMNS_IGNORE)
df_train = get_dummies(df_train, cols_cat)
df_test  = get_dummies(df_test, cols_cat)

In [10]:
# Save raw features
save_feature(df_train, prefix='raw', dir_save='raw_use', is_train=True, auto_type=False, list_ignore=COLUMNS_IGNORE)
save_feature(df_test, prefix='raw', dir_save='raw_use', is_train=False, auto_type=False, list_ignore=COLUMNS_IGNORE, is_viz=False)

(590540,) | D15
(590540,) | V327
(590540,) | V7
(590540,) | V147
(590540,) | V210
(590540,) | V225
(590540,) | V106
(590540,) | V203
(590540,) | V3
(590540,) | V61
(590540,) | V323
(590540,) | V110
(590540,) | V252
(590540,) | V170
(590540,) | V104
(590540,) | V98
(590540,) | D2
(590540,) | V176
(590540,) | V77
(590540,) | V33
(590540,) | V62
(590540,) | C12
(590540,) | V171
(590540,) | V237
(590540,) | V166
(590540,) | V35
(590540,) | V319
(590540,) | V135
(590540,) | V261
(590540,) | V183
(590540,) | V313
(590540,) | V105
(590540,) | V88
(590540,) | V59
(590540,) | V278
(590540,) | C1
(590540,) | V15
(590540,) | V260
(590540,) | D6
(590540,) | V148
(590540,) | V227
(590540,) | V333
(590540,) | V236
(590540,) | V22
(590540,) | V276
(590540,) | V144
(590540,) | V68
(590540,) | V126
(590540,) | V238
(590540,) | C6
(590540,) | V63
(590540,) | V289
(590540,) | V308
(590540,) | V275
(590540,) | V173
(590540,) | V266
(590540,) | V140
(590540,) | V254
(590540,) | V221
(590540,) | D10
(590540

In [12]:
# User Group
# pred_user = pd.read_csv('../output/0830_ieee__same_user__pattern-user_keys__card_addr.csv')
pred_user = pd.read_csv('../output/same_user_pattern/20190901_user_ids_share.csv')
pred_user['same_user_id'] = pred_user['predicted_user_id']
pred_user.loc[pred_user[pred_user['predicted_user_id'].isnull()].index, 'same_user_id'] = pred_user.loc[pred_user[pred_user['predicted_user_id'].isnull()].index, COLUMN_ID]
pred_user['same_user_id'] = pred_user['same_user_id'].astype('int')
pred_user.set_index(COLUMN_ID, inplace=True)

df_train.set_index(COLUMN_ID, inplace=True)
df_test.set_index(COLUMN_ID, inplace=True)
df_train['pred_user'] = pred_user['same_user_id']
df_test['pred_user'] = pred_user['same_user_id']
df_train.reset_index(inplace=True)
df_test.reset_index(inplace=True)

In [13]:
Y = df_train[COLUMN_TARGET]
df_train.drop(COLUMN_TARGET, axis=1, inplace=True)
kfold = list(GroupKFold(n_splits=5).split(df_train, Y, df_train["pred_user"]))

In [16]:
# Train or Testにしかないfeatureが存在しないようにする
cols_train = [col for col in df_train.columns if col not in COLUMNS_IGNORE]
cols_test = [col for col in df_test.columns if col not in COLUMNS_IGNORE]
use_cols = list(set(cols_train) & set(cols_test))

In [116]:
# 情報をもたない or 重複してるようなfeatureを除く
list_unique_drop = drop_unique_feature(df_train, df_test, use_cols, drop=False)
list_corr_drop = drop_high_corr_feature(df_train, df_test, use_cols, drop=False)
move_feature(list(set(list_corr_drop)), 'raw_use', 'raw_trush', prefix='raw__')

 41%|████      | 216/525 [00:00<00:00, 1067.69it/s]

highly correlated: V329 / V105
highly correlated: V97 / V324
highly correlated: C7 / C12


 60%|██████    | 317/525 [00:00<00:00, 607.55it/s]

highly correlated: V95 / V322
highly correlated: R_emaildomain_servicios-ta.com_dummie / P_emaildomain_servicios-ta.com_dummie
highly correlated: V324 / V280
highly correlated: P_emaildomain_prodigy.net.mx_dummie / R_emaildomain_prodigy.net.mx_dummie


 85%|████████▌ | 448/525 [00:00<00:00, 363.04it/s]

highly correlated: D12 / D4
highly correlated: V323 / V96
highly correlated: V167 / V177
highly correlated: V322 / V279
highly correlated: V324 / V103
highly correlated: V95 / V101
highly correlated: V322 / V101


100%|██████████| 525/525 [00:01<00:00, 444.55it/s]

highly correlated: V322 / V293
highly correlated: V279 / V293





In [104]:
model_type = 'lgb'
params = {
    'n_jobs': 40,
    'seed': 1208,
    'n_splits': 5,
    'metric': 'auc',
    'model_type': model_type,
    'objective': 'binary',
    'fold': ['stratified', 'group'][1],
    
    'num_leaves': 2**6-1,
    'max_depth': -1,
    'subsample': 1.0,
    'subsample_freq': 1,
    'colsample_bytree' : 0.25,
    'lambda_l1' : 0.1,
    'lambda_l2' : 1.0,
    'learning_rate' : 0.1,
}

best_iteration, cv_score, df_feim, pred_result, score_list = ieee_cv(df_train, Y, df_test, use_cols, params)

test_pred = pred_result.values[len(df_train):]
valid_submit_prediction(test_pred)

Training until validation scores don't improve for 50 rounds.
[200]	valid_0's auc: 0.901542
Early stopping, best iteration is:
[212]	valid_0's auc: 0.901653
Training until validation scores don't improve for 50 rounds.
Early stopping, best iteration is:
[109]	valid_0's auc: 0.890612
Training until validation scores don't improve for 50 rounds.
Early stopping, best iteration is:
[51]	valid_0's auc: 0.88186
Training until validation scores don't improve for 50 rounds.
Early stopping, best iteration is:
[95]	valid_0's auc: 0.896769
Training until validation scores don't improve for 50 rounds.
Early stopping, best iteration is:
[140]	valid_0's auc: 0.891794


In [141]:
# Importanceの低いFeatureは除外する
useless_features = feim_df[feim_df['imp_avg']<=200].index.tolist()
move_feature(useless_features, 'raw_use', 'raw_trush', prefix='raw__')

In [108]:
#========================================================================
# Adversarial Validation
#========================================================================
df_train['is_train'] = 1
df_test['is_train']  = 0
all_data = pd.concat([df_train, df_test], axis=0)
COLUMN_ADV = 'is_train'

Y_ADV = all_data[COLUMN_ADV]
all_data.drop(COLUMN_ADV, axis=1, inplace=True)
kfold = list(GroupKFold(n_splits=5).split(all_data, Y_ADV, all_data["pred_user"]))

model_type = 'lgb'
params = {
    'n_jobs': 40,
    'seed': 1208,
    'n_splits': 5,
    'metric': 'auc',
    'model_type': model_type,
    'objective': 'binary',
    'fold': ['stratified', 'group'][1],
    
    'num_leaves': 2**6-1,
    'max_depth': -1,
    'subsample': 1.0,
    'subsample_freq': 1,
    'colsample_bytree' : 0.25,
    'lambda_l1' : 0.1,
    'lambda_l2' : 1.0,
    'learning_rate' : 0.1,
}

adv_cv_score, adv_feim_df, adv_pred_result = ieee_cv(
    all_data,
    Y_ADV,
    [],
    use_cols,
    params,
    is_adv=True
)

Training until validation scores don't improve for 50 rounds.
[200]	valid_0's auc: 0.935023
[400]	valid_0's auc: 0.940121
[600]	valid_0's auc: 0.943592
[800]	valid_0's auc: 0.945213
[1000]	valid_0's auc: 0.946541
[1200]	valid_0's auc: 0.947765
[1400]	valid_0's auc: 0.948202
Early stopping, best iteration is:
[1392]	valid_0's auc: 0.948307
Training until validation scores don't improve for 50 rounds.
[200]	valid_0's auc: 0.935876
[400]	valid_0's auc: 0.942171
[600]	valid_0's auc: 0.944441
[800]	valid_0's auc: 0.946336
[1000]	valid_0's auc: 0.947303
[1200]	valid_0's auc: 0.948015
[1400]	valid_0's auc: 0.948615
[1600]	valid_0's auc: 0.949609
Early stopping, best iteration is:
[1748]	valid_0's auc: 0.949887
Training until validation scores don't improve for 50 rounds.
[200]	valid_0's auc: 0.934275
[400]	valid_0's auc: 0.940662
[600]	valid_0's auc: 0.943786
[800]	valid_0's auc: 0.945787
[1000]	valid_0's auc: 0.947027
[1200]	valid_0's auc: 0.947871
[1400]	valid_0's auc: 0.94856
[1600]	valid_

In [110]:
#========================================================================
# 学習結果やパラメータのログをBigQueryに保存する
#========================================================================
n_features = len(use_cols)
n_rows = df_train.shape[0]
save_log_cv_result(
    best_iteration,
    cv_score,
    feim_df,
    model_type,
    n_features,
    n_rows,
    params,
    pred_result,
    score_list,
    adv_cv_score
)

Setup Dataset dim_ml_dataset.
