In [1]:
%load_ext autoreload
%autoreload 2
from glob import glob
import os
from pathlib import Path
import re
import sys
import yaml
import datetime
import numpy as np
import pandas as pd
from tqdm import tqdm
from func.utils import get_categorical_features, read_pkl_gzip, to_pkl_gzip, parallel_load_data, get_filename
from func.ml_utils import Classifier
from func.preprocessing import get_dummies

In [2]:
COLUMN_ID = 'TransactionID'
COLUMN_DT = 'TransactionDT'
COLUMN_TARGET = 'isFraud'
COLUMNS_IGNORE = [COLUMN_ID, COLUMN_DT, COLUMN_TARGET]

train_paths = glob('../feature/raw_main/*_train.gz')
test_paths = glob('../feature/raw_main/*_test.gz')

train_paths = [path for path in train_paths 
               if path.count('ID') 
#                if not path.count('V') 
#                if not path.count('V1') 
#                and not path.count('V2') 
#                and not path.count('V4') 
#                and not path.count('V5') 
#                and not path.count('V6') 
#                and not path.count('V8') 
#                and not path.count('day_no') 
              ]
test_paths  = [path for path in test_paths  
               if path.count('ID') 
#                if not path.count('V') 
#                if not path.count('V1') 
#                and not path.count('V2') 
#                and not path.count('V4') 
#                and not path.count('V5') 
#                and not path.count('V6') 
#                and not path.count('V8') 
#                and not path.count('day_no') 
              ]

train_df = parallel_load_data(train_paths)
test_df = parallel_load_data(test_paths)

In [3]:
filepath = '../output/0826_transaction_new_user.csv'
target_ids = pd.read_csv(filepath).values.ravel()
all_data = pd.concat([train_df, test_df], axis=0, ignore_index=False)

train_data = train_df[train_df[COLUMN_ID].isin(target_ids)]
# data = all_data[all_data[COLUMN_ID].isin(target_ids)]
data = all_data
data['is_train'] = ~data[COLUMN_TARGET].isnull()*1
print(data.shape)

(1097231, 98)


In [4]:
col_target = 'is_train'
COLUMNS_CATEGORY = get_categorical_features(data, COLUMNS_IGNORE)
df_ohe = get_dummies(data, COLUMNS_CATEGORY)
train = df_ohe.drop(col_target, axis=1)
Y = df_ohe[col_target]

In [6]:
from sklearn.model_selection import StratifiedKFold

start_time = "{0:%Y%m%d_%H%M%S}".format(datetime.datetime.now())[:13]

n_splits = 4
kfold = StratifiedKFold(n_splits=n_splits)
kfold = list(kfold.split(train, Y))
params = {}
metric = 'auc'
model_type = 'lgb'
score_list = []
feim_list  = []
y_pred = np.zeros(len(train))
test_preds = []
use_cols = [col for col in df_ohe.columns if col not in COLUMNS_IGNORE+[col_target]
#             and col.startswith('C')
            and not col.startswith('D') 
            and not col.startswith('V') 
#             and not col.startswith('C')
            and not col.startswith('addr')
#             and not col.startswith('card')
            and not col.startswith('dist')
#             and not col.startswith('M')
            and not col.startswith('R_email')
            and not col.startswith('P_email')
#             and not col.count('Amt')
#             and not col.count('ProductCD')
           ]

use_cols += ['D11', 'V74', 'V34', 'V33', 'V94', 'V73']

model_map = {}

for n_fold, (trn_idx, val_idx) in enumerate(kfold):
    x_train = train.iloc[trn_idx][use_cols]
    y_train = Y.iloc[trn_idx]
    x_valid = train.iloc[val_idx][use_cols]
    y_valid = Y.iloc[val_idx]

    score, oof_pred, test_pred, feim, model = Classifier(
        model_type=model_type,
        x_train=x_train,
        y_train=y_train,
        x_valid=x_valid,
        y_valid=y_valid,
        x_test=[],
        params=params,
        get_model=True,
    )
    score_list.append(score)
    y_pred[val_idx] = oof_pred
    test_preds.append(test_pred)
    model_map[n_fold] = model
    
    feim.rename(columns={'importance': f'importance_fold{n_fold+1}'}, inplace=True)
    feim.set_index('feature', inplace=True)
    feim_list.append(feim)
    
cv_score = np.mean(score_list)
feim_df = pd.concat(feim_list, axis=1)
feim_df['importance_avg'] = feim_df.mean(axis=1)
feim_df.sort_values(by='importance_avg', ascending=False, inplace=True)

New categorical_feature is []
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))
New categorical_feature is []
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))


Training until validation scores don't improve for 50 rounds.
[200]	valid_0's auc: 0.790405
[400]	valid_0's auc: 0.795495
[600]	valid_0's auc: 0.797774
[800]	valid_0's auc: 0.800605
Early stopping, best iteration is:
[837]	valid_0's auc: 0.800649


New categorical_feature is []
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))
New categorical_feature is []
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))


Training until validation scores don't improve for 50 rounds.
[200]	valid_0's auc: 0.823707
[400]	valid_0's auc: 0.830536
[600]	valid_0's auc: 0.833824
[800]	valid_0's auc: 0.837912
[1000]	valid_0's auc: 0.84027
[1200]	valid_0's auc: 0.842315
[1400]	valid_0's auc: 0.843593
[1600]	valid_0's auc: 0.844925
[1800]	valid_0's auc: 0.846374
[2000]	valid_0's auc: 0.847342
Early stopping, best iteration is:
[2063]	valid_0's auc: 0.847665


New categorical_feature is []
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))
New categorical_feature is []
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))


Training until validation scores don't improve for 50 rounds.
[200]	valid_0's auc: 0.749776
[400]	valid_0's auc: 0.760453
[600]	valid_0's auc: 0.766674
[800]	valid_0's auc: 0.770691
[1000]	valid_0's auc: 0.773624
[1200]	valid_0's auc: 0.77566
[1400]	valid_0's auc: 0.777797
[1600]	valid_0's auc: 0.779756
[1800]	valid_0's auc: 0.781631
[2000]	valid_0's auc: 0.783292
[2200]	valid_0's auc: 0.784731
[2400]	valid_0's auc: 0.786317
[2600]	valid_0's auc: 0.787512
[2800]	valid_0's auc: 0.788402
[3000]	valid_0's auc: 0.789537
[3200]	valid_0's auc: 0.790546
[3400]	valid_0's auc: 0.79153
[3600]	valid_0's auc: 0.792436
[3800]	valid_0's auc: 0.793093
[4000]	valid_0's auc: 0.793905
[4200]	valid_0's auc: 0.794558
[4400]	valid_0's auc: 0.795155
[4600]	valid_0's auc: 0.795839
[4800]	valid_0's auc: 0.796388
[5000]	valid_0's auc: 0.796912
[5200]	valid_0's auc: 0.797516
[5400]	valid_0's auc: 0.798095
[5600]	valid_0's auc: 0.798832
[5800]	valid_0's auc: 0.799515
[6000]	valid_0's auc: 0.800133
[6200]	valid_0

New categorical_feature is []
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))
New categorical_feature is []
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))


Training until validation scores don't improve for 50 rounds.
[200]	valid_0's auc: 0.746144
[400]	valid_0's auc: 0.751333
Early stopping, best iteration is:
[421]	valid_0's auc: 0.751647


In [None]:
from func.xray_wrapper import Xray_Cal
xray = Xray_Cal(ignore_list=COLUMNS_IGNORE)

xray_result = pd.DataFrame()
for fold_num, (trn_idx, val_idx) in zip(tqdm(range(n_splits)), list(kfold)):
    xray.model = model_map[fold_num]
    xray, tmp_result = xray.get_xray(
#         base_xray=train[use_cols].iloc[trn_idx, :],
        base_xray=train[use_cols].iloc[val_idx, :],
        fold_num=fold_num,
        parallel=False
#         parallel=True
    )
    tmp_result.rename(columns={'xray' : f'xray_{fold_num}'}, inplace=True)
    
    if len(xray_result):
        xray_result = xray_result.merge(tmp_result, how='inner', on=['N', 'feature', 'value'])
    else:
        xray_result = tmp_result
        
# X-Rayの平均を出力（主にこれを可視化する）        
xray_cols = [col for col in xray_result.columns if col.count('xray')]
xray_result['xray_avg'] = xray_result[xray_cols].mean(axis=1)

 25%|██▌       | 1/4 [14:40<44:01, 880.37s/it]

FOLD: 0


In [12]:
# xray_result.to_csv('../output/0826_ieee__adversarial_clf_xray__high_fraud_user.csv', index=False)
# xray_result.to_csv('../output/0826_ieee__adversarial_15000row_clf_xray__high_fraud_user.csv', index=False)
# xray_result.to_csv('../output/0826_ieee__adversarial_val_idx_clf_xray__high_fraud_user.csv', index=False)
xray_result.to_csv('../output/0829_ieee__adversarial_val_idx_clf_xray__high_fraud_user.csv', index=False)

In [202]:
col_target = COLUMN_TARGET
COLUMNS_CATEGORY = get_categorical_features(train_data, COLUMNS_IGNORE)
train_data['diff_email'] = (train_data['P_emaildomain'] != train_data['R_emaildomain'])*1
df_ohe = get_dummies(train_data, COLUMNS_CATEGORY)
train = df_ohe.drop(col_target, axis=1)
Y = df_ohe[col_target]

In [209]:
from sklearn.model_selection import StratifiedKFold

start_time = "{0:%Y%m%d_%H%M%S}".format(datetime.datetime.now())[:13]

seed = 1208
params = {
    'num_threads': -1,
    'num_leaves': 31,
    'objective':'binary',
    "boosting": "gbdt",
    'max_depth': -1,
    "min_child_samples": 20,
    "bagging_freq": 2,
    "bagging_fraction": 0.9,
    "colsample_bytree": 0.3,
    "lambda_l1": 0.1,
    "lambda_l2": 1.0,
    "verbosity": -1,
    'random_seed': seed,
    'bagging_seed':seed,
    'feature_fraction_seed':seed,
    'data_random_seed':seed
}
n_splits = 5
kfold = StratifiedKFold(n_splits=n_splits)
kfold = list(kfold.split(train, Y))
metric = 'auc'
model_type = 'lgb'
score_list = []
feim_list  = []
y_pred = np.zeros(len(train))
test_preds = []

use_cols = [col for col in df_ohe.columns if col not in COLUMNS_IGNORE+[col_target]
            and not col.startswith('D') 
            and not col.startswith('addr')
            and not col.startswith('dist')
            and not col.startswith('M')
            and not col.count('ProductCD')
            and not col.startswith('card')
            and not col.startswith('C')
#             and not col.startswith('R_email')
            and not col.count('Amt')
           ]

model_map = {}

for n_fold, (trn_idx, val_idx) in enumerate(kfold):
    x_train = train.iloc[trn_idx][use_cols]
    y_train = Y.iloc[trn_idx]
    x_valid = train.iloc[val_idx][use_cols]
    y_valid = Y.iloc[val_idx]

    score, oof_pred, test_pred, feim, model = Classifier(
        model_type=model_type,
        x_train=x_train,
        y_train=y_train,
        x_valid=x_valid,
        y_valid=y_valid,
        x_test=[],
        params=params,
        get_model=True,
    )
    score_list.append(score)
    y_pred[val_idx] = oof_pred
    test_preds.append(test_pred)
    model_map[n_fold] = model
    
    feim.rename(columns={'importance': f'importance_fold{n_fold+1}'}, inplace=True)
    feim.set_index('feature', inplace=True)
    feim_list.append(feim)
    
cv_score = np.mean(score_list)
feim_df = pd.concat(feim_list, axis=1)
feim_df['importance_avg'] = feim_df.mean(axis=1)
feim_df.sort_values(by='importance_avg', ascending=False, inplace=True)

Training until validation scores don't improve for 100 rounds.
Early stopping, best iteration is:
[2]	valid_0's auc: 0.630697
Training until validation scores don't improve for 100 rounds.
Early stopping, best iteration is:
[7]	valid_0's auc: 0.616756
Training until validation scores don't improve for 100 rounds.
Early stopping, best iteration is:
[11]	valid_0's auc: 0.61114
Training until validation scores don't improve for 100 rounds.
Early stopping, best iteration is:
[2]	valid_0's auc: 0.673664
Training until validation scores don't improve for 100 rounds.
Early stopping, best iteration is:
[21]	valid_0's auc: 0.683892


In [210]:
COLUMN_PRED = 'prediction'
train_data[COLUMN_PRED] = y_pred
# train_data.to_csv('../output/0826_ieee__clf_pred__high_fraud_user.csv', index=False)
# train_data.to_csv('../output/0826_ieee__clf_pred__high_fraud_user__no_C.csv', index=False)
train_data.to_csv('../output/0826_ieee__clf_pred__high_fraud_user__no_C_card_amt.csv', index=False)

In [211]:
feim_df

Unnamed: 0_level_0,importance_fold1,importance_fold2,importance_fold3,importance_fold4,importance_fold5,importance_avg
feature,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
diff_email,450.226013,847.802979,854.136963,328.868011,1106.977905,717.602356
hour,337.876923,750.405945,1007.667114,347.086761,1100.308472,708.669067
R_emaildomain_gmail.com_dummie,413.959015,272.083405,604.232666,298.169006,294.503693,376.589539
R_emaildomain_hotmail.com_dummie,147.942001,0.0,277.967896,129.268005,227.21637,156.478851
P_emaildomain_hotmail.com_dummie,16.0312,209.996994,151.251907,17.2894,350.631897,149.040283
P_emaildomain_aol.com_dummie,46.923401,130.8255,262.311096,26.742001,146.018311,122.564064
time_zone_6.5-10.0_dummie,0.0,0.0,155.776154,0.0,306.685791,92.492386
P_emaildomain_gmail.com_dummie,0.0,70.030197,159.113113,0.0,165.086121,78.845886
P_emaildomain_yahoo.com_dummie,0.0,52.709789,177.882294,0.0,93.480202,64.814461
P_emaildomain_msn.com_dummie,32.623299,78.543198,70.948425,31.0075,0.0,42.624489


In [None]:
from func.xray_wrapper import Xray_Cal
xray = Xray_Cal(ignore_list=COLUMNS_IGNORE)

xray_result = pd.DataFrame()
for fold_num, (trn_idx, val_idx) in zip(tqdm(range(n_splits)), list(kfold)):
    xray.model = model_map[fold_num]
    xray, tmp_result = xray.get_xray(base_xray=train[use_cols].iloc[trn_idx, :], fold_num=fold_num, parallel=False)
    tmp_result.rename(columns={'xray' : f'xray_{fold_num}'}, inplace=True)
    
    if len(xray_result):
        xray_result = xray_result.merge(tmp_result, how='inner', on=['N', 'feature', 'value'])
    else:
        xray_result = tmp_result
        
# X-Rayの平均を出力（主にこれを可視化する）        
xray_cols = [col for col in xray_result.columns if col.count('xray')]
xray_result['xray_avg'] = xray_result[xray_cols].mean(axis=1)

In [156]:
xray_result.to_csv('../output/0826_ieee__clf_xray__high_fraud_user.csv', index=False)