In [41]:
%load_ext autoreload
%autoreload 2
from glob import glob
import os
from pathlib import Path
import re
import sys
import yaml
import datetime
import numpy as np
import pandas as pd
from tqdm import tqdm
from func.utils import get_categorical_features, read_pkl_gzip, to_pkl_gzip, parallel_load_data, get_filename
from func.ml_utils import Classifier
from func.preprocessing import get_dummies

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [4]:
COLUMN_ID = 'TransactionID'
COLUMN_DT = 'TransactionDT'
COLUMN_TARGET = 'isFraud'
COLUMNS_IGNORE = [COLUMN_ID, COLUMN_DT, COLUMN_TARGET]

train_paths = glob('../feature/raw_main/*_train.gz')
test_paths = glob('../feature/raw_main/*_test.gz')

train_paths = [path for path in train_paths 
               if not path.count('V') 
               and not path.count('day_no') 
              ]
test_paths  = [path for path in test_paths  
               if not path.count('V') 
               and not path.count('day_no') 
              ]

train_df = parallel_load_data(train_paths)
test_df = parallel_load_data(test_paths)

In [57]:
filepath = '../output/0826_transaction_new_user.csv'
target_ids = pd.read_csv(filepath).values.ravel()
all_data = pd.concat([train_df, test_df], axis=0, ignore_index=False)
train_data = train_df[train_df[COLUMN_ID].isin(target_ids)]
data = all_data[all_data[COLUMN_ID].isin(target_ids)]
data['is_train'] = ~data[COLUMN_TARGET].isnull()*1
print(data.shape)

(17749, 56)


In [61]:
col_target = 'is_train'
COLUMNS_CATEGORY = get_categorical_features(train, COLUMNS_IGNORE)
df_ohe = get_dummies(data, COLUMNS_CATEGORY)
train = df_ohe.drop(col_target, axis=1)
Y = df_ohe[col_target]

In [123]:
from sklearn.model_selection import StratifiedKFold

start_time = "{0:%Y%m%d_%H%M%S}".format(datetime.datetime.now())[:13]

n_splits = 5
kfold = StratifiedKFold(n_splits=n_splits)
kfold = list(kfold.split(train, Y))
params = {}
metric = 'auc'
model_type = 'lgb'
score_list = []
feim_list  = []
y_pred = np.zeros(len(train))
test_preds = []
use_cols = [col for col in df_ohe.columns if col not in COLUMNS_IGNORE+[col_target]
            and col.startswith('C')
#             and not col.startswith('D') 
#             and not col.startswith('C')
#             and not col.startswith('addr')
#             and not col.startswith('card')
#             and not col.startswith('dist')
#             and not col.startswith('M')
#             and not col.startswith('R_email')
#             and not col.count('Amt')
#             and not col.count('ProductCD')
           ]

for n_fold, (trn_idx, val_idx) in enumerate(kfold):
    x_train = train.iloc[trn_idx][use_cols]
    y_train = Y.iloc[trn_idx]
    x_valid = train.iloc[val_idx][use_cols]
    y_valid = Y.iloc[val_idx]

    score, oof_pred, test_pred, feim, _ = Classifier(
        model_type=model_type,
        x_train=x_train,
        y_train=y_train,
        x_valid=x_valid,
        y_valid=y_valid,
        x_test=[],
        params=params,
    )
    score_list.append(score)
    y_pred[val_idx] = oof_pred
    test_preds.append(test_pred)
    
    feim.rename(columns={'importance': f'importance_fold{n_fold+1}'}, inplace=True)
    feim.set_index('feature', inplace=True)
    feim_list.append(feim)
    
cv_score = np.mean(score_list)
feim_df = pd.concat(feim_list, axis=1)
feim_df['importance_avg'] = feim_df.mean(axis=1)
feim_df.sort_values(by='importance_avg', ascending=False, inplace=True)

New categorical_feature is []
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))
New categorical_feature is []
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))


Training until validation scores don't improve for 100 rounds.
Early stopping, best iteration is:
[58]	valid_0's auc: 0.870326
Training until validation scores don't improve for 100 rounds.


New categorical_feature is []
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))
New categorical_feature is []
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))


[200]	valid_0's auc: 0.902125
Early stopping, best iteration is:
[175]	valid_0's auc: 0.90272
Training until validation scores don't improve for 100 rounds.


New categorical_feature is []
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))
New categorical_feature is []
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))


[200]	valid_0's auc: 0.858641
Early stopping, best iteration is:
[197]	valid_0's auc: 0.858657
Training until validation scores don't improve for 100 rounds.


New categorical_feature is []
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))
New categorical_feature is []
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))


Early stopping, best iteration is:
[47]	valid_0's auc: 0.894707
Training until validation scores don't improve for 100 rounds.


New categorical_feature is []
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))
New categorical_feature is []
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))


[200]	valid_0's auc: 0.85886
Early stopping, best iteration is:
[136]	valid_0's auc: 0.863133


In [124]:
feim_df

Unnamed: 0_level_0,importance_fold1,importance_fold2,importance_fold3,importance_fold4,importance_fold5,importance_avg
feature,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
C1,7287.305664,7319.840332,8101.408691,6900.094238,7617.550781,7445.239746
C13,4185.461426,5333.989746,5323.234863,3734.185303,4488.952148,4613.165039
C8,4249.067383,4745.564941,4713.508789,3817.025635,4850.10791,4475.055176
C12,4167.996094,4428.182617,4973.542969,2719.324707,4590.077637,4175.825195
C14,3295.231445,4019.927246,4198.414551,3866.231689,4085.57373,3893.075684
C4,2685.299072,3108.728516,3251.567871,2966.348145,3016.084473,3005.605469
C9,2994.323975,3151.44043,3158.475586,2496.234619,2848.874756,2929.869873
C7,2569.514404,2414.460205,2862.633057,3026.144043,2419.059082,2658.362061
C2,1930.860962,2484.241699,2644.303223,1691.62085,2234.301514,2197.065674
C11,1838.2854,2199.080078,2292.199707,1462.6073,2268.536133,2012.141846


In [105]:
col_target = COLUMN_TARGET
COLUMNS_CATEGORY = get_categorical_features(train_data, COLUMNS_IGNORE)
df_ohe = get_dummies(train_data, COLUMNS_CATEGORY)
train = df_ohe.drop(col_target, axis=1)
Y = df_ohe[col_target]

In [127]:
from sklearn.model_selection import StratifiedKFold

start_time = "{0:%Y%m%d_%H%M%S}".format(datetime.datetime.now())[:13]

n_splits = 5
kfold = StratifiedKFold(n_splits=n_splits)
kfold = list(kfold.split(train, Y))
params = {}
metric = 'auc'
model_type = 'lgb'
score_list = []
feim_list  = []
y_pred = np.zeros(len(train))
test_preds = []
use_cols = [col for col in df_ohe.columns if col not in COLUMNS_IGNORE+[col_target]
            and not col.startswith('D') 
            and not col.startswith('addr')
            and not col.startswith('dist')
            and not col.startswith('M')
            and not col.count('ProductCD')
#             and not col.startswith('C')
#             and not col.startswith('card')
#             and not col.startswith('R_email')
#             and not col.count('Amt')
           ]

for n_fold, (trn_idx, val_idx) in enumerate(kfold):
    x_train = train.iloc[trn_idx][use_cols]
    y_train = Y.iloc[trn_idx]
    x_valid = train.iloc[val_idx][use_cols]
    y_valid = Y.iloc[val_idx]

    score, oof_pred, test_pred, feim, _ = Classifier(
        model_type=model_type,
        x_train=x_train,
        y_train=y_train,
        x_valid=x_valid,
        y_valid=y_valid,
        x_test=[],
        params=params,
    )
    score_list.append(score)
    y_pred[val_idx] = oof_pred
    test_preds.append(test_pred)
    
    feim.rename(columns={'importance': f'importance_fold{n_fold+1}'}, inplace=True)
    feim.set_index('feature', inplace=True)
    feim_list.append(feim)
    
cv_score = np.mean(score_list)
feim_df = pd.concat(feim_list, axis=1)
feim_df['importance_avg'] = feim_df.mean(axis=1)
feim_df.sort_values(by='importance_avg', ascending=False, inplace=True)

New categorical_feature is []
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))
New categorical_feature is []
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))


Training until validation scores don't improve for 100 rounds.
Early stopping, best iteration is:
[62]	valid_0's auc: 0.883629


New categorical_feature is []
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))
New categorical_feature is []
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))


Training until validation scores don't improve for 100 rounds.
[200]	valid_0's auc: 0.922322
Early stopping, best iteration is:
[158]	valid_0's auc: 0.923


New categorical_feature is []
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))
New categorical_feature is []
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))


Training until validation scores don't improve for 100 rounds.
[200]	valid_0's auc: 0.881433
Early stopping, best iteration is:
[174]	valid_0's auc: 0.88191


New categorical_feature is []
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))
New categorical_feature is []
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))


Training until validation scores don't improve for 100 rounds.
Early stopping, best iteration is:
[53]	valid_0's auc: 0.906352


New categorical_feature is []
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))
New categorical_feature is []
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))


Training until validation scores don't improve for 100 rounds.
[200]	valid_0's auc: 0.879435
Early stopping, best iteration is:
[146]	valid_0's auc: 0.882914


In [128]:
feim_df

Unnamed: 0_level_0,importance_fold1,importance_fold2,importance_fold3,importance_fold4,importance_fold5,importance_avg
feature,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
C1,7973.583496,8781.628906,8433.011719,7533.430176,6532.382324,7850.807129
C13,3986.987061,3100.977051,4293.088379,3887.078369,5420.277344,4137.681641
C14,3194.875977,4439.252441,3635.155029,3505.076904,2768.117920,3508.495605
C12,2564.736084,2926.914551,3483.337402,3100.026367,4157.181152,3246.438965
C2,3162.179688,3300.388428,4090.220947,2712.759766,1868.530396,3026.815918
card1,2497.166260,2949.020752,2987.713379,2135.336426,4505.583008,3014.963867
C6,1937.052002,3053.124023,2626.659668,2167.321533,5052.063477,2967.244141
TransactionAmt,2435.087158,3042.052246,3878.920166,2174.550293,2868.220703,2879.766113
C7,2386.836670,1709.556152,3034.511475,2160.409668,3077.258057,2473.714355
C11,2951.849121,959.682861,3016.070557,2329.572510,1262.238281,2103.882568
