In [52]:
%load_ext autoreload
%autoreload 2
import os
import sys
import yaml
import datetime
import numpy as np
import pandas as pd
from func.utils import get_categorical_features
from func.ml_utils import Classifier
from func.BigQuery import BigQuery

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [42]:
COLUMN_ID = 'TransactionID'
COLUMN_TARGET = 'isFraud'
COLUMNS_IGNORE = [COLUMN_ID, COLUMN_TARGET]

train_df = pd.read_csv('../input/train_transaction.csv')
test_df  = pd.read_csv('../input/test_transaction.csv')

COLUMNS_CATEGORY = get_categorical_features(train_df, COLUMNS_IGNORE)
use_cols = [col for col in train_df.columns if col not in COLUMNS_IGNORE+COLUMNS_CATEGORY]

Y = train_df[COLUMN_TARGET]
train_df = train_df[use_cols]
train_df.head()

Unnamed: 0,TransactionDT,TransactionAmt,card1,card2,card3,card5,addr1,addr2,dist1,dist2,...,V330,V331,V332,V333,V334,V335,V336,V337,V338,V339
0,86400,68.5,13926,,150.0,142.0,315.0,87.0,19.0,,...,,,,,,,,,,
1,86401,29.0,2755,404.0,150.0,102.0,325.0,87.0,,,...,,,,,,,,,,
2,86469,59.0,4663,490.0,150.0,166.0,330.0,87.0,287.0,,...,,,,,,,,,,
3,86499,50.0,18132,567.0,150.0,117.0,476.0,87.0,,,...,,,,,,,,,,
4,86506,50.0,4497,514.0,150.0,102.0,420.0,87.0,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [46]:
from sklearn.model_selection import StratifiedKFold

start_time = "{0:%Y%m%d_%H%M%S}".format(datetime.datetime.now())[:13]

n_splits = 5
kfold = StratifiedKFold(n_splits=n_splits)
kfold = list(kfold.split(train_df, Y))
params = {}
metric = 'auc'
model_type = 'lgb'
score_list = []
feim_list  = []
y_pred = np.zeros(len(train_df))
test_preds = []
use_cols = [col for col in dataset.columns if col not in COLUMNS_IGNORE+COLUMNS_CATEGORY]

for n_fold, (trn_idx, val_idx) in enumerate(kfold):
    x_train = train_df.iloc[trn_idx][use_cols]
    y_train = Y.iloc[trn_idx]
    x_valid = train_df.iloc[val_idx][use_cols]
    y_valid = Y.iloc[val_idx]

    score, oof_pred, test_pred, feim, _, params = Classifier(
        model_type=model_type,
        x_train=x_train,
        y_train=y_train,
        x_valid=x_valid,
        y_valid=y_valid,
        x_test=[],
        params=params,
    )
    score_list.append(score)
    y_pred[val_idx] = oof_pred
    test_preds.append(test_pred)
    
    feim.rename(columns={'importance': f'importance_fold{n_fold+1}'}, inplace=True)
    feim.set_index('feature', inplace=True)
    feim_list.append(feim)
    
cv_score = np.mean(score_list)
feim_df = pd.concat(feim_list, axis=1)
feim_df['importance_avg'] = feim_df.mean(axis=1)

New categorical_feature is []
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))
New categorical_feature is []
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))


Training until validation scores don't improve for 100 rounds.
[200]	valid_0's auc: 0.884668
Early stopping, best iteration is:
[166]	valid_0's auc: 0.888564


New categorical_feature is []
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))
New categorical_feature is []
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))


Training until validation scores don't improve for 100 rounds.
Early stopping, best iteration is:
[6]	valid_0's auc: 0.855407


New categorical_feature is []
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))
New categorical_feature is []
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))


Training until validation scores don't improve for 100 rounds.
Early stopping, best iteration is:
[6]	valid_0's auc: 0.858865


New categorical_feature is []
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))
New categorical_feature is []
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))


Training until validation scores don't improve for 100 rounds.
[200]	valid_0's auc: 0.918391
[400]	valid_0's auc: 0.924825
[600]	valid_0's auc: 0.926378
[800]	valid_0's auc: 0.928353
[1000]	valid_0's auc: 0.928763
[1200]	valid_0's auc: 0.928976
Early stopping, best iteration is:
[1159]	valid_0's auc: 0.929609


New categorical_feature is []
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))
New categorical_feature is []
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))


Training until validation scores don't improve for 100 rounds.
[200]	valid_0's auc: 0.894831
[400]	valid_0's auc: 0.903136
[600]	valid_0's auc: 0.904273
Early stopping, best iteration is:
[553]	valid_0's auc: 0.904977


In [47]:
feim_df.sort_values(by='importance_avg', ascending=False, inplace=True)
display(feim_df.head())

Unnamed: 0_level_0,importance_fold1,importance_fold2,importance_fold3,importance_fold4,importance_fold5,importance_avg
feature,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
V258,54054.152344,65909.859375,66860.671875,63341.277344,67058.414062,63444.875
C1,30741.880859,22999.960938,22135.201172,31540.441406,30142.542969,27512.005859
C14,17157.5625,16444.818359,18448.966797,22242.355469,23464.916016,19551.722656
TransactionDT,16628.685547,1373.168945,1729.534912,20299.755859,12669.451172,10540.120117
D2,6561.610352,6513.270996,5934.631836,16000.671875,14143.941406,9830.825195


In [48]:
COLUMNS_TOP100 = feim_df['importance_avg'].head(100).index
COLUMNS_TOP100

Index(['V258', 'C1', 'C14', 'TransactionDT', 'D2', 'V201', 'card1', 'card2',
       'V294', 'C13', 'C4', 'TransactionAmt', 'addr1', 'V283', 'D15', 'D10',
       'C11', 'V308', 'card5', 'C2', 'C6', 'C10', 'D1', 'dist1', 'C8', 'D4',
       'C12', 'D3', 'V156', 'V187', 'V317', 'D8', 'card3', 'V33', 'V315',
       'V307', 'C5', 'V34', 'C9', 'V91', 'V313', 'V44', 'V90', 'D11', 'V94',
       'V29', 'V130', 'V87', 'V310', 'V62', 'V69', 'D6', 'V49', 'V53', 'V102',
       'addr2', 'V45', 'V282', 'V67', 'V312', 'V83', 'V314', 'V76', 'V48',
       'D12', 'V61', 'V30', 'V285', 'D13', 'V189', 'V149', 'V209', 'V281',
       'dist2', 'V207', 'V322', 'V295', 'V82', 'V131', 'V266', 'V20', 'V70',
       'V133', 'V74', 'V54', 'D14', 'V320', 'V257', 'V66', 'V318', 'D9',
       'V323', 'V243', 'D5', 'V165', 'V13', 'V225', 'V56', 'V12', 'V36'],
      dtype='object', name='feature')

In [64]:
test_pred_avg = np.zeros(test_df.shape[0])
all_pred = np.append(y_pred, test_pred_avg)
all_ids = np.append(dataset[COLUMN_ID], test_df[COLUMN_ID])
result_pred = pd.Series(all_pred, index=all_ids, name='pred_' + start_time)

# to_pkl_gzip(obj=result_pred, path=f"../output/result_pred/{start_time}__CV{str(cv_score).replace('.', '-')}__all_preds")
result_pred = read_pkl_gzip(path=f"../output/result_pred/20190717_0811__CV0-8874842732034365__all_preds.gz")

In [18]:
from bq_log import pred_table
pred_table(result_pred.to_frame())

2019-07-17 09:59:05,351 func.BigQuery 32 [INFO]    [logger_func] start 
2019-07-17 09:59:05,351 func.BigQuery 32 [INFO]    [logger_func] start 
2019-07-17 09:59:05,351 func.BigQuery 32 [INFO]    [logger_func] start 
2019-07-17 09:59:05,351 func.BigQuery 32 [INFO]    [logger_func] start 
2019-07-17 09:59:05,351 func.BigQuery 32 [INFO]    [logger_func] start 
2019-07-17 09:59:05,615 func.BigQuery 53 [INFO]    [_set_dataset] Setup Dataset dim_ml_dataset. 
2019-07-17 09:59:05,615 func.BigQuery 53 [INFO]    [_set_dataset] Setup Dataset dim_ml_dataset. 
2019-07-17 09:59:05,615 func.BigQuery 53 [INFO]    [_set_dataset] Setup Dataset dim_ml_dataset. 
2019-07-17 09:59:05,615 func.BigQuery 53 [INFO]    [_set_dataset] Setup Dataset dim_ml_dataset. 
2019-07-17 09:59:05,615 func.BigQuery 53 [INFO]    [_set_dataset] Setup Dataset dim_ml_dataset. 
2019-07-17 09:59:05,750 func.BigQuery 73 [INFO]    [create_table] Table 0717_0941__ieee__pred_value_log created. 
2019-07-17 09:59:05,750 func.BigQuery 73 

In [22]:
from bq_log import create_train_log_table, save_train_log

lb_score = np.nan
metric = 'auc'

create_train_log_table()

log_map = {}
log_map['exp_date']    = start_time
log_map['n_features']  = train_df.shape[1]
log_map['n_rows']      = train_df.shape[0]
log_map['cv_score']    = cv_score
log_map['fold1_score'] = score_list[0]
log_map['fold2_score'] = score_list[1]
log_map['fold3_score'] = score_list[2]
log_map['fold4_score'] = score_list[3]
log_map['fold5_score'] = score_list[4]
log_map['seed']        = seed
log_map['metric']      = metric
log_map['model_type']  = model_type
save_train_log(log_map, params)

NameError: name 'np' is not defined