In [5]:
%load_ext autoreload
%autoreload 2
import os
import sys
import gc
import datetime
import random
import pandas as pd
import numpy as np
from func.utils import get_numeric_features, get_categorical_features, read_pkl_gzip, to_pkl_gzip, parallel_load_data, get_filename, logger_func
try:
    logger
except NameError:
    logger=logger_func()

start_time = "{0:%Y%m%d_%H%M%S}".format(datetime.datetime.now())[:13]

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [6]:
sys.path.append('../../../tool/ctrNet-tool/')
import ctrNet
import tensorflow as tf
from sklearn.model_selection import train_test_split
from src import misc_utils

In [None]:
COLUMN_ID = 'TransactionID'
COLUMN_DT = 'TransactionDT'
COLUMN_TARGET = 'isFraud'
COLUMN_GROUP = 'DT-M'
COLUMNS_IGNORE = [COLUMN_ID, COLUMN_DT, COLUMN_TARGET, COLUMN_GROUP, 'is_train', 'date']

def filter_feature(path):
    if path.count(''):
        return True
    else:
        return False

paths_train = glob('../submit/re_sub/*_train.gz')
paths_test  = glob('../submit/re_sub/*_test.gz')
paths_train += glob('../submit/add_feature/*_train.gz')
paths_test  += glob('../submit/add_feature/*_test.gz')
paths_train += glob('../feature/valid_use/531*_train.gz')
paths_test  += glob('../feature/valid_use/531*_test.gz')
paths_train += glob('../feature/valid_use/532*_train.gz')
paths_test  += glob('../feature/valid_use/532*_test.gz')
paths_train_feature = []
paths_test_feature  = []

df_train = parallel_load_data(paths_train)
df_test  = parallel_load_data(paths_test)
Y = df_train[COLUMN_TARGET]
df_train.drop(COLUMN_TARGET, axis=1, inplace=True)
len_train = df_train.shape[0]

In [12]:
#========================================================================
# Loading Dataset & Bucketting
#========================================================================
def make_bucket(data,num=10):
    data.sort()
    bins=[]
    for i in range(num):
        bins.append(data[int(len(data)*(i+1)//num)-1])
    return bins

df_feat = pd.concat([df_train, df_test], axis=0, ignore_index=True)
del df_train, df_test
gc.collect()

cols_num = get_numeric_features(df_feat, ignore_list=COLUMNS_IGNORE)

for f in tqdm(cols_num):
    mode = df_feat[~df_feat[f].isnull()][f].mode()[0]
    df_feat[f].fillna(mode, inplace=True)
    data = df_feat[f].tolist()
    bins=make_bucket(data,num=50)
    df_feat[f] = np.digitize(df_feat[f], bins=bins)

100%|██████████| 141/141 [50:38<00:00, 21.81s/it]


In [13]:
cols_cat = get_categorical_features(df_feat, ignore_list=COLUMNS_IGNORE)

for col in cols_cat:
    df_feat[col].fillna(0, inplace=True)

train = df_feat.iloc[:len_train, :]
test = df_feat.iloc[len_train:, :]
use_cols = [col for col in train.columns if col not in COLUMNS_IGNORE]

In [14]:
hparam=tf.contrib.training.HParams(
            model='xdeepfm',
            norm=True,
            batch_norm_decay=0.9,
            hidden_size=[128,128],
            cross_layer_sizes=[128,128,128],
            k=8,
            hash_ids=int(2e5),
            batch_size=1024,
            optimizer="adam",
            learning_rate=0.001,
            num_display_steps=1000,
            num_eval_steps=1000,
            epoch=1,
            metric='auc',
            activation=['relu','relu','relu'],
            cross_activation='identity',
            init_method='uniform',
            init_value=0.1,
            feature_nums=len(use_cols),
            kfold=5)
misc_utils.print_hparams(hparam)

  activation=['relu', 'relu', 'relu']
  batch_norm_decay=0.9
  batch_size=1024
  cross_activation=identity
  cross_layer_sizes=[128, 128, 128]
  epoch=1
  feature_nums=148
  hash_ids=200000
  hidden_size=[128, 128]
  init_method=uniform
  init_value=0.1
  k=8
  kfold=5
  learning_rate=0.001
  metric=auc
  model=xdeepfm
  norm=True
  num_display_steps=1000
  num_eval_steps=1000
  optimizer=adam


# Training model

In [20]:
train[COLUMN_TARGET] = Y
test[COLUMN_TARGET] = np.nan

In [None]:
index=set(range(train.shape[0]))
K_fold=[]
for i in range(hparam.kfold):
    if i == hparam.kfold-1:
        tmp=index
    else:
        tmp=random.sample(index,int(1.0/hparam.kfold*train.shape[0]))
    index=index-set(tmp)
    print("Number:",len(tmp))
    K_fold.append(tmp)

In [39]:
y_pred = np.zeros(train.shape[0])
for i in range(hparam.kfold):       
        
    print("Fold",i)
    dev_index=K_fold[i]
    dev_index=random.sample(dev_index,int(0.1*len(dev_index)))
    train_index=[]
    for j in range(hparam.kfold):
        if j!=i:
            train_index+=K_fold[j]
            
    x_train = train.iloc[train_index][use_cols]
    y_train = train.iloc[train_index][COLUMN_TARGET]
    x_val = train.iloc[dev_index][use_cols]
    y_val = train.iloc[dev_index][COLUMN_TARGET]
    
    model=ctrNet.build_model(hparam)
    model.train(train_data=(x_train, y_train), dev_data=(x_val, y_val))
    print("Training Done! Inference...")
    if i==0:
        y_pred[dev_index] += model.infer(dev_data=(x_val, y_val))/hparam.kfold
        y_test = model.infer(dev_data=(test[use_cols], test[COLUMN_TARGET]))/hparam.kfold
    else:
        y_pred[dev_index] += model.infer(dev_data=(x_val, y_val))/hparam.kfold
        y_test += model.infer(dev_data=(test[use_cols],test[COLUMN_TARGET]))/hparam.kfold

Fold 0
# Trainable variables
  emb_v1:0, (200000, 1), 
  emb_v2:0, (200000, 8), 
  Variable:0, (1184, 128), 
  norm_0/beta:0, (128,), 
  norm_0/gamma:0, (128,), 
  Variable_1:0, (128, 128), 
  norm_1/beta:0, (128,), 
  norm_1/gamma:0, (128,), 
  Variable_2:0, (128, 1), 
  exfm_part/f_0:0, (1, 21904, 128), 
  exfm_part/f_1:0, (1, 9472, 128), 
  exfm_part/f_2:0, (1, 9472, 128), 
  exfm_part/w_nn_output:0, (256, 1), 
  exfm_part/b_nn_output:0, (1,), 
  epoch 0 step 1000 lr 0.001 logloss 0.619496 gN 0.30, Tue Mar 12 22:13:13 2019
# Epcho-time 223.05s Eval AUC 0.721773. Best AUC 0.721773.
  epoch 0 step 2000 lr 0.001 logloss 0.605771 gN 0.24, Tue Mar 12 22:17:28 2019
# Epcho-time 477.70s Eval AUC 0.729980. Best AUC 0.729980.
  epoch 0 step 3000 lr 0.001 logloss 0.602579 gN 0.23, Tue Mar 12 22:21:42 2019
# Epcho-time 732.48s Eval AUC 0.732971. Best AUC 0.732971.
  epoch 0 step 4000 lr 0.001 logloss 0.600521 gN 0.22, Tue Mar 12 22:25:58 2019
# Epcho-time 988.38s Eval AUC 0.734562. Best AUC 0.

In [41]:
to_pkl_gzip(obj=y_test, path = f'../output/pred_result/{start_time}_ieee__test_oof_Xdeepfm')

0.502331614968106


KeyError: "['MachineIdentifier'] not in index"

(7853253,)