In [1]:
%load_ext autoreload
%autoreload 2
from glob import glob
import os
from pathlib import Path
import re
import sys
import yaml
import datetime
import numpy as np
import pandas as pd
from tqdm import tqdm
from func.utils import get_categorical_features, read_pkl_gzip, to_pkl_gzip, parallel_load_data, get_filename, logger_func, timer
from ieee_train import eval_train, eval_check_feature
from kaggle_utils import reduce_mem_usage, move_feature
logger = logger_func()

from sklearn.metrics import log_loss, roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from deepctr_torch.models import *
from deepctr_torch.inputs import SparseFeat, DenseFeat, get_fixlen_feature_names
import torch

2019-09-23 14:30:41,106 func.utils 347 [INFO]    [logger_func] start 


In [3]:
a = np.array([np.inf, 1, 0])

In [5]:
np.where(a==np.inf, 2, a)

array([2., 1., 0.])

In [2]:
COLUMN_ID = 'TransactionID'
COLUMN_DT = 'TransactionDT'
COLUMN_TARGET = 'isFraud'
COLUMN_GROUP = 'DT-M'
COLUMNS_IGNORE = [COLUMN_ID, COLUMN_DT, COLUMN_TARGET, COLUMN_GROUP, 'is_train', 'date']

def filter_feature(path):
    if path.count(''):
        return True
    else:
        return False

paths_train = glob('../feature/raw_use/*_train.gz')
paths_test  = glob('../feature/raw_use/*_test.gz')
# paths_train = glob('../submit/re_sub/*_train.gz')
# paths_test  = glob('../submit/re_sub/*_test.gz')
# paths_train += glob('../submit/add_feature/*_train.gz')
# paths_test  += glob('../submit/add_feature/*_test.gz')
# paths_train += glob('../feature/valid_use/531*_train.gz')
# paths_test  += glob('../feature/valid_use/531*_test.gz')
# paths_train += glob('../feature/valid_use/532*_train.gz')
# paths_test  += glob('../feature/valid_use/532*_test.gz')
# paths_train += glob('../feature/valid_trush/532*uid3*_train.gz')
# paths_test  += glob('../feature/valid_trush/532*uid3*_test.gz')

paths_train_feature = []
paths_test_feature  = []

df_train = parallel_load_data(paths_train)
df_test  = parallel_load_data(paths_test)
Y = df_train[COLUMN_TARGET]
df_train.drop(COLUMN_TARGET, axis=1, inplace=True)

In [None]:
data = pd.concat([df_train, df_test], axis=0, ignore_index=True)

sparse_features = []
dense_features = data.columns

data[sparse_features] = data[sparse_features].fillna('-1', )

with timer("Fill Na"):
    for col in tqdm(dense_features):
        avg = data[col].mean()
        data[dense_features] = data[dense_features].fillna(avg, )
    
target = [COLUMN_TARGET]

# 1.Label Encoding for sparse features,and do simple Transformation for dense features
for feat in sparse_features:
    lbe = LabelEncoder()
    data[feat] = lbe.fit_transform(data[feat])
    
    
with timer("Min Max Scalar"):
    mms = MinMaxScaler(feature_range=(0, 1))
    dense_features = [col for col in dense_features if not col.count('513__D2-D4__ratio__ProductCD-W')]
    data[dense_features] = mms.fit_transform(data[dense_features])

In [None]:
# 2.count #unique features for each sparse field,and record dense feature field name

fixlen_feature_columns = [SparseFeat(feat, data[feat].nunique())
                          for feat in sparse_features] + [DenseFeat(feat, 1,)
                                                          for feat in dense_features]

dnn_feature_columns    = fixlen_feature_columns
linear_feature_columns = fixlen_feature_columns

fixlen_feature_names = get_fixlen_feature_names(
    linear_feature_columns + dnn_feature_columns)

In [None]:
from sklearn.model_selection import GroupKFold

df_train = data.iloc[:len(df_train)]
df_test  = data.iloc[len(df_train):]

n_splits = 6
group_kfold_path = '../input/0908_ieee__DT-M_GroupKFold.gz'
group = read_pkl_gzip(group_kfold_path)
df_train[COLUMN_GROUP] = group

kfold = list(GroupKFold(n_splits=n_splits).split(df_train, Y, df_train[COLUMN_GROUP]))
print("Set Kfold")

In [None]:
# 3.generate input data for model

use_cols = dense_features
oof_pred = np.zeros(df_train.shape[0])
x_test = df_test[use_cols]
test_preds = []
test_model_input = [x_test[name] for name in fixlen_feature_names]

for n_fold, (trn_idx, val_idx) in enumerate(kfold):
    
    with timer("Preset Dataset."):
        x_train = df_train.iloc[trn_idx][use_cols]
        y_train = Y.iloc[trn_idx]
        x_valid = df_train.iloc[val_idx][use_cols]
        y_valid = Y.iloc[val_idx]

        train_model_input = [x_train[name] for name in fixlen_feature_names]
        valid_model_input = [x_valid[name] for name in fixlen_feature_names]

    # 4.Define Model,train,predict and evaluate
    
    print("Start Train and Predict.")
    with timer("Fitting"):

        device = 'cpu'
        use_cuda = True
        if use_cuda and torch.cuda.is_available():
            print('cuda ready...')
            device = 'cuda:0'

        model = DeepFM(
            linear_feature_columns=linear_feature_columns,
            dnn_feature_columns=dnn_feature_columns,
            task='binary',
            l2_reg_embedding=1e-5,
            device=device
        )

        model.compile(
            "adagrad",
            "binary_crossentropy",
            metrics=["binary_crossentropy", "auc"],
        )
        model.fit(
            train_model_input,
            y_train.values,
            batch_size=1024,
            epochs=7,
            validation_split=0.0,
            verbose=2
        )

    with timer("Predict"):
        
        pred_ans = model.predict(valid_model_input, 256)
        print("")
        print("test LogLoss", round(log_loss(y_valid, pred_ans), 4))
        print("test AUC", round(roc_auc_score(y_valid.values, pred_ans), 4))
        
        oof_pred[val_idx] = pred_ans
        
        test_pred = model.predict(test_model_input, 256)
        test_preds.append(test_pred)

In [35]:
pred_ans

array([[0.01531531],
       [0.01715467],
       [0.00604828],
       ...,
       [0.02184101],
       [0.00815489],
       [0.01425854]], dtype=float32)