In [1]:
%load_ext autoreload
%autoreload 2
from glob import glob
import gc
import os
import sys
import datetime
import numpy as np
import pandas as pd
from tqdm import tqdm
from func.utils import get_numeric_features, get_categorical_features, read_pkl_gzip, to_pkl_gzip, parallel_load_data, get_filename
from func.time_utils import date_add_days, date_add_times
from func.ml_utils import save_feature, get_cnt_feature, get_dummie_feature, get_label_feature
from func.parallel_utils import get_parallel_arg_list
from joblib import delayed, Parallel

In [2]:
COLUMN_ID = 'TransactionID'
COLUMN_DT = 'TransactionDT'
COLUMN_TARGET = 'isFraud'
COLUMN_GROUP = 'DT-M'
COLUMNS_IGNORE = [COLUMN_ID, COLUMN_DT, COLUMN_TARGET, COLUMN_GROUP, 'is_train', 'date']

paths_train = glob('../feature/create/524__fill*_train.gz')
paths_test  = glob('../feature/create/524__fill*_test.gz')
paths_train = [path for path in paths_train if path.count('cnt_')]
paths_test  = [path for path in paths_test  if path.count('cnt_')]

# paths_train += glob('../feature/create/524__bin*_train.gz')
# paths_test  += glob('../feature/create/524__bin*_test.gz')
paths_train += glob('../feature/eda_base/isFraud_train.gz')
paths_test  += glob('../feature/eda_base/isFraud_test.gz')

df_train = parallel_load_data(paths_train)
df_test  = parallel_load_data(paths_test)

In [3]:
df_train.head()

Unnamed: 0,524__fill__cnt_C14_fraud_mean,524__fill__cnt_C11_fraud_mean,524__fill__cnt_V133_fraud_mean,524__fill__cnt_card4_fraud_mean,524__fill__cnt_TransactionAmt_fraud_mean,524__fill__cnt_C7_fraud_mean,524__fill__cnt_card5_fraud_mean,524__fill__cnt_addr1_fraud_mean,524__fill__cnt_V53_fraud_mean,524__fill__cnt_V67_fraud_mean,...,524__fill__cnt_V127_fraud_mean,524__fill__cnt_cents_fraud_mean,524__fill__cnt_V187_fraud_mean,524__fill__cnt_V49_fraud_mean,524__fill__cnt_C3_fraud_mean,524__fill__cnt_V4_fraud_mean,524__fill__cnt_V283_fraud_mean,524__fill__cnt_P_emaildomain_fraud_mean,524__fill__cnt_V294_fraud_mean,isFraud
0,0.030418,0.041655,0.056426,0.099153,0.035139,0.02638,0.04918,0.01717,0.020873,0.029015,...,0.028004,0.019868,0.021483,0.045776,0.037935,0.018964,0.040691,0.031235,0.097308,0
1,0.030418,0.02885,0.03088,0.035718,0.019681,0.02638,0.070953,0.025597,0.046045,0.029015,...,0.036609,0.041006,0.021483,0.053368,0.037935,0.06442,0.040691,0.04623,0.029483,0
2,0.030418,0.02885,0.03088,0.037841,0.021559,0.02638,0.011701,0.036783,0.020873,0.029015,...,0.036609,0.041006,0.021483,0.053368,0.037935,0.018964,0.040691,0.102399,0.029483,0
3,0.030418,0.02885,0.1989,0.035718,0.043227,0.02638,0.013251,0.034418,0.020873,0.029015,...,0.081336,0.041006,0.021483,0.053368,0.037935,0.06442,0.01673,0.024885,0.105263,0
4,0.030418,0.02885,0.03088,0.035718,0.043227,0.02638,0.070953,0.032592,0.10475,0.10475,...,0.036609,0.041006,0.075021,0.045776,0.037935,0.06442,0.040691,0.04623,0.029483,0


In [9]:
use_cols = [col for col in df_train.columns if col not in COLUMNS_IGNORE]
df_train['all_score'] = df_train[use_cols].sum(axis=1)
df_train['all_score'] = (df_train[use_cols]+1).product(axis=1)

In [6]:
cols_cnt_card   = sorted([col for col in use_cols if col.count('card') and col.count('cnt')])
cols_cnt_addr   = sorted([col for col in use_cols if col.count('addr') and col.count('cnt')])
cols_cnt_C      = sorted([col for col in use_cols if col.count('C') and col.count('cnt')])
cols_cnt_V      = sorted([col for col in use_cols if col.count('V') and col.count('cnt')])
cols_cnt_Amt    = sorted([col for col in use_cols if (col.count('TransactionAmt') or col.count('cents')) and col.count('cnt')])

In [10]:
pd.set_option('max_rows', 500)
df_train[['all_score', COLUMN_TARGET]].to_csv('../output/fraud_score.csv', index=False)