In [1]:
%load_ext autoreload
%autoreload 2
from glob import glob
import gc
import os
import sys
import datetime
import numpy as np
import pandas as pd
from tqdm import tqdm
from func.utils import get_numeric_features, get_categorical_features, read_pkl_gzip, to_pkl_gzip, parallel_load_data, get_filename
from func.time_utils import date_add_days
from func.ml_utils import save_feature, get_cnt_feature, get_dummie_feature, get_label_feature
from func.parallel_utils import get_parallel_arg_list
from joblib import delayed, Parallel

In [2]:
COLUMN_ID = 'TransactionID'
COLUMN_DT = 'TransactionDT'
COLUMN_TARGET = 'isFraud'
COLUMNS_IGNORE = [COLUMN_ID, COLUMN_DT, COLUMN_TARGET, 'ProductCD']

train_paths = glob('../feature/eda_base/*_train.gz')
test_paths = glob('../feature/eda_base/*_test.gz')

train_paths = [path for path in train_paths 
               if path.count(COLUMN_DT) 
               or path.count(COLUMN_ID)
               or path.count('V')
               or path.count('Product')
              ]
test_paths = [path for path in test_paths 
               if path.count(COLUMN_DT) 
               or path.count(COLUMN_ID)
               or path.count('V')
               or path.count('Product')
              ]

df_train = parallel_load_data(train_paths)
df_test = parallel_load_data(test_paths)
data = pd.concat([df_train, df_test], axis=0, ignore_index=True)
if COLUMN_ID in data.columns:
    data.set_index(COLUMN_ID, inplace=True)

base_train = read_pkl_gzip('../input/base_train.gz').set_index(COLUMN_ID)
base_test = read_pkl_gzip('../input/base_test.gz').set_index(COLUMN_ID)
base = pd.concat([base_train, base_test], axis=0)

cols_num = get_numeric_features(data, COLUMNS_IGNORE)
cols_num = [col for col in cols_num if col.count('C') or col.count('D')]
data[cols_num] = data[cols_num].astype('float32')

df_user_id_ca = pd.read_csv('../output/same_user_pattern/0903__same_user_id__card_addr.csv').set_index(COLUMN_ID)
df_user_id_cap = pd.read_csv('../output/same_user_pattern/0903__same_user_id__card_addr_pemail.csv').set_index(COLUMN_ID)
df_user_id_capm = pd.read_csv('../output/same_user_pattern/0902__same_user_id__card_addr_pemail_M.csv').set_index(COLUMN_ID)
df_user_id_bear = pd.read_csv('../output/same_user_pattern/20190901_user_ids_share.csv').set_index(COLUMN_ID)

data['user_id_card_addr'] = df_user_id_ca['predicted_user_id']
data['user_id_card_addr_pemail'] = df_user_id_cap['predicted_user_id']
data['user_id_card_addr_pemail_M'] = df_user_id_capm['predicted_user_id']
data['user_id_bear'] = df_user_id_bear['predicted_user_id']

In [3]:
START_DATE = '2017-12-01'
startdate = datetime.datetime.strptime(START_DATE, '%Y-%m-%d')

data['datetime'] = data['TransactionDT'].apply(lambda x: (startdate + datetime.timedelta(seconds = x) - datetime.timedelta(seconds = 14400) ))
data['datetime'].fillna(datetime.date(2020, 1, 1), inplace=True)
data['date'] = data['datetime'].map(lambda x: x.date())

In [4]:
#========================================================================
# C同士のdiff, ratio
# 類似してるCでまとめる
#========================================================================
feim = read_pkl_gzip('../output/feature_importances/20190905_1024__CV0-9434494228779833__feature155.gz')
cols_V = []
for v in feim.loc[[i for i in feim.index if i.count('V')]].index:
    cols_V.append(v.replace('raw__', ''))
cols_V

['V187',
 'V259',
 'V265',
 'V294',
 'V308',
 'V258',
 'V307',
 'V317',
 'V315',
 'V313',
 'V310',
 'V70',
 'V283',
 'V87',
 'V91',
 'V45',
 'V156',
 'V312',
 'V62',
 'V127',
 'V314',
 'V130',
 'V320',
 'V54',
 'V306',
 'V83',
 'V48',
 'V76',
 'V90',
 'V282',
 'V29',
 'V128',
 'V61',
 'V131',
 'V53',
 'V49',
 'V38',
 'V133',
 'V281',
 'V78',
 'V44',
 'V165',
 'V267',
 'V37',
 'V67',
 'V318',
 'V94']

In [43]:
v_set1 = ['V187', 'V67', 'V267']
v_set2 = ['V37', 'V38', 'V44', 'V45']
v_set3 = ['V62', 'V61', 'V83']
v_set4 = ['V281', 'V282', 'V283']
v_set5 = ['V306', 'V307', 'V308', 'V317', 'V318', 'V320', 'V127', 'V128', 'V133', 'V265', 'V294']
v_set6 = ['V90', 'V91', 'V49', 'V48', 'V70', 'V29']
v_set7 = ['V310', 'V311', 'V312', 'V312', 'V313', 'V314', 'V315']
v_set8 = ['V87', 'V78', 'V258', 'V259']
v_set9 = ['V267', 'V130', 'V131']
remain = list(set(cols_V) -set(v_set1) - set(v_set2) - set(v_set3)  - set(v_set4) - set(v_set5)  - set(v_set6) - set(v_set7) - set(v_set8))

In [49]:
from sklearn.decomposition import PCA

list_v = [
    v_set1,
    v_set2,
    v_set3,
    v_set4,
    v_set5,
    v_set6,
    v_set7,
    v_set8,
    v_set9,
    remain,
]

for v_set in list_v:

    pca = PCA(n_components=1)
    data[v_set] = data[v_set].fillna(-1)
    pca.fit(data[v_set])
    v_name = '-'.join(v_set)
    data[f"pca__{v_name}"] = pca.transform(data[v_set])

In [52]:
#========================================================================
# ProductCDあたりのC
#========================================================================
prefix = '510'
dir_save = 'valid'
cols_feature = [col for col in data.columns if col.count('pca__')]

for col in tqdm(cols_feature):
    train = data.iloc[:len(base_train)]
    test  = data.iloc[len(base_train):]
    
    save_feature(train[[col]], prefix, dir_save, is_train=True, auto_type=True, list_ignore=COLUMNS_IGNORE)
    save_feature(test[ [col]], prefix, dir_save, is_train=False, auto_type=True, list_ignore=COLUMNS_IGNORE)

  0%|          | 0/10 [00:00<?, ?it/s]

(590540,) | pca__V187-V67-V267
(506691,) | pca__V187-V67-V267


 10%|█         | 1/10 [00:02<00:25,  2.86s/it]

(590540,) | pca__V37-V38-V44-V45
(506691,) | pca__V37-V38-V44-V45


 20%|██        | 2/10 [00:05<00:23,  2.88s/it]

(590540,) | pca__V62-V61-V83
(506691,) | pca__V62-V61-V83


 30%|███       | 3/10 [00:08<00:20,  2.86s/it]

(590540,) | pca__V281-V282-V283
(506691,) | pca__V281-V282-V283


 40%|████      | 4/10 [00:11<00:17,  2.88s/it]

(590540,) | pca__V306-V307-V308-V317-V318-V320-V127-V128-V133-V265-V294
(506691,) | pca__V306-V307-V308-V317-V318-V320-V127-V128-V133-V265-V294


 50%|█████     | 5/10 [00:14<00:14,  2.92s/it]

(590540,) | pca__V90-V91-V49-V48-V70-V29
(506691,) | pca__V90-V91-V49-V48-V70-V29


 60%|██████    | 6/10 [00:17<00:11,  2.90s/it]

(590540,) | pca__V310-V311-V312-V312-V313-V314-V315
(506691,) | pca__V310-V311-V312-V312-V313-V314-V315


 70%|███████   | 7/10 [00:20<00:08,  2.91s/it]

(590540,) | pca__V87-V78-V258-V259
(506691,) | pca__V87-V78-V258-V259


 80%|████████  | 8/10 [00:23<00:05,  2.91s/it]

(590540,) | pca__V267-V130-V131
(506691,) | pca__V267-V130-V131


 90%|█████████ | 9/10 [00:26<00:02,  2.90s/it]

(590540,) | pca__V54-V76-V165-V130-V156-V131-V94-V53
(506691,) | pca__V54-V76-V165-V130-V156-V131-V94-V53


100%|██████████| 10/10 [00:29<00:00,  2.94s/it]
