In [1]:
%load_ext autoreload
%autoreload 2
from glob import glob
import os
from pathlib import Path
import re
import sys
import yaml
import datetime
import numpy as np
import pandas as pd
from tqdm import tqdm
from func.utils import get_categorical_features, read_pkl_gzip, to_pkl_gzip, parallel_load_data, get_filename
from func.ml_utils import save_feature, get_cnt_feature, get_dummie_feature, get_label_feature
from ieee_train import eval_train, eval_check_feature
from kaggle_utils import reduce_mem_usage, move_feature

In [2]:
COLUMN_ID = 'TransactionID'
COLUMN_DT = 'TransactionDT'
COLUMN_TARGET = 'isFraud'
COLUMNS_IGNORE = [COLUMN_ID, COLUMN_DT, COLUMN_TARGET, 'is_train', 'date']

def filter_feature(path):
    if path.count('') :
        return True
    else:
        return False


paths_train = glob('../feature/eda_base/*_train.gz')
paths_train = [path for path in paths_train if filter_feature(path) ]
df_train = parallel_load_data(paths_train)

paths_test = glob('../feature/eda_base/*_test.gz')
paths_test = [path for path in paths_test if filter_feature(path) ]
df_test = parallel_load_data(paths_test)

group_kfold_path = '../input/0908_ieee__DT-M_GroupKFold.gz'
group = read_pkl_gzip(group_kfold_path)
COLUMN_GROUP = 'DT-M'
df_train[COLUMN_GROUP] = group

In [4]:
#========================================================================
# Target Encoding
# 当月のTargetはNullとして計算する
#========================================================================
dir_save = 'valid'

weight_map = {
    '2017-12': 0.25,
    '2018-1' : 0.40,
    '2018-2' : 0.55,
    '2018-3' : 0.70,
    '2018-4' : 0.85,
    '2018-5' : 1.0,
}

cols_addr = [col for col in df_train.columns if col.startswith('addr')]
cols_DTM = df_train['DT-M'].unique().tolist()
print(cols_addr)

for col in tqdm(cols_addr):
    base_train = df_train[[col, 'DT-M']]
    base_test = df_test[[col]]
    list_dtm = []
        
    for dtm in cols_DTM + ['test']:
        
        if dtm=="2017-12":
            weight_map = {
                '2018-1' : 1.0,
                '2018-2' : 0.8,
                '2018-3' : 0.6,
                '2018-4' : 0.4,
                '2018-5' : 0.2,
            }
        elif dtm=="2018-1":
            weight_map = {
                '2017-12' : 1.0,
                '2018-2' : 0.8,
                '2018-3' : 0.6,
                '2018-4' : 0.4,
                '2018-5' : 0.2,
            }
        elif dtm=="2018-2":
            weight_map = {
                '2017-12' : 0.8,
                '2018-1' : 1.0,
                '2018-3' : 0.6,
                '2018-4' : 0.4,
                '2018-5' : 0.2,
            }
        elif dtm=="2018-3":
            weight_map = {
                '2017-12' : 0.6,
                '2018-1' : 0.8,
                '2018-2' : 1.0,
                '2018-4' : 0.4,
                '2018-5' : 0.2,
            }
        elif dtm=="2018-4":
            weight_map = {
                '2017-12' : 0.4,
                '2018-1' : 0.6,
                '2018-2' : 0.8,
                '2018-3' : 1.0,
                '2018-5' : 0.2,
            }
        elif dtm=="2018-5":
            weight_map = {
                '2017-12' : 0.2,
                '2018-1' : 0.4,
                '2018-2' : 0.6,
                '2018-3' : 0.8,
                '2018-4' : 1.0,
            }
        else:
            weight_map = {
                '2017-12' : 0.25,
                '2018-1' : 0.40,
                '2018-2' : 0.55,
                '2018-3' : 0.70,
                '2018-4' : 0.85,
                '2018-5' : 1.0,
            }
        
        if dtm != 'test':
            df = df_train[df_train['DT-M']!=dtm].copy()
        else:
            df = df_train.copy()
        
        tmp = df.groupby([COLUMN_GROUP,col], as_index=False)[COLUMN_TARGET].agg({
            f'{col}_mean': 'mean'
        })
        tmp_all = df.groupby([col], as_index=False)[COLUMN_TARGET].agg({
            f'{col}_fraud_all_mean': 'mean'
        })
        merge = tmp_all.merge(tmp, how='left', on=col)
    
        # pd.set_option('max_rows', 100)
        merge['ratio'] = merge[f'{col}_mean'] / (merge[f'{col}_fraud_all_mean'] + 1)
        
        merge.reset_index(inplace=True)
        tmp_base = merge[[col, f'{col}_fraud_all_mean']].drop_duplicates()
        merge.set_index(col, inplace=True)
        tmp_base.set_index(col, inplace=True)
        
        merge['no_fraud'] = (merge[f'{col}_mean']==0)
        tmp_base[f'{col}_no_fraud_sum'] = merge.groupby(col)['no_fraud'].sum()
        
        tmp_base[f'{col}_monthly_fraud_prob_max_min_diff'] = merge.groupby(col)[f'{col}_mean'].max() - merge.groupby(col)[f'{col}_mean'].min()
        tmp_base[f'{col}_monthly_fraud_prob_max_min_ratio'] = merge.groupby(col)[f'{col}_mean'].max() / (merge.groupby(col)[f'{col}_mean'].min()+ 1)
        tmp_base[f'{col}_monthly_fraud_prob_std'] = merge.groupby(col)[f'{col}_mean'].std()
        
        tmp_base[f'{col}_monthly_fraud_ratio_max_min_diff'] = merge.groupby(col)['ratio'].max() - merge.groupby(col)['ratio'].min()
        tmp_base[f'{col}_monthly_fraud_ratio_max_min_diff'] = merge.groupby(col)['ratio'].max() - merge.groupby(col)['ratio'].min()
        tmp_base[f'{col}_monthly_fraud_ratio_max_min_ratio'] = merge.groupby(col)['ratio'].max() / (merge.groupby(col)['ratio'].min() + 1)
        tmp_base[f'{col}_monthly_fraud_ratio_std'] = merge.groupby(col)['ratio'].std()
        
        # Weighted Mean
        merge['weight'] = merge['DT-M'].map(weight_map)
        merge[f'{col}_weight'] = merge['weight'] * merge[f'{col}_mean']
        merge[f'ratio_weight'] = merge['weight'] * merge[f'ratio']
        
        tmp_base[f'{col}_fraud_prob_weighted_mean'] = merge.groupby(col)[f'{col}_weight'].sum() / merge.groupby(col)['weight'].sum()
        tmp_base[f'{col}_fraud_prob_trend'] = tmp_base[f'{col}_fraud_prob_weighted_mean'] / (tmp_base[f'{col}_fraud_all_mean'] + 1)
        
        tmp_base[f'{col}_fraud_ratio_weighted_mean'] = merge.groupby(col)[f'ratio_weight'].sum() / merge.groupby(col)['weight'].sum()
        
        tmp_base.fillna(0, inplace=True)
        
        if dtm != 'test':
            tmp_base['DT-M'] = dtm
            list_dtm.append(tmp_base)
        else:
            test_TE = tmp_base
        
    train_TE = pd.concat(list_dtm, axis=0)
    result_train = base_train.merge(train_TE, how='left', on=[col, 'DT-M'])
    result_test = base_test.merge(test_TE, how='left', on=[col])
    
    cols_save = [col for col in result_train.columns if col.count(f'_fraud_')]
    
    save_feature(result_train[cols_save], '519', dir_save, is_train=True, auto_type=True, list_ignore=COLUMNS_IGNORE)
    save_feature(result_test[cols_save],  '519', dir_save, is_train=False, auto_type=True, list_ignore=COLUMNS_IGNORE)

  0%|          | 0/2 [00:00<?, ?it/s]

['addr2', 'addr1']
(590540,) | addr2_fraud_all_mean
(590540,) | addr2_no_fraud_sum
(590540,) | addr2_monthly_fraud_prob_max_min_diff
(590540,) | addr2_monthly_fraud_prob_max_min_ratio
(590540,) | addr2_monthly_fraud_prob_std
(590540,) | addr2_monthly_fraud_ratio_max_min_diff
(590540,) | addr2_monthly_fraud_ratio_max_min_ratio
(590540,) | addr2_monthly_fraud_ratio_std
(590540,) | addr2_fraud_prob_weighted_mean
(590540,) | addr2_fraud_prob_trend
(590540,) | addr2_fraud_ratio_weighted_mean
(506691,) | addr2_fraud_all_mean
(506691,) | addr2_no_fraud_sum
(506691,) | addr2_monthly_fraud_prob_max_min_diff
(506691,) | addr2_monthly_fraud_prob_max_min_ratio
(506691,) | addr2_monthly_fraud_prob_std
(506691,) | addr2_monthly_fraud_ratio_max_min_diff
(506691,) | addr2_monthly_fraud_ratio_max_min_ratio
(506691,) | addr2_monthly_fraud_ratio_std
(506691,) | addr2_fraud_prob_weighted_mean
(506691,) | addr2_fraud_prob_trend
(506691,) | addr2_fraud_ratio_weighted_mean


 50%|█████     | 1/2 [00:13<00:13, 13.66s/it]

(590540,) | addr1_fraud_all_mean
(590540,) | addr1_no_fraud_sum
(590540,) | addr1_monthly_fraud_prob_max_min_diff
(590540,) | addr1_monthly_fraud_prob_max_min_ratio
(590540,) | addr1_monthly_fraud_prob_std
(590540,) | addr1_monthly_fraud_ratio_max_min_diff
(590540,) | addr1_monthly_fraud_ratio_max_min_ratio
(590540,) | addr1_monthly_fraud_ratio_std
(590540,) | addr1_fraud_prob_weighted_mean
(590540,) | addr1_fraud_prob_trend
(590540,) | addr1_fraud_ratio_weighted_mean
(506691,) | addr1_fraud_all_mean
(506691,) | addr1_no_fraud_sum
(506691,) | addr1_monthly_fraud_prob_max_min_diff
(506691,) | addr1_monthly_fraud_prob_max_min_ratio
(506691,) | addr1_monthly_fraud_prob_std
(506691,) | addr1_monthly_fraud_ratio_max_min_diff
(506691,) | addr1_monthly_fraud_ratio_max_min_ratio
(506691,) | addr1_monthly_fraud_ratio_std
(506691,) | addr1_fraud_prob_weighted_mean
(506691,) | addr1_fraud_prob_trend
(506691,) | addr1_fraud_ratio_weighted_mean


100%|██████████| 2/2 [00:30<00:00, 14.59s/it]
