In [1]:
%load_ext autoreload
%autoreload 2
from glob import glob
import gc
import os
import sys
import datetime
import numpy as np
import pandas as pd
from tqdm import tqdm
from func.utils import get_numeric_features, get_categorical_features, read_pkl_gzip, to_pkl_gzip, parallel_load_data, get_filename
from func.time_utils import date_add_days
from func.ml_utils import save_feature, get_cnt_feature, get_dummie_feature, get_label_feature
from func.parallel_utils import get_parallel_arg_list
from joblib import delayed, Parallel

In [2]:
feim = read_pkl_gzip('../output/feature_importances/20190905_1024__CV0-9434494228779833__feature155.gz')
cols_V = []
for v in feim.loc[[i for i in feim.index if i.count('V')]].index:
    cols_V.append(v.replace('raw__', ''))
# cols_V

In [3]:
COLUMN_ID = 'TransactionID'
COLUMN_DT = 'TransactionDT'
COLUMN_TARGET = 'isFraud'
COLUMNS_IGNORE = [COLUMN_ID, COLUMN_DT, COLUMN_TARGET, 'ProductCD']

train_paths = glob('../feature/eda_base/*_train.gz')
test_paths = glob('../feature/eda_base/*_test.gz')

train_paths = [path for path in train_paths 
               if path.count(COLUMN_DT) 
               or path.count(COLUMN_ID)
               or path.count('time_zone')
               or path.count('hour')
               or path.count('Product')
               or path.count('V')
              ]
test_paths = [path for path in test_paths 
               if path.count(COLUMN_DT) 
               or path.count(COLUMN_ID)
               or path.count('time_zone')
               or path.count('hour')
               or path.count('V')
               or path.count('Product')
              ]

df_train = parallel_load_data(train_paths)
df_test = parallel_load_data(test_paths)
data = pd.concat([df_train, df_test], axis=0, ignore_index=True)
if COLUMN_ID in data.columns:
    data.set_index(COLUMN_ID, inplace=True)

base_train = read_pkl_gzip('../input/base_train.gz').set_index(COLUMN_ID)
base_test = read_pkl_gzip('../input/base_test.gz').set_index(COLUMN_ID)
base = pd.concat([base_train, base_test], axis=0)

cols_num = get_numeric_features(data, COLUMNS_IGNORE)
cols_num = [col for col in cols_num if col.count('C') or col.count('D')]
data[cols_num] = data[cols_num].astype('float32')

df_user_id_ca = pd.read_csv('../output/same_user_pattern/0903__same_user_id__card_addr.csv').set_index(COLUMN_ID)
df_user_id_cap = pd.read_csv('../output/same_user_pattern/0903__same_user_id__card_addr_pemail.csv').set_index(COLUMN_ID)
df_user_id_capm = pd.read_csv('../output/same_user_pattern/0902__same_user_id__card_addr_pemail_M.csv').set_index(COLUMN_ID)
df_user_id_bear = pd.read_csv('../output/same_user_pattern/20190901_user_ids_share.csv').set_index(COLUMN_ID)

data['user_id_card_addr'] = df_user_id_ca['predicted_user_id']
data['user_id_card_addr_pemail'] = df_user_id_cap['predicted_user_id']
data['user_id_card_addr_pemail_M'] = df_user_id_capm['predicted_user_id']
data['user_id_bear'] = df_user_id_bear['predicted_user_id']

Process ForkPoolWorker-52:
Process ForkPoolWorker-27:
Process ForkPoolWorker-115:
Process ForkPoolWorker-61:
Process ForkPoolWorker-103:
Process ForkPoolWorker-95:
Process ForkPoolWorker-128:
Process ForkPoolWorker-41:
Process ForkPoolWorker-96:
Process ForkPoolWorker-101:
Process ForkPoolWorker-127:
Process ForkPoolWorker-86:
Process ForkPoolWorker-97:
Process ForkPoolWorker-68:
Process ForkPoolWorker-116:
Process ForkPoolWorker-22:
Process ForkPoolWorker-124:
Process ForkPoolWorker-77:
Traceback (most recent call last):
Process ForkPoolWorker-78:
Process ForkPoolWorker-108:
Process ForkPoolWorker-47:
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
  File "/home/yryrgogo/anaconda3/lib/python3.7/multiprocessing/process.py", line 297, in _bootstrap
    self.run()
  File "/home/yryrgogo/anaconda3/lib/python3.7/multiprocessing/process.py", line 297, in _bootstrap
    self.run()
Process ForkPoolWork

In [4]:
all_V = [col for col in data.columns if col.count('V')]
drop_V = list(set(all_V) - set(cols_V))
data.drop(drop_V, axis=1, inplace=True)
del df_train, df_test
gc.collect()

106

In [5]:
START_DATE = '2017-12-01'
startdate = datetime.datetime.strptime(START_DATE, '%Y-%m-%d')

data['datetime'] = data['TransactionDT'].apply(lambda x: (startdate + datetime.timedelta(seconds = x) - datetime.timedelta(seconds = 14400) ))
data['datetime'].fillna(datetime.date(2020, 1, 1), inplace=True)
data['date'] = data['datetime'].map(lambda x: x.date())

In [7]:
#========================================================================
# ProductCDあたりのC
#========================================================================
cols_pcd = data['ProductCD'].unique()

for col in tqdm(cols_V):
    for pcd in cols_pcd:
        feature_name = f'{col}__ProductCD-{pcd}'
        data[feature_name] = -1
        data.loc[data['ProductCD'].isin([pcd]), feature_name] = data.loc[data['ProductCD'].isin([pcd]), col]

100%|██████████| 47/47 [01:48<00:00,  3.81s/it]


In [8]:
# sorted([col for col in data.columns if not col.count('C')])
cols_V_PCD = sorted([col for col in data.columns if col.count('V') and col not in COLUMNS_IGNORE
#                  and not col.count('Product')
                ])
# cols_D = ['D3', 'D5', 'D6', 'D7', 'D8', 'D9', 'D11', 'D12', 'D13', 'D14']
# cols_feature = cols_C + cols_D
# cols_feature = cols_feature[:3]
len(cols_V_PCD)

282

In [9]:
#========================================================================
# FE Aggregation User ID & TimeSeries Date
#========================================================================

def parallel_agg(df, base_key, base_date, n_day, feature):
    result = df.groupby(base_key)[feature].agg({
        f'{base_key}_day{n_day}_{feature}_mean': 'mean',
    })
    return result


def parallel_agg(df, base_key, n_day, feature):
    list_term_df = []
    for end_date in tqdm(list_end_date):
        start_date = date_add_days(end_date, n_day*-1)
        tmp_user = df[df['date']==end_date][[base_key]].drop_duplicates()
        tmp = df[(start_date <= df.date) & (df.date <= end_date)]

        result = tmp.groupby(base_key, as_index=False)[feature].agg({
            f'{base_key}_day{n_day}_{feature}_mean': 'mean',
        })

        tmp_user = tmp_user.merge(result, on=base_key, how='inner')
        tmp_user['date'] = end_date
        list_term_df.append(tmp_user)
    df_agg = pd.concat(list_term_df, axis=0)
    df_agg.set_index([base_key, 'date'], inplace=True)
    return df_agg
    

# User別に期間を切って集計できる様にする
dir_save = 'valid'
df = data
list_base_key = [col for col in df.columns if col.count('user_id')]
list_end_date = sorted(df['date'].unique())[1:]
list_base_date = list_end_date
list_n_day = [1, 3, 5, 7, 10, 14, 21, 28, 31, 62, 93, 124, 180, 270, 360][1:]
# list_n_day = [1, 3, 5, 7, 10, 14]

for base_key in list_base_key:
    for feature in cols_V_PCD:
        
        base_train = data[[base_key, 'date', feature]].iloc[:len(base_train)]
        base_test  = data[[base_key, 'date', feature]].iloc[len(base_train):]
        
        list_p = Parallel(60)([delayed(parallel_agg)(df[[base_key, 'date', feature]], base_key, n_day, feature) for n_day in list_n_day])
        
        df_agg = pd.concat(list_p, axis=1)
        df_agg.reset_index(inplace=True)
        df_agg['date'] = df_agg['date'].map(lambda x: x.date())
            
        base_train_agg = base_train.merge(df_agg, how='left', on=[base_key, 'date'])
        base_test_agg = base_test.merge(df_agg, how='left', on=[base_key, 'date'])
        
        cols_agg = [col for col in base_train_agg.columns if col.count(f'{base_key}_day')]
        
        for col in cols_agg:
            base_train_agg[f"{col}_org_ratio"] = base_train_agg[col] / (base_train_agg[feature]+1)
            base_train_agg[f"{col}_org_diff"] = base_train_agg[col] - (base_train_agg[feature])
            base_test_agg[f"{col}_org_ratio"] = base_test_agg[col] / (base_test_agg[feature]+1)
            base_test_agg[f"{col}_org_diff"] = base_test_agg[col] - (base_test_agg[feature])
        
        cols_save = [col for col in base_train_agg.columns if col.count('org_ratio') or col.count('org_diff')]
        
        save_feature(base_train_agg[cols_save], '503', dir_save, is_train=True, auto_type=True, list_ignore=COLUMNS_IGNORE)
        save_feature(base_test_agg[cols_save],  '503', dir_save, is_train=False, auto_type=True, list_ignore=COLUMNS_IGNORE)

(590540,) | user_id_card_addr_day3_V127_mean_org_ratio
(590540,) | user_id_card_addr_day3_V127_mean_org_diff
(590540,) | user_id_card_addr_day5_V127_mean_org_ratio
(590540,) | user_id_card_addr_day5_V127_mean_org_diff
(590540,) | user_id_card_addr_day7_V127_mean_org_ratio
(590540,) | user_id_card_addr_day7_V127_mean_org_diff
(590540,) | user_id_card_addr_day10_V127_mean_org_ratio
(590540,) | user_id_card_addr_day10_V127_mean_org_diff
(590540,) | user_id_card_addr_day14_V127_mean_org_ratio
(590540,) | user_id_card_addr_day14_V127_mean_org_diff
(590540,) | user_id_card_addr_day21_V127_mean_org_ratio
(590540,) | user_id_card_addr_day21_V127_mean_org_diff
(590540,) | user_id_card_addr_day28_V127_mean_org_ratio
(590540,) | user_id_card_addr_day28_V127_mean_org_diff
(590540,) | user_id_card_addr_day31_V127_mean_org_ratio
(590540,) | user_id_card_addr_day31_V127_mean_org_diff
(590540,) | user_id_card_addr_day62_V127_mean_org_ratio
(590540,) | user_id_card_addr_day62_V127_mean_org_diff
(590540

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



(590540,) | user_id_card_addr_day3_V314__ProductCD-H_mean_org_ratio
(590540,) | user_id_card_addr_day3_V314__ProductCD-H_mean_org_diff
(590540,) | user_id_card_addr_day5_V314__ProductCD-H_mean_org_ratio
(590540,) | user_id_card_addr_day5_V314__ProductCD-H_mean_org_diff
(590540,) | user_id_card_addr_day7_V314__ProductCD-H_mean_org_ratio
(590540,) | user_id_card_addr_day180_V314__ProductCD-H_mean_org_diff
(590540,) | user_id_card_addr_day270_V314__ProductCD-H_mean_org_ratio
(590540,) | user_id_card_addr_day270_V314__ProductCD-H_mean_org_diff
(590540,) | user_id_card_addr_day360_V314__ProductCD-H_mean_org_ratio
(590540,) | user_id_card_addr_day360_V314__ProductCD-H_mean_org_diff
(506691,) | user_id_card_addr_day3_V314__ProductCD-H_mean_org_ratio
(506691,) | user_id_card_addr_day3_V314__ProductCD-H_mean_org_diff
(506691,) | user_id_card_addr_day5_V314__ProductCD-H_mean_org_ratio
(506691,) | user_id_card_addr_day5_V314__ProductCD-H_mean_org_diff
(506691,) | user_id_card_addr_day7_V314__Prod

Process ForkPoolWorker-3255:
Process ForkPoolWorker-3272:
Process ForkPoolWorker-3298:
ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.

Process ForkPoolWorker-3276:
Process ForkPoolWorker-3291:
Process ForkPoolWorker-3266:
Process ForkPoolWorker-3270:
Process ForkPoolWorker-3285:
Process ForkPoolWorker-3286:
Process ForkPoolWorker-3289:
Process ForkPoolWorker-3290:
Process ForkPoolWorker-3273:
Process ForkPoolWorker-3279:
Process ForkPoolWorker-3274:
Process ForkPoolWorker-3252:
Process ForkPoolWorker-3275:
Process ForkPoolWorker-3296:
Process ForkPoolWorker-3293:
Process ForkPoolWorker-3260:
Process ForkPoolWorker-3269:
Process ForkPoolWorker-3239:
Process ForkPoolWorker-3280:
Process ForkPoolWorker-3243:
Process ForkPoolWorker-3254:
Process ForkPoolWorker-3267:
Process ForkPoolWorker-3258:
Process ForkPoolWorker-3292:
Process ForkPoolWorker-3271:
Process ForkPoolWorker-3295:
Process ForkPoolWorker-3284:
Process ForkPoolWorker-3

Traceback (most recent call last):
  File "/home/yryrgogo/anaconda3/lib/python3.7/site-packages/joblib/parallel.py", line 833, in retrieve
    self._output.extend(job.get(timeout=self.timeout))
  File "/home/yryrgogo/anaconda3/lib/python3.7/site-packages/joblib/_parallel_backends.py", line 521, in wrap_future_result
    return future.result(timeout=timeout)
  File "/home/yryrgogo/anaconda3/lib/python3.7/concurrent/futures/_base.py", line 427, in result
    self._condition.wait(timeout)
  File "/home/yryrgogo/anaconda3/lib/python3.7/threading.py", line 296, in wait
    waiter.acquire()
KeyboardInterrupt

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/home/yryrgogo/anaconda3/lib/python3.7/site-packages/joblib/parallel.py", line 934, in __call__
    self.retrieve()
  File "/home/yryrgogo/anaconda3/lib/python3.7/site-packages/joblib/parallel.py", line 855, in retrieve
    backend.abort_everything(ensure_ready=ensure_ready)
 

Process ForkPoolWorker-3233:
Process ForkPoolWorker-3304:
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last

Traceback (most recent call last):
  File "/home/yryrgogo/anaconda3/lib/python3.7/site-packages/joblib/parallel.py", line 833, in retrieve
    self._output.extend(job.get(timeout=self.timeout))
  File "/home/yryrgogo/anaconda3/lib/python3.7/site-packages/joblib/_parallel_backends.py", line 521, in wrap_future_result
    return future.result(timeout=timeout)
  File "/home/yryrgogo/anaconda3/lib/python3.7/concurrent/futures/_base.py", line 427, in result
    self._condition.wait(timeout)
  File "/home/yryrgogo/anaconda3/lib/python3.7/threading.py", line 296, in wait
    waiter.acquire()
KeyboardInterrupt

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/home/yryrgogo/anaconda3/lib/python3.7/site-packages/joblib/parallel.py", line 934, in __call__
    self.retrieve()
  File "/home/yryrgogo/anaconda3/lib/python3.7/site-packages/joblib/parallel.py", line 855, in retrieve
    backend.abort_everything(ensure_ready=ensure_ready)
 

  File "/home/yryrgogo/anaconda3/lib/python3.7/multiprocessing/process.py", line 297, in _bootstrap
    self.run()
Traceback (most recent call last):
  File "/home/yryrgogo/anaconda3/lib/python3.7/multiprocessing/process.py", line 297, in _bootstrap
    self.run()
Traceback (most recent call last):
  File "/home/yryrgogo/anaconda3/lib/python3.7/multiprocessing/process.py", line 297, in _bootstrap
    self.run()
Traceback (most recent call last):
  File "/home/yryrgogo/anaconda3/lib/python3.7/multiprocessing/process.py", line 297, in _bootstrap
    self.run()



During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/home/yryrgogo/anaconda3/lib/python3.7/site-packages/IPython/core/interactiveshell.py", line 2039, in showtraceback
    stb = value._render_traceback_()
AttributeError: 'TypeError' object has no attribute '_render_traceback_'

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/home/yryrgogo/anaconda3/lib/python3.7/site-packages/IPython/core/ultratb.py", line 1101, in get_records
    return _fixed_getinnerframes(etb, number_of_lines_of_context, tb_offset)
  File "/home/yryrgogo/anaconda3/lib/python3.7/site-packages/IPython/core/ultratb.py", line 319, in wrapped
    return f(*args, **kwargs)
  File "/home/yryrgogo/anaconda3/lib/python3.7/site-packages/IPython/core/ultratb.py", line 353, in _fixed_getinnerframes
    records = fix_frame_records_filenames(inspect.getinnerframes(etb, context))
  File "/home/yryrgogo/ana

Traceback (most recent call last):
  File "/home/yryrgogo/anaconda3/lib/python3.7/multiprocessing/process.py", line 297, in _bootstrap
    self.run()
  File "/home/yryrgogo/anaconda3/lib/python3.7/multiprocessing/process.py", line 297, in _bootstrap
    self.run()


TypeError: can only concatenate str (not "list") to str

Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
  File "/home/yryrgogo/anaconda3/lib/python3.7/multiprocessing/process.py", line 297, in _bootstrap
    self.run()
  File "/home/yryrgogo/anaconda3/lib/python3.7/multiprocessing/process.py", line 297, in _bootstrap
    self.run()
Traceback (most recent call last):
Traceback (most recent call last):
  File "/home/yryrgogo/anaconda3/lib/python3.7/multiprocessing/process.py", line 297, in _bootstrap
    self.run()
  File "/home/yryrgogo/anaconda3/lib/python3.7/multiprocessing/process.py", line 297, in _bootstrap
    self.run()
Traceback (most recent call last):
  File "/home/yryrgogo/anaconda3/lib/python3.7/multiprocessing/process.py", line 297, in _bootstrap
    self.run()
  File "/home/yryrgogo/anaconda3/lib/python3.7/multiprocessing/process.py", line 297, in _bootstrap
    self.run()
  File "/home/yryrgogo/anaconda3/lib/python3.7/multiprocessing/process.py", line 297, in _bootstrap


Traceback (most recent call last):
  File "/home/yryrgogo/anaconda3/lib/python3.7/site-packages/joblib/parallel.py", line 833, in retrieve
    self._output.extend(job.get(timeout=self.timeout))
  File "/home/yryrgogo/anaconda3/lib/python3.7/site-packages/joblib/_parallel_backends.py", line 521, in wrap_future_result
    return future.result(timeout=timeout)
  File "/home/yryrgogo/anaconda3/lib/python3.7/concurrent/futures/_base.py", line 427, in result
    self._condition.wait(timeout)
  File "/home/yryrgogo/anaconda3/lib/python3.7/threading.py", line 296, in wait
    waiter.acquire()
KeyboardInterrupt

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/home/yryrgogo/anaconda3/lib/python3.7/site-packages/joblib/parallel.py", line 934, in __call__
    self.retrieve()
  File "/home/yryrgogo/anaconda3/lib/python3.7/site-packages/joblib/parallel.py", line 855, in retrieve
    backend.abort_everything(ensure_ready=ensure_ready)
 

In [11]:
cols_V_PCD

['V127',
 'V127__ProductCD-C',
 'V127__ProductCD-H',
 'V127__ProductCD-R',
 'V127__ProductCD-S',
 'V127__ProductCD-W',
 'V128',
 'V128__ProductCD-C',
 'V128__ProductCD-H',
 'V128__ProductCD-R',
 'V128__ProductCD-S',
 'V128__ProductCD-W',
 'V130',
 'V130__ProductCD-C',
 'V130__ProductCD-H',
 'V130__ProductCD-R',
 'V130__ProductCD-S',
 'V130__ProductCD-W',
 'V131',
 'V131__ProductCD-C',
 'V131__ProductCD-H',
 'V131__ProductCD-R',
 'V131__ProductCD-S',
 'V131__ProductCD-W',
 'V133',
 'V133__ProductCD-C',
 'V133__ProductCD-H',
 'V133__ProductCD-R',
 'V133__ProductCD-S',
 'V133__ProductCD-W',
 'V156',
 'V156__ProductCD-C',
 'V156__ProductCD-H',
 'V156__ProductCD-R',
 'V156__ProductCD-S',
 'V156__ProductCD-W',
 'V165',
 'V165__ProductCD-C',
 'V165__ProductCD-H',
 'V165__ProductCD-R',
 'V165__ProductCD-S',
 'V165__ProductCD-W',
 'V187',
 'V187__ProductCD-C',
 'V187__ProductCD-H',
 'V187__ProductCD-R',
 'V187__ProductCD-S',
 'V187__ProductCD-W',
 'V258',
 'V258__ProductCD-C',
 'V258__ProductCD

In [None]:
#========================================================================
# FE Aggregation User ID & TimeSeries Hour
#========================================================================


In [13]:
# for df, agg_cols in zip(arg_df_list, arg_list):
def parallel_agg(df, agg_cols):
    error_keys = []
    error_cols = []
    for col in agg_cols:
        aggs = {}
#         aggs[col] = ['mean', 'max', 'min', 'std']
        aggs[col] = ['max', 'min']
        
        for key in list_key:
            
            tmp_base_train = base_train.join(df[key])
            tmp_base_test = base_test.join(df[key])
            
            try:
                base_agg = df[key].to_frame()
            except AttributeError:
                error_keys.append(key)
                error_cols += agg_cols
            
            df_agg = df.groupby(key).agg(aggs)
            df_agg.columns = get_new_columns(key+'_', aggs)
            max_col = [col for col in df_agg.columns if col.count('_max')][0]
            min_col = [col for col in df_agg.columns if col.count('_min')][0]
            df_agg[max_col+'_min_diff'] = df_agg[max_col] - df_agg[min_col]
            
            df_agg.drop([max_col, min_col], axis=1, inplace=True)
            
            df_agg.reset_index(inplace=True)
            
            base_train_agg = tmp_base_train.merge(df_agg, on=key, how='left')
            base_test_agg = tmp_base_test.merge(df_agg, on=key, how='left')
            
            del df_agg, tmp_base_train, tmp_base_test
            gc.collect()
            
            print(base_train_agg.shape, base_test_agg.shape)
            cols_feature = [
                col for col in base_train_agg.columns 
                if col not in COLUMNS_IGNORE and col != key and col != 'D1']
            save_feature(base_train_agg[cols_feature], '502', dir_save, is_train=True, auto_type=True, list_ignore=COLUMNS_IGNORE)
            save_feature(base_test_agg[cols_feature],  '502', dir_save, is_train=False, auto_type=True, list_ignore=COLUMNS_IGNORE)
            
            del base_train_agg, base_test_agg
            gc.collect()

In [14]:
err = Parallel(n_jobs)([delayed(parallel_agg)(d, ac) for d, ac in zip(arg_df_list, arg_list)])

In [None]:
df_feat.head()