In [1]:
%load_ext autoreload
%autoreload 2
from glob import glob
import os
import sys
import datetime
import numpy as np
import pandas as pd
from tqdm import tqdm
from func.utils import get_categorical_features, read_pkl_gzip, to_pkl_gzip, parallel_load_data, get_filename

In [None]:
COLUMN_ID = 'TransactionID'
COLUMN_DT = 'TransactionDT'
COLUMN_TARGET = 'isFraud'
COLUMNS_IGNORE = [COLUMN_ID, COLUMN_DT, COLUMN_TARGET]

train_paths = glob('../feature/raw_main/*_train.gz')
test_paths = glob('../feature/raw_main/*_test.gz')

train_paths = [path for path in train_paths 
               if path.count('DT') 
               or path.count('Fraud') 
               or path.count('D1')
               or path.count('D3')
               or path.count('C1')
               or path.count('V221')
               or path.count('V285')
               or path.count('ID')
              ]
test_paths = [path for path in test_paths 
               if path.count('DT') 
               or path.count('Fraud') 
               or path.count('D1')
               or path.count('D3')
               or path.count('C1')
               or path.count('V221')
               or path.count('V285')
               or path.count('ID')
              ]

train_df = parallel_load_data(train_paths)
test_df = parallel_load_data(test_paths)

In [52]:
if COLUMN_ID in train_df.columns:
    train_df.set_index(COLUMN_ID, inplace=True)
    test_df.set_index(COLUMN_ID, inplace=True)
rawdata = pd.concat([train_df, test_df], axis=0)
rawdata['C1'] += 1
rawdata['C13'] += 1
list_path = glob('../output/0830_ieee__same_user__pattern-user_keys__card*')
for path in list_path:
    fname = path[56:].replace('csv', '').replace(r'.', '')
    same_user = pd.read_csv(path).set_index(COLUMN_ID)
    rawdata[f'same_user__{fname}']  = same_user['same_user_id']
    
cols_C = ['C1', 'C13']
cols_V = ['V221', 'V285']
col_same = 'same_user__addr_pemail_M'
col_same = 'same_user__addr_pemail'
rawdata.sort_values(by=[col_same, COLUMN_DT], inplace=True)

In [53]:
rawdata['thres_ratio'] = rawdata[['C1', 'D1', 'D3']].apply(lambda x: 
              1 if        (x[0]==1    and x[1]<=30 and x[2]<=2)
              else 2 if   (x[0]==2    and x[1]<=30 and x[2]<=2)
              else 3 if   (x[0]==3    and x[1]<=30 and x[2]<=2)
              else 2 if   (x[0]<=5    and x[1]<=30 and x[2]<=2)
              else 1.6 if (x[0]<=12   and x[1]<=30 and x[2]<=2)
              else 1.3 if (x[0]<=20   and x[1]<=30 and x[2]<=2)
              else 1.2 if (x[0]<=30   and x[1]<=30 and x[2]<=2)
              else 1.1 if (x[0]<=9999 and x[1]<=30 and x[2]<=2)
                                                    
              else 1 if   (x[0]==1    and x[1]> 30)
              else 2 if   (x[0]==2    and x[1]> 30)
              else 3 if   (x[0]==3    and x[1]> 30)
              else 2 if   (x[0]<=5    and x[1]> 30)
              else 1.8 if (x[0]<=12   and x[1]> 30)
              else 1.5 if (x[0]<=20   and x[1]> 30)
              else 1.4 if (x[0]<=40   and x[1]> 30)
              else 2 if   (x[0]> 40   and x[1]> 30)
                                                    
              else 1 if   (x[0]==1    and x[2]>=3 )
              else 2 if   (x[0]==2    and x[2]>=3 )
              else 3 if   (x[0]==3    and x[2]>=3 )
              else 2 if   (x[0]<=5    and x[2]>=3 )
              else 1.8 if (x[0]<=12   and x[2]>=3 )
              else 1.5 if (x[0]<=20   and x[2]>=3 )
              else 1.4 if (x[0]<=30   and x[2]>=3 )
              else 2
             , axis=1)

In [54]:
data = rawdata.copy()

In [111]:
for no in tqdm(range(1,11,1)):

    col = 'C1'
    if col_same in data.columns:
        data.set_index(col_same, inplace=True)
    data['C1_shift1'] = data.groupby(col_same)[col].shift(1)
    
    data.reset_index(inplace=True)
    data.loc[data[data[f'{col}_shift1'].isnull()].index, f'{col}_shift1'] = data.loc[data[data[f'{col}_shift1'].isnull()].index, col]
    data.set_index(col_same, inplace=True)
    
    data[f'{col}_diff_ratio'] = data[col] / data[f'{col}_shift1'].map(lambda x: abs(x))
    data[f'diff_user_no_{col}'] = (data['C1_diff_ratio'] > data['thres_ratio'])*1 + 1
    
    col = 'C13'
    data['C13_shift1'] = data.groupby(col_same)[col].shift(1)
    
    data.reset_index(inplace=True)
    data.loc[data[data[f'{col}_shift1'].isnull()].index, f'{col}_shift1'] = data.loc[data[data[f'{col}_shift1'].isnull()].index, col]
    data.set_index(col_same, inplace=True)
    
    data[f'{col}_diff_ratio'] = data[col] / data[f'{col}_shift1'].map(lambda x: abs(x))
    data[f'diff_user_no_{col}'] = (data['C13_diff_ratio'] > data['thres_ratio']) * 1 + 1
    
    data.reset_index(inplace=True)
    
    col_tmp_id = f'tmp_user_id__{no}'
    col_new_id = f'new_user_id__card_addr_pemail__{no}'
#     col_new_id = f'new_user_id__card_addr_pemail_M__{no}'
    data[col_tmp_id] = data[[col_same, 'diff_user_no_C1', 'diff_user_no_C13']].apply(lambda x: f'{x[0]}-{x[1]}-{x[2]}', axis=1)
    new = data[col_tmp_id].drop_duplicates().sort_values().to_frame()
    new['new_id'] = np.arange(len(new))
    data.set_index(col_tmp_id, inplace=True)
    new.set_index(col_tmp_id, inplace=True)
    data[col_new_id] = new['new_id']
    data.reset_index(inplace=True)

    col_same = col_new_id






  0%|          | 0/10 [00:00<?, ?it/s][A[A[A[A[A




 10%|█         | 1/10 [00:38<05:47, 38.62s/it][A[A[A[A[A




 20%|██        | 2/10 [01:19<05:13, 39.24s/it][A[A[A[A[A




 30%|███       | 3/10 [02:00<04:38, 39.83s/it][A[A[A[A[A




 40%|████      | 4/10 [02:42<04:02, 40.42s/it][A[A[A[A[A




 50%|█████     | 5/10 [03:24<03:24, 40.91s/it][A[A[A[A[A




 60%|██████    | 6/10 [04:04<02:42, 40.70s/it][A[A[A[A[A




 70%|███████   | 7/10 [04:45<02:01, 40.66s/it][A[A[A[A[A




 80%|████████  | 8/10 [05:26<01:21, 40.77s/it][A[A[A[A[A




 90%|█████████ | 9/10 [06:08<00:41, 41.13s/it][A[A[A[A[A




100%|██████████| 10/10 [06:49<00:00, 41.24s/it][A[A[A[A[A




[A[A[A[A[A

In [110]:
tmp = data.groupby(col_same)['C1'].count()
multi_idx = tmp[tmp>3].index
data.set_index('same_user__addr_pemail').loc[multi_idx][[col_new_id, 'D1', 'D3', 'C1_diff_ratio', 'C1', f'C1_shift1']]

Unnamed: 0_level_0,new_user_id__card_addr_pemail__1,D1,D3,C1_diff_ratio,C1,C1_shift1
same_user__addr_pemail,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
109,10088,238.0,0.0,1.000000,7.0,7.0
109,10088,238.0,0.0,1.000000,7.0,7.0
109,10088,462.0,224.0,1.285714,9.0,7.0
109,10088,464.0,2.0,1.000000,9.0,9.0
142,47106,0.0,,1.000000,2.0,2.0
142,47106,0.0,0.0,1.000000,2.0,2.0
142,47106,0.0,0.0,1.000000,2.0,2.0
142,47106,0.0,0.0,1.000000,2.0,2.0
142,47106,0.0,0.0,1.000000,2.0,2.0
148,53891,0.0,,1.000000,2.0,2.0


In [91]:
# check
tmp = data.reset_index()[[col_new_id, 'same_user__addr_pemail', 'thres_ratio', 'D1', 'D3', 'C1_diff_ratio', 'C1', 'C1_shift1', 'C13_diff_ratio', 'C13', 'C13_shift1']].sort_values(by=['same_user__addr_pemail', 'D1'])
tmp[tmp['C1_diff_ratio']>1]

Unnamed: 0,new_user_id__card_addr_pemail__1,same_user__addr_pemail,thres_ratio,D1,D3,C1_diff_ratio,C1,C1_shift1,C13_diff_ratio,C13,C13_shift1


In [105]:
cols_new = [col for col in data.columns if col.count('new_user')]
data[['same_user__addr_pemail'] + cols_new].to_csv('../output/0831_ieee__some_new_user_id__card_addr_pemail_M.csv')