In [129]:
%load_ext autoreload
%autoreload 2
from func.utils import get_categorical_features, read_pkl_gzip, to_pkl_gzip, parallel_load_data, get_filename
from func.time_utils import date_add_days, diff_of_days, diff_of_times
from glob import glob
from matplotlib import pyplot as plt
from pathlib import Path
from tqdm import tqdm
from train_prophet import main_prophet
import datetime
import math
import os
import numpy as np
import pandas as pd

HOME = os.path.expanduser('~')
sys.path.append(f"{HOME}/github/data_mining/library")
from parallel_utils import get_parallel_arg_list
from joblib import delayed, Parallel

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [None]:
COLUMN_ID = 'TransactionID'
COLUMN_DT = 'TransactionDT'
COLUMN_TARGET = 'isFraud'
COLUMNS_IGNORE = [COLUMN_ID, COLUMN_DT, COLUMN_TARGET]

train_paths = glob('../feature/raw_main/*_train.gz')
test_paths = glob('../feature/raw_main/*_test.gz')

train_paths = [path for path in train_paths 
               if path.count('DT') 
               or path.count('Fraud') 
               or path.count('D')
               or path.count('C1')
               or path.count('C13')
               or path.count('V221')
               or path.count('V285')
               or path.count('card')
               or path.count('addr')
               or path.count('Reg')
               or path.count('P_email')
               or path.count('M')
               or path.count('Product')
              ]
test_paths  = [path for path in test_paths  
               if path.count('DT') 
               or path.count('Fraud') 
               or path.count('D')
               or path.count('C1')
               or path.count('C13')
               or path.count('V221')
               or path.count('V285')
               or path.count('card')
               or path.count('addr')
               or path.count('Reg')
               or path.count('P_email')
               or path.count('M')
               or path.count('Product')
              ]

train_df = parallel_load_data(train_paths)
test_df = parallel_load_data(test_paths)

In [None]:
START_DATE = '2017-11-04'
# START_DATE = '2017-12-01'
startdate = datetime.datetime.strptime(START_DATE, '%Y-%m-%d')
train_df['datetime'] = train_df['TransactionDT'].apply(lambda x: (startdate + datetime.timedelta(seconds = x) - datetime.timedelta(seconds = 14400) ))
test_df['datetime'] = test_df['TransactionDT'].apply(lambda x: (startdate + datetime.timedelta(seconds = x) - datetime.timedelta(seconds = 14400) ))
train_df['datetime'].fillna(datetime.date(2020, 1, 1), inplace=True)
test_df['datetime'].fillna(datetime.date(2020, 1, 1), inplace=True)
train_df['date'] = train_df['datetime'].map(lambda x: x.date())
test_df['date']  =  test_df['datetime'].map(lambda x: x.date())
train_df['hour'] = train_df['datetime'].map(lambda x: x.hour)
test_df['hour']  =  test_df['datetime'].map(lambda x: x.hour)

In [None]:
list_regist = []
for d, diff in tqdm(train_df[['date', 'D1']].values):
    if diff < 999999:
        regist = date_add_days(d, -1*diff)
    else:
        regist = date_add_days(d, 0)
    list_regist.append(str(regist))

train_df['Regist_date'] = list_regist
    
list_regist = []
for d, diff in tqdm(test_df[['date', 'D1']].values):
    if diff < 999999:
        regist = date_add_days(d, -1*diff)
    else:
        regist = date_add_days(d, 0)
    list_regist.append(str(regist))

test_df['Regist_date'] = list_regist

In [None]:
data = pd.concat([train_df, test_df], axis=0, ignore_index=True)

異なるuser keyで様々なユーザー特定パターン（正確でなくてもよい）を作り、それぞれのパターンで特徴を作る  
正確でなくてもよい理由としては、粒度を粗くしてもある条件を外した特徴量になるだけなので、問題ない

In [None]:
user_info = [
    'ProductCD',
    'Regist_date'
]
list_card = [
    'card1',
    'card2',
    'card3',
    'card4',
    'card5',
    'card6',
]
list_addr = [
    'addr1',
    'addr2',
]
list_pemail = [
    'P_emaildomain'
]
list_M = [col for col in data.columns if col.startswith('M')]

user_keys__card = user_info + list_card

user_keys__card_addr = user_info + list_card + list_addr
user_keys__card_pemail = user_info + list_card + list_pemail
user_keys__card_M = user_info + list_card + list_M

user_keys__card_addr_pemail = user_info + list_card + list_addr + list_pemail
user_keys__card_addr_M = user_info + list_card + list_addr + list_M
user_keys__card_pemail_M = user_info + list_card + list_pemail + list_M

user_keys__card_addr_pemail_M = user_info + list_card + list_addr + list_pemail + list_M

list_user_keys = [
  ['user_keys__card',      user_keys__card,],
  ['user_keys__card_addr',      user_keys__card_addr,],
  ['user_keys__card_pemail',      user_keys__card_pemail,],
  ['user_keys__card_M',      user_keys__card_M,],
  ['user_keys__card_addr_M',      user_keys__card_addr_M,],
  ['user_keys__card_addr_pemail',      user_keys__card_addr_pemail,],
  ['user_keys__card_pemail_M',      user_keys__card_pemail_M,],
  ['user_keys__card_addr_pemail_M',      user_keys__card_addr_pemail_M,],
]
check_Ds = ['D1', 'D2', 'D3', 'D4', 'D5', 'datetime']

data['D1'].fillna(-1, inplace=True)
data[user_info + list_card] = data[user_info + list_card].astype('str').fillna('#')

user_keyで粗いグループを作ったら、そのグループの中で同じユーザーを探索し、条件が合えば仲間にしてさらにグループを作っていくイメージ

In [None]:
user_keys = user_keys__card_addr_pemail

uid = 'user_id'
df = data.copy()

for col in user_keys:
    df[col].fillna('#', inplace=True)

df[user_keys] =  df[user_keys].astype('str')

user_df = df.groupby(user_keys)[COLUMN_ID].min()
user_df = user_df.to_frame(uid)
user_df = df.set_index(user_keys).join(user_df)

check_cols = [COLUMN_ID, uid, COLUMN_TARGET, 'datetime', 'Regist_date'] + ['D1', 'D3', 'C1', 'C13', 'V221', 'V285']
user_df = user_df.reset_index()[check_cols]

if uid not in user_df.columns:
    user_df.reset_index(inplace=True)
    
user_df['hour'] = user_df['datetime'].map(lambda x: x.hour)
user_df['D3'].fillna(-99999, inplace=True)
user_df['C1'].fillna(-99999, inplace=True)
user_df['C13'].fillna(-99999, inplace=True)
user_df['V221'].fillna(-99999, inplace=True)
user_df['V285'].fillna(-99999, inplace=True)


# user_df.sort_values(by=[uid, 'datetime'], inplace=True)

# user_df['diff_from_regist'] = user_df['datetime'] - user_df['Regist_date'].map(lambda x: datetime.datetime(int(x[:4]), int(x[5:7]), int(x[8:10]) ) )
# user_df['diff_from_regist'] = user_df['diff_from_regist'].map(lambda x: x.days)

# user_df['Regist_date_add_D1'] = user_df[['Regist_date', 'D1']].apply(lambda x: date_add_days(x[0], x[1]), axis=1)
# user_df['Regist_date_add_D1-D3'] = user_df[['Regist_date_add_D1', 'D3']].apply(lambda x: date_add_days(x[0], -1*x[1]), axis=1)
# user_df['Regist_date_add_D1-D3-1'] = user_df[['Regist_date_add_D1', 'D3']].apply(lambda x: date_add_days(x[0], -1*x[1]-1), axis=1)

# user_df['date__before1'] = user_df.groupby(uid)['date'].shift(1)
# user_df['date__before2'] = user_df.groupby(uid)['date'].shift(2)
# user_df['date__before3'] = user_df.groupby(uid)['date'].shift(3)
# user_df['D1__before1']   = user_df.groupby(uid)['D1'].shift(1)
# user_df['D1__before2']   = user_df.groupby(uid)['D1'].shift(2)

# user_df['diff_D1__before1'] = user_df[['D1', 'D1__before1']].apply(lambda x: x[0] - x[1] , axis=1)
# user_df['diff_D1__before2'] = user_df[['D1', 'D1__before2']].apply(lambda x: x[0] - x[1] , axis=1)
# user_df['diff_day__before1'] = user_df[['date', 'date__before1']].apply(lambda x: diff_of_days(x[0], x[1]) , axis=1)
# user_df['diff_day__before2'] = user_df[['date', 'date__before2']].apply(lambda x: diff_of_days(x[0], x[1]) , axis=1)
# user_df['diff_day__before3'] = user_df[['date', 'date__before3']].apply(lambda x: diff_of_days(x[0], x[1]) , axis=1)

# user_df.to_csv('../output/0901_ieee__user_df_card_addr_pemail__for_specific_same_user.csv', index=False)

In [155]:
col_no = 'no'
user_df['no'] = np.arange(user_df.shape[0])+1

tmp = user_df.groupby(uid)['C1'].count().to_frame('user_cnt')
df_multi_user = tmp[tmp['user_cnt']>1]
multi_user = df_multi_user.index.tolist()

list_eval_group = []
list_user_map = []

arg_list = get_parallel_arg_list(n_jobs=16, arg_list=multi_user)

In [156]:
def parallel_explore_same_user(group_no_list):
    
    list_user_map = []
    for group_no in group_no_list:

        group = user_df[user_df[uid]==group_no]
        group.sort_values(by='datetime', inplace=True)
        users = group[col_no].unique().tolist()
        user_map = {}
        
        while True:
            
            remain_user = list( set(users) - set(list(user_map.keys())) )
            
            base_user = remain_user[0]
            user_map[base_user] = base_user
            list_same_user = []
            
            base = group[group[col_no]==base_user]
            base_dt = base['datetime'].values[0]
            base_d1 = base['D1'].values[0]
            base_d3 = base['D3'].values[0]
            base_c1 = base['C1'].values[0]
            base_c13 = base['C13'].values[0]
            base_v221 = base['V221'].values[0]
            base_v285 = base['V285'].values[0]
            base_hour = base['hour'].values[0]
            base_fraud = base[COLUMN_TARGET].values[0]
            
            for i in range(1, len(remain_user), 1):
            
                tmp_user = remain_user[i]
                
                tmp = group[group[col_no]==tmp_user]
                tmp_dt = tmp['datetime'].values[0]
                tmp_d1 = tmp['D1'].values[0]
                tmp_d3 = tmp['D3'].values[0]
                tmp_c1 = tmp['C1'].values[0]
                tmp_c13 = tmp['C13'].values[0]
                tmp_v221 = tmp['V221'].values[0]
                tmp_v285 = tmp['V285'].values[0]
                tmp_hour = tmp['hour'].values[0]
                tmp_fraud = tmp[COLUMN_TARGET].values[0]
                
                diff_d1   = tmp_d1 - base_d1
                diff_c1   = abs(tmp_c1 - base_c1)
                minus_c1   = tmp_c1 < base_c1
                if minus_c1:
                    thres_c1 = 1
                    thres_c1_2 = 1
                else:
                    thres_c1 = 2
                    thres_c1_2 = 4
                
                diff_c13  = abs(tmp_c13 - base_c13)
                diff_v221 = abs(tmp_v221 - base_v221)
                diff_v285 = abs(tmp_v285 - base_v285)
                
                ratio_c1   = abs(tmp_c1  / base_c1)
                ratio_c13  = abs(tmp_c13 / base_c13)
                
    #             if base_user==58:
    #                 print("="*12)
    #                 print(f"base {base_user} {base_dt}  ->  D1:{base_d1} D3:{base_d3} C1:{base_c1} C13:{base_c13} V221:{base_v221} V285:{base_v285}")
    #                 print(f"tmp  {tmp_user} {tmp_dt}  ->  D1:{tmp_d1} D3:{tmp_d3} C1:{tmp_c1} C13:{tmp_c13} V221:{tmp_v221} V285:{tmp_v285}")
    #                 print(f"diff_D1 {diff_d1} diffD1-D3: {diff_d1-tmp_d3} diff_C1:{diff_c1} diff_C13:{diff_c13} diff_V221:{diff_v221} diff_V285:{diff_v285}")
    #                 print("="*12)
                
                # D3がNullの場合
                if base_d3==-99999 or tmp_d3==-99999:
                    # TODO
                    # ここでFraud条件を入れると、train-test間を同じユーザーにまとめることはできなくなる
                    # 両パターンを作成し、Fraud含めないパターンでまたいたユーザーが本当にまたいでるかチェックしたい
                    if base_fraud==tmp_fraud and diff_c1<=thres_c1 and diff_c13<=thres_c1 and diff_v221<=2 and diff_v285<=2:
                        pass
                    elif base_c1>=20 and tmp_c1>=20 and (0.9<=ratio_c1 and ratio_c1<=1.1) and (0.9<=ratio_c13 or ratio_c13<=1.1) and diff_v221<=2 and diff_v285<=2:
                        pass
                    elif base_fraud==tmp_fraud and base_c1>=20 and tmp_c1>=20 and (0.9<=ratio_c1 and ratio_c1<=1.1) and (0.9<=ratio_c13 or ratio_c13<=1.1) and diff_v221>90000 and diff_v285>90000:
                        pass
                    elif base_fraud==tmp_fraud and base_c1>=20 and tmp_c1>=20 and (0.9<=ratio_c1 and ratio_c1<=1.1) and (0.9<=ratio_c13 or ratio_c13<=1.1) and diff_v221>90000 and diff_v285<=2:
                        pass
                    elif base_fraud==tmp_fraud and base_c1>=20 and tmp_c1>=20 and (0.9<=ratio_c1 and ratio_c1<=1.1) and (0.9<=ratio_c13 or ratio_c13<=1.1) and diff_v221<=2 and diff_v285>90000:
                        pass
                    else:
                        continue
                    
                # D3がNullsでない場合
                # D3の差分が時差を考慮してもおかしい場合は別ユーザーにする（時差が1~2時間以上だと強制的にはじいてしまうが）
                elif (diff_d1 == tmp_d3) or abs(diff_d1 - tmp_d3)==1 and (tmp_hour>=23 or tmp_hour<=1):
                    pass
                else:
                    # 別ユーザーとみなす
                    continue
                    
                    
                # D3がNullsでない場合
                if base_d3!=-99999 and tmp_d3!=-99999:
                    # 別ユーザーとみなす
                    if diff_c1<-10 or diff_c1>1000:
                        continue
                    if diff_c13<-10 or diff_c13>1000:
                        continue
                        
                    # V221, 285の変化量が1トランザクションで10以上はないという想定
                    if (10<= diff_v221) and (diff_v221<=50):
                        continue
                    if (10<= diff_v285) and (diff_v285<=30):
                        continue
                    
                    if diff_c1<=thres_c1_2 and diff_c13<=thres_c1_2 and diff_v221<=2 and diff_v285<=2 and diff_d1>0:
                        pass
                    elif diff_c1<=thres_c1 and diff_c13<=thres_c1 and diff_v221<=1 and diff_v285<=1 and diff_d1==0:
                        pass
                    elif base_c1>=20 and tmp_c1>=20 and (0.8<=ratio_c1 and ratio_c1<=1.6) and (0.8<=ratio_c13 or ratio_c13<=1.6) and diff_d1!=0:
                        pass
                    else:
                        # 別ユーザーとみなす
                        continue
                    
                user_map[tmp_user] = base_user
                
            if len(user_map.keys()) == len(users):
                break
                
        tmp_user = pd.Series(user_map).to_frame('same_user')
        tmp_user['group_no'] = same_group
        list_user_map.append(tmp_user)
    return tmp_user

In [None]:
list_p = Parallel(16)([delayed(parallel_explore_same_user)(group_no_list) for group_no_list in arg_list])

In [125]:
tmp = user_df.set_index('no').join( pd.concat(list_user_map), how='inner')
tmp[tmp['group_no']>=10].sort_values(by='same_user')

Unnamed: 0,TransactionID,user_id,isFraud,datetime,Regist_date,D1,D3,C1,C13,V221,V285,hour,same_user,group_no
27,3017689,3017689,1.0,2017-11-12 16:57:08,2016-07-01,499.0,-99999.0,4.0,0.0,1.0,-99999.0,16,27,10
28,3028580,3017689,1.0,2017-11-15 14:45:13,2016-07-01,502.0,-99999.0,4.0,0.0,1.0,0.0,14,28,10
29,3028585,3017689,1.0,2017-11-15 14:46:20,2016-07-01,502.0,-99999.0,4.0,0.0,1.0,0.0,14,28,10
30,3028936,3017689,1.0,2017-11-15 15:48:23,2016-07-01,502.0,-99999.0,5.0,1.0,1.0,0.0,15,28,10
31,3028956,3017689,1.0,2017-11-15 15:52:08,2016-07-01,502.0,-99999.0,5.0,1.0,1.0,0.0,15,28,10
32,3038102,3017689,0.0,2017-11-17 18:13:53,2016-07-01,504.0,-99999.0,3306.0,2067.0,-99999.0,0.0,18,32,10
33,3106587,3017689,0.0,2017-11-30 19:19:31,2016-07-01,517.0,-99999.0,41.0,6.0,-99999.0,0.0,19,33,10
55,3371836,3017689,0.0,2018-02-23 13:22:19,2016-07-01,602.0,-99999.0,1.0,1.0,-99999.0,0.0,13,34,10
54,3369423,3017689,0.0,2018-02-22 16:36:32,2016-07-01,601.0,-99999.0,0.0,0.0,-99999.0,0.0,16,34,10
51,3345228,3017689,0.0,2018-02-14 14:48:13,2016-07-01,593.0,-99999.0,0.0,0.0,-99999.0,0.0,14,34,10


In [None]:
# ユーザーIDの作成
cnt = 0
p_cnt = 0
list_user = []
for user, d3, b1, b2, b3, D1_b1, D1_b2 in user_df[[
    uid,
    'D3',
    'diff_day__before1',
    'diff_day__before2',
    'diff_day__before3',
    'diff_D1__before1',
    'diff_D1__before2'
]].values:
    p_cnt+=1
#         if d3==b3 or d3==b3-1:
    if d3==b3:
        current_user = b3_user
#         elif d3==b2 or d3==b2-1:
    elif d3==b2:
        current_user = b2_user
#         elif d3==b1 or d3==b1-1:
    elif d3==b1:
        current_user = b1_user
#         elif D1_b1 == b1 and D1_b1 != 0:
#             current_user = b1_user
#         elif D1_b2 == b2 and D1_b2 != 0:
#             current_user = b2_user
    else:
        cnt += 1
    current_user = cnt

    if p_cnt>=3:
        b3_user = b2_user
    if p_cnt>=2:
        b2_user = b1_user
    b1_user = cnt
    list_user.append(current_user)

user_df[col_same_user] = list_user
print(f"Extract Same User ID.", user_df.shape)
user_df = user_df[[COLUMN_ID, col_same_user, uid]]

In [None]:
for pattern, user_keys in tqdm(list_user_keys):
    df_new_user = create_new_user_id(data)
    df_new_user.to_csv(f'../output/0830_ieee__same_user__pattern-{pattern}.csv', index=False)