In [1]:
%load_ext autoreload
%autoreload 2
from glob import glob
import gc
import os
import sys
import datetime
import numpy as np
import pandas as pd
from tqdm import tqdm
from func.utils import get_numeric_features, get_categorical_features, read_pkl_gzip, to_pkl_gzip, parallel_load_data, get_filename
from func.time_utils import date_add_days
from func.ml_utils import save_feature, get_cnt_feature, get_dummie_feature, get_label_feature
from func.parallel_utils import get_parallel_arg_list
from joblib import delayed, Parallel

In [None]:
COLUMN_ID = 'TransactionID'
COLUMN_DT = 'TransactionDT'
COLUMN_TARGET = 'isFraud'
COLUMNS_IGNORE = [COLUMN_ID, COLUMN_DT, COLUMN_TARGET, 'ProductCD']

train_paths = glob('../feature/eda_base/*_train.gz')
test_paths = glob('../feature/eda_base/*_test.gz')

train_paths = [path for path in train_paths 
               if (path.count(COLUMN_DT) 
               or path.count(COLUMN_ID)
               or path.count(COLUMN_TARGET)
               or path.count('V')
               or path.count('C')
               or path.count('D')
               or path.count('card')
               or path.count('addr')
               or path.count('domain')
               or path.count('TransactionAmt')
               or path.count('Product')
               )
               and not path.count('new_uid')
               and not path.count('fill')
               and not path.count('bin')
               and not path.count('129')
              ]
test_paths = [path for path in test_paths 
               if (path.count(COLUMN_DT) 
               or path.count(COLUMN_ID)
               or path.count(COLUMN_TARGET)
               or path.count('V')
               or path.count('C')
               or path.count('D')
               or path.count('card')
               or path.count('addr')
               or path.count('domain')
               or path.count('TransactionAmt')
               or path.count('Product')
               )
               and not path.count('new_uid')
               and not path.count('fill')
               and not path.count('bin')
               and not path.count('129')
              ]

df_train = parallel_load_data(train_paths)
print(df_train.shape)
df_test = parallel_load_data(test_paths)
data = pd.concat([df_train, df_test], axis=0, ignore_index=True)
train_length = df_train.shape[0]

df_user_id_ca = pd.read_csv('../output/same_user_pattern/0903__same_user_id__card_addr.csv').set_index(COLUMN_ID)
df_user_id_cap = pd.read_csv('../output/same_user_pattern/0903__same_user_id__card_addr_pemail.csv').set_index(COLUMN_ID)
df_user_id_capm = pd.read_csv('../output/same_user_pattern/0902__same_user_id__card_addr_pemail_M.csv').set_index(COLUMN_ID)
df_user_id_bear = pd.read_csv('../output/same_user_pattern/20190901_user_ids_share.csv').set_index(COLUMN_ID)

data.set_index(COLUMN_ID, inplace=True)
data['user_id_bear'] = df_user_id_bear['predicted_user_id']
data.reset_index(inplace=True)

# df_train['user_id_card_addr'] = df_user_id_ca['predicted_user_id']
# df_train['user_id_card_addr_pemail'] = df_user_id_cap['predicted_user_id']
# df_train['user_id_card_addr_pemail_M'] = df_user_id_capm['predicted_user_id']

In [3]:
START_DATE = '2017-12-01'
# START_DATE = '2017-12-01'
startdate = datetime.datetime.strptime(START_DATE, '%Y-%m-%d')

data['datetime'] = data['TransactionDT'].apply(lambda x: (startdate + datetime.timedelta(seconds = x) - datetime.timedelta(seconds = 14400) ))
data['datetime'].fillna(datetime.date(2020, 1, 1), inplace=True)
data['date'] = data['datetime'].map(lambda x: x.date())

# df_train['datetime'] = df_train['TransactionDT'].apply(lambda x: (startdate + datetime.timedelta(seconds = x) - datetime.timedelta(seconds = 14400) ))
# df_train['datetime'].fillna(datetime.date(2020, 1, 1), inplace=True)
# df_train['date'] = df_train['datetime'].map(lambda x: x.date())

list_regist = []
for d, diff in tqdm(data[['date', 'D1']].values):
    if diff < 999999:
        regist = date_add_days(d, -1*diff)
    else:
        regist = date_add_days(d, 0)
    list_regist.append(str(regist))

data['Regist_date'] = list_regist

100%|██████████| 1097231/1097231 [00:04<00:00, 261451.42it/s]


In [4]:
df_check = read_pkl_gzip(path='../output/0919_ieee__eight_residual_50000over.gz')
df_check = df_check.merge(df_train, how='inner', on=COLUMN_ID)

cols_uid = [col for col in df_check.columns if col.count('130_')]

col_bear  = 'user_id_bear'
col_uid_1 = 'user_id_card_addr'
col_uid_2 = 'user_id_card_addr_pemail'
col_uid_3 = 'user_id_card_addr_pemail_M'

list_uid = [
    col_bear ,
    col_uid_1,
    col_uid_2,
    col_uid_3,
] + cols_uid

# for uid in list_uid:
#     cnt_map = df_check[uid].value_counts()
#     df_check[f'cnt__{uid}'] = df_check[uid].map(cnt_map)

In [27]:
cols_sort = ['datetime', 'user_id_bear', 'isFraud', 'residual', '_eight_rank', '_pred_rank', 'C1', 'C11', 'C13', 'P_emaildomain', 'ProductCD', 'R_emaildomain', 'Regist_date', 'D1', 'D3', 'addr1', 'card1', 'card2', 'card3', 'V95', 'V97', 'V127', 'V128', 'TransactionAmt', 'C10', 'C12', 'C14', 'C2', 'C3', 'C4', 'C5', 'C6', 'C7', 'C8', 'C9', 'D10', 'D11', 'D12', 'D13', 'D14', 'D15', 'D2', 'D4', 'D5', 'D6', 'D7', 'D8', 'D9', 'V101', 'V102', 'V103', 'V107', 'V108', 'V109', 'V110', 'V111', 'V112', 'V113', 'V114', 'V115', 'V116', 'V123', 'V124', 'V125', 'V130', 'V131', 'V133', 'V156', 'V165', 'V187', 'V2', 'V201', 'V243', 'V258', 'V259', 'V265', 'V267', 'V281', 'V282', 'V283', 'V29', 'V294', 'V3', 'V306', 'V307', 'V308', 'V310', 'V312', 'V313', 'V314', 'V315', 'V317', 'V318', 'V320', 'V37', 'V38', 'V4', 'V44', 'V45', 'V48', 'V49', 'V5', 'V53', 'V54', 'V6', 'V61', 'V62', 'V67', 'V7', 'V70', 'V76', 'V78', 'V83', 'V87', 'V90', 'V91', 'V94', '__DT-M', '__DT-W', 'addr2', 'card4', 'card5', 'card6']
cols_all = df_check.columns
cols_remain = sorted(list(set(cols_all) - set(cols_sort)))
df_check = df_check[cols_sort + cols_remain]

In [16]:
pd.set_option('max_columns', 200)
pd.set_option('max_rows', 200)
col_amt = 'TransactionAmt'
cols_V = [col for col in tmp.columns if col.startswith('V')]
cols_C = [col for col in tmp.columns if col.startswith('C')]
cols_D = [col for col in tmp.columns if col.startswith('D')]

In [59]:
data['C1_C14_ratio'] = data['C1'] / data['C14']
data['ratio_id'] = data['C1_C14_ratio'].map(lambda x: np.round(x, 3)).astype('str').fillna('#')  + '_' + data['addr1'].astype('str')
# + '_' + data['card1'].astype('str') \

check_ids = data['ratio_id'].value_counts().head(20).index


# df_check['C1_C14_ratio'] = df_check['C1'] / df_check['C14']
# # df_check[df_check[col_bear]==200780.0][['pred', 'C1', 'C14', 'C1_C14_ratio']].sort_values(by='pred')
# # df_check[(df_check['C1_C14_ratio']==1) & (df_check['card1'])]
# df_check['ratio_id'] = df_check['C1_C14_ratio'].map(lambda x: np.round(x, 3)).astype('str') + '_' + df_check['card1'].astype('str')
# df_check['ratio_id'].value_counts().head(20)

In [60]:
for uid in check_ids:
    tmp = data[data['ratio_id']==uid]
    cnt = tmp[col_bear].value_counts()
    ids = cnt[cnt==1].index
    print(ids.shape)
    sys.exit()
    
#     df_train[df_train[col_bear].isin(ids)][[col_bear, 'Regist_date', 'datetime'] + cols_D].sort_values(by='Regist_date')
# print(ids.shape)
# tmp.sort_values(by='datetime', inplace=True)
# tmp[[col_bear] + cols_C + cols_D].sort_values(by=col_bear)

(21809,)


SystemExit: 

In [125]:
# df_filter = data[data[col_bear].isin(ids)]
# cnt = df_filter[col_bear].value_counts()
# cnt = data[col_bear].value_counts()
# ids = cnt[cnt==1].index
tmp = data[data[col_bear].isin(ids)]
# save = tmp[['ProductCD', 'isFraud', 'Regist_date', 'datetime', 'D1', 'D2', 'D3', 'C1', 'C14', 'V95', 'V97', col_amt, 'V130', 'V131', 'V307', 'V308']].sort_values(by='datetime').sort_values(by=['Regist_date'])
save = tmp[[COLUMN_ID, 'ProductCD', 'isFraud', 'Regist_date', 'datetime', 'card1', 'card2', 'card3', 'D1', 'D2', 'D3', 'D6', 'D7', 'D8', 'D13', 'D14', 'D15', 'C1', 'C14', 'V95', 'V97', col_amt, 'V130', 'V131', 'V307', 'V308', col_bear]].sort_values(by='datetime').sort_values(by=['Regist_date'])

In [None]:
save_r = save[save['ProductCD']=='R']
# save_r.to_csv('../output/0919_probing_R_addr1_C1_C14_ratio.csv')

tmp = save_r[~save_r['D8'].isnull()]
tmp = tmp[[COLUMN_ID, 'datetime', 'D8', 'isFraud']]
list_user_r = []
already = []

for uid, d, d8, t in tqdm(tmp[[COLUMN_ID, 'datetime', 'D8', 'isFraud']].values):
    if uid in set(already):
        continue
    tmp['diff'] = (tmp['D8'] - d8).astype('int')
    candidates = tmp[(tmp['datetime'] - d).map(lambda x: x.total_seconds()/60/60).astype('int') == tmp['diff']]
    if candidates.shape[0]>1:
#     if candidates['isFraud'].isnull().sum()>1:
#     if candidates['isFraud'].sum()>1:
#         display(candidates)
        list_user_r.append(candidates[COLUMN_ID].values)
        already += candidates[COLUMN_ID].values.tolist()

In [182]:
save_r = save[save['ProductCD']=='R']
# save_r.to_csv('../output/0919_probing_R_addr1_C1_C14_ratio.csv')

tmp = save_r[~saver['D3'].isnull()]
tmp = tmp[[COLUMN_ID, 'datetime', 'Regist_date', 'D1', 'D3', 'D8', 'isFraud']]
list_user_R_D3 = []
already_R_D3 = []

for uid, d, rd, d1,d3, d8, t in tmp[[COLUMN_ID, 'datetime', 'Regist_date', 'D1', 'D3', 'D8', 'isFraud']].values:
    if uid in set(already_R_D3):
        continue
    diff = d1 - d3
    candidates = tmp[((tmp['D1'] == diff) | (tmp[COLUMN_ID]==uid)) & (tmp['Regist_date']==rd) & (tmp['D3']!=0)]
    if candidates.shape[0]>1:
#         display(candidates.sort_values(by='datetime'))
        list_user_R_D3.append(candidates[COLUMN_ID].values)
        already_R_D3 += candidates[COLUMN_ID].values.tolist()

In [178]:
save_s = save[save['ProductCD']=='S']
# save_s.to_csv('../output/0919_probing_R_addr1_C1_C14_ratio.csv')

tmp = save_s[~save_s['D8'].isnull()]
tmp = tmp[[COLUMN_ID, 'datetime', 'D8', 'isFraud']]
list_user_S_D8 = []
already_S_D8 = []

for uid, d, d8, t in tmp[[COLUMN_ID, 'datetime', 'D8', 'isFraud']].values:
    if uid in set(already_S_D8):
        continue
    tmp['diff'] = (tmp['D8'] - d8).astype('int')
    candidates = tmp[(tmp['datetime'] - d).map(lambda x: x.total_seconds()/60/60).astype('int') == tmp['diff']]
    if candidates.shape[0]>1:
#     if candidates['isFraud'].isnull().sum()>1:
#     if candidates['isFraud'].sum()>1:
#         display(candidates)
        list_user_S_D8.append(candidates[COLUMN_ID].values)
        already_S_D8 += candidates[COLUMN_ID].values.tolist()

In [179]:
save_s = save[save['ProductCD']=='S']
# save_s.to_csv('../output/0919_probing_R_addr1_C1_C14_ratio.csv')

tmp = save_s[~save_s['D3'].isnull()]
tmp = tmp[[COLUMN_ID, 'datetime', 'Regist_date', 'D1', 'D3', 'D8', 'isFraud']]
list_user_S_D3 = []
already_S_D3 = []

for uid, d, rd, d1,d3, d8, t in tmp[[COLUMN_ID, 'datetime', 'Regist_date', 'D1', 'D3', 'D8', 'isFraud']].values:
    if uid in set(already_S_D3):
        continue
    diff = d1 - d3
    candidates = tmp[((tmp['D1'] == diff) | (tmp[COLUMN_ID]==uid)) & (tmp['Regist_date']==rd) & (tmp['D3']!=0)]
    if candidates.shape[0]>1:
#         display(candidates.sort_values(by='datetime'))
        list_user_S_D3.append(candidates[COLUMN_ID].values)
        already_S_D3 += candidates[COLUMN_ID].values.tolist()

In [177]:
save_w = save[save['ProductCD']=='W']
# save_w.to_csv('../output/0919_probing_R_addr1_C1_C14_ratio.csv')

tmp = save_w[~save_w['D8'].isnull()]
tmp = tmp[[COLUMN_ID, 'datetime', 'D8', 'isFraud']]
list_user_W_D8 = []
already_W_D8 = []

for uid, d, d8, t in tmp[[COLUMN_ID, 'datetime', 'D8', 'isFraud']].values:
    if uid in set(already_W_D8):
        continue
    tmp['diff'] = (tmp['D8'] - d8).astype('int')
    candidates = tmp[(tmp['datetime'] - d).map(lambda x: x.total_seconds()/60/60).astype('int') == tmp['diff']]
    if candidates.shape[0]>1:
#     if candidates['isFraud'].isnull().sum()>1:
#     if candidates['isFraud'].sum()>1:
#         display(candidates)
        list_user_W_D8.append(candidates[COLUMN_ID].values)
        already_W_D8 += candidates[COLUMN_ID].values.tolist()

In [175]:
save_w = save[save['ProductCD']=='W']
# save_w.to_csv('../output/0919_probing_R_addr1_C1_C14_ratio.csv')

tmp = save_w[~save_w['D3'].isnull()]
tmp = tmp[[COLUMN_ID, 'datetime', 'Regist_date', 'D1', 'D3', 'D8', 'isFraud']]
list_user_W_D3 = []
already_W_D3 = []

for uid, d, rd, d1,d3, d8, t in tmp[[COLUMN_ID, 'datetime', 'Regist_date', 'D1', 'D3', 'D8', 'isFraud']].values:
    if uid in set(already_W_D3):
        continue
    diff = d1 - d3
    candidates = tmp[((tmp['D1'] == diff) | (tmp[COLUMN_ID]==uid)) & (tmp['Regist_date']==rd) & (tmp['D3']!=0)]
    if candidates.shape[0]>1:
#         display(candidates.sort_values(by='datetime'))
        list_user_W_D3.append(candidates[COLUMN_ID].values)
        already_W_D3 += candidates[COLUMN_ID].values.tolist()

In [176]:
save_c = save[save['ProductCD']=='C']
# save_c.to_csv('../output/0919_probing_R_addr1_C1_C14_ratio.csv')

tmp = save_c[~save_c['D8'].isnull()]
tmp = tmp[[COLUMN_ID, 'datetime', 'D8', 'isFraud']]
list_user_C_D8 = []
already_C_D8 = []

for uid, d, d8, t in tmp[[COLUMN_ID, 'datetime', 'D8', 'isFraud']].values:
    if uid in set(already_C_D8):
        continue
    tmp['diff'] = (tmp['D8'] - d8).astype('int')
    candidates = tmp[(tmp['datetime'] - d).map(lambda x: x.total_seconds()/60/60).astype('int') == tmp['diff']]
    if candidates.shape[0]>1:
#     if candidates['isFraud'].isnull().sum()>1:
#     if candidates['isFraud'].sum()>1:
#         display(candidates)
        list_user_C_D8.append(candidates[COLUMN_ID].values)
        already_C_D8 += candidates[COLUMN_ID].values.tolist()

In [170]:
save_c = save[save['ProductCD']=='C']
# save_c.to_csv('../output/0919_probing_R_addr1_C1_C14_ratio.csv')

tmp = save_c[~save_c['D3'].isnull()]
tmp = tmp[[COLUMN_ID, 'datetime', 'Regist_date', 'D1', 'D3', 'D8', 'isFraud']]
list_user_C_D3 = []
already_C_D3 = []

for uid, d, rd, d1,d3, d8, t in tmp[[COLUMN_ID, 'datetime', 'Regist_date', 'D1', 'D3', 'D8', 'isFraud']].values:
    if uid in set(already_C_D3):
        continue
    diff = d1 - d3
    candidates = tmp[((tmp['D1'] == diff) | (tmp[COLUMN_ID]==uid)) & (tmp['Regist_date']==rd) & (tmp['D3']!=0)]
    if candidates.shape[0]>1:
#         display(candidates.sort_values(by='datetime'))
        list_user_C_D3.append(candidates[COLUMN_ID].values)
        already_C_D3 += candidates[COLUMN_ID].values.tolist()

In [180]:
save_h = save[save['ProductCD']=='H']
# save_h.to_csv('../output/0919_probing_R_addr1_C1_C14_ratio.csv')

tmp = save_h[~save_h['D8'].isnull()]
tmp = tmp[[COLUMN_ID, 'datetime', 'D8', 'isFraud']]
list_user_H_D8 = []
already_H_D8 = []

for uid, d, d8, t in tmp[[COLUMN_ID, 'datetime', 'D8', 'isFraud']].values:
    if uid in set(already_H_D8):
        continue
    tmp['diff'] = (tmp['D8'] - d8).astype('int')
    candidates = tmp[(tmp['datetime'] - d).map(lambda x: x.total_seconds()/60/60).astype('int') == tmp['diff']]
    if candidates.shape[0]>1:
#     if candidates['isFraud'].isnull().sum()>1:
#     if candidates['isFraud'].sum()>1:
#         display(candidates)
        list_user_H_D8.append(candidates[COLUMN_ID].values)
        already_H_D8 += candidates[COLUMN_ID].values.tolist()

In [181]:
save_h = save[save['ProductCD']=='H']
# save_h.to_csv('../output/0919_probing_R_addr1_C1_C14_ratio.csv')

tmp = save_h[~save_h['D3'].isnull()]
tmp = tmp[[COLUMN_ID, 'datetime', 'Regist_date', 'D1', 'D3', 'D8', 'isFraud']]
list_user_H_D3 = []
already_H_D3 = []

for uid, d, rd, d1,d3, d8, t in tmp[[COLUMN_ID, 'datetime', 'Regist_date', 'D1', 'D3', 'D8', 'isFraud']].values:
    if uid in set(already_H_D3):
        continue
    diff = d1 - d3
    candidates = tmp[((tmp['D1'] == diff) | (tmp[COLUMN_ID]==uid)) & (tmp['Regist_date']==rd) & (tmp['D3']!=0)]
    if candidates.shape[0]>1:
#         display(candidates.sort_values(by='datetime'))
        list_user_H_D3.append(candidates[COLUMN_ID].values)
        already_H_D3 += candidates[COLUMN_ID].values.tolist()

In [None]:
cnt = 0
save['d8_progress'] = np.nan

for users in tqdm(list_user_R_D3):
# tmp = np.hstack(list_user_r)
    save.loc[save[COLUMN_ID].isin(users), 'd8_progress'] = cnt
    cnt += 1
    
for users in tqdm(list_user_S_D8):
# tmp = np.hstack(list_user_r)
    save.loc[save[COLUMN_ID].isin(users), 'd8_progress'] = cnt
    cnt += 1
    
    
for users in tqdm(list_user_W_D8):
# tmp = np.hstack(list_user_r)
    save.loc[save[COLUMN_ID].isin(users), 'd8_progress'] = cnt
    cnt += 1
    
    
for users in tqdm(list_user_C_D8):
# tmp = np.hstack(list_user_r)
    save.loc[save[COLUMN_ID].isin(users), 'd8_progress'] = cnt
    cnt += 1
    
    
for users in tqdm(list_user_H_D8):
# tmp = np.hstack(list_user_r)
    save.loc[save[COLUMN_ID].isin(users), 'd8_progress'] = cnt
    cnt += 1

In [None]:
cnt = 0
save['d3_progress'] = np.nan

for users in tqdm(list_user_R_D3):
# tmp = np.hstack(list_user_r)
    save.loc[save[COLUMN_ID].isin(users), 'd3_progress'] = cnt
    cnt += 1
    
for users in tqdm(list_user_S_D3):
# tmp = np.hstack(list_user_r)
    save.loc[save[COLUMN_ID].isin(users), 'd3_progress'] = cnt
    cnt += 1
    
    
for users in tqdm(list_user_W_D3):
# tmp = np.hstack(list_user_r)
    save.loc[save[COLUMN_ID].isin(users), 'd3_progress'] = cnt
    cnt += 1
    
    
for users in tqdm(list_user_C_D3):
# tmp = np.hstack(list_user_r)
    save.loc[save[COLUMN_ID].isin(users), 'd3_progress'] = cnt
    cnt += 1
    
    
for users in tqdm(list_user_H_D3):
# tmp = np.hstack(list_user_r)
    save.loc[save[COLUMN_ID].isin(users), 'd3_progress'] = cnt
    cnt += 1

In [205]:
to_pkl_gzip(obj=save[[COLUMN_ID, 'd3_progress', 'd8_progress']], path='../output/0920_ieee__d3_d8_progress_ProductCD')