In [None]:
%load_ext autoreload
%autoreload 2
from glob import glob
import os
from pathlib import Path
import gc
import re
import sys
import yaml
import datetime
import numpy as np
import pandas as pd
from tqdm import tqdm
from func.utils import get_categorical_features, read_pkl_gzip, to_pkl_gzip, parallel_load_data, get_filename, logger_func
from ieee_train import eval_train, eval_check_feature
from kaggle_utils import reduce_mem_usage, move_feature

%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style="whitegrid")
plt.figure(figsize=(12, 12))

try:
    logger
except NameError:
    logger = logger_func()

In [None]:
COLUMN_ID = 'TransactionID'
COLUMN_DT = 'TransactionDT'
COLUMN_TARGET = 'isFraud'
COLUMN_GROUP = 'DT-M'
COLUMNS_IGNORE = [COLUMN_ID, COLUMN_DT, COLUMN_TARGET, COLUMN_GROUP, 'is_train', 'datetime', 'date', 'year', 'month', 'DT-M']

def filter_feature(path):
    if path.count(''):
        return True
    else:
        return False

paths_train = glob('../submit/re_sub/Tran*_train.gz')
paths_test  = glob('../submit/re_sub/Tran*_test.gz')
paths_train += glob('../submit/re_sub/is*_train.gz')
paths_test  += glob('../submit/re_sub/is*_test.gz')
paths_train += glob('../submit/re_sub/528__ugr_R_emaildomain_C1_C14_ratio_agg_V35_52*_train.gz')
paths_test  += glob('../submit/re_sub/528__ugr_R_emaildomain_C1_C14_ratio_agg_V35_52*_test.gz')
print(len(paths_train))

paths_train_feature = []
paths_test_feature  = []

# df_train = reduce_mem_usage( parallel_load_data(paths_train) )
# df_test  = reduce_mem_usage( parallel_load_data(paths_test) )
df_train = parallel_load_data(paths_train)
df_test  = parallel_load_data(paths_test)
data = pd.concat([df_train, df_test], axis=0)

In [None]:
#========================================================================
# Bear's score
#========================================================================

from sklearn.metrics import roc_auc_score
bear = pd.read_csv('../input/20190913_ieee__bear_probing.csv').iloc[:, [0, 1, 2, 3, 4, 6]]
# bear = bear[bear[COLUMN_TARGET]==-1]
bear = bear.iloc[:, [0,1,2,3,5]]
bear.columns = [COLUMN_ID, COLUMN_DT, col_bear, 'data_type', 'bear_probing']
bear = bear.merge(data[[COLUMN_ID, 'DT-M']], how='inner', on=COLUMN_ID)
submission = pd.read_csv('../input/sample_submission.csv')
submission.columns = [COLUMN_ID, 'pred']


def bear_validation(test_pred):
    submission['pred'] = test_pred
    bear_score = submission.merge(bear, how='inner', on=COLUMN_ID)
    public  = bear_score[bear_score['data_type']=='test_public']
    private = bear_score[bear_score['data_type']=='test_private']
    
    public_score = roc_auc_score(public[COLUMN_TARGET].values, public['pred'].values)
    private_score = roc_auc_score(private[COLUMN_TARGET].values, private['pred'].values)
    all_score = roc_auc_score(bear_score[COLUMN_TARGET].values, bear_score['pred'].values)

    return public_score, private_score, all_score

In [None]:
# data = pd.concat([df_train, df_test], axis=0)
startdate = datetime.datetime(2017,12,1)

data['datetime'] = data['TransactionDT'].apply(lambda x: (startdate + datetime.timedelta(seconds = x)))
data['year'] = data['datetime'].map(lambda x: x.year)
data['month'] = data['datetime'].map(lambda x: x.month)
data['month'] = data['month'].map(lambda x: 5 if x==6 else x)

data['DT-M'] = data[['year', 'month']].apply(lambda x: str(x[0]) + '-' + str(x[1]), axis=1)


# df_train['datetime'] = df_train['TransactionDT'].apply(lambda x: (startdate + datetime.timedelta(seconds = x)))
# df_train['year'] = df_train['datetime'].map(lambda x: x.year)
# df_train['month'] = df_train['datetime'].map(lambda x: x.month)
# df_train['month'] = df_train['month'].map(lambda x: 5 if x==6 else x)
# df_train['DT-M'] = df_train[['year', 'month']].apply(lambda x: str(x[0]) + '-' + str(x[1]), axis=1)

# df_test['datetime'] = df_test['TransactionDT'].apply(lambda x: (startdate + datetime.timedelta(seconds = x)))
# df_test['year'] = df_test['datetime'].map(lambda x: x.year)
# df_test['month'] = df_test['datetime'].map(lambda x: x.month)
# df_test['month'] = df_test['month'].map(lambda x: 5 if x==6 else x)

# df_test['DT-M'] = df_test[['year', 'month']].apply(lambda x: str(x[0]) + '-' + str(x[1]), axis=1)

In [None]:
#========================================================================
# Bear's ID
#========================================================================
col_bear = 'predicted_user_id'
same_user_path = '../output/same_user_pattern/20190901_user_ids_share.csv'

bear = pd.read_csv(same_user_path)
bear = bear[[COLUMN_ID, col_bear]]

# max_id = bear['predicted_user_id'].max()
# bear.loc[bear[bear['predicted_user_id'].isnull()].index, 'predicted_user_id'] = np.arange(
#     bear['predicted_user_id'].isnull().sum() ) + 1 + max_id
bear['predicted_user_id'] =  bear['predicted_user_id'].fillna(-1).astype('int')

data = data.merge(bear[[COLUMN_ID, 'predicted_user_id']], how='inner', on=COLUMN_ID)

In [None]:
df_lb = read_pkl_gzip('../output/pred_result/20190927_0221__CV0-9594791704263358__all_preds.gz')
df_bear = read_pkl_gzip('../output/pred_result/20190929_1132__CV0-912702787903791__all_preds_Bear_GROUPK.gz')
df_lb.columns = [COLUMN_ID, 'pred_lb']
df_bear.columns = [COLUMN_ID, 'pred_bear']

data['diff_pred'] = df_lb['pred_lb'] - df_bear['pred_bear']
data['diff_pred'] = data['diff_pred'].map(lambda x: np.round(x, 2))
data['diff_pred'] = data['diff_pred'].map(np.abs)

In [None]:
#========================================================================
# 0.02未満の差は誤差とする
# 予測値で0.02以上の差が開いてるIDはリークの影響を受けてるとする
# 恐らく, bear's idとキレイに特定できているユーザーほど差が大きく、そうでないノイズがのったuser_idは差が開きにくい
#========================================================================
threshold = 0.01
df_leak = data[data['diff_pred'] >= threshold]
df_solo = data[data['diff_pred'] <  threshold]

# やはりpublic/privateに差が開いてるユーザーが多め
display(df_leak[COLUMN_TARGET].fillna(-1).value_counts())
display(df_solo[COLUMN_TARGET].fillna(-1).value_counts())

print('solo ratio leak:' , df_leak.shape, (df_leak[col_bear].value_counts()==1).sum() / df_leak.shape[0])
print('solo ratio solo:' , df_solo.shape, (df_solo[col_bear].value_counts()==1).sum() / df_solo.shape[0])

In [None]:
#========================================================================
# ここで保存したそれぞれのIDグループごとに最適化を行う
# 
#========================================================================
leak_ids = df_leak[COLUMN_ID].values
solo_ids = df_solo[COLUMN_ID].values

In [None]:
cnt = data[[col_bear, 'DT-M']].drop_duplicates().groupby(col_bear)['DT-M'].agg({
    'count': 'count'
})
leak_bear = cnt[cnt>1].index

all_bear = list(data[col_bear].values)
list_solo = cnt[cnt['count']<=1].index.tolist()
list_leak = list(set(all_bear) - set(list_solo))

print(len(list_solo), len(list_leak))
print(len(list_solo) + len(list_leak))

In [None]:
bear_solo_id = data[data[col_bear].isin(list_solo)][COLUMN_ID].values.tolist()
bear_leak_id = data[data[col_bear].isin(list_leak)][COLUMN_ID].values.tolist()

print(len(bear_solo_id) , len(bear_leak_id))
print(len(bear_solo_id) + len(bear_leak_id))
to_pkl_gzip(obj=leak_ids, path='../output/923_ieee__bear_leak_ids')
to_pkl_gzip(obj=solo_ids, path='../output/923_ieee__bear_solo_ids')

In [None]:
print(len(bear_solo_id))
print( len( list(list(set(bear_solo_id) - set(leak_ids))) ))

In [None]:
df_pred = read_pkl_gzip('../output/pred_result/20190927_0221__CV0-9594791704263358__all_preds.gz')

In [None]:
df_pred_solo = df_pred[df_pred[COLUMN_ID].isin(bear_solo_id)]
df_pred_solo.shape

In [None]:
result_leak_ids = list(set(bear_leak_id) - set(solo_ids))
result_solo_ids = list(set(list(bear_solo_id) + list(solo_ids)))

print(len(result_leak_ids) , len(result_solo_ids))
print(len(result_leak_ids) + len(result_solo_ids))

to_pkl_gzip(obj=result_leak_ids, path='../output/923_ieee__leak_ids')
to_pkl_gzip(obj=result_solo_ids, path='../output/923_ieee__solo_ids')

In [None]:
len(set(result_solo_ids))

In [None]:
#========================================================================
# Join Prediction
#========================================================================
# pred = read_pkl_gzip('../output/pred_result/20190925_1450__CV0-9581588018233685__all_preds.gz').iloc[:, 1].values
# data['pred_holy'] = pred

# oof_haku = pd.read_csv('../output/oof/oof.csv').iloc[:, 1].values
# test_haku = pd.read_csv('../output/oof/submission.csv').iloc[:, 1].values
# data['pred_haku'] = np.hstack((oof_haku, test_haku))

# oof_hmd = pd.read_csv('../output/oof/20190925_hmdhmd_oof.csv').iloc[:, 1].values
# test_hmd = pd.read_csv('../output/oof/20190925_hmdhmd_pred.csv').iloc[:, 1].values
# data['pred_hmd'] = np.hstack((oof_hmd, test_hmd))

# oof_bear = pd.read_csv('../output/oof/20190925_all_uid_agg_stats_lr001_oof_features1381_oof0.95_pub0.984_pri0.989.csv')
# test_bear = pd.read_csv('../output/oof/20190925_all_uid_agg_stats_lr001_pred_features1381_oof0.95_pub0.984_pri0.989.csv')
# df_bear = pd.concat([oof_bear, test_bear], axis=0)
# df_bear.columns = [COLUMN_ID, 'pred_bear']
# data = data.merge(df_bear, how='inner', on=COLUMN_ID)


pred = read_pkl_gzip('../output/pred_result/20190925_1450__CV0-9581588018233685__all_preds.gz').iloc[:len(df_train), 1].values
df_train['pred_holy'] = pred
oof_haku = pd.read_csv('../output/oof/oof.csv').iloc[:, 1].values
df_train['pred_haku'] = oof_haku

oof_hmd = pd.read_csv('../output/oof/20190925_hmdhmd_oof.csv').iloc[:, 1].values
df_train['pred_hmd'] = oof_hmd

oof_bear = pd.read_csv('../output/oof/20190925_all_uid_agg_stats_lr001_oof_features1381_oof0.95_pub0.984_pri0.989.csv')
oof_bear.columns = [COLUMN_ID, 'pred_bear']
df_train = df_train.merge(oof_bear, how='inner', on=COLUMN_ID)


pred = read_pkl_gzip('../output/pred_result/20190925_1450__CV0-9581588018233685__all_preds.gz').iloc[len(df_train):, 1].values
df_test['pred_holy'] = pred

test_haku = pd.read_csv('../output/oof/submission.csv').iloc[:, 1].values
df_test['pred_haku'] = test_haku

test_hmd = pd.read_csv('../output/oof/20190925_hmdhmd_pred.csv').iloc[:, 1].values
df_test['pred_hmd'] = test_hmd

test_bear = pd.read_csv('../output/oof/20190925_all_uid_agg_stats_lr001_pred_features1381_oof0.95_pub0.984_pri0.989.csv')
test_bear.columns = [COLUMN_ID, 'pred_bear']
df_test = df_test.merge(test_bear, how='inner', on=COLUMN_ID)

In [None]:
#========================================================================
# 検証するbear's idでfilter
#========================================================================
data.sort_values(by=COLUMN_DT, inplace=True)
bear_first = data.groupby(col_bear)['DT-M'].first()
bear_cnt = data.groupby(col_bear)['DT-M'].count()

bear_test_user = bear_first[bear_first>='2018-7'].index
bear_multi_cnt_user = bear_cnt[bear_cnt>3].index
bear_valid_user = list(set(bear_test_user) & set(bear_multi_cnt_user))
print(len(bear_valid_user))

df_bear_valid = bear[bear[col_bear].isin(bear_valid_user)]
df_bear_valid = df_bear_valid.merge(data[[COLUMN_ID, 'pred_holy', 'pred_haku', 'pred_hmd', 'pred_bear']], how='inner', on=COLUMN_ID)

pd.set_option('max_rows', 1400)
df_bear_valid.sort_values(by=[col_bear, COLUMN_DT], ascending=False)

In [None]:
cnt_adv = pd.concat([
df_train['528__ugr_R_emaildomain_C1_C14_ratio_agg_V35_52_mean_mean'].value_counts().rename('cnt_train'), 
df_test['528__ugr_R_emaildomain_C1_C14_ratio_agg_V35_52_mean_mean'].value_counts().rename('cnt_test')
], axis=1)
cnt_adv.sort_values(by='cnt_test', ascending=False, inplace=True)
cnt_adv

In [None]:
from sklearn.metrics import roc_auc_score
df_train['round_ugr'] = df_train['528__ugr_R_emaildomain_C1_C14_ratio_agg_V35_52_mean_mean'].map(lambda x: np.round(x, 2))
list_val = []

for val in tqdm(sorted(df_train['round_ugr'].unique().tolist())):
    
    tmp = df_train[df_train['round_ugr']==val]
    
    if len(tmp)==0:
        continue
    
    y_train = tmp[COLUMN_TARGET].values
    haku = tmp['pred_haku'].values
    holy = tmp['pred_holy'].values
    hmd = tmp['pred_hmd'].values
    bear = tmp['pred_bear'].values
    cnt = tmp.shape[0]
    
    try:
        score_haku = roc_auc_score(y_train, haku)
        score_hmd = roc_auc_score(y_train, hmd)
        score_holy = roc_auc_score(y_train, holy)
        score_bear = roc_auc_score(y_train, bear)
        mean_haku = np.mean(haku)
        mean_hmd  = np.mean(hmd)
        mean_holy = np.mean(holy)
        mean_bear = np.mean(bear)
        max_val = np.max(y_train)
#         print(f" * ugr: {val} cnt: {cnt} haku: {score_haku} hmd: {score_hmd} holy: {score_holy} bear: {score_bear}")
        list_val.append([val, cnt, score_haku, score_hmd, score_holy, score_bear, max_val, mean_haku, mean_hmd, mean_holy, mean_bear])
    except ValueError:
        mean_haku = np.mean(haku)
        mean_hmd  = np.mean(hmd)
        mean_holy = np.mean(holy)
        mean_bear = np.mean(bear)
        max_val = np.max(y_train)
#         print(f" * ugr: {val} cnt: {tmp.shape[0]} y_train: {np.max(y_train)} ")
        list_val.append([val, cnt, np.nan, np.nan, np.nan, np.nan, max_val, mean_haku, mean_hmd, mean_holy, mean_bear])

In [None]:
pd.set_option('max_rows', 300)
# ugr_train = pd.DataFrame(list_val, columns=['ugr', 'cnt', 'score_haku', 'score_hmd', 'score_holy', 'score_bear', 'max_target', 'mean_haku', 'mean_hmd', 'mean_holy', 'mean_bear'])
ugr_train.columns = [f"train_{col}" if not col.count('ugr') else col  for col in ugr_train.columns]

In [None]:
df_test['round_ugr'] = df_test['528__ugr_R_emaildomain_C1_C14_ratio_agg_V35_52_mean_mean'].map(lambda x: np.round(x, 2))
list_val = []

for val in tqdm(sorted(df_test['round_ugr'].unique().tolist())):
    
    tmp = df_test[df_test['round_ugr']==val]
    
    if len(tmp)==0:
        continue
    
    y_test = tmp[COLUMN_TARGET].values
    haku = tmp['pred_haku'].values
    holy = tmp['pred_holy'].values
    hmd = tmp['pred_hmd'].values
    bear = tmp['pred_bear'].values
    cnt = tmp.shape[0]
    
    try:
        mean_haku = np.mean(haku)
        mean_hmd  = np.mean(hmd)
        mean_holy = np.mean(holy)
        mean_bear = np.mean(bear)
        max_val = np.max(y_test)
        list_val.append([val, cnt, max_val, mean_haku, mean_hmd, mean_holy, mean_bear])
    except ValueError:
        mean_haku = np.mean(haku)
        mean_hmd  = np.mean(hmd)
        mean_holy = np.mean(holy)
        mean_bear = np.mean(bear)
        max_val = np.max(y_test)
        list_val.append([val, cnt, max_val, mean_haku, mean_hmd, mean_holy, mean_bear])

In [None]:
ugr_test = pd.DataFrame(list_val, columns=['ugr', 'cnt', 'max_target', 'mean_haku', 'mean_hmd', 'mean_holy', 'mean_bear'])
ugr_test.columns = [f"test_{col}" if not col.count('ugr') else col  for col in ugr_test.columns]
ugr_test.head()

In [None]:
df_ugr = ugr_train.merge(ugr_test, how='outer', on='ugr')
df_ugr.head()

In [None]:
df_ugr.to_csv('../output/0928_ieee__528__ugr_R_emaildomain_C1_C14_ratio_agg_V35_52_mean_mean__round2__score.csv', index=False)

In [None]:
# col_bear = 'predicted_user_id'
# cols_pred = [col for col in df_train.columns if col.count('pred')]

# if col_bear not in df_train.columns:
#     df_train = df_train.merge(df_user_id_bear[[COLUMN_ID, col_bear]], how='left', on=COLUMN_ID)

tmp = df_train[(0.959<df_train['round_ugr']) 
         &
         (df_train['round_ugr']<0.961)][[COLUMN_ID, 'DT-M', 'datetime', COLUMN_TARGET] + cols_pred]
tmp.sort_values(by=COLUMN_TARGET, ascending=False, inplace=True)
tmp = tmp[tmp[COLUMN_TARGET]==1]

low_ids = tmp[col_bear].unique()

In [None]:
train_low = df_train[df_train[col_bear].isin(low_ids)][[COLUMN_ID, 'DT-M', 'datetime', COLUMN_TARGET] + cols_pred]

In [None]:
# train_low.sort_values(by=[col_bear, 'datetime'], inplace=True)
train_low.shape