In [1]:
%load_ext autoreload
%autoreload 2
from glob import glob
import gc
import os
import sys
import datetime
import numpy as np
import pandas as pd
from tqdm import tqdm
from func.utils import get_numeric_features, get_categorical_features, read_pkl_gzip, to_pkl_gzip, parallel_load_data, get_filename
from func.time_utils import date_add_days
from func.ml_utils import save_feature, get_cnt_feature, get_dummie_feature, get_label_feature
from func.parallel_utils import get_parallel_arg_list
from joblib import delayed, Parallel

In [2]:
COLUMN_ID = 'TransactionID'
COLUMN_DT = 'TransactionDT'
COLUMN_TARGET = 'isFraud'
COLUMNS_IGNORE = [COLUMN_ID, COLUMN_DT, COLUMN_TARGET, 'ProductCD']

train_paths = glob('../feature/eda_base/*_train.gz')
test_paths = glob('../feature/eda_base/*_test.gz')
train_paths += glob('../feature/org_use/526*_train.gz')
test_paths  += glob('../feature/org_use/526*_test.gz')
train_paths += glob('../feature/raw_use/ker__uid*_train.gz')
test_paths  += glob('../feature/raw_use/ker__uid*_test.gz')

train_paths = [path for path in train_paths 
               if (path.count(COLUMN_DT) 
               or path.count(COLUMN_ID)
               or path.count(COLUMN_TARGET)
               or path.count('V')
               or path.count('C')
               or path.count('D')
               or (path.count('526') and path.count('mean'))
#                or path.count('uid_')
#                or path.count('uid2_t')
#                or path.count('uid3_t')
#                or path.count('uid4_t')
#                or path.count('uid5_t')
#                or path.count('card')
#                or path.count('addr')
#                or path.count('domain')
               or path.count('TransactionAmt')
               or path.count('Product')
               )
               and not path.count('fill')
               and not path.count('bin')
               and not path.count('129')
               and not path.count('130')
              ]
test_paths = [path for path in test_paths 
               if (path.count(COLUMN_DT) 
               or path.count(COLUMN_ID)
               or path.count(COLUMN_TARGET)
               or path.count('V')
               or path.count('C')
               or path.count('D')
               or (path.count('526') and path.count('mean'))
#                or path.count('uid_')
#                or path.count('uid2_t')
#                or path.count('uid3_t')
#                or path.count('uid4_t')
#                or path.count('uid5_t')
#                or path.count('card')
#                or path.count('addr')
#                or path.count('domain')
               or path.count('TransactionAmt')
               or path.count('Product')
               )
               and not path.count('fill')
               and not path.count('bin')
               and not path.count('129')
               and not path.count('130')
              ]

df_train = parallel_load_data(train_paths)
df_test = parallel_load_data(test_paths)
data = pd.concat([df_train, df_test], axis=0, ignore_index=True)
train_length = df_train.shape[0]

In [4]:
START_DATE = '2017-12-01'
# START_DATE = '2017-12-01'
startdate = datetime.datetime.strptime(START_DATE, '%Y-%m-%d')

data['datetime'] = data['TransactionDT'].apply(lambda x: (startdate + datetime.timedelta(seconds = x) - datetime.timedelta(seconds = 14400) ))
data['datetime'].fillna(datetime.date(2020, 1, 1), inplace=True)
data['date'] = data['datetime'].map(lambda x: x.date())

list_regist = []
for d, diff in tqdm(data[['date', 'D1']].values):
    if diff < 999999:
        regist = date_add_days(d, -1*diff)
    else:
        regist = date_add_days(d, 0)
    list_regist.append(str(regist))

data['Regist_date'] = list_regist

100%|██████████| 1097231/1097231 [00:07<00:00, 143022.20it/s]


In [12]:
data['pred'] = read_pkl_gzip('../output/pred_result/20190927_0221__CV0-9594791704263358__all_preds.gz').iloc[:, 1].values
train = data.iloc[:len(df_train)]
test  = data.iloc[len(df_train):]

In [13]:
list_regist = train['Regist_date'].value_counts().index
# test['Regist_date'].value_counts()

In [26]:
from sklearn.metrics import roc_auc_score

list_df = []
for rd in tqdm(list_regist):
    tmp = train[train['Regist_date'] == rd]
    pred = tmp['pred'].values
    target = tmp[COLUMN_TARGET].values
    try:
        score = roc_auc_score(target, pred)
    except ValueError:
        score = np.nan
    avg = tmp['pred'].mean()
    std = tmp['pred'].std()
    cnt = tmp.shape[0]
    list_df.append([rd, score, avg, std, cnt])
    
df = pd.DataFrame(list_df, columns=['Regist_date', 'score', 'avg', 'std', 'cnt'])
# Parallel(-1)([delayed(parallel_get)(arg) for arg in arg_list])


  0%|          | 0/792 [00:00<?, ?it/s][A
  0%|          | 1/792 [00:00<01:29,  8.80it/s][A
  0%|          | 3/792 [00:00<01:26,  9.07it/s][A
  1%|          | 4/792 [00:00<01:27,  8.99it/s][A
  1%|          | 6/792 [00:00<01:23,  9.40it/s][A
  1%|          | 8/792 [00:00<01:15, 10.44it/s][A
  1%|▏         | 10/792 [00:00<01:08, 11.38it/s][A
  2%|▏         | 12/792 [00:01<01:07, 11.47it/s][A
  2%|▏         | 14/792 [00:01<01:03, 12.23it/s][A
  2%|▏         | 16/792 [00:01<01:03, 12.14it/s][A
  2%|▏         | 18/792 [00:01<01:00, 12.77it/s][A
  3%|▎         | 20/792 [00:01<01:01, 12.64it/s][A
  3%|▎         | 22/792 [00:01<00:58, 13.24it/s][A
  3%|▎         | 24/792 [00:01<00:55, 13.77it/s][A
  3%|▎         | 26/792 [00:02<00:55, 13.68it/s][A
  4%|▎         | 28/792 [00:02<00:53, 14.18it/s][A
  4%|▍         | 30/792 [00:02<00:54, 14.06it/s][A
  4%|▍         | 32/792 [00:02<00:54, 13.95it/s][A
  4%|▍         | 34/792 [00:02<00:53, 14.26it/s][A
  5%|▍         | 36/792 [

In [42]:
df.sort_values(by='Regist_date', inplace=True)
tmp = test['Regist_date'].value_counts().rename('test_cnt')
tmp = tmp.reset_index()
tmp.columns = ['Regist_date', 'test_cnt']
df = df.merge(tmp, how='outer', on='Regist_date')

In [45]:
df.to_csv('../output/0927_ieee__EDA__Regist_date_Score.csv', index=False)

In [49]:
tmp = train[train['Regist_date']=='2017-12-23']
tmp['date'].value_counts()

2017-12-23    4680
2017-12-24      70
2017-12-30      34
2017-12-25      31
2018-01-11      27
2017-12-26      26
2018-01-06      26
2017-12-28      24
2018-01-13      21
2018-01-04      20
2017-12-27      20
2018-01-20      20
2018-01-01      18
2018-01-02      15
2018-02-03      15
2018-01-27      15
2018-01-10      14
2017-12-31      14
2018-01-07      14
2018-02-23      13
2018-01-09      13
2018-03-14      13
2018-01-18      13
2018-02-16      11
2018-03-31      11
2018-01-12      11
2018-01-28      11
2018-01-03      10
2018-03-03      10
2017-12-29       9
              ... 
2018-03-19       3
2018-01-22       3
2018-05-16       3
2018-03-16       3
2018-01-25       3
2018-04-24       2
2018-05-19       2
2018-03-13       2
2018-02-15       2
2018-05-06       2
2018-04-09       2
2018-03-18       2
2018-04-21       2
2018-03-10       2
2018-04-27       2
2018-04-15       2
2018-05-18       2
2018-05-14       2
2018-02-14       2
2018-05-23       2
2018-03-12       2
2018-05-20  

In [51]:
regist = read_pkl_gzip('../submit/re_sub/532__ugr_uid2_ProductCD_Regist_date_D8_agg_V242_263_mean_mean_train.gz')
pd.Series(regist).value_counts()

1.000000      71487
0.882353      10819
0.000000       7192
0.823529       3818
1.176471       2866
0.941176       2762
1.235294       1891
1.058824       1793
1.470588       1463
1.117647       1413
1.352941       1331
0.911765       1064
0.500000        822
0.960784        755
0.705882        725
0.970588        714
1.411765        688
1.647059        482
1.294118        467
1.823529        455
0.666667        448
1.588235        442
1.088235        420
0.764706        417
0.980392        401
1.529412        396
1.705882        373
0.955882        358
0.852941        357
1.029412        356
              ...  
29.352942         1
8.529411          1
9.823529          1
16.176470         1
9.705882          1
89.000000         1
0.605042          1
15.529411         1
11.882353         1
32.941177         1
9.294118          1
29.705883         1
8.000000          1
5.800000          1
12.294118         1
196.500000        1
27.294117         1
161.500000        1
11.294118         1


In [52]:
regist = read_pkl_gzip('../submit/re_sub/532__ugr_uid2_ProductCD_Regist_date_D8_agg_V242_263_mean_mean_train.gz')
train['532'] = regist

In [58]:
train['bi'] = (train['pred']>0.023)*1
acc_cnt = pd.concat([train.groupby('532')['bi'].mean(), train['532'].value_counts()], axis=1)
acc_cnt.sort_values(by='532', ascending=False, inplace=True)

In [87]:
# acc_cnt
# train[(train['532']<1.47059) & (train['532']>1.470588)]['Regist_date'].value_counts()
train[(train['532']<1.6470591) & (train['532']>1.6470589)]['Regist_date'].value_counts()
train.groupby('Regist_date')['bi'].mean()
train.groupby('Regist_date')[COLUMN_TARGET].sum()

Regist_date
2016-03-08      0.0
2016-03-10      0.0
2016-03-11      0.0
2016-03-16      2.0
2016-03-17      0.0
2016-03-18      0.0
2016-03-20      0.0
2016-03-21      0.0
2016-03-22      0.0
2016-03-24      0.0
2016-03-25      0.0
2016-03-28      0.0
2016-03-30      0.0
2016-03-31      0.0
2016-04-01      0.0
2016-04-02      0.0
2016-04-04      0.0
2016-04-05      0.0
2016-04-06      0.0
2016-04-07      0.0
2016-04-08      0.0
2016-04-09      0.0
2016-04-10      0.0
2016-04-12      0.0
2016-04-13      0.0
2016-04-14      0.0
2016-04-15      0.0
2016-04-16      0.0
2016-04-17      0.0
2016-04-18      0.0
2016-04-19      0.0
2016-04-20      0.0
2016-04-22      0.0
2016-04-23      0.0
2016-04-24      0.0
2016-04-25      0.0
2016-04-26      0.0
2016-04-27      0.0
2016-04-28      0.0
2016-04-30      0.0
2016-05-02      0.0
2016-05-03      0.0
2016-05-04      0.0
2016-05-05      0.0
2016-05-06      0.0
2016-05-07      0.0
2016-05-08      0.0
2016-05-10      0.0
2016-05-11      0.0
2016-05-

In [71]:
%matplotlib inline
from matplotlib import pyplot as plt
import seaborn as sns
sns.set(style="whitegrid")
pd.set_option('max_rows', 500)

# plt.figure(figsize=(20, 12))
# sns.lineplot(data=acc_cnt[acc_cnt['bi']<0.6], x='532', y='bi')

In [85]:
acc_cnt

Unnamed: 0,bi,532
1.000000,0.085050,71487
0.882353,0.097976,10819
0.000000,0.208843,7192
0.823529,0.094290,3818
1.176471,0.224703,2866
0.941176,0.135047,2762
1.235294,0.367002,1891
1.058824,0.162856,1793
1.470588,0.505126,1463
1.117647,0.231423,1413
