In [1]:
import os
import gc
import pandas as pd
import numpy as np
from datetime import datetime
import matplotlib.pyplot as plt
import seaborn as sns
#import pandas_profiling
import matplotlib
import warnings
matplotlib.style.use('ggplot')
warnings.filterwarnings("ignore")
from IPython.display import display
import random
import time
from sklearn.model_selection import train_test_split, StratifiedKFold, TimeSeriesSplit, GridSearchCV

## Load Data

In [2]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [3]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

df = train.merge(test,how='outer')
df = df.sort_values(by='locdt')
del train,test

In [4]:
df=reduce_mem_usage(df)
gc.collect()

Mem. usage decreased to 159.39 Mb (55.2% reduction)


62

### EDA

In [5]:
# one times report 
#train_profiling = pandas_profiling.ProfileReport(train)
#test_profiling = pandas_profiling.ProfileReport(test)
#train_profiling.to_file('train_profiling.html')
#test_profiling.to_file('test_profiling.html')

In [6]:
# NA ratio
print( "NA\n",'--'*20 )
print( (df.isnull().sum()[df.isnull().sum()>0]) /df.shape[0] )

NA
 ----------------------------------------
flbmk        0.008385
flg_3dsmk    0.008385
fraud_ind    0.216967
dtype: float64


In [7]:
# Unique value (all dataset)
#print(f'train data size: {df.shape[0]}\n----------------------')
#print('Unique Value\n----------------------')
#for i in df.columns:
#    if len(df[i].unique()) < 12:
#        print('{}:{}     | {}'.format(i, len(df[i].unique()), df[i].unique()))
#    else:
#        print('{}: {} '.format(i, len(df[i].unique())))

In [8]:
# Unique value (test)
#print(
#    f'test data size: {df[df.fraud_ind.isnull()].shape[0]}\n----------------------'
#)
#print('Unique Value\n----------------------')
#for i in df.columns:
#    if len(df[df.fraud_ind.isnull()][i].unique()) < 12:
#        print('{}:{}     | {}'.format(
#            i, len(df[df.fraud_ind.isnull()][i].unique()),
#            df[df.fraud_ind.isnull()][i].unique()))
#    else:
#        print('{}: {} '.format(i, len(df[df.fraud_ind.isnull()][i].unique())))

In [9]:
# relation between covariate variable and response variable
#exceptlist = []
#for k in df.columns:
#    if (len(df[k].unique()) >7)  & (df.dtypes!='object')[k] & (k!='fraud_ind') :
#        try:
#            fig = plt.figure(figsize=(16,9))
#            ax1 = fig.add_subplot(221)
#            ax2 = fig.add_subplot(222)
#            sns.distplot(df[k][df.fraud_ind==0],label='0',ax=ax1)
#            sns.distplot(df[k][df.fraud_ind==1],label='1',ax=ax1)
#            ax1.legend()
#            ax1.set_title(k+' (train)',zcolor='r')
#            sns.distplot(df[k][df.fraud_ind.isnull()],label='test',ax=ax2)
#            sns.distplot(df[k][df.fraud_ind.notnull()],label='train',ax=ax2)
#            ax2.legend()
#            ax2.set_title(k,color='r')
#        except:
#            print('{}: type error -- {}'.format(k,df[k].dtypes))
#    elif k=='fraud_ind':
#        continue
#    else:
#        exceptlist.append(k)

In [10]:
#for k in exceptlist:
#    display(pd.crosstab(df[df.fraud_ind.notnull()][k],df['fraud_ind'], margins=True, margins_name="Total" ).apply(lambda r: r/df[df.fraud_ind.notnull()].shape[0] , axis=1))
#    #display(pd.crosstab(df[df.fraud_ind.notnull()][k],df['fraud_ind']))
#    print('--'*20)

In [11]:
#viplist = ['cano','conam','csmcu','etymd','locdt','loctm','mcc','mchno','txkey']
#for k in viplist:
#    if len(df[k].unique() ) < 20:
#        
#        display(pd.crosstab(df[df.fraud_ind.notnull()][k],df['fraud_ind']))
#        print('--'*20)

In [12]:
#df[df.fraud_ind.notnull()][['locdt','loctm']].sort_values(by=['locdt','loctm'])
#display(df[df.fraud_ind.notnull()][['locdt','loctm']].describe().round(3))
#display(df[df.fraud_ind.isnull()][['locdt','loctm']].describe().round(3))

In [13]:
#df[['bacno','acqic','conam','ecfg','locdt','loctm','mcc','mchno','txkey','fraud_ind']]\
#    [df.cano==195350].sort_values(by=['bacno','loctm','locdt'])#\
#    #.groupby('ecfg').fraud_ind.sum()

#target_value = 38818
#print('---- fraud_ind count ----')
#display(df[df.bacno==target_value].sort_values(by=['bacno','loctm','locdt'])\
#        .groupby(['cano','ecfg','stscd','acqic']).fraud_ind.sum())
#        #.groupby(['cano','ecfg','stscd','acqic']).fraud_ind.mean())
#print('--'*20)
#df['count_1'] = 1
#print('---- count_1 ----')
#display(df[df.bacno==target_value].sort_values(by=['bacno','loctm','locdt'])\
#        .groupby(['cano','ecfg','stscd','acqic']).count_1.sum())

In [14]:
#target_columns = 'stocn'
#
#analysis = \
#pd.DataFrame({target_columns:df[df.fraud_ind.notnull()].groupby(target_columns).fraud_ind.mean().index,
#              'y_mean':df[df.fraud_ind.notnull()].groupby(target_columns).fraud_ind.mean(),
#              'y_count':(df[df.fraud_ind.notnull()].groupby(target_columns).fraud_ind.sum()).astype('int'),
#              'size_1':df[df.fraud_ind.notnull()].groupby(target_columns).count_1.sum()
#             }).sort_values(by=['size_1','y_mean'],ascending=False)

In [15]:
#analysis.sort_values(by=['y_mean'],ascending=False)

### Feature Extending

In [16]:
# Check train & test different
print('train cano size: ', len(df[df.fraud_ind.notnull()].cano.unique()))
print('test cano size : ', len(df[df.fraud_ind.isnull()].cano.unique()))
print('both cano size : ',len(set(df[df.fraud_ind.notnull()].cano) & \
                              set(df[df.fraud_ind.isnull()].cano)) )
print('diff cano rate : ',len(set(df[df.fraud_ind.notnull()].cano).\
                              difference(set(df[df.fraud_ind.isnull()].cano))) /len(df[df.fraud_ind.notnull()].cano) )
print('--' * 20)
print('train bacno size: ', len(df[df.fraud_ind.notnull()].bacno.unique()))
print('test bacno size : ', len(df[df.fraud_ind.isnull()].bacno.unique()))
print('both bacno size : ',len(set(df[df.fraud_ind.notnull()].bacno) & \
                              set(df[df.fraud_ind.isnull()].bacno)) )
print('diff bacno rate : ',len(set(df[df.fraud_ind.notnull()].bacno).\
                              difference(set(df[df.fraud_ind.isnull()].bacno)))/len(df[df.fraud_ind.notnull()].bacno) )

train cano size:  129413
test cano size :  86808
both cano size :  3763
diff cano rate :  0.08256740266541901
----------------------------------------
train bacno size:  95214
test bacno size :  71099
both bacno size :  3128
diff bacno rate :  0.06051175361597911


In [17]:
# Check train & test different
#target_mean_feature = ['csmcu','etymd','mcc','mchno','scity','stocn','stscd']
#
#for k in target_mean_feature:
#    print('train {} size: '.format(k),len(df[df.fraud_ind.notnull()][k]))
#    print('test {} size : '.format(k),len(df[df.fraud_ind.isnull()][k]))
#    print('both {} size : '.format(k),len(set(df[df.fraud_ind.notnull()][k]) & \
#                                  set(df[df.fraud_ind.isnull()][k])) )
#    print('diff {} rate : '.format(k),len(set(df[df.fraud_ind.notnull()][k]).\
#                                  difference(set(df[df.fraud_ind.isnull()][k]))) /\
#                                  len(df[df.fraud_ind.notnull()][k]) )
#    print('--'*20)

In [19]:
# transfor data
df['cano_diff'] = df.groupby('cano').locdt.shift(0) - df.groupby('cano').locdt.shift(1)
df['bacno_diff'] = df.groupby('bacno').locdt.shift(0) - df.groupby('bacno').locdt.shift(1)

df['locdt_dtran'] = df['locdt'] % 7
df['locdt_mtran'] = df['locdt'] % 30
df['loctm'] = df['loctm'] // 10000 + (df['loctm'] -
                                      (df['loctm'] // 10000) * 10000) / 6000

end = np.array(sorted(df[df.locdt == 120].txkey.values))
df['magic_txkey'] = df['txkey'].apply(lambda x: np.where(x <= end)[0][0])
# quantitle
df['magic_quant'] = df.groupby('magic_txkey').txkey.rank() / df['magic_txkey'].map(df['magic_txkey'].value_counts())
df['magic_tm'] = df.magic_txkey.map(df.groupby('magic_txkey').fraud_ind.mean())
cat_feature = [x for x in df.columns if x not in ['conam', 'loctm', 'txkey', 'fraud_ind', 'magic_quant', 'magic_tm']]

# Check train category not use in test
print(f'Before: {df.shape}')
for i in set(cat_feature).difference(['cano', 'locdt', 'bacno']):
    if len(
            set(df[df.fraud_ind.notnull()][i].unique()).difference(
                set(df[df.fraud_ind.isnull()][i].unique()))) > 0:
        drop_target = list(
            set(df[df.fraud_ind.notnull()][i].unique()).difference(
                set(df[df.fraud_ind.isnull()][i].unique())))
        print(f'{i}: {len(drop_target)}')
        df = df.drop(df[df[i].isin(drop_target)].index)
print(f'After: {df.shape}')

Before: (1943452, 30)
acqic: 3118
cano_diff: 1
csmcu: 10
bacno_diff: 2
scity: 3428
mcc: 62
mchno: 51066
After: (1788693, 30)


In [20]:
#target_mean_feature = ['bacno','cano','acqic','csmcu','etymd','mcc','mchno','scity','stocn','stscd']
target_mean_feature = ['csmcu', 'etymd', 'mcc', 'stocn', 'magic_txkey']
onehot_feature = [
    'contp', 'flbmk', 'ecfg', 'flg_3dsmk', 'hcefg', 'insfg', 'ovrlt', 'stscd',
    'locdt_dtran', 'iterm'
]
freq_feature = [
    'csmcu', 'etymd', 'mcc', 'mchno', 'acqic', 'bacno', 'cano', 'scity',
    'stocn'
]

for k in target_mean_feature:
    df[k + '_tm'] = df[k].map(df.groupby(k).fraud_ind.mean())
    df[k + '_ts'] = df[k].map(df.groupby(k).fraud_ind.std()).fillna(0)
    df[k + '_keym'] = df[k].map(df.groupby(k).txkey.mean())
    df[k + '_keys'] = df[k].map(df.groupby(k).txkey.std()).fillna(0)

for k in freq_feature:
    df[k + '_f'] = df[k].map(df[k].value_counts(normalize=True))

for k in onehot_feature:
    add_dumy = pd.get_dummies(df[k])
    add_dumy.columns = [k + "_{}".format(x) for x in add_dumy.columns]
    if add_dumy.shape[0] < 2:
        add_dumy = add_dumy.iloc[:, 0]
    df = pd.concat([df, add_dumy], axis=1)
# fix loctm
df['loctm'] = df['loctm'] // 10000 + (df['loctm'] -
                                      (df['loctm'] // 10000) * 10000) / 6000

In [21]:
# conam 相關
for k in cat_feature:
    df[k + "_conam_min"] = df[k].map(df.groupby(k).conam.min())
    df[k + "_conam_max"] = df[k].map(df.groupby(k).conam.max())
    df[k + "_conam_med"] = df[k].map(df.groupby(k).conam.median())
    df[k + "_conam_mean"] = df[k].map(df.groupby(k).conam.mean())
    df[k + "_conam_std"] = df[k].map(df.groupby(k).conam.std()).fillna(0)
    df[k + "_conam_mean_ratio"] = df.conam / df[k + "_conam_mean"]
    df[k + "_conam_std_ratio"] = df.conam / df[k + "_conam_std"]
    del df[k+"_conam_mean"] ,df[k+"_conam_std"]

# txkey 相關
for k in cat_feature:
    df[k + "_txkey_min"] = df[k].map(df.groupby(k).magic_txkey.min())
    df[k + "_txkey_max"] = df[k].map(df.groupby(k).magic_txkey.max())
    df[k + "_txkey_med"] = df[k].map(df.groupby(k).magic_txkey.median())
    df[k + "_txkey_mean"] = df[k].map(df.groupby(k).magic_txkey.mean())
    df[k + "_txkey_std"] = df[k].map(df.groupby(k).magic_txkey.std()).fillna(0)
    df[k + "_txkey_mean_ratio"] = df.conam / df[k + "_txkey_mean"]
    df[k + "_txkey_med_ratio"] = df.conam / df[k + "_txkey_med"]
    df[k + "_txkey_std_ratio"] = df.conam / df[k + "_txkey_std"]
    del df[k+"_txkey_mean"] ,df[k+"_txkey_std"],df[k+"_txkey_med"]

df['digital'] = (df.conam - df.conam.astype('int')) * 1000

In [22]:
# Might be overfit feature
df['test1'] = df.cano.map(df.groupby('cano').magic_txkey.median())
df['test2'] = df.bacno.map(df.groupby('bacno').magic_txkey.median())
df['test5'] = df.sort_values(by=['cano', 'locdt']).groupby('cano').locdt.diff()

# Kaggle feature
#df['mean_last'] = df['conam'] - df.groupby('cano')['conam'].transform(lambda x: x.rolling(5, 1).mean())
#df['min_last'] = df.groupby('cano')['conam'].transform(lambda x: x.rolling(5, 1).min())
#df['max_last'] = df.groupby('cano')['conam'].transform(lambda x: x.rolling(5, 1).max())
#df['std_last'] = df['mean_last'] / df.groupby('cano')['conam'].transform(lambda x: x.rolling(5, 1).std())
#df['count_last'] = df.groupby('cano')['conam'].transform(lambda x: x.rolling(10, 1).count())
#df['mean_last'].fillna(0, inplace=True)
#df['std_last'].fillna(0, inplace=True)

# card use ratio 
df['use_card'] = df['cano'].map(df['cano'].value_counts() )/ df['bacno'].map(df['bacno'].value_counts() )

### TS ###
# csmcu target mean  (train:72 , all:76)
df['csmcu_tm1'] = df.csmcu.map(df.groupby('csmcu').fraud_ind.mean())
df['firstcard_fraud']= df.cano.map(df.sort_values(by='locdt').groupby('cano').fraud_ind.first())

df['cano_csmcu_count'] = df.cano.map(df.groupby('cano').csmcu.count())
temp2 = df.groupby('cano').csmcu.value_counts(normalize=True).reset_index(name='cano_csmcu_valuecount')
df = df.merge(temp2,how='left',on=['cano','csmcu'])

In [None]:
temp1 = df.groupby(['insfg', 'iterm']).fraud_ind.mean().reset_index()
temp1.columns.values[2] = 'insfg_iterm_tm'
df = df.merge(temp1, how='left')
df = reduce_mem_usage(df)
del temp1, temp2, add_dumy

In [None]:
#df.to_csv('df.csv',index=False)

### Try EDA

In [None]:
# find valid set split method
#sns.distplot(df[df.fraud_ind==1].locdt,color='r')
#sns.distplot(df[df.fraud_ind==0].locdt,color='b')

In [None]:
#sns.distplot(df[df.fraud_ind.notnull()].locdt,color='b')
#sns.distplot(df[df.fraud_ind.isnull()].locdt,color='r')

In [None]:
# train and test same cano
#cheat_cano = set(df.cano[df.fraud_ind.notnull()]) & set(df.cano[df.fraud_ind.isnull()])

In [None]:
#temp1 = df[df.cano.isin(cheat_cano)].sort_values(by=['cano','loctm','locdt'])[['bacno','cano','conam','loctm','locdt','fraud_ind']]
#temp1[temp1.bacno.isin(temp1.bacno[temp1.fraud_ind==1].unique())]

In [None]:
#sns.countplot( df.stocn[(df.fraud_ind==1) &(df.stocn<50) ],color='r' )

In [None]:
#mat1 = np.zeros([len(df[df.fraud_ind.notnull()].stocn.unique()),5])
#count=0
#for i in df[df.fraud_ind.notnull()].stocn.unique():
#    mat1[count,0] = i
#    mat1[count,1] = len(df.stocn[(df.fraud_ind==1) &(df.stocn==i)]  )
#    mat1[count,2] = len(df.stocn[(df.fraud_ind==0) &(df.stocn==i)]  )
#    mat1[count,3] = len(df.stocn[(df.fraud_ind.isnull()) &(df.stocn==i)]  )
#    mat1[count,4] = len(df.stocn[(df.fraud_ind==1) &(df.stocn==i)]  )/\
#        len(df.stocn[(df.stocn==i) & (df.fraud_ind.notnull())]  )
#    count+=1
#
#mat1 = pd.DataFrame(mat1)
#mat1.columns = ['stocn','isfraud','nofraud','testset_size','ratio']
#mat1.sort_values(by=['testset_size'],ascending=False)

In [None]:
print(f'test set  size : {df[df.fraud_ind.isnull()].shape[0] }')
print(f'test size bcno : {len(df[df.fraud_ind.isnull()].bacno.unique())}')
print(f'test size cano : {len(df[df.fraud_ind.isnull()].cano.unique())}')

### Model

In [None]:
#import lightgbm as lgb
from sklearn.metrics import f1_score
import xgboost as xgb
from sklearn.model_selection import GridSearchCV 
from xgboost.sklearn import XGBClassifier

In [None]:
#feature = [x for x in df.columns if x not in \
#            onehot_feature + freq_feature+['count_1','fraud_ind','locdt','txkey']]
#X = df[df.fraud_ind.notnull()][feature]
#y = df[df.fraud_ind.notnull()]['fraud_ind'].values.astype('int') 

In [None]:
# Do oversampling
#new_X = X.copy()
#new_y = y.copy()
#
#add_num = (y==1).sum()
#for i in range(74):
#    new_X = pd.concat([new_X,X[y==1] ],axis=0)
#    new_y = np.append(new_y,[1]*add_num )
#    print(f'{i+1}/74',end='\r')
#print('Oversampling OK ....')

In [None]:
def split_data(method=1):
    # random split (X)
    if method == 1:
        X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2, stratify=y,random_state = 123)
    # by cano (O)
    elif method == 2:
        q_index = df[df.fraud_ind.notnull()].groupby('cano').fraud_ind.mean()
        q_index[q_index>0] = 1
        tt_idx = random.choices(q_index[q_index==0].index , k = np.int_(len(q_index[q_index==0])*0.8)) +\
            random.choices(q_index[q_index==1].index , k = np.int_(len(q_index[q_index==1])*0.8)) 
        vv_idx = list(set(q_index.index).difference(tt_idx))
        X_train = df[(df.fraud_ind.notnull()) & (df.cano.isin(tt_idx)) ][feature]
        y_train = df[(df.fraud_ind.notnull()) & (df.cano.isin(tt_idx)) ]['fraud_ind'].values.astype('int') 
        X_test = df[(df.fraud_ind.notnull()) & (df.cano.isin(vv_idx)) ][feature]
        y_test = df[(df.fraud_ind.notnull()) & (df.cano.isin(vv_idx)) ]['fraud_ind'].values.astype('int') 
    # by locdt day (X)
    elif method==3:
        tt_idx = []
        for i in range(1,91):
            num = df[df.locdt==i].shape[0]
            cand = random.sample( list(df[df.locdt==i].index) , np.int_(num*0.8) )
            tt_idx+= cand
        vv_idx = list(set(df[df.fraud_ind.notnull()].index).difference(tt_idx))
        X_train = df[(df.fraud_ind.notnull()) & (df.index.isin(tt_idx)) ][feature]
        y_train = df[(df.fraud_ind.notnull()) & (df.index.isin(tt_idx)) ]['fraud_ind'].values.astype('int') 
        X_test = df[(df.fraud_ind.notnull()) & (df.index.isin(vv_idx)) ][feature]
        y_test = df[(df.fraud_ind.notnull()) & (df.index.isin(vv_idx)) ]['fraud_ind'].values.astype('int') 
    # sliding windows
    elif method==4: 
        X_train = df[(df.fraud_ind.notnull()) & (df.locdt<=66) ][feature]
        y_train = df[(df.fraud_ind.notnull()) & (df.locdt<=66) ]['fraud_ind'].values.astype('int') 
        X_test = df[(df.fraud_ind.notnull()) & (df.locdt>66) ][feature]
        y_test = df[(df.fraud_ind.notnull()) & (df.locdt>66) ]['fraud_ind'].values.astype('int') 
        
    return X_train, X_test, y_train, y_test

In [None]:
def model_(x_train,y_train,x_test,y_test,boost_type='lgb'):
    tStart = time.time()
    if boost_type=='lgb':
        model = lgb.LGBMClassifier(
            boosting_type='gbdt',
            objective='binary',
            learning_rate=0.01, 
            n_estimators= 9000, 
            max_depth = 8, 
            min_child_weight = 5,       
            scale_pos_weight = 9, # refer: 70
            subsample = 0.7,
            colsample_bytree = 0.7,
            subsample_freq =1,
            n_jobs=-1)
        
    elif boost_type=='xgb':
        model = XGBClassifier(
            learning_rate = 0.03 , 
            tree_method = 'gpu_hist',
            n_estimators=9000, 
            max_depth=12,
            min_child_weight=12, 
            gamma=0, 
            subsample=0.8, 
            colsample_bytree=0.8,
            objective= 'binary:logistic', 
            nthread=-1, 
            scale_pos_weight=75, 
            seed=599
            )
        
    print('Start training ...')
    model.fit(x_train,y_train)
    yp_train = model.predict_proba(x_train)[:,1]
    yp_valid = model.predict_proba(x_test)[:,1]
    print(f'Use time: { np.int_((time.time()-tStart)/60)  } mins\nCaluate prob ...')
    
    ## probability tune
    mat = np.zeros([5,100])
    for threshold in range(100):
        y_pred_train = np.int_( yp_train > threshold*0.01)
        y_pred_valid = np.int_( yp_valid > threshold*0.01)
        mat[0,threshold] = round(threshold*0.01,2)
        mat[1,threshold] = f1_score(y_train,y_pred_train)
        mat[2,threshold] = f1_score(y_test,y_pred_valid) 
        mat[3,threshold] = (y_train==y_pred_train).mean()
        mat[4,threshold] = (y_test==y_pred_valid).mean()
        
    # Fig1 for F1
    sns.pointplot( x= mat[0,:],y= mat[1,:],color='r')
    sns.pointplot( x= mat[0,:],y= mat[2,:],color='b')
    plt.title(f'{boost_type} F1 performance',color='r')
    plt.show()
    
    # Fig2 for acc
    sns.pointplot( x= mat[0,10:],y= mat[3,10:],color='r')
    sns.pointplot( x= mat[0,10:],y= mat[4,10:],color='b')
    plt.title(f'{boost_type} Acc performance',color='r')
    plt.show()
    print('--'*20)
    
    # reult for best probalility
    best_prob = round(np.argmax(mat[2,:])*0.01,2)
    print('Valid Result:\nprob: {}, F1 : {}, acc : {}'.\
          format(best_prob,max(mat[2,:]).round(3), mat[4,:][np.argmax(mat[2,:])].round(3)))
    print('--'*20)
    
    # confusion matrix
    y_pred_train = np.int_( yp_train > best_prob) 
    y_pred_valid = np.int_( yp_valid > best_prob) 
    print('Train confusion matrix')
    display(pd.crosstab(y_train, y_pred_train,margins=True, margins_name="Total" ))
    print('--'*20)
    print('Valid confusion matrix')
    display(pd.crosstab(y_test,y_pred_valid,margins=True, margins_name="Total" ))
    print('--'*20)
    
    print('Feature Importance (Top 10)')
    display(pd.DataFrame({'feature':feature,'gain':model.feature_importances_}).\
        sort_values(by='gain',ascending=False).iloc[0:10,:])
    
    return model,best_prob

In [None]:
# Test ADASYN algorithm
#from sklearn.preprocessing import StandardScaler
#from imblearn.over_sampling import ADASYN
#X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2, stratify=y,random_state = 88)
#std_scale = StandardScaler()
#X_train_scaled = std_scale.fit_transform(X_train)
#X_test_scaled = std_scale.transform(X_test)
#adasyn = ADASYN(random_state=88)
#X_adasyn, y_adasyn = adasyn.fit_resample(X_train_scaled, y_train)

In [None]:
#display(pd.DataFrame({'feature':feature,'gain':model_xgb.feature_importances_}).\
#    sort_values(by='gain',ascending=False))

In [None]:
#df_clone = df[(df.fraud_ind.notnull())].copy().sort_values(by='locdt').reset_index()
#myCViterator = [] 
#for train, test in TimeSeriesSplit(n_splits=5).split(df_clone.index):
#    myCViterator.append( (train, test))
#
#gc.collect()
# Grid search
param_test1 = {
    'max_depth':list(range(12,14,1)),
    'min_child_weight':list(range(12,13,1)),
    'scale_pos_weight':list(range(20,80,5))
    }
gsearch1 = GridSearchCV(estimator = XGBClassifier(learning_rate = 0.03 , 
                                                  n_estimators=2000, 
                                                  max_depth=5,
                                                  tree_method = 'gpu_hist',
                                                  n_gpus= 1,
                                                  min_child_weight=1, gamma=0, 
                                                  subsample=0.8, 
                                                  colsample_bytree=0.8,
                                                  objective= 'binary:logistic', nthread=-1, 
                                                  scale_pos_weight=1, 
                                                  seed=27), 
                        param_grid = param_test1, scoring='f1',iid=False, cv=myCViterator,verbose=3)
gsearch1.fit( df_clone[feature],df_clone['fraud_ind'].values.astype('int'))
print(gsearch1.best_params_)
#{'max_depth': 12, 'min_child_weight': 11, 'scale_pos_weight': 70}

### CV result

In [None]:
feature = [x for x in df.columns if x not in \
            onehot_feature + freq_feature+['count_1','fraud_ind','locdt','locdt_tran','iterm']]
X = df[df.fraud_ind.notnull()][feature]
y = df[df.fraud_ind.notnull()]['fraud_ind'].values.astype('int')

In [None]:
X_train, X_test, y_train, y_test = split_data(method = 4 )

In [None]:
gc.collect()
final_model,best_prob = model_(x_train=X_train,y_train=y_train,x_test=X_test,y_test=y_test,boost_type='xgb')

### Submit

In [None]:
#final_model=model = XGBClassifier(
#                        learning_rate = 0.03 , 
#                        tree_method = 'gpu_hist',
#                        n_estimators=5000, 
#                        max_depth=9,
#                        min_child_weight=1, 
#                        gamma=0, 
#                        subsample=0.8, 
#                        colsample_bytree=0.8,
#                        objective= 'binary:logistic', 
#                        nthread=-1, 
#                        scale_pos_weight=11, 
#                        seed=27
#                        )

In [None]:
# final model
gc.collect()
tStart = time.time()
final_model.fit(X,y)
#final_model.fit(new_X,new_y)
print(f'Use time: { np.int_((time.time()-tStart)/60)  } mins')

In [None]:
y_prob = final_model.predict_proba( df[df.fraud_ind.isnull()][feature])[:,1]
sns.distplot(y_prob)

In [None]:
submit = pd.read_csv('submission_test.csv')
Mapping = pd.DataFrame({'txkey':df[df.fraud_ind.isnull()]['txkey'],
                        'fraud_ind':np.int_(y_prob > 0.86 )  })
del submit['fraud_ind']
submit = submit.merge(Mapping,how='left',on='txkey')
print(f'best_prob: {best_prob}')
display(submit.fraud_ind.value_counts())
del Mapping

In [None]:
pd.DataFrame([
    (x, y) for (x, y) in zip(X.columns, final_model.feature_importances_)
]).sort_values(by=1, ascending=False).iloc[0:20, :]

In [None]:
submit.to_csv('submit_xgq6288.csv',index = False)

In [None]:
# record 
print(f'# {final_model.learning_rate}, {final_model.max_depth} ,{final_model.scale_pos_weight}   ,{final_model.min_child_weight}    ,{final_model.n_estimators}  ,  ')

In [None]:
# eta ,dep,scale,child,nround, cv result                             , csv          , lb        , meth, feature  
# 0.02,15 ,15   ,1    ,5000  , (prob: 0.77, F1 : 0.781, acc : 0.994) , submit_l7698 , 0.527928  , 1   , test 1,2,5
# 0.02,15 ,15   ,1    ,5000  , (prob: 0.78, F1 : 0.751, acc : 0.993) , submit_l5647 , 0.567554  , 1   , test 5
# 0.03,15 ,10   ,1    ,9000  , (prob: 0.68, F1 : 0.784, acc : 0.994) , submit_l4835 , 0.544638  , 1   , test 5
# 0.01, 5 ,10   ,3    ,9000  , (prob: 0.74, F1 : 0.730, acc : 0.993) , submit_l6260 , 0.566163  , 1   , test 5
# 0.01, 5 ,10   ,3    ,9000  , (prob: 0.77, F1 : 0.714, acc : 0.993) , submit_l6524 , 0.574345  , 1   , -txkey + test 5
# ------------------------------------------------  fix cv split  -------------------------------------------------------------
# 0.01, 5 ,10   ,3    ,9000  , (prob: 0.77, F1 : 0.605, acc : 0.990) , submit_l6524 , 0.574345  , 2   , -txkey + test 5
# 0.03, 5 ,10   ,5    ,3000  , (prob: 0.71, F1 : 0.601, acc : 0.990) , submit_x7809 , 0.579507  , 2   , -txkey + test 5
# 0.01, 5 ,10   ,3    ,9000  , (prob: 0.81, F1 : 0.621, acc : 0.991) , submit_l5831 , 0.578424  , 2   , -txkey + test 5 + new
# 0.01, 7 ,10   ,1    ,5000  , (prob: 0.70, F1 : 0.613, acc : 0.991) , submit_x7161 , 0.587242  , 2   , -txkey + test 5
# -------------------------------------------------  fix loctm  ---------------------------------------------------------------
# 0.01, 5 ,10   ,3    ,9000  , (prob: 0.79, F1 : 0.615, acc : 0.991) , submit_l6246 , 0.582314  , 2   , -txkey + test 5 + new
# 0.01, 7 ,10   ,1    ,5000  , (prob: 0.71, F1 : 0.622, acc : 0.991) , submit_x7084 , 0.584780  , 2   , -txkey + test 5 + new2
# ------------------------------------------------  fix cv split  -------------------------------------------------------------
# 0.01, 5 ,10   ,3    ,9000  , (prob: 0.69, F1 : 0.466, acc : 0.990) , submit_l6246 , 0.582314  , 4   , -txkey + test 5 + new
# 0.01, 7 ,20   ,3    ,9000  , (prob: 0.81, F1 : 0.478, acc : 0.991) , submit_l6556 , 0.562601  , 4   , test 5 + new3
# 0.02, 5 ,10   ,5    ,5000  , (prob: 0.63, F1 : 0.463, acc : 0.989) , submit_x9470 , 0.567734  , 4   , -txkey + test 5 + new2
# 0.01, 15,10   ,3    ,9000  , (prob: 0.63, F1 : 0.492, acc : 0.991) , submit_l7607 , 0.572729  , 4   , test 5 + new2
# 0.01, 10,10   ,1    ,9000  , (prob: 0.22, F1 : 0.466, acc : 0.990) , submit_x9847 , 0.558851  , 4   , -txkey + test 5 + new2
# 0.01, 8 ,10   ,1    ,9000  , (prob: 0.65, F1 : 0.494, acc : 0.991) , submit_l7228 , 0.572884  , 4   , -txkey + test 5 + new2
# 0.01, 8 ,9    ,5    ,9000  , (prob: 0.49, F1 : 0.508, acc : 0.991) , submit_l8952 , 0.574535  , 4   , -txkey + new999
# 0.01, 8 ,9    ,5    ,9000  , (prob: 0.60, F1 : 0.508, acc : 0.991) , submit_lq7189, 0.590527  , 4   , -txkey + new999
# 0.01, 8 ,9    ,5    ,9000  , (prob: 0.68, F1 : 0.508, acc : 0.991) , submit_lq6489, 0.596744  , 4   , -txkey + new999
# 0.03, 9 ,11   ,1    ,2000  , (prob: 0.22, F1 : 0.523, acc : 0.991) , submit_xg8437, 0.570375  , 4   , -txkey + new999 
# 0.03, 9 ,11   ,1    ,5000  , (prob: 0.22, F1 : 0.531, acc : 0.992) , submit_xg6177, 0.592118  , 4   , -txkey + new999 
# 0.03, 9 ,11   ,1    ,5000  , (prob: 0.11, F1 : 0.544, acc : 0.992) , submit_xg9081, 0.548502  , 4   , new
# 0.01,10 ,55   ,11   ,4000  , (prob: 0.46, F1 : 0.555, acc : 0.992) , submit_xg6548, 0.575885  , 4   , new2
# 0.03,10 ,55   ,11   ,4000  , (prob: 0.39, F1 : 0.558, acc : 0.992) , submit_xg5873, 0.581088  , 4   , new2
# 0.03,10 ,55   ,11   ,9000  , (prob: 0.26, F1 : 0.559, acc : 0.992) , submit_xg6316, 0.586641  , 4   , new2


  

In [None]:
# outcome value analysis
a = []
for i in range(1,91):
    a.append(df.fraud_ind[ (df.locdt==i)  ].sum())
    
plt.plot(a)
print(np.array(a).mean()*30)

### Stacking

In [None]:
#submit1 = pd.read_csv('submit_l4835.csv')  #  0.544638
#submit2 = pd.read_csv('submit_l6260.csv')  #  0.566163
#submit3 = pd.read_csv('submit_l6524.csv')  #  0.574345
#submit4 = pd.read_csv('submit_x7084.csv')  #  0.584780
#
#submit = pd.read_csv('submission_test.csv')

In [None]:
#submit.fraud_ind = \
#    (submit1.fraud_ind*0.544638 + submit2.fraud_ind*0.566163 + 
#    submit3.fraud_ind*0.574345 + submit4.fraud_ind*0.584780)  / (0.544638+0.566163+0.574345+0.584780 )
#submit.fraud_ind = np.int_(submit.fraud_ind>=0.5)
#submit.fraud_ind.value_counts()
#submit.to_csv('stack1.csv',index = False)