# 1. Load Libraries

In [1]:
from SlackNotifier.notifier import SlackNotifier

api_url = 'https://hooks.slack.com/services/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'
channel = '#ieee-cis-fraud-detection'
sn = SlackNotifier(api_url, channel)

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from sklearn.preprocessing import LabelEncoder

from sklearn.model_selection import train_test_split

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score

import warnings
warnings.filterwarnings('ignore')

import os
print(os.listdir('../../input'))

['sample_submission.csv', 'test_identity.csv', 'test_transaction.csv', 'train_identity.csv', 'train_transaction.csv']


# 2. Prepare Data

## 2.1 Load Datasets

In [3]:
df_train_t = pd.read_csv('../../input/train_transaction.csv', index_col='TransactionID')
df_train_i = pd.read_csv('../../input/train_identity.csv', index_col='TransactionID')
df_test_t = pd.read_csv('../../input/test_transaction.csv', index_col='TransactionID')
df_test_i = pd.read_csv('../../input/test_identity.csv', index_col='TransactionID')
sample_submission = pd.read_csv('../../input/sample_submission.csv')

print('train_transaction의 shape : ', df_train_t.shape)
print('train_identity의 shape : ', df_train_i.shape)
print('test_transaction의 shape : ', df_test_t.shape)
print('test_transaction의 shape : ', df_test_i.shape)

train_transaction의 shape :  (590540, 393)
train_identity의 shape :  (144233, 40)
test_transaction의 shape :  (506691, 392)
test_transaction의 shape :  (141907, 40)


In [4]:
sn.noti('Load Datasets Finished.')

## 2.2 Merge Datasets

In [5]:
df_train = df_train_t.merge(df_train_i, 
                            how='left', 
                            left_index=True, 
                            right_index=True)
df_test = df_test_t.merge(df_test_i, 
                          how='left', 
                          left_index=True, 
                          right_index=True)

In [6]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 590540 entries, 2987000 to 3577539
Columns: 433 entries, isFraud to DeviceInfo
dtypes: float64(399), int64(3), object(31)
memory usage: 1.9+ GB


In [7]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 506691 entries, 3663549 to 4170239
Columns: 432 entries, TransactionDT to DeviceInfo
dtypes: float64(399), int64(2), object(31)
memory usage: 1.7+ GB


## 2.3 Reduce Memory Usage

In [8]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [9]:
df_train = reduce_mem_usage(df_train)
df_test = reduce_mem_usage(df_test)

Mem. usage decreased to 668.22 Mb (66.2% reduction)
Mem. usage decreased to 583.43 Mb (65.6% reduction)


In [10]:
sn.noti('Reduce Memory Finished')

## 2.4 Divide Features by Categorical and Numerical

### 2.4.1 Categorical Features

In [11]:
# cat_col_t : categorical columns in transaction dataset
cat_col_t = ['ProductCD','addr1','addr2', 'P_emaildomain', 'R_emaildomain', 'TransactionDT']
cat_col_t.extend(['card' + str(x) for x in range(1,7)]) # card1 ~ card6
cat_col_t.extend(['M' + str(x) for x in range(1,10)]) # M1 ~ M9

# cat_col_i : categorical columns in identity dataset
cat_col_i = ['DeviceType', 'DeviceInfo']
cat_col_i.extend(['id_' + str(x) for x in range(12, 39)]) # id_12 ~ id_38

# cat_col : categorical columns in transaction and identity dataset
cat_col = [*cat_col_t, *cat_col_i]

### 2.4.2 Numerical Features

In [12]:
# num_col_t : numerical columns in transaction dataset
num_col_t = [col for col in list(df_train_t.columns) if col not in [*cat_col_t, 'isFraud']]

# num_col_i : numerical columns in identity dataset
num_col_i = [col for col in list(df_train_i.columns) if col not in cat_col_i]

# num_col : numerical columns in transaction and identity dataset
num_col =[*num_col_t, *num_col_i]

### 2.4.3 Transaction Dataset

In [13]:
print("number of transaction dataset's columns : ", df_train_t.shape[1])
print("cat_col_t's length : ", len(cat_col_t))
print("num_col_t's length : ", len(num_col_t))
print("sum of boths : ", len(cat_col_t) + len(num_col_t))

number of transaction dataset's columns :  393
cat_col_t's length :  21
num_col_t's length :  371
sum of boths :  392


### 2.4.4 Identity Dataset

In [14]:
print("number of transaction dataset's columns : ", df_train_i.shape[1])
print("cat_col_i's length : ", len(cat_col_i))
print("num_col_i's length : ", len(num_col_i))
print("sum of boths : ", len(cat_col_i) + len(num_col_i))

number of transaction dataset's columns :  40
cat_col_i's length :  29
num_col_i's length :  11
sum of boths :  40


In [15]:
sn.noti('Prepare Data Finished!')

# 3. Summary of Features

## 3.1 Summary of Categorical Features

In [16]:
def make_cat_summary(df, features) :

    summary = df[features].dtypes.reset_index()
    summary.rename(columns={'index': 'Name', 0: 'Dtype'}, inplace=True)
    summary['DataSets'] = ['transaction' if col in cat_col_t else 'identity' for col in features]
    summary['NullCnt'] = [df[col].isnull().sum() for col in features]
    summary['NullRt'] = [np.round((df[col].isnull().sum())/df.shape[0], 2) for col in features]
    summary['UniqueCnt'] = [df[col].nunique() for col in features]
    Values = []
    for col in features :
        if df[col].nunique() <= 5 :
            val = list(df[col].value_counts().reset_index()['index'])
            val.sort()
            Values.append(', '.join(str(v) for v in val))
        else :
            Values.append('-')    
    summary['Values'] = Values
    summary['MinValue'] = [df.loc[df[col].notnull(), col].min() for col in features]
    summary['MaxValue'] = [df.loc[df[col].notnull(), col].max() for col in features]
    
    return summary

In [17]:
cat_summary = make_cat_summary(df_train, cat_col)
cat_summary.head()

Unnamed: 0,Name,Dtype,DataSets,NullCnt,NullRt,UniqueCnt,Values,MinValue,MaxValue
0,ProductCD,object,transaction,0,0.0,5,"C, H, R, S, W",C,W
1,addr1,float16,transaction,65706,0.11,332,-,100,540
2,addr2,float16,transaction,65706,0.11,74,-,10,102
3,P_emaildomain,object,transaction,94456,0.16,59,-,aim.com,ymail.com
4,R_emaildomain,object,transaction,453249,0.77,60,-,aim.com,ymail.com


## 3.2 Summary of Numerical Features

In [18]:
def make_num_summary(df, features) :
    
    summary = df[features].dtypes.reset_index()
    summary.rename(columns={'index': 'Name', 0: 'Dtype'}, inplace=True)
    summary['DataSets'] = ['transaction' if col in num_col_t else 'identity' for col in features]
    summary['NullCnt'] = [df[col].isnull().sum() for col in features]
    summary['NullRt'] = [np.round((df[col].isnull().sum())/df.shape[0], 2) for col in features]
    summary['MinValue'] = [df.loc[df[col].notnull(), col].min() for col in features]
    summary['Q25'] = [df[col].quantile([0.25]).values[0] for col in features]
    summary['Q50'] = [df[col].quantile([0.50]).values[0] for col in features]
    summary['Q75'] = [df[col].quantile([0.75]).values[0] for col in features]
    summary['MaxValue'] = [df.loc[df[col].notnull(), col].max() for col in features]
    summary['Mean'] = [df.loc[df[col].notnull(), col].mean() for col in features]
    summary['Std'] = [df.loc[df[col].notnull(), col].std() for col in features]

    return summary

In [19]:
num_summary = make_num_summary(df_train, num_col)
num_summary.head()

Unnamed: 0,Name,Dtype,DataSets,NullCnt,NullRt,MinValue,Q25,Q50,Q75,MaxValue,Mean,Std
0,TransactionAmt,float16,transaction,0,0.0,0.250977,43.3125,68.75,125.0,31936.0,,
1,dist1,float16,transaction,352271,0.6,0.0,3.0,8.0,24.0,10288.0,,
2,dist2,float16,transaction,552913,0.94,0.0,7.0,37.0,206.0,11624.0,inf,inf
3,C1,float16,transaction,0,0.0,0.0,1.0,1.0,3.0,4684.0,,
4,C2,float16,transaction,0,0.0,0.0,1.0,1.0,3.0,5692.0,,


In [20]:
sn.noti('Summary Finished')

# 4. Feature Engineering

## 4.1 Define Functions

In [21]:
def value_acc_freq(df, col) :
        
    df_vc = df[col].value_counts().reset_index()
    df_vc.rename(columns={'index': 'value', col: 'cnt'}, inplace=True)
    df_vc['accCntRt'] = df_vc['cnt'].cumsum() / len(df[df[col].notnull()])
    
    return df_vc

In [22]:
def replace_to_others(df, col, rate=None, cnt=None) :
    
    df_vc = df[col].value_counts().reset_index()
    df_vc.rename(columns={'index': 'value', col: 'cnt'}, inplace=True)
    df_vc['accCntRt'] = df_vc['cnt'].cumsum() / len(df[df[col].notnull()])
    target_list = []
    
    if rate != None :
        target_list = list(df_vc[df_vc['accCntRt'] >= float(rate)]['value'])
    if cnt != None :
        target_list = list(df_vc[df_vc['cnt'] < int(10)]['value'])
        
    dataType = str(df_vc['value'].values.dtype)
    replace_value = 'OTHERS'
    if dataType.find('int') == 0 :
        replace_value = int(99999)
    elif dataType.find('float') == 0 :
        replace_value = float(99999)
    
    df.loc[df[col].isin(target_list), col] = replace_value

## 4.2 Feature Engineering for Categorical Features

In [23]:
def cat_feature_engineering(df) :
    # addr1
    replace_to_others(df, 'addr1', rate=0.95)

    # addr2
    replace_to_others(df, 'addr2', cnt=10)

    # P_emaildomain
    df.loc[df['P_emaildomain'].str.contains('gmail', na=False), 'P_emaildomain'] = 'GMAIL'
    df.loc[df['P_emaildomain'].str.contains('yahoo', na=False), 'P_emaildomain'] = 'YAHOO'
    df.loc[df['P_emaildomain'].str.contains('hotmail', na=False), 'P_emaildomain'] = 'HOTMAIL'
    df.loc[df['P_emaildomain'].str.contains('live', na=False), 'P_emaildomain'] = 'LIVE'
    df.loc[df['P_emaildomain'].str.contains('netzero', na=False), 'P_emaildomain'] = 'NETZERO'
    df.loc[df['P_emaildomain'].str.contains('outlook', na=False), 'P_emaildomain'] = 'OUTLOOK'
    replace_to_others(df, 'P_emaildomain', cnt=250)

    # R_emaildomain
    df.loc[df['R_emaildomain'].str.contains('gmail', na=False), 'R_emaildomain'] = 'GMAIL'
    df.loc[df['R_emaildomain'].str.contains('yahoo', na=False), 'R_emaildomain'] = 'YAHOO'
    df.loc[df['R_emaildomain'].str.contains('hotmail', na=False), 'R_emaildomain'] = 'HOTMAIL'
    df.loc[df['R_emaildomain'].str.contains('live', na=False), 'R_emaildomain'] = 'LIVE'
    df.loc[df['R_emaildomain'].str.contains('netzero', na=False), 'R_emaildomain'] = 'NETZERO'
    df.loc[df['R_emaildomain'].str.contains('outlook', na=False), 'R_emaildomain'] = 'OUTLOOK'
    replace_to_others(df, 'R_emaildomain', cnt=80)

    # TransactionDT
    # Reference : https://www.kaggle.com/shkim4738/extensive-eda-and-modeling-xgb-hyperopt
    import datetime

    START_DATE = '2017-12-01'
    startdate = datetime.datetime.strptime(START_DATE, '%Y-%m-%d')
    df['Date'] = df['TransactionDT'].apply(lambda x : (startdate + datetime.timedelta(seconds=x)))

    df['Weekdays'] = df['Date'].dt.dayofweek
    df['Hours'] = df['Date'].dt.hour
    df['Days'] = df['Date'].dt.day

    df.drop(['Date'], axis=1, inplace=True)

    # DeviceInfo
    df.loc[df['DeviceInfo'].str.contains('SAMSUNG', na=False), 'DeviceInfo'] = 'SAMSUNG'
    df.loc[df['DeviceInfo'].str.contains('SM', na=False), 'DeviceInfo'] = 'SM'
    df.loc[df['DeviceInfo'].str.contains('rv', na=False), 'DeviceInfo'] = 'RV'
    df.loc[df['DeviceInfo'].str.contains('Moto', na=False), 'DeviceInfo'] = 'MOTO'
    df.loc[df['DeviceInfo'].str.contains('HUAWEI', na=False), 'DeviceInfo'] = 'HUAWEI'
    df.loc[df['DeviceInfo'].str.contains('Huawei', na=False), 'DeviceInfo'] = 'HUAWEI'
    df.loc[df['DeviceInfo'].str.contains('LG-', na=False), 'DeviceInfo'] = 'LG'
    df.loc[df['DeviceInfo'].str.contains('Android', na=False), 'DeviceInfo'] = 'ANDROID'
    df.loc[df['DeviceInfo'].str.contains('Linux', na=False), 'DeviceInfo'] = 'LINUX'
    df.loc[df['DeviceInfo'].str.contains('HTC', na=False), 'DeviceInfo'] = 'HTC'
    df.loc[df['DeviceInfo'].str.contains('Hisense', na=False), 'DeviceInfo'] = 'HISENSE'
    df.loc[df['DeviceInfo'].str.contains('Blade', na=False), 'DeviceInfo'] = 'BLADE'
    df.loc[df['DeviceInfo'].str.contains('BLADE', na=False), 'DeviceInfo'] = 'BLADE'
    df.loc[df['DeviceInfo'].str.contains('ASUS', na=False), 'DeviceInfo'] = 'ASUS'
    df.loc[df['DeviceInfo'].str.contains('Redmi', na=False), 'DeviceInfo'] = 'REDMI'
    df.loc[df['DeviceInfo'].str.contains('iOS', na=False), 'DeviceInfo'] = 'iOS'
    df.loc[df['DeviceInfo'].str.contains('MacOS', na=False), 'DeviceInfo'] = 'MacOS'
    device = ['SAMSUNG','SM','RV','MOTO','HUAWEI','LG','ANDROID','LINUX','HTC','HISENSE','BLADE','ASUS','REDMI',
              'Windows','iOS', 'MacOS', 'Trident/7.0']
    df.loc[(~df['DeviceInfo'].isin(device)) & (df['DeviceInfo'].notnull()), 'DeviceInfo'] = 'OTHERS'

    # id_30
    df.loc[df['id_30'].str.contains('Windows', na=False), 'id_30'] = 'WINDOWS'
    df.loc[df['id_30'].str.contains('iOS', na=False), 'id_30'] = 'iOS'
    df.loc[df['id_30'].str.contains('Mac OS X', na=False), 'id_30'] = 'MacOS'
    df.loc[df['id_30'].str.contains('Android', na=False), 'id_30'] = 'ANDROID'

    # id_31
    df.loc[df['id_31'].str.contains('chrome', na=False), 'id_31'] = 'CHROME'
    df.loc[df['id_31'].str.contains('firefox', na=False), 'id_31'] = 'FIREFOX'
    df.loc[df['id_31'].str.contains('edge', na=False), 'id_31'] = 'EDGE'
    df.loc[df['id_31'].str.contains('ie ', na=False), 'id_31'] = 'IE'
    df.loc[df['id_31'].str.contains('safari', na=False), 'id_31'] = 'SAFARI'
    df.loc[df['id_31'].str.contains('opera', na=False), 'id_31'] = 'OPERA'
    df.loc[df['id_31'].str.contains('samsung', na=False), 'id_31'] = 'SAMSUNG'
    df.loc[df['id_31'].str.contains('Samsung', na=False), 'id_31'] = 'SAMSUNG'
    df.loc[df['id_31'].str.contains('android', na=False), 'id_31'] = 'ANDROID'
    df.loc[df['id_31'].str.contains('Android', na=False), 'id_31'] = 'ANDROID'
    device2 = ['CHROME','FIREFOX','EDGE','IE','SAFARI','OPERA','SAMSUNG','ANDROID']
    df.loc[(~df['id_31'].isin(device2)) & (df['id_31'].notnull()), 'id_31'] = 'OTHERS'

    # id_33
    replace_to_others(df, 'id_33', cnt=10)

In [24]:
cat_feature_engineering(df_train)
cat_feature_engineering(df_test)

In [25]:
cat_col_t.extend(['Weekdays','Hours','Days'])
cat_col = [*cat_col_t, *cat_col_i]

## 4.3 Feature Engineering for Numerical Features

생략

## 4.4 Missinig Values

In [26]:
cat_col_null_over90 = list(cat_summary.loc[cat_summary['NullRt'] >= float(0.9), 'Name'].values)
num_col_null_over90 = list(num_summary.loc[num_summary['NullRt'] >= float(0.9), 'Name'].values)
col_null_over90 = [*cat_col_null_over90, *num_col_null_over90]

In [27]:
df_train = df_train.drop(col_null_over90, axis=1)
df_test = df_test.drop(col_null_over90, axis=1)

In [28]:
cat_col = [col for col in cat_col if col not in cat_col_null_over90]
num_col = [col for col in num_col if col not in num_col_null_over90]

## 4.5 Encoding

In [29]:
for col in cat_col :
    if col in df_train.columns :
        le = LabelEncoder()
        le.fit(list(df_train[col].astype(str).values)
               + list(df_test[col].astype(str).values))
        df_train[col] = le.transform(list(df_train[col].astype(str).values))
        df_test[col] = le.transform(list(df_test[col].astype(str).values))

In [30]:
sn.noti('Feature Engineering Finished!')

# 5. Modeling

## 5.1 Set X, y

In [31]:
y_train = df_train['isFraud']
X_train = df_train.drop('isFraud', axis=1)
X_test = df_test

## 5.2 Make Validation set

In [32]:
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.2, random_state=0)

In [33]:
train_cnt = y_train.count()
valid_cnt = y_valid.count()
print('학습 세트 shape : {0}, 검증 세트 shape : {1}'.format(X_train.shape, X_valid.shape))

print('학습 세트 레이블 값 분포 비율')
print(y_train.value_counts()/train_cnt)
print('\n검증 세트 레이블 값 분포 비율')
print(y_valid.value_counts()/valid_cnt)

학습 세트 shape : (472432, 422), 검증 세트 shape : (118108, 422)
학습 세트 레이블 값 분포 비율
0    0.964839
1    0.035161
Name: isFraud, dtype: float64

검증 세트 레이블 값 분포 비율
0    0.965692
1    0.034308
Name: isFraud, dtype: float64


## 5.3 모델 학습

In [34]:
%%time
from lightgbm import LGBMClassifier

lgbm_clf = LGBMClassifier(n_estimators=500, random_state=0)

evals = [(X_valid, y_valid)]
lgbm_clf.fit(X_train, y_train,
             early_stopping_rounds=100,
             eval_metric='auc',
             eval_set=evals,
             verbose=True)

lgbm_auc_score = roc_auc_score(y_valid, lgbm_clf.predict_proba(X_valid)[:,1], average='macro')
print('ROC AUC : {0:.4f}'.format(lgbm_auc_score))
sn.noti('LightGBM ROC AUC Score : {0:.4f}'.format(lgbm_auc_score))

[1]	valid_0's binary_logloss: 0.126366	valid_0's auc: 0.804378
Training until validation scores don't improve for 100 rounds.
[2]	valid_0's binary_logloss: 0.120267	valid_0's auc: 0.849366
[3]	valid_0's binary_logloss: 0.116304	valid_0's auc: 0.856252
[4]	valid_0's binary_logloss: 0.113008	valid_0's auc: 0.859701
[5]	valid_0's binary_logloss: 0.110035	valid_0's auc: 0.865047
[6]	valid_0's binary_logloss: 0.107565	valid_0's auc: 0.869407
[7]	valid_0's binary_logloss: 0.105597	valid_0's auc: 0.871918
[8]	valid_0's binary_logloss: 0.103855	valid_0's auc: 0.873486
[9]	valid_0's binary_logloss: 0.10242	valid_0's auc: 0.874717
[10]	valid_0's binary_logloss: 0.100876	valid_0's auc: 0.876362
[11]	valid_0's binary_logloss: 0.0996368	valid_0's auc: 0.877128
[12]	valid_0's binary_logloss: 0.0984714	valid_0's auc: 0.879384
[13]	valid_0's binary_logloss: 0.0974617	valid_0's auc: 0.879942
[14]	valid_0's binary_logloss: 0.0964328	valid_0's auc: 0.882679
[15]	valid_0's binary_logloss: 0.0955687	valid_

# 6. Make Submission

In [35]:
y_pred = lgbm_clf.predict_proba(X_test)[:,1]

sample_submission['isFraud'] = y_pred
sample_submission.to_csv('v011_lightgbm.csv', index=False)