In [129]:
import pandas as pd
import numpy as np

from sklearn.metrics import roc_auc_score

from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
%matplotlib inline

import gc
import lightgbm as lgb

from sklearn.metrics import roc_auc_score
from sklearn.model_selection import ParameterGrid

# Load Data

In [156]:
train_identity = pd.read_csv('train_identity.csv')
train_transaction = pd.read_csv('train_transaction.csv')
test_identity = pd.read_csv('test_identity.csv')
test_transaction = pd.read_csv('test_transaction.csv')
sub = pd.read_csv('sample_submission.csv')              

In [157]:
## use a small data size for testing

In [158]:
# train_identity = train_identity.head(100000)
# train_transaction = train_transaction.head(100000)
# test_identity = test_identity.head(100000)
# test_transaction = test_transaction.head(100000)

In [159]:
train = pd.merge(train_transaction, train_identity, on='TransactionID', how='left')
test = pd.merge(test_transaction, test_identity, on='TransactionID', how='left')

In [160]:
print(f'Train dataset has {train.shape[0]} rows and {train.shape[1]} columns.')
print(f'Test dataset has {test.shape[0]} rows and {test.shape[1]} columns.')

Train dataset has 590540 rows and 434 columns.
Test dataset has 506691 rows and 433 columns.


In [161]:
del train_identity, train_transaction, test_identity, test_transaction

In [162]:
gc.collect()

185

# Data Exploration

In [163]:
useful_features = ['TransactionDT', 'TransactionAmt',
                   'ProductCD', 'card1', 'card2', 'card3', 'card4', 'card5', 'card6',
                   'addr1', 'addr2', 'dist1', 'dist2', 'P_emaildomain',
                   'R_emaildomain', 'C1', 'C2', 'C3', 'C4', 'C5', 'C6', 'C7', 'C8',
                   'C9', 'C10', 'C11', 'C12', 'C13', 'C14', 'D1', 'D2', 'D3', 'D4',
                   'D5', 'D6', 'D7', 'D8', 'D9', 'D10', 'D11', 'D12', 'D13', 'D14',
                   'D15', 'M1', 'M2', 'M3', 'M4', 'M5', 'M6', 'M7', 'M8', 'M9', 'V1',
                   'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10', 'V11',
                   'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20',
                   'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28', 'V29',
                   'V30', 'V31', 'V32', 'V33', 'V34', 'V35', 'V36', 'V37', 'V38',
                   'V39', 'V40', 'V41', 'V42', 'V43', 'V44', 'V45', 'V46', 'V47',
                   'V48', 'V49', 'V50', 'V51', 'V52', 'V53', 'V54', 'V55', 'V56',
                   'V57', 'V58', 'V59', 'V60', 'V61', 'V62', 'V63', 'V64', 'V65',
                   'V66', 'V67', 'V68', 'V69', 'V70', 'V71', 'V72', 'V73', 'V74',
                   'V75', 'V76', 'V77', 'V78', 'V79', 'V80', 'V81', 'V82', 'V83',
                   'V84', 'V85', 'V86', 'V87', 'V88', 'V89', 'V90', 'V91', 'V92',
                   'V93', 'V94', 'V95', 'V96', 'V97', 'V98', 'V99', 'V100', 'V101',
                   'V102', 'V103', 'V104', 'V105', 'V106', 'V107', 'V108', 'V109',
                   'V110', 'V111', 'V112', 'V113', 'V114', 'V115', 'V116', 'V117',
                   'V118', 'V119', 'V120', 'V121', 'V122', 'V123', 'V124', 'V125',
                   'V126', 'V127', 'V128', 'V129', 'V130', 'V131', 'V132', 'V133',
                   'V134', 'V135', 'V136', 'V137', 'V138', 'V139', 'V140', 'V141',
                   'V142', 'V143', 'V144', 'V145', 'V146', 'V147', 'V148', 'V149',
                   'V150', 'V151', 'V152', 'V153', 'V154', 'V155', 'V156', 'V157',
                   'V158', 'V159', 'V160', 'V161', 'V162', 'V163', 'V164', 'V165',
                   'V166', 'V167', 'V168', 'V169', 'V170', 'V171', 'V172', 'V173',
                   'V174', 'V175', 'V176', 'V177', 'V178', 'V179', 'V180', 'V181',
                   'V182', 'V183', 'V184', 'V185', 'V186', 'V187', 'V188', 'V189',
                   'V190', 'V191', 'V192', 'V193', 'V194', 'V195', 'V196', 'V197',
                   'V198', 'V199', 'V200', 'V201', 'V202', 'V203', 'V204', 'V205',
                   'V206', 'V207', 'V208', 'V209', 'V210', 'V211', 'V212', 'V213',
                   'V214', 'V215', 'V216', 'V217', 'V218', 'V219', 'V220', 'V221',
                   'V222', 'V223', 'V224', 'V225', 'V226', 'V227', 'V228', 'V229',
                   'V230', 'V231', 'V232', 'V233', 'V234', 'V235', 'V236', 'V237',
                   'V238', 'V239', 'V240', 'V241', 'V242', 'V243', 'V244', 'V245',
                   'V246', 'V247', 'V248', 'V249', 'V250', 'V251', 'V252', 'V253',
                   'V254', 'V255', 'V256', 'V257', 'V258', 'V259', 'V260', 'V261',
                   'V262', 'V263', 'V264', 'V265', 'V266', 'V267', 'V268', 'V269',
                   'V270', 'V271', 'V272', 'V273', 'V274', 'V275', 'V276', 'V277',
                   'V278', 'V279', 'V280', 'V281', 'V282', 'V283', 'V284', 'V285',
                   'V286', 'V287', 'V288', 'V289', 'V290', 'V291', 'V292', 'V293',
                   'V294', 'V295', 'V296', 'V297', 'V298', 'V299', 'V300', 'V301',
                   'V302', 'V303', 'V304', 'V305', 'V306', 'V307', 'V308', 'V309',
                   'V310', 'V311', 'V312', 'V313', 'V314', 'V315', 'V316', 'V317',
                   'V318', 'V319', 'V320', 'V321', 'V322', 'V323', 'V324', 'V325',
                   'V326', 'V327', 'V328', 'V329', 'V330', 'V331', 'V332', 'V333',
                   'V334', 'V335', 'V336', 'V337', 'V338', 'V339', 'id_01', 'id_02',
                   'id_03', 'id_04', 'id_05', 'id_06', 'id_07', 'id_08', 'id_09',
                   'id_10', 'id_11', 'id_12', 'id_13', 'id_14', 'id_15', 'id_16',
                   'id_17', 'id_18', 'id_19', 'id_20', 'id_21', 'id_22', 'id_23',
                   'id_24', 'id_25', 'id_26', 'id_27', 'id_28', 'id_29', 'id_30',
                   'id_31', 'id_32', 'id_33', 'id_34', 'id_35', 'id_36', 'id_37',
                   'id_38', 'DeviceType', 'DeviceInfo']

In [None]:
cat_features = [   'ProductCD', 'card1', 'card2', 'card3', 'card4', 'card5', 'card6',
                   'addr1', 'addr2', 'P_emaildomain', 'R_emaildomain', 'M1', 'M2', 'M3', 
                   'M4', 'M5', 'M6', 'M7', 'M8', 'M9', 'id_12', 'id_13', 'id_14', 'id_15', 'id_16',
                   'id_17', 'id_18', 'id_19', 'id_20', 'id_21', 'id_22', 'id_23',
                   'id_24', 'id_25', 'id_26', 'id_27', 'id_28', 'id_29', 'id_30',
                   'id_31', 'id_32', 'id_33', 'id_34', 'id_35', 'id_36', 'id_37',
                   'id_38', 'DeviceType', 'DeviceInfo']

In [None]:
for col in train.columns:
    if col in cat_features:
        le = LabelEncoder()
        le.fit(list(train[col].astype(str).values) + list(test[col].astype(str).values))
        train.loc[:, col] = le.transform(list(train[col].astype(str).values))
        test.loc[:, col] = le.transform(list(test[col].astype(str).values))

## tuning parameter

In [None]:
train_X = train[useful_features].head(int(train.shape[0] / 2))
train_Y = train['isFraud'].head(int(train.shape[0] / 2))
eva_X = train[useful_features].tail(int(train.shape[0] / 2))
eva_Y = train['isFraud'].tail(int(train.shape[0] / 2))

In [None]:
def grid_evaluate_binary(X_train, y_train, X_test, y_test, params_fixed, params_grid, categorical_feature):
    optimal_auc = 0
    optimal_params = {}

    for param in ParameterGrid(params_grid):
        local_params = {**params_fixed, **param}
        gbdt_model = lgb.LGBMClassifier(**local_params)
        gbdt_model.fit(X_train, y_train, eval_set=[(X_test, y_test)], early_stopping_rounds=20, eval_metric='auc', categorical_feature=categorical_feature)

        y_pred = gbdt_model.predict_proba(X_test)[:, 1]
        local_auc = roc_auc_score(y_test, y_pred)

        print({
            "type": "hyperparameter_tuning_candidate",
            "local_params": local_params,
            "local_auc": local_auc})

        if local_auc > optimal_auc:
            optimal_auc = local_auc
            optimal_params = local_params

    print({
        "type": "hyperparameter_tuning_optimal",
        "optimal_auc": optimal_auc,
        "optimal_params": optimal_params})

    return optimal_params, optimal_auc

In [None]:
params_fixed = {'max_depth': -1,  # no limit, use num_leaves
                'objective': 'binary',  # binary log loss classification
                'nthread': -1,
                'metric': 'auc',
                'boosting_type': 'gbdt',
                'min_data_in_leaf': 20,
                'zero_as_missing': False} 

main_params_grid = {
    'n_estimators': [100, 200, 400, 800],
    'num_leaves': [31, 60, 120, 240]
}

minor_params_grid = {
    'learning_rate': [0.005, 0.01, 0.02, 0.05, 0.1],
    'lambda_l2': [0.001, 0.01, 0.1],
}

In [None]:
# optimal_params, _ = grid_evaluate_binary(train_X, train_Y, eva_X, eva_Y, params_fixed, main_params_grid, categorical_feature=cat_features)
# optimal_params2, _ = grid_evaluate_binary(train_X, train_Y, eva_X, eva_Y, optimal_params, minor_params_grid, categorical_feature=cat_features)

In [None]:
print(optimal_params2)

In [None]:
optimal_params2 = {'max_depth': -1, 'objective': 'binary', 'nthread': -1, 'metric': 'auc', 'boosting_type': 'gbdt', 'min_data_in_leaf': 20, 'zero_as_missing': False, 'n_estimators': 200, 'num_leaves': 31, 'lambda_l2': 0.01, 'learning_rate': 0.05}

# Train LightGBM

In [None]:
model = lgb.LGBMClassifier(**optimal_params2)
model.fit(train_X, train_Y, eval_set=(eva_X, eva_Y), eval_metric='auc', early_stopping_rounds=50, categorical_feature=cat_features)

In [None]:
params['n_estimators'] = model.booster_.num_trees()
model = lgb.LGBMClassifier(**params)
model.fit(train[useful_features], train['isFraud'], eval_metric='auc', categorical_feature=cat_features)

# Prediction

In [None]:
y_preds = model.predict_proba(test[useful_features])[:, 1]

In [None]:
sub['isFraud'] = y_preds
sub.to_csv("submission2.csv", index=False)