In [24]:
import pandas as pd
import numpy as np
from datetime import datetime
import matplotlib.pyplot as plt
from sklearn.model_selection import KFold
from sklearn.feature_selection import RFECV
import lightgbm as lgb
from tqdm import tqdm_notebook
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import LabelEncoder
import multiprocessing
import gc

In [25]:
# loading preprocessed data
files = ['../Desktop/train_val_uid/X_train.csv', 
         '../Desktop/train_val_uid/X_val.csv',
         '../Desktop/train_val_uid/X_test.csv',
         '../Desktop/train_val_uid/y_train.csv',
         '../Desktop/train_val_uid/y_val.csv']

def load_data(file):
    return pd.read_csv(file)

with multiprocessing.Pool() as pool:
    X_train, X_val, X_test, y_train, y_val = pool.map(load_data, files)

In [20]:
X_train

Unnamed: 0,TransactionID,TransactionAmt,ProductCD,card1,card2,card3,card5,card6,addr1,addr2,...,D9_card1_addr1_mean,D9_card1_addr1_std,D9_card1_addr1_P_emaildomain_mean,D9_card1_addr1_P_emaildomain_std,D11_card1_mean,D11_card1_std,D11_card1_addr1_mean,D11_card1_addr1_std,D11_card1_addr1_P_emaildomain_mean,D11_card1_addr1_P_emaildomain_std
0,2989413,99.982002,1,16541.0,-1.0,50.0,126.0,2,77.0,77.0,...,-0.861111,0.470550,-0.166667,0.962250,225.069138,210.206573,224.931900,215.723770,-1.000000,0.000000
1,2989407,99.982002,3,2821.0,11.0,50.0,119.0,1,84.0,77.0,...,-0.567100,0.740934,-0.825071,0.504442,189.949524,247.391190,149.771957,244.241592,265.942017,269.591278
2,2989400,86.982002,4,893.0,414.0,50.0,124.0,1,215.0,77.0,...,-0.819932,0.517438,-1.000000,0.000000,223.007462,237.794708,282.277863,301.959015,326.402771,325.760376
3,2989416,14.982000,4,17132.0,467.0,50.0,17.0,2,172.0,77.0,...,-0.962403,0.244122,-1.000000,0.000000,288.241089,270.982452,281.910431,251.790894,393.544769,303.444946
4,2989408,76.982002,4,8839.0,455.0,50.0,126.0,2,91.0,77.0,...,-0.818287,0.420477,-1.000000,0.000000,164.347809,189.385956,163.715469,192.032791,151.664856,215.900711
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
442900,3418690,47.931999,4,12037.0,62.0,50.0,126.0,2,369.0,77.0,...,-0.802632,0.591354,-1.000000,0.000000,316.987671,251.406036,334.453339,253.530151,536.871216,132.490433
442901,3418706,129.931992,4,13479.0,287.0,50.0,126.0,1,385.0,77.0,...,-0.920543,0.364130,-0.825071,0.504442,239.293686,293.568848,268.567566,287.425598,265.942017,269.591278
442902,3418691,391.011993,4,7394.0,390.0,50.0,126.0,2,26.0,77.0,...,-0.891129,0.401650,-0.886752,0.403628,269.672394,267.732178,261.397064,251.452393,224.914795,232.125229
442903,3418707,58.411999,4,1455.0,221.0,50.0,126.0,1,58.0,77.0,...,-1.000000,0.000000,-1.000000,0.000000,189.429337,234.728836,165.174835,244.114044,316.234070,257.019592


In [26]:
y_train = y_train['isFraud']
y_train

0         0
1         0
2         0
3         0
4         0
         ..
442900    0
442901    0
442902    0
442903    0
442904    0
Name: isFraud, Length: 442905, dtype: int64

In [27]:
y_val=y_val['isFraud']

## XGBoost with CPU & GPU

In [8]:
!pip install xgboost

Collecting xgboost
  Downloading xgboost-1.4.1-py3-none-macosx_10_14_x86_64.macosx_10_15_x86_64.macosx_11_0_x86_64.whl (1.2 MB)
[K     |████████████████████████████████| 1.2 MB 6.0 MB/s eta 0:00:01
Installing collected packages: xgboost
Successfully installed xgboost-1.4.1
You should consider upgrading via the '/Users/bellalyu/anaconda3/bin/python -m pip install --upgrade pip' command.[0m


In [28]:
import xgboost as xgb
import time

In [31]:
# train 75% val 25%

print("XGBoost version:", xgb.__version__)

start = time.time()
clf = xgb.XGBClassifier( 
        n_estimators=2000,
        max_depth=12, 
        learning_rate=0.02, 
        subsample=0.8,
        colsample_bytree=0.4, 
        missing=-1, 
        eval_metric='auc',
        #USE CPU
        nthread=4,
        tree_method='hist' 
        # USE GPU
        #tree_method='gpu_hist' 
    )
h = clf.fit(X_train, y_train, 
        eval_set=[(X_val,y_val)],
        verbose=50, early_stopping_rounds=100)
print('run time is', time.time()-start)

XGBoost version: 1.4.1
[0]	validation_0-auc:0.80108
[50]	validation_0-auc:0.87348
[100]	validation_0-auc:0.88902
[150]	validation_0-auc:0.90582
[200]	validation_0-auc:0.91656
[250]	validation_0-auc:0.92431
[300]	validation_0-auc:0.92959
[350]	validation_0-auc:0.93285
[400]	validation_0-auc:0.93443
[450]	validation_0-auc:0.93563
[500]	validation_0-auc:0.93610
[550]	validation_0-auc:0.93670
[600]	validation_0-auc:0.93700
[650]	validation_0-auc:0.93732
[700]	validation_0-auc:0.93748
[750]	validation_0-auc:0.93749
[800]	validation_0-auc:0.93737
[822]	validation_0-auc:0.93730
run time is 187.66033697128296
