## Importing libraries

In [1]:
import pandas as pd
import numpy as np
import os

from cuml import ForestInference

from sklearn.metrics import mean_squared_error, confusion_matrix, precision_score, recall_score, f1_score, accuracy_score, roc_auc_score
from sklearn.metrics import roc_curve, auc
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier



## Reading data

In [2]:
%%time
df = pd.read_csv('PS_20174392719_1491204439457_log.csv')
pd.set_option('display.float_format', '{:.2f}'.format)
df = df.rename(columns={'oldbalanceOrg': 'oldbalanceOrig'})

CPU times: user 5.53 s, sys: 1.01 s, total: 6.54 s
Wall time: 6.54 s


In [3]:
# First glance
df

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrig,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
0,1,PAYMENT,9839.64,C1231006815,170136.00,160296.36,M1979787155,0.00,0.00,0,0
1,1,PAYMENT,1864.28,C1666544295,21249.00,19384.72,M2044282225,0.00,0.00,0,0
2,1,TRANSFER,181.00,C1305486145,181.00,0.00,C553264065,0.00,0.00,1,0
3,1,CASH_OUT,181.00,C840083671,181.00,0.00,C38997010,21182.00,0.00,1,0
4,1,PAYMENT,11668.14,C2048537720,41554.00,29885.86,M1230701703,0.00,0.00,0,0
...,...,...,...,...,...,...,...,...,...,...,...
6362615,743,CASH_OUT,339682.13,C786484425,339682.13,0.00,C776919290,0.00,339682.13,1,0
6362616,743,TRANSFER,6311409.28,C1529008245,6311409.28,0.00,C1881841831,0.00,0.00,1,0
6362617,743,CASH_OUT,6311409.28,C1162922333,6311409.28,0.00,C1365125890,68488.84,6379898.11,1,0
6362618,743,TRANSFER,850002.52,C1685995037,850002.52,0.00,C2080388513,0.00,0.00,1,0


The dataset consists of 10 variables:

* step: represents a unit of time where 1 step equals 1 hour
* type: type of online transaction
* amount: the amount of the transaction
* nameOrig: customer starting the transaction
* oldbalanceOrg: balance before the transaction
* newbalanceOrig: balance after the transaction
* nameDest: recipient of the transaction
* oldbalanceDest: initial balance of recipient before the transaction
* newbalanceDest: the new balance of recipient after the transaction
* isFraud: fraud transaction 

## Data Preparation

In [4]:
# discard missing data
df = df.dropna()

# Dropping unnecessary features and text features
df = df.drop(['newbalanceOrig', 'newbalanceDest', 'isFlaggedFraud', 'nameOrig', 'nameDest'], axis=1)

# One-hot encoding for "type" feature
df = pd.concat([df, 1*pd.get_dummies(df['type'])], axis=1).drop('type', axis=1)

In [5]:
X = df.drop('isFraud', axis = 1)
y = df.isFraud

In [6]:
# Splitting data into train, validation and test samples

X_not_test, X_test, y_not_test, y_test = train_test_split(X, y, test_size = 0.1, random_state = 42, stratify = y)
X_train, X_eval, y_train, y_eval = train_test_split(X_not_test, y_not_test, test_size = 0.2, stratify = y_not_test, random_state = 42)

In [7]:
# Convert data to numpy array
X_train = X_train.to_numpy()
y_train = y_train.to_numpy()
X_eval = X_eval.to_numpy()
y_eval = y_eval.to_numpy()
X_test = X_test.to_numpy()
y_test = y_test.to_numpy()

## Run XGBoost without GPU

In [8]:
%%time
best_model = XGBClassifier(tree_method='hist', random_state=0, device='cpu')
best_model.fit(X_train, y_train,
                        eval_set = [(X_eval, y_eval)],
                        verbose=25)

[0]	validation_0-logloss:0.09437
[25]	validation_0-logloss:0.00238
[50]	validation_0-logloss:0.00160
[75]	validation_0-logloss:0.00134
[99]	validation_0-logloss:0.00121
CPU times: user 2min 28s, sys: 1.8 s, total: 2min 30s
Wall time: 10.3 s


In [9]:
%%time
pred_xgb = best_model.predict(X_test)
y_pred_proba = best_model.predict_proba(X_test)[:,1]

CPU times: user 2.35 s, sys: 4.02 ms, total: 2.35 s
Wall time: 127 ms


In [10]:
precision_xgb = precision_score(pred_xgb, y_test)
recall_xgb = recall_score(pred_xgb, y_test)
f1_xgb = f1_score(pred_xgb, y_test)
auroc_xgb = roc_auc_score(y_test, y_pred_proba)

print(f'Precision on test: {precision_xgb}\n'
        f'Recall on test: {recall_xgb}\n'
        f'F1 on test {f1_xgb}\n'
        f'ROC AUC on test: {auroc_xgb}')

Precision on test: 0.7405602923264312
Recall on test: 0.9074626865671642
F1 on test 0.8155600268276325
ROC AUC on test: 0.9991244439845522
