# XGBoost

## Importing packages

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score

pd.set_option('display.max_columns', None)

## Importing data

In [2]:
train_data = pd.read_csv('../data/5_train_dataset.csv')
test_data = pd.read_csv('../data/4_test_dataset.csv')

## XGBoost

Getting X & y

In [3]:
X_train = train_data.drop('is_fraud', axis=1)
y_train = train_data['is_fraud']
X_test = test_data.drop('is_fraud', axis=1)
y_test = test_data['is_fraud']

Initialise base XGBoost instance

In [4]:
random_forest = RandomForestClassifier()

Initialise parameter grid for randomized search

In [5]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['log2', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
parameter_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

Run randomized search on Random Forest

In [6]:
rf_random = RandomizedSearchCV(estimator= random_forest,
                  param_distributions= parameter_grid,
                  scoring= "recall",
                  n_iter = 500,
                  verbose= 4)

In [7]:
rf_random.fit(X_train, y_train)

Fitting 5 folds for each of 500 candidates, totalling 2500 fits
[CV 1/5] END bootstrap=False, max_depth=30, max_features=log2, min_samples_leaf=4, min_samples_split=10, n_estimators=1600;, score=0.862 total time=  11.8s
[CV 2/5] END bootstrap=False, max_depth=30, max_features=log2, min_samples_leaf=4, min_samples_split=10, n_estimators=1600;, score=0.847 total time=  11.7s
[CV 3/5] END bootstrap=False, max_depth=30, max_features=log2, min_samples_leaf=4, min_samples_split=10, n_estimators=1600;, score=0.878 total time=  11.9s
[CV 4/5] END bootstrap=False, max_depth=30, max_features=log2, min_samples_leaf=4, min_samples_split=10, n_estimators=1600;, score=0.901 total time=  11.9s
[CV 5/5] END bootstrap=False, max_depth=30, max_features=log2, min_samples_leaf=4, min_samples_split=10, n_estimators=1600;, score=0.847 total time=  11.7s
[CV 1/5] END bootstrap=True, max_depth=20, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=2000;, score=0.862 total time=  10.4s
[C

[CV 4/5] END bootstrap=False, max_depth=30, max_features=log2, min_samples_leaf=4, min_samples_split=10, n_estimators=400;, score=0.905 total time=   3.1s
[CV 5/5] END bootstrap=False, max_depth=30, max_features=log2, min_samples_leaf=4, min_samples_split=10, n_estimators=400;, score=0.843 total time=   3.0s
[CV 1/5] END bootstrap=False, max_depth=40, max_features=log2, min_samples_leaf=1, min_samples_split=10, n_estimators=1600;, score=0.862 total time=  13.4s
[CV 2/5] END bootstrap=False, max_depth=40, max_features=log2, min_samples_leaf=1, min_samples_split=10, n_estimators=1600;, score=0.862 total time=  12.9s
[CV 3/5] END bootstrap=False, max_depth=40, max_features=log2, min_samples_leaf=1, min_samples_split=10, n_estimators=1600;, score=0.882 total time=  12.8s
[CV 4/5] END bootstrap=False, max_depth=40, max_features=log2, min_samples_leaf=1, min_samples_split=10, n_estimators=1600;, score=0.912 total time=  12.9s
[CV 5/5] END bootstrap=False, max_depth=40, max_features=log2, min

[CV 3/5] END bootstrap=False, max_depth=40, max_features=log2, min_samples_leaf=1, min_samples_split=2, n_estimators=1400;, score=0.901 total time=  11.9s
[CV 4/5] END bootstrap=False, max_depth=40, max_features=log2, min_samples_leaf=1, min_samples_split=2, n_estimators=1400;, score=0.920 total time=  11.4s
[CV 5/5] END bootstrap=False, max_depth=40, max_features=log2, min_samples_leaf=1, min_samples_split=2, n_estimators=1400;, score=0.858 total time=  11.7s
[CV 1/5] END bootstrap=False, max_depth=None, max_features=log2, min_samples_leaf=1, min_samples_split=2, n_estimators=800;, score=0.862 total time=   6.5s
[CV 2/5] END bootstrap=False, max_depth=None, max_features=log2, min_samples_leaf=1, min_samples_split=2, n_estimators=800;, score=0.877 total time=   6.8s
[CV 3/5] END bootstrap=False, max_depth=None, max_features=log2, min_samples_leaf=1, min_samples_split=2, n_estimators=800;, score=0.889 total time=   6.4s
[CV 4/5] END bootstrap=False, max_depth=None, max_features=log2, mi

[CV 2/5] END bootstrap=False, max_depth=100, max_features=log2, min_samples_leaf=4, min_samples_split=5, n_estimators=200;, score=0.851 total time=   1.5s
[CV 3/5] END bootstrap=False, max_depth=100, max_features=log2, min_samples_leaf=4, min_samples_split=5, n_estimators=200;, score=0.885 total time=   1.5s
[CV 4/5] END bootstrap=False, max_depth=100, max_features=log2, min_samples_leaf=4, min_samples_split=5, n_estimators=200;, score=0.912 total time=   1.5s
[CV 5/5] END bootstrap=False, max_depth=100, max_features=log2, min_samples_leaf=4, min_samples_split=5, n_estimators=200;, score=0.854 total time=   1.6s
[CV 1/5] END bootstrap=False, max_depth=70, max_features=log2, min_samples_leaf=4, min_samples_split=5, n_estimators=400;, score=0.866 total time=   3.2s
[CV 2/5] END bootstrap=False, max_depth=70, max_features=log2, min_samples_leaf=4, min_samples_split=5, n_estimators=400;, score=0.851 total time=   3.3s
[CV 3/5] END bootstrap=False, max_depth=70, max_features=log2, min_sampl

[CV 5/5] END bootstrap=False, max_depth=70, max_features=log2, min_samples_leaf=2, min_samples_split=2, n_estimators=1800;, score=0.858 total time=10.8min
[CV 1/5] END bootstrap=False, max_depth=80, max_features=sqrt, min_samples_leaf=2, min_samples_split=10, n_estimators=600;, score=0.874 total time=   4.8s
[CV 2/5] END bootstrap=False, max_depth=80, max_features=sqrt, min_samples_leaf=2, min_samples_split=10, n_estimators=600;, score=0.858 total time=   4.7s
[CV 3/5] END bootstrap=False, max_depth=80, max_features=sqrt, min_samples_leaf=2, min_samples_split=10, n_estimators=600;, score=0.870 total time=   4.6s
[CV 4/5] END bootstrap=False, max_depth=80, max_features=sqrt, min_samples_leaf=2, min_samples_split=10, n_estimators=600;, score=0.908 total time=   4.6s
[CV 5/5] END bootstrap=False, max_depth=80, max_features=sqrt, min_samples_leaf=2, min_samples_split=10, n_estimators=600;, score=0.851 total time=   4.6s
[CV 1/5] END bootstrap=False, max_depth=100, max_features=log2, min_sa

[CV 3/5] END bootstrap=True, max_depth=10, max_features=sqrt, min_samples_leaf=4, min_samples_split=5, n_estimators=1000;, score=0.855 total time=   4.8s
[CV 4/5] END bootstrap=True, max_depth=10, max_features=sqrt, min_samples_leaf=4, min_samples_split=5, n_estimators=1000;, score=0.878 total time=   4.9s
[CV 5/5] END bootstrap=True, max_depth=10, max_features=sqrt, min_samples_leaf=4, min_samples_split=5, n_estimators=1000;, score=0.797 total time=   5.2s
[CV 1/5] END bootstrap=True, max_depth=30, max_features=sqrt, min_samples_leaf=2, min_samples_split=2, n_estimators=1200;, score=0.862 total time=   6.3s
[CV 2/5] END bootstrap=True, max_depth=30, max_features=sqrt, min_samples_leaf=2, min_samples_split=2, n_estimators=1200;, score=0.843 total time=   6.2s
[CV 3/5] END bootstrap=True, max_depth=30, max_features=sqrt, min_samples_leaf=2, min_samples_split=2, n_estimators=1200;, score=0.882 total time=   6.3s
[CV 4/5] END bootstrap=True, max_depth=30, max_features=sqrt, min_samples_le

[CV 2/5] END bootstrap=False, max_depth=70, max_features=log2, min_samples_leaf=1, min_samples_split=10, n_estimators=2000;, score=0.866 total time=  15.2s
[CV 3/5] END bootstrap=False, max_depth=70, max_features=log2, min_samples_leaf=1, min_samples_split=10, n_estimators=2000;, score=0.882 total time=  15.2s
[CV 4/5] END bootstrap=False, max_depth=70, max_features=log2, min_samples_leaf=1, min_samples_split=10, n_estimators=2000;, score=0.916 total time=  15.4s
[CV 5/5] END bootstrap=False, max_depth=70, max_features=log2, min_samples_leaf=1, min_samples_split=10, n_estimators=2000;, score=0.851 total time=  15.2s
[CV 1/5] END bootstrap=True, max_depth=80, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=2000;, score=0.862 total time=  10.7s
[CV 2/5] END bootstrap=True, max_depth=80, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=2000;, score=0.858 total time=  10.4s
[CV 3/5] END bootstrap=True, max_depth=80, max_features=sqrt, min_sa

[CV 1/5] END bootstrap=True, max_depth=10, max_features=sqrt, min_samples_leaf=2, min_samples_split=10, n_estimators=1400;, score=0.835 total time=   6.8s
[CV 2/5] END bootstrap=True, max_depth=10, max_features=sqrt, min_samples_leaf=2, min_samples_split=10, n_estimators=1400;, score=0.820 total time=   6.7s
[CV 3/5] END bootstrap=True, max_depth=10, max_features=sqrt, min_samples_leaf=2, min_samples_split=10, n_estimators=1400;, score=0.851 total time=   6.8s
[CV 4/5] END bootstrap=True, max_depth=10, max_features=sqrt, min_samples_leaf=2, min_samples_split=10, n_estimators=1400;, score=0.882 total time=   6.8s
[CV 5/5] END bootstrap=True, max_depth=10, max_features=sqrt, min_samples_leaf=2, min_samples_split=10, n_estimators=1400;, score=0.805 total time=   6.8s
[CV 1/5] END bootstrap=False, max_depth=70, max_features=log2, min_samples_leaf=1, min_samples_split=10, n_estimators=1000;, score=0.870 total time=   7.6s
[CV 2/5] END bootstrap=False, max_depth=70, max_features=log2, min_sa

[CV 4/5] END bootstrap=True, max_depth=40, max_features=sqrt, min_samples_leaf=1, min_samples_split=10, n_estimators=400;, score=0.897 total time=   2.1s
[CV 5/5] END bootstrap=True, max_depth=40, max_features=sqrt, min_samples_leaf=1, min_samples_split=10, n_estimators=400;, score=0.839 total time=   2.0s
[CV 1/5] END bootstrap=True, max_depth=80, max_features=log2, min_samples_leaf=2, min_samples_split=5, n_estimators=400;, score=0.851 total time=   2.0s
[CV 2/5] END bootstrap=True, max_depth=80, max_features=log2, min_samples_leaf=2, min_samples_split=5, n_estimators=400;, score=0.847 total time=   2.0s
[CV 3/5] END bootstrap=True, max_depth=80, max_features=log2, min_samples_leaf=2, min_samples_split=5, n_estimators=400;, score=0.874 total time=   2.1s
[CV 4/5] END bootstrap=True, max_depth=80, max_features=log2, min_samples_leaf=2, min_samples_split=5, n_estimators=400;, score=0.908 total time=   2.1s
[CV 5/5] END bootstrap=True, max_depth=80, max_features=log2, min_samples_leaf=2

[CV 3/5] END bootstrap=True, max_depth=80, max_features=log2, min_samples_leaf=4, min_samples_split=2, n_estimators=1200;, score=0.866 total time=   6.0s
[CV 4/5] END bootstrap=True, max_depth=80, max_features=log2, min_samples_leaf=4, min_samples_split=2, n_estimators=1200;, score=0.897 total time=   6.0s
[CV 5/5] END bootstrap=True, max_depth=80, max_features=log2, min_samples_leaf=4, min_samples_split=2, n_estimators=1200;, score=0.831 total time=   5.9s
[CV 1/5] END bootstrap=False, max_depth=50, max_features=log2, min_samples_leaf=2, min_samples_split=2, n_estimators=1200;, score=0.866 total time=   9.0s
[CV 2/5] END bootstrap=False, max_depth=50, max_features=log2, min_samples_leaf=2, min_samples_split=2, n_estimators=1200;, score=0.866 total time=   9.0s
[CV 3/5] END bootstrap=False, max_depth=50, max_features=log2, min_samples_leaf=2, min_samples_split=2, n_estimators=1200;, score=0.885 total time=   9.0s
[CV 4/5] END bootstrap=False, max_depth=50, max_features=log2, min_sample

[CV 1/5] END bootstrap=True, max_depth=90, max_features=sqrt, min_samples_leaf=1, min_samples_split=10, n_estimators=2000;, score=0.858 total time=  10.4s
[CV 2/5] END bootstrap=True, max_depth=90, max_features=sqrt, min_samples_leaf=1, min_samples_split=10, n_estimators=2000;, score=0.843 total time=  10.2s
[CV 3/5] END bootstrap=True, max_depth=90, max_features=sqrt, min_samples_leaf=1, min_samples_split=10, n_estimators=2000;, score=0.874 total time=  10.3s
[CV 4/5] END bootstrap=True, max_depth=90, max_features=sqrt, min_samples_leaf=1, min_samples_split=10, n_estimators=2000;, score=0.897 total time=  10.3s
[CV 5/5] END bootstrap=True, max_depth=90, max_features=sqrt, min_samples_leaf=1, min_samples_split=10, n_estimators=2000;, score=0.839 total time=  10.3s
[CV 1/5] END bootstrap=True, max_depth=70, max_features=log2, min_samples_leaf=1, min_samples_split=2, n_estimators=1400;, score=0.862 total time=   7.3s
[CV 2/5] END bootstrap=True, max_depth=70, max_features=log2, min_sampl

[CV 4/5] END bootstrap=False, max_depth=40, max_features=sqrt, min_samples_leaf=4, min_samples_split=2, n_estimators=200;, score=0.905 total time=   1.5s
[CV 5/5] END bootstrap=False, max_depth=40, max_features=sqrt, min_samples_leaf=4, min_samples_split=2, n_estimators=200;, score=0.847 total time=   1.5s
[CV 1/5] END bootstrap=True, max_depth=10, max_features=log2, min_samples_leaf=4, min_samples_split=2, n_estimators=2000;, score=0.824 total time=   9.7s
[CV 2/5] END bootstrap=True, max_depth=10, max_features=log2, min_samples_leaf=4, min_samples_split=2, n_estimators=2000;, score=0.820 total time=   9.7s
[CV 3/5] END bootstrap=True, max_depth=10, max_features=log2, min_samples_leaf=4, min_samples_split=2, n_estimators=2000;, score=0.855 total time=   9.7s
[CV 4/5] END bootstrap=True, max_depth=10, max_features=log2, min_samples_leaf=4, min_samples_split=2, n_estimators=2000;, score=0.878 total time=   9.7s
[CV 5/5] END bootstrap=True, max_depth=10, max_features=log2, min_samples_le

[CV 3/5] END bootstrap=True, max_depth=None, max_features=sqrt, min_samples_leaf=2, min_samples_split=2, n_estimators=1200;, score=0.878 total time=   6.1s
[CV 4/5] END bootstrap=True, max_depth=None, max_features=sqrt, min_samples_leaf=2, min_samples_split=2, n_estimators=1200;, score=0.908 total time=   6.2s
[CV 5/5] END bootstrap=True, max_depth=None, max_features=sqrt, min_samples_leaf=2, min_samples_split=2, n_estimators=1200;, score=0.843 total time=   6.1s
[CV 1/5] END bootstrap=True, max_depth=100, max_features=log2, min_samples_leaf=1, min_samples_split=10, n_estimators=600;, score=0.858 total time=   3.1s
[CV 2/5] END bootstrap=True, max_depth=100, max_features=log2, min_samples_leaf=1, min_samples_split=10, n_estimators=600;, score=0.839 total time=   3.1s
[CV 3/5] END bootstrap=True, max_depth=100, max_features=log2, min_samples_leaf=1, min_samples_split=10, n_estimators=600;, score=0.874 total time=   3.1s
[CV 4/5] END bootstrap=True, max_depth=100, max_features=log2, min_

[CV 1/5] END bootstrap=False, max_depth=90, max_features=log2, min_samples_leaf=1, min_samples_split=10, n_estimators=1400;, score=0.866 total time=  11.0s
[CV 2/5] END bootstrap=False, max_depth=90, max_features=log2, min_samples_leaf=1, min_samples_split=10, n_estimators=1400;, score=0.870 total time=  10.8s
[CV 3/5] END bootstrap=False, max_depth=90, max_features=log2, min_samples_leaf=1, min_samples_split=10, n_estimators=1400;, score=0.885 total time=  10.6s
[CV 4/5] END bootstrap=False, max_depth=90, max_features=log2, min_samples_leaf=1, min_samples_split=10, n_estimators=1400;, score=0.916 total time=  10.6s
[CV 5/5] END bootstrap=False, max_depth=90, max_features=log2, min_samples_leaf=1, min_samples_split=10, n_estimators=1400;, score=0.854 total time=  10.6s
[CV 1/5] END bootstrap=True, max_depth=60, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=200;, score=0.862 total time=   1.0s
[CV 2/5] END bootstrap=True, max_depth=60, max_features=sqrt, min_s

[CV 4/5] END bootstrap=True, max_depth=110, max_features=sqrt, min_samples_leaf=4, min_samples_split=10, n_estimators=1000;, score=0.901 total time=   5.1s
[CV 5/5] END bootstrap=True, max_depth=110, max_features=sqrt, min_samples_leaf=4, min_samples_split=10, n_estimators=1000;, score=0.828 total time=   5.0s
[CV 1/5] END bootstrap=True, max_depth=30, max_features=log2, min_samples_leaf=4, min_samples_split=5, n_estimators=1600;, score=0.847 total time=   7.9s
[CV 2/5] END bootstrap=True, max_depth=30, max_features=log2, min_samples_leaf=4, min_samples_split=5, n_estimators=1600;, score=0.835 total time=   8.0s
[CV 3/5] END bootstrap=True, max_depth=30, max_features=log2, min_samples_leaf=4, min_samples_split=5, n_estimators=1600;, score=0.870 total time=   7.9s
[CV 4/5] END bootstrap=True, max_depth=30, max_features=log2, min_samples_leaf=4, min_samples_split=5, n_estimators=1600;, score=0.897 total time=   8.0s
[CV 5/5] END bootstrap=True, max_depth=30, max_features=log2, min_sample

[CV 3/5] END bootstrap=False, max_depth=20, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=800;, score=0.893 total time=   6.1s
[CV 4/5] END bootstrap=False, max_depth=20, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=800;, score=0.920 total time=   6.3s
[CV 5/5] END bootstrap=False, max_depth=20, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=800;, score=0.858 total time=   6.1s
[CV 1/5] END bootstrap=True, max_depth=90, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=1000;, score=0.862 total time=   5.2s
[CV 2/5] END bootstrap=True, max_depth=90, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=1000;, score=0.862 total time=   5.3s
[CV 3/5] END bootstrap=True, max_depth=90, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=1000;, score=0.889 total time=   5.2s
[CV 4/5] END bootstrap=True, max_depth=90, max_features=sqrt, min_samples_le

[CV 1/5] END bootstrap=False, max_depth=100, max_features=log2, min_samples_leaf=2, min_samples_split=10, n_estimators=2000;, score=0.862 total time=  14.9s
[CV 2/5] END bootstrap=False, max_depth=100, max_features=log2, min_samples_leaf=2, min_samples_split=10, n_estimators=2000;, score=0.851 total time=  14.8s
[CV 3/5] END bootstrap=False, max_depth=100, max_features=log2, min_samples_leaf=2, min_samples_split=10, n_estimators=2000;, score=0.874 total time=  15.3s
[CV 4/5] END bootstrap=False, max_depth=100, max_features=log2, min_samples_leaf=2, min_samples_split=10, n_estimators=2000;, score=0.908 total time=48.3min
[CV 5/5] END bootstrap=False, max_depth=100, max_features=log2, min_samples_leaf=2, min_samples_split=10, n_estimators=2000;, score=0.847 total time=50.1min
[CV 1/5] END bootstrap=True, max_depth=None, max_features=sqrt, min_samples_leaf=2, min_samples_split=10, n_estimators=1800;, score=0.854 total time=31.4min
[CV 2/5] END bootstrap=True, max_depth=None, max_features=

[CV 4/5] END bootstrap=True, max_depth=20, max_features=sqrt, min_samples_leaf=2, min_samples_split=2, n_estimators=1800;, score=0.908 total time=   9.3s
[CV 5/5] END bootstrap=True, max_depth=20, max_features=sqrt, min_samples_leaf=2, min_samples_split=2, n_estimators=1800;, score=0.847 total time=   9.3s
[CV 1/5] END bootstrap=True, max_depth=90, max_features=log2, min_samples_leaf=4, min_samples_split=10, n_estimators=1800;, score=0.847 total time=   9.1s
[CV 2/5] END bootstrap=True, max_depth=90, max_features=log2, min_samples_leaf=4, min_samples_split=10, n_estimators=1800;, score=0.835 total time=   9.1s
[CV 3/5] END bootstrap=True, max_depth=90, max_features=log2, min_samples_leaf=4, min_samples_split=10, n_estimators=1800;, score=0.866 total time=   9.0s


KeyboardInterrupt: 

Get results from grid search

In [None]:
print(f'Best hyper-parameters are: {rf_random.best_params_}\nRecall is: {round(rf_random.best_score_ * 100, 2)}%')

## Ideal model

Initalising XGBoost with ideal hyper-parameters 

In [None]:
random_forest = rf_random

Training and getting predictions

In [None]:
y_pred = random_forest.predict(X_test)

Getting results

In [None]:
acc_result = accuracy_score(y_test, y_pred)
recall_result = recall_score(y_test, y_pred)
precision_result = precision_score(y_test, y_pred)
f1_result = f1_score(y_test, y_pred)
print(f'Accuracy score is : {round(acc_result*100, 2)}%')
print(f'Recall score is : {round(recall_result*100, 2)}%')
print(f'Precision score is : {round(precision_result*100, 2)}%')
print(f'F1 score is : {round(f1_result*100, 2)}%')

In [None]:
import joblib
joblib.dump(random_forest, '../models/random_forest.joblib')