# XGBoost

## Importing packages

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score

pd.set_option('display.max_columns', None)

## Importing data

In [2]:
data = pd.read_csv('../data/5_train_dataset.csv')

## XGBoost

Getting X & y

In [3]:
X = data.drop('is_fraud', axis=1)
y = data['is_fraud']

Initialise base XGBoost instance

In [4]:
xgboost = XGBClassifier(objective='binary:logistic')

Initialise parameter grid for grid search

In [5]:
parameter_grid = {
    "learning_rate": [0.01, 0.05, 0.1, 0.5, 0.9],
    "n_estimators": [100, 500, 1000],
    "max_depth": [1, 5, 9],
    "subsample": [0.3, 0.5, 0.9],
    "colsample_bytree": [0.3, 0.5, 0.9]
                  }

Run grid search on XGBoost

In [6]:
gs = GridSearchCV(estimator= xgboost,
                  param_grid= parameter_grid,
                  scoring= "recall",
                  verbose= 4)

In [7]:
gs.fit(X, y)

Fitting 5 folds for each of 405 candidates, totalling 2025 fits


[CV 1/5] END colsample_bytree=0.3, learning_rate=0.01, max_depth=1, n_estimators=100, subsample=0.3;, score=0.249 total time=   0.2s
[CV 2/5] END colsample_bytree=0.3, learning_rate=0.01, max_depth=1, n_estimators=100, subsample=0.3;, score=0.287 total time=   0.1s
[CV 3/5] END colsample_bytree=0.3, learning_rate=0.01, max_depth=1, n_estimators=100, subsample=0.3;, score=0.282 total time=   0.0s
[CV 4/5] END colsample_bytree=0.3, learning_rate=0.01, max_depth=1, n_estimators=100, subsample=0.3;, score=0.321 total time=   0.0s
[CV 5/5] END colsample_bytree=0.3, learning_rate=0.01, max_depth=1, n_estimators=100, subsample=0.3;, score=0.268 total time=   0.0s
[CV 1/5] END colsample_bytree=0.3, learning_rate=0.01, max_depth=1, n_estimators=100, subsample=0.5;, score=0.303 total time=   0.0s
[CV 2/5] END colsample_bytree=0.3, learning_rate=0.01, max_depth=1, n_estimators=100, subsample=0.5;, score=0.287 total time=   0.0s
[CV 3/5] END colsample_bytree=0.3, learning_rate=0.01, max_depth=1, n

Get results from grid search

In [8]:
print(f'Best hyper-parameters are: {gs.best_params_}\nRecall is: {round(gs.best_score_ * 100, 2)}%')

Best hyper-parameters are: {'colsample_bytree': 0.5, 'learning_rate': 0.9, 'max_depth': 5, 'n_estimators': 100, 'subsample': 0.9}
Recall is: 90.82%


## Ideal model

Initalising XGBoost with ideal hyper-parameters 

In [9]:
xgboost_ideal = XGBClassifier(objective='binary:logistic', colsample_bytree=0.5, learning_rate=0.9, max_depth=5, n_estimators=100, subsample=0.9)

Training and getting predictions

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3)
model = xgboost_ideal.fit(X_train, y_train)
y_test_pred = model.predict(X_test)

Getting results

In [11]:
acc_result = accuracy_score(y_test, y_test_pred)
recall_result = recall_score(y_test, y_test_pred)
precision_result = precision_score(y_test, y_test_pred)
f1_result = f1_score(y_test, y_test_pred)
print(f'Accuracy score is : {round(acc_result*100, 2)}%')
print(f'Recall score is : {round(recall_result*100, 2)}%')
print(f'Precision score is : {round(precision_result*100, 2)}%')
print(f'F1 score is : {round(f1_result*100, 2)}%')

Accuracy score is : 97.28%
Recall score is : 91.44%
Precision score is : 95.53%
F1 score is : 93.44%
