In [10]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, average_precision_score
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV

In [11]:
data = pd.read_csv('dataset/creditcard.csv')
x = data.drop('Class', axis=1)
y = data['Class']
x_train_val, x_test, y_train_val, y_test = train_test_split(x, y, test_size=0.2, stratify=y)
x_train, x_val, y_train, y_val = train_test_split(x_train_val, y_train_val, test_size=0.2, stratify=y_train_val)

In [12]:
param_grid = {
    'max_depth': [3, 5, 7],
    'learning_rate': [0.1, 0.01],
    'n_estimators': [100, 200,500,800]
}

grid = GridSearchCV(XGBClassifier(tree_method='hist', device='cuda:1', verbosity=0,), param_grid, scoring='average_precision', cv=3)
grid.fit(x_train, y_train)

print(f"best parameters: {grid.best_params_}")

best parameters: {'learning_rate': 0.01, 'max_depth': 5, 'n_estimators': 800}


In [18]:
ratio = len(y_train[y_train==0]) / len(y_train[y_train==1])

model = XGBClassifier(
    n_estimators=500,
    max_depth=5,
    learning_rate=0.1,
    scale_pos_weight=ratio,
    eval_metric='aucpr',
    early_stopping_rounds=50
)

model.fit(
    x_train,
    y_train,
    eval_set=[(x_val, y_val)],
    verbose=True
)

[0]	validation_0-aucpr:0.45293
[1]	validation_0-aucpr:0.64651
[2]	validation_0-aucpr:0.64187
[3]	validation_0-aucpr:0.63923
[4]	validation_0-aucpr:0.64703
[5]	validation_0-aucpr:0.64950
[6]	validation_0-aucpr:0.66289
[7]	validation_0-aucpr:0.66210
[8]	validation_0-aucpr:0.66401
[9]	validation_0-aucpr:0.70131
[10]	validation_0-aucpr:0.70001
[11]	validation_0-aucpr:0.70152
[12]	validation_0-aucpr:0.70250
[13]	validation_0-aucpr:0.70232
[14]	validation_0-aucpr:0.72307
[15]	validation_0-aucpr:0.72463
[16]	validation_0-aucpr:0.72497
[17]	validation_0-aucpr:0.72579
[18]	validation_0-aucpr:0.72713
[19]	validation_0-aucpr:0.73676
[20]	validation_0-aucpr:0.73712
[21]	validation_0-aucpr:0.73543
[22]	validation_0-aucpr:0.73396
[23]	validation_0-aucpr:0.73208
[24]	validation_0-aucpr:0.73326
[25]	validation_0-aucpr:0.73302
[26]	validation_0-aucpr:0.73341
[27]	validation_0-aucpr:0.73307
[28]	validation_0-aucpr:0.73219
[29]	validation_0-aucpr:0.73248
[30]	validation_0-aucpr:0.73196
[31]	validation_0-

0,1,2
,"objective  objective: typing.Union[str, xgboost.sklearn._SklObjWProto, typing.Callable[[typing.Any, typing.Any], typing.Tuple[numpy.ndarray, numpy.ndarray]], NoneType] Specify the learning task and the corresponding learning objective or a custom objective function to be used. For custom objective, see :doc:`/tutorials/custom_metric_obj` and :ref:`custom-obj-metric` for more information, along with the end note for function signatures.",'binary:logistic'
,"base_score  base_score: typing.Union[float, typing.List[float], NoneType] The initial prediction score of all instances, global bias.",
,booster,
,"callbacks  callbacks: typing.Optional[typing.List[xgboost.callback.TrainingCallback]] List of callback functions that are applied at end of each iteration. It is possible to use predefined callbacks by using :ref:`Callback API `. .. note::  States in callback are not preserved during training, which means callback  objects can not be reused for multiple training sessions without  reinitialization or deepcopy. .. code-block:: python  for params in parameters_grid:  # be sure to (re)initialize the callbacks before each run  callbacks = [xgb.callback.LearningRateScheduler(custom_rates)]  reg = xgboost.XGBRegressor(**params, callbacks=callbacks)  reg.fit(X, y)",
,colsample_bylevel  colsample_bylevel: typing.Optional[float] Subsample ratio of columns for each level.,
,colsample_bynode  colsample_bynode: typing.Optional[float] Subsample ratio of columns for each split.,
,colsample_bytree  colsample_bytree: typing.Optional[float] Subsample ratio of columns when constructing each tree.,
,"device  device: typing.Optional[str] .. versionadded:: 2.0.0 Device ordinal, available options are `cpu`, `cuda`, and `gpu`.",
,"early_stopping_rounds  early_stopping_rounds: typing.Optional[int] .. versionadded:: 1.6.0 - Activates early stopping. Validation metric needs to improve at least once in  every **early_stopping_rounds** round(s) to continue training. Requires at  least one item in **eval_set** in :py:meth:`fit`. - If early stopping occurs, the model will have two additional attributes:  :py:attr:`best_score` and :py:attr:`best_iteration`. These are used by the  :py:meth:`predict` and :py:meth:`apply` methods to determine the optimal  number of trees during inference. If users want to access the full model  (including trees built after early stopping), they can specify the  `iteration_range` in these inference methods. In addition, other utilities  like model plotting can also use the entire model. - If you prefer to discard the trees after `best_iteration`, consider using the  callback function :py:class:`xgboost.callback.EarlyStopping`. - If there's more than one item in **eval_set**, the last entry will be used for  early stopping. If there's more than one metric in **eval_metric**, the last  metric will be used for early stopping.",50
,enable_categorical  enable_categorical: bool See the same parameter of :py:class:`DMatrix` for details.,False


In [22]:

y_probs = model.predict_proba(x_test)[:, 1]
y_pred = model.predict(x_test)

print(f"auprc: {average_precision_score(y_test, y_probs)}\n")
print(f"classification report: {classification_report(y_test, y_pred)}")

auprc: 0.8784853524874567

classification report:               precision    recall  f1-score   support

           0       1.00      1.00      1.00     56864
           1       0.85      0.80      0.82        98

    accuracy                           1.00     56962
   macro avg       0.92      0.90      0.91     56962
weighted avg       1.00      1.00      1.00     56962



In [20]:
model.get_booster().save_model("fraud_model.json")