In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import precision_recall_curve, f1_score, confusion_matrix, average_precision_score
from fraud_detection.data.loader import DataHandler
from fraud_detection.core.settings import settings

In [2]:
# Test_original Data
test_original = DataHandler.from_registry(
    "DATA", "processed_dir", "test_original.parquet").load()

# Train_original Data
train_original = DataHandler.from_registry(
    "DATA", "processed_dir", "train_original.parquet").load()

# Train_resampled Data
train_resampled = DataHandler.from_registry(
    "DATA", "processed_dir", "train_resampled.parquet").load()

In [3]:
print("Train (original):", train_original.shape)
print("Train (resampled):", train_resampled.shape)
print("Test (original):", test_original.shape)

print("\nClass distribution (original train):")
print(train_original["class"].value_counts(normalize=True))

print("\nClass distribution (resampled train):")
print(train_resampled["class"].value_counts(normalize=True))

Train (original): (120889, 200)
Train (resampled): (219136, 200)
Test (original): (30223, 200)

Class distribution (original train):
class
0    0.906352
1    0.093648
Name: proportion, dtype: float64

Class distribution (resampled train):
class
0    0.5
1    0.5
Name: proportion, dtype: float64


In [4]:
FEATURES = settings.get("features")

TARGET = FEATURES["target"]


X_train_orig = train_original.drop(columns=[TARGET])
y_train_orig = train_original[TARGET]

X_train_res = train_resampled.drop(columns=[TARGET])
y_train_res = train_resampled[TARGET]

X_test = test_original.drop(columns=[TARGET])
y_test = test_original[TARGET]

In [8]:
from sklearn.linear_model import LogisticRegression

baseline_model = LogisticRegression(
    max_iter=1000,
    class_weight="balanced",  # extra safety even after SMOTE
    n_jobs=-1,
    random_state=42
)

baseline_model.fit(X_train_res, y_train_res)



0,1,2
,"penalty  penalty: {'l1', 'l2', 'elasticnet', None}, default='l2' Specify the norm of the penalty: - `None`: no penalty is added; - `'l2'`: add a L2 penalty term and it is the default choice; - `'l1'`: add a L1 penalty term; - `'elasticnet'`: both L1 and L2 penalty terms are added. .. warning::  Some penalties may not work with some solvers. See the parameter  `solver` below, to know the compatibility between the penalty and  solver. .. versionadded:: 0.19  l1 penalty with SAGA solver (allowing 'multinomial' + L1) .. deprecated:: 1.8  `penalty` was deprecated in version 1.8 and will be removed in 1.10.  Use `l1_ratio` instead. `l1_ratio=0` for `penalty='l2'`, `l1_ratio=1` for  `penalty='l1'` and `l1_ratio` set to any float between 0 and 1 for  `'penalty='elasticnet'`.",'deprecated'
,"C  C: float, default=1.0 Inverse of regularization strength; must be a positive float. Like in support vector machines, smaller values specify stronger regularization. `C=np.inf` results in unpenalized logistic regression. For a visual example on the effect of tuning the `C` parameter with an L1 penalty, see: :ref:`sphx_glr_auto_examples_linear_model_plot_logistic_path.py`.",1.0
,"l1_ratio  l1_ratio: float, default=0.0 The Elastic-Net mixing parameter, with `0 <= l1_ratio <= 1`. Setting `l1_ratio=1` gives a pure L1-penalty, setting `l1_ratio=0` a pure L2-penalty. Any value between 0 and 1 gives an Elastic-Net penalty of the form `l1_ratio * L1 + (1 - l1_ratio) * L2`. .. warning::  Certain values of `l1_ratio`, i.e. some penalties, may not work with some  solvers. See the parameter `solver` below, to know the compatibility between  the penalty and solver. .. versionchanged:: 1.8  Default value changed from None to 0.0. .. deprecated:: 1.8  `None` is deprecated and will be removed in version 1.10. Always use  `l1_ratio` to specify the penalty type.",0.0
,"dual  dual: bool, default=False Dual (constrained) or primal (regularized, see also :ref:`this equation `) formulation. Dual formulation is only implemented for l2 penalty with liblinear solver. Prefer `dual=False` when n_samples > n_features.",False
,"tol  tol: float, default=1e-4 Tolerance for stopping criteria.",0.0001
,"fit_intercept  fit_intercept: bool, default=True Specifies if a constant (a.k.a. bias or intercept) should be added to the decision function.",True
,"intercept_scaling  intercept_scaling: float, default=1 Useful only when the solver `liblinear` is used and `self.fit_intercept` is set to `True`. In this case, `x` becomes `[x, self.intercept_scaling]`, i.e. a ""synthetic"" feature with constant value equal to `intercept_scaling` is appended to the instance vector. The intercept becomes ``intercept_scaling * synthetic_feature_weight``. .. note::  The synthetic feature weight is subject to L1 or L2  regularization as all other features.  To lessen the effect of regularization on synthetic feature weight  (and therefore on the intercept) `intercept_scaling` has to be increased.",1
,"class_weight  class_weight: dict or 'balanced', default=None Weights associated with classes in the form ``{class_label: weight}``. If not given, all classes are supposed to have weight one. The ""balanced"" mode uses the values of y to automatically adjust weights inversely proportional to class frequencies in the input data as ``n_samples / (n_classes * np.bincount(y))``. Note that these weights will be multiplied with sample_weight (passed through the fit method) if sample_weight is specified. .. versionadded:: 0.17  *class_weight='balanced'*",'balanced'
,"random_state  random_state: int, RandomState instance, default=None Used when ``solver`` == 'sag', 'saga' or 'liblinear' to shuffle the data. See :term:`Glossary ` for details.",42
,"solver  solver: {'lbfgs', 'liblinear', 'newton-cg', 'newton-cholesky', 'sag', 'saga'}, default='lbfgs' Algorithm to use in the optimization problem. Default is 'lbfgs'. To choose a solver, you might want to consider the following aspects: - 'lbfgs' is a good default solver because it works reasonably well for a wide  class of problems. - For :term:`multiclass` problems (`n_classes >= 3`), all solvers except  'liblinear' minimize the full multinomial loss, 'liblinear' will raise an  error. - 'newton-cholesky' is a good choice for  `n_samples` >> `n_features * n_classes`, especially with one-hot encoded  categorical features with rare categories. Be aware that the memory usage  of this solver has a quadratic dependency on `n_features * n_classes`  because it explicitly computes the full Hessian matrix. - For small datasets, 'liblinear' is a good choice, whereas 'sag'  and 'saga' are faster for large ones; - 'liblinear' can only handle binary classification by default. To apply a  one-versus-rest scheme for the multiclass setting one can wrap it with the  :class:`~sklearn.multiclass.OneVsRestClassifier`. .. warning::  The choice of the algorithm depends on the penalty chosen (`l1_ratio=0`  for L2-penalty, `l1_ratio=1` for L1-penalty and `0 < l1_ratio < 1` for  Elastic-Net) and on (multinomial) multiclass support:  ================= ======================== ======================  solver l1_ratio multinomial multiclass  ================= ======================== ======================  'lbfgs' l1_ratio=0 yes  'liblinear' l1_ratio=1 or l1_ratio=0 no  'newton-cg' l1_ratio=0 yes  'newton-cholesky' l1_ratio=0 yes  'sag' l1_ratio=0 yes  'saga' 0<=l1_ratio<=1 yes  ================= ======================== ====================== .. note::  'sag' and 'saga' fast convergence is only guaranteed on features  with approximately the same scale. You can preprocess the data with  a scaler from :mod:`sklearn.preprocessing`. .. seealso::  Refer to the :ref:`User Guide ` for more  information regarding :class:`LogisticRegression` and more specifically the  :ref:`Table `  summarizing solver/penalty supports. .. versionadded:: 0.17  Stochastic Average Gradient (SAG) descent solver. Multinomial support in  version 0.18. .. versionadded:: 0.19  SAGA solver. .. versionchanged:: 0.22  The default solver changed from 'liblinear' to 'lbfgs' in 0.22. .. versionadded:: 1.2  newton-cholesky solver. Multinomial support in version 1.6.",'lbfgs'


In [10]:
y_pred_lr = baseline_model.predict(X_test)
y_proba_lr = baseline_model.predict_proba(X_test)[:, 1]

print("Logistic Regression Evaluation")
print("F1-score:", f1_score(y_test, y_pred_lr))
print("AUC-PR:", average_precision_score(y_test, y_proba_lr))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_lr))


Logistic Regression Evaluation
F1-score: 0.15924413168643275
AUC-PR: 0.09468831053931057
Confusion Matrix:
 [[11520 15873]
 [ 1212  1618]]


In [11]:
# Predict fraud probabilities
y_proba_lr = baseline_model.predict_proba(X_test)[:, 1]

# Precision–Recall curve
precision, recall, thresholds = precision_recall_curve(y_test, y_proba_lr)

# Remove last point (no threshold associated)
precision = precision[:-1]
recall = recall[:-1]
thresholds = thresholds

# Compute F1-score for each threshold
f1_scores = 2 * (precision * recall) / (precision + recall + 1e-9)

# Select best threshold
best_idx = np.argmax(f1_scores)
best_threshold = thresholds[best_idx]

# Apply optimized threshold
y_pred_lr_opt = (y_proba_lr >= best_threshold).astype(int)

# Evaluation
print("Logistic Regression Evaluation (Optimized Threshold)")
print(f"Best threshold: {best_threshold:.4f}")
print(f"F1-score: {f1_score(y_test, y_pred_lr_opt):.4f}")
print(f"AUC-PR: {average_precision_score(y_test, y_proba_lr):.4f}")
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_lr_opt))

Logistic Regression Evaluation (Optimized Threshold)
Best threshold: 0.3904
F1-score: 0.1720
AUC-PR: 0.0947
Confusion Matrix:
 [[  792 26601]
 [   61  2769]]


In [12]:
from sklearn.ensemble import RandomForestClassifier

rf_model = RandomForestClassifier(
    n_estimators=400,              # more trees → stabler probabilities
    max_depth=18,                  # allow slightly more expressiveness
    min_samples_leaf=10,           # smooth leaf probabilities
    min_samples_split=20,          # reduce overfitting after SMOTE
    max_features="sqrt",           # standard + strong default
    bootstrap=True,
    class_weight=None,             # IMPORTANT: no weighting after SMOTE
    n_jobs=-1,
    random_state=42
)

rf_model.fit(X_train_res, y_train_res)

0,1,2
,"n_estimators  n_estimators: int, default=100 The number of trees in the forest. .. versionchanged:: 0.22  The default value of ``n_estimators`` changed from 10 to 100  in 0.22.",400
,"criterion  criterion: {""gini"", ""entropy"", ""log_loss""}, default=""gini"" The function to measure the quality of a split. Supported criteria are ""gini"" for the Gini impurity and ""log_loss"" and ""entropy"" both for the Shannon information gain, see :ref:`tree_mathematical_formulation`. Note: This parameter is tree-specific.",'gini'
,"max_depth  max_depth: int, default=None The maximum depth of the tree. If None, then nodes are expanded until all leaves are pure or until all leaves contain less than min_samples_split samples.",18
,"min_samples_split  min_samples_split: int or float, default=2 The minimum number of samples required to split an internal node: - If int, then consider `min_samples_split` as the minimum number. - If float, then `min_samples_split` is a fraction and  `ceil(min_samples_split * n_samples)` are the minimum  number of samples for each split. .. versionchanged:: 0.18  Added float values for fractions.",20
,"min_samples_leaf  min_samples_leaf: int or float, default=1 The minimum number of samples required to be at a leaf node. A split point at any depth will only be considered if it leaves at least ``min_samples_leaf`` training samples in each of the left and right branches. This may have the effect of smoothing the model, especially in regression. - If int, then consider `min_samples_leaf` as the minimum number. - If float, then `min_samples_leaf` is a fraction and  `ceil(min_samples_leaf * n_samples)` are the minimum  number of samples for each node. .. versionchanged:: 0.18  Added float values for fractions.",10
,"min_weight_fraction_leaf  min_weight_fraction_leaf: float, default=0.0 The minimum weighted fraction of the sum total of weights (of all the input samples) required to be at a leaf node. Samples have equal weight when sample_weight is not provided.",0.0
,"max_features  max_features: {""sqrt"", ""log2"", None}, int or float, default=""sqrt"" The number of features to consider when looking for the best split: - If int, then consider `max_features` features at each split. - If float, then `max_features` is a fraction and  `max(1, int(max_features * n_features_in_))` features are considered at each  split. - If ""sqrt"", then `max_features=sqrt(n_features)`. - If ""log2"", then `max_features=log2(n_features)`. - If None, then `max_features=n_features`. .. versionchanged:: 1.1  The default of `max_features` changed from `""auto""` to `""sqrt""`. Note: the search for a split does not stop until at least one valid partition of the node samples is found, even if it requires to effectively inspect more than ``max_features`` features.",'sqrt'
,"max_leaf_nodes  max_leaf_nodes: int, default=None Grow trees with ``max_leaf_nodes`` in best-first fashion. Best nodes are defined as relative reduction in impurity. If None then unlimited number of leaf nodes.",
,"min_impurity_decrease  min_impurity_decrease: float, default=0.0 A node will be split if this split induces a decrease of the impurity greater than or equal to this value. The weighted impurity decrease equation is the following::  N_t / N * (impurity - N_t_R / N_t * right_impurity  - N_t_L / N_t * left_impurity) where ``N`` is the total number of samples, ``N_t`` is the number of samples at the current node, ``N_t_L`` is the number of samples in the left child, and ``N_t_R`` is the number of samples in the right child. ``N``, ``N_t``, ``N_t_R`` and ``N_t_L`` all refer to the weighted sum, if ``sample_weight`` is passed. .. versionadded:: 0.19",0.0
,"bootstrap  bootstrap: bool, default=True Whether bootstrap samples are used when building trees. If False, the whole dataset is used to build each tree.",True


In [13]:
y_pred_rf = rf_model.predict(X_test)
y_proba_rf = rf_model.predict_proba(X_test)[:, 1]

print("Random Forest Evaluation")
print("F1-score:", f1_score(y_test, y_pred_rf))
print("AUC-PR:", average_precision_score(y_test, y_proba_rf))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_rf))

Random Forest Evaluation
F1-score: 0.13416308227326806
AUC-PR: 0.09165506769755731
Confusion Matrix:
 [[20029  7364]
 [ 2097   733]]


In [15]:
from sklearn.model_selection import StratifiedKFold, cross_validate

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

scoring = {
    "f1": "f1",
    "auc_pr": "average_precision",
}

In [25]:
import xgboost as xgb
from sklearn.metrics import f1_score, average_precision_score, confusion_matrix, precision_recall_curve

# -----------------------------
# 1. Initialize XGBoost classifier
# -----------------------------
xgb_model = xgb.XGBClassifier(
    n_estimators=500,       # More trees for better learning
    max_depth=6,            # Typical starting point
    learning_rate=0.1,      # Step size
    subsample=0.8,          # Random sampling for robustness
    colsample_bytree=0.8,   # Feature sampling
    scale_pos_weight=(y_train_res == 0).sum() / (y_train_res ==
                                                 1).sum(),  # Balance classes after SMOTE
    n_jobs=-1,
    random_state=42,
    eval_metric='logloss',
    use_label_encoder=False
)

# -----------------------------
# 2. Train on resampled data
# -----------------------------
xgb_model.fit(X_train_res, y_train_res)

# -----------------------------
# 3. Predict probabilities on test set
# -----------------------------
y_proba_xgb = xgb_model.predict_proba(X_test)[:, 1]

# -----------------------------
# 4. Optimize threshold based on F1
# -----------------------------
precision, recall, thresholds = precision_recall_curve(y_test, y_proba_xgb)
f1_scores = 2 * (precision[:-1] * recall[:-1]) / \
    (precision[:-1] + recall[:-1] + 1e-9)

best_idx = f1_scores.argmax()
best_threshold = thresholds[best_idx]

y_pred_xgb_opt = (y_proba_xgb >= best_threshold).astype(int)

# -----------------------------
# 5. Evaluation
# -----------------------------
print("XGBoost Evaluation")
print(f"Best threshold: {best_threshold:.4f}")
print(f"F1-score: {f1_score(y_test, y_pred_xgb_opt):.4f}")
print(f"AUC-PR: {average_precision_score(y_test, y_proba_xgb):.4f}")
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_xgb_opt))

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


XGBoost Evaluation
Best threshold: 0.0162
F1-score: 0.1712
AUC-PR: 0.0948
Confusion Matrix:
 [[    0 27393]
 [    0  2830]]


In [26]:
pos_weight = (y_train_orig == 0).sum() / (y_train_orig == 1).sum()

xgb_model = xgb.XGBClassifier(
    n_estimators=500,
    max_depth=6,
    learning_rate=0.1,
    subsample=0.8,
    colsample_bytree=0.8,
    scale_pos_weight=pos_weight,  # balance original imbalance
    n_jobs=-1,
    random_state=42,
    eval_metric='logloss'
)

xgb_model.fit(X_train_orig, y_train_orig)

0,1,2
,"objective  objective: typing.Union[str, xgboost.sklearn._SklObjWProto, typing.Callable[[typing.Any, typing.Any], typing.Tuple[numpy.ndarray, numpy.ndarray]], NoneType] Specify the learning task and the corresponding learning objective or a custom objective function to be used. For custom objective, see :doc:`/tutorials/custom_metric_obj` and :ref:`custom-obj-metric` for more information, along with the end note for function signatures.",'binary:logistic'
,"base_score  base_score: typing.Union[float, typing.List[float], NoneType] The initial prediction score of all instances, global bias.",
,booster,
,"callbacks  callbacks: typing.Optional[typing.List[xgboost.callback.TrainingCallback]] List of callback functions that are applied at end of each iteration. It is possible to use predefined callbacks by using :ref:`Callback API `. .. note::  States in callback are not preserved during training, which means callback  objects can not be reused for multiple training sessions without  reinitialization or deepcopy. .. code-block:: python  for params in parameters_grid:  # be sure to (re)initialize the callbacks before each run  callbacks = [xgb.callback.LearningRateScheduler(custom_rates)]  reg = xgboost.XGBRegressor(**params, callbacks=callbacks)  reg.fit(X, y)",
,colsample_bylevel  colsample_bylevel: typing.Optional[float] Subsample ratio of columns for each level.,
,colsample_bynode  colsample_bynode: typing.Optional[float] Subsample ratio of columns for each split.,
,colsample_bytree  colsample_bytree: typing.Optional[float] Subsample ratio of columns when constructing each tree.,0.8
,"device  device: typing.Optional[str] .. versionadded:: 2.0.0 Device ordinal, available options are `cpu`, `cuda`, and `gpu`.",
,"early_stopping_rounds  early_stopping_rounds: typing.Optional[int] .. versionadded:: 1.6.0 - Activates early stopping. Validation metric needs to improve at least once in  every **early_stopping_rounds** round(s) to continue training. Requires at  least one item in **eval_set** in :py:meth:`fit`. - If early stopping occurs, the model will have two additional attributes:  :py:attr:`best_score` and :py:attr:`best_iteration`. These are used by the  :py:meth:`predict` and :py:meth:`apply` methods to determine the optimal  number of trees during inference. If users want to access the full model  (including trees built after early stopping), they can specify the  `iteration_range` in these inference methods. In addition, other utilities  like model plotting can also use the entire model. - If you prefer to discard the trees after `best_iteration`, consider using the  callback function :py:class:`xgboost.callback.EarlyStopping`. - If there's more than one item in **eval_set**, the last entry will be used for  early stopping. If there's more than one metric in **eval_metric**, the last  metric will be used for early stopping.",
,enable_categorical  enable_categorical: bool See the same parameter of :py:class:`DMatrix` for details.,False


In [27]:
# -----------------------------
# 3. Predict probabilities on test set
# -----------------------------
y_proba_xgb = xgb_model.predict_proba(X_test)[:, 1]

# -----------------------------
# 4. Optimize threshold based on F1
# -----------------------------
precision, recall, thresholds = precision_recall_curve(y_test, y_proba_xgb)
f1_scores = 2 * (precision[:-1] * recall[:-1]) / \
    (precision[:-1] + recall[:-1] + 1e-9)

best_idx = f1_scores.argmax()
best_threshold = thresholds[best_idx]

y_pred_xgb_opt = (y_proba_xgb >= best_threshold).astype(int)

# -----------------------------
# 5. Evaluation
# -----------------------------
print("XGBoost Evaluation")
print(f"Best threshold: {best_threshold:.4f}")
print(f"F1-score: {f1_score(y_test, y_pred_xgb_opt):.4f}")
print(f"AUC-PR: {average_precision_score(y_test, y_proba_xgb):.4f}")
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_xgb_opt))

XGBoost Evaluation
Best threshold: 0.0887
F1-score: 0.1714
AUC-PR: 0.0963
Confusion Matrix:
 [[   66 27327]
 [    4  2826]]
