## Feature Engineering & Model Testing
This notebook is used for model prototyping & experimentation of feature engineering techniques.

In [None]:
import os
import mlflow
from util.data_access import load_baseline_data
from util.preprocess import get_preprocessed_data, train_test_split_by_step
from util.tracking import (
    get_classification_metrics,
    HGBM_PARAMS,
    get_experiment_id,
)
from util import columns
from skopt import BayesSearchCV
from sklearn.experimental import enable_hist_gradient_boosting  # noreorder
from sklearn.ensemble import HistGradientBoostingClassifier
from dotenv import load_dotenv
import shap


load_dotenv()

mlflow.set_tracking_uri(os.environ["MLFLOW_TRACKING_URI"])
EXPERIMENT_NAME = "Fraud Model Feature Engineering Loop"
EXPERIMENT_ID = get_experiment_id(EXPERIMENT_NAME)

In [None]:
df_raw = load_baseline_data()

df = get_preprocessed_data(df_raw)

X_train, X_valid, y_train, y_valid = train_test_split_by_step(
    data=df, step="step", target="fraud", train_size=0.8
)

In [None]:
assert all(i in columns.NUMERICAL for i in columns.FRAUD_COMMITED_MEAN)

In [None]:
SEARCH = False
train_data = X_train.loc[:, columns.NUMERICAL]
valid_data = X_valid.loc[:, columns.NUMERICAL]

model = HistGradientBoostingClassifier(
    **HGBM_PARAMS,
)

if SEARCH:
    param_dist = {
        "min_samples_leaf": (30, 80),
        "max_leaf_nodes": (30, 80),
        "learning_rate": (1e-3, 1e-0, "log-uniform"),
        "l2_regularization": (50, 1000),
        "max_bins": (40, 150),
    }
    estimator = BayesSearchCV(
        model, search_spaces=param_dist, n_iter=50, cv=5, n_jobs=-1
    )
    estimator.fit(train_data, y_train)
    params = estimator.best_params_
else:
    params = HGBM_PARAMS
    estimator = model
    estimator.fit(train_data, y_train)

In [None]:
with mlflow.start_run(experiment_id=EXPERIMENT_ID) as run:
    mlflow.log_param("Train Data Dimension", train_data.shape)
    mlflow.log_param("Train Target Bad Rate", y_train.mean())
    mlflow.log_param("Valid Data Dimension", valid_data.shape)
    mlflow.log_param("Valid Target Dimension", y_valid.mean())

    mlflow.log_param("Model Type", model.__class__.__name__)
    y_pred_train = estimator.predict(train_data)
    y_pred_proba_train = estimator.predict(train_data)
    train_metrics = get_classification_metrics(
        y_train, y_pred_train, y_pred_proba_train
    )

    for key, val in params.items():
        mlflow.log_param(key, val)

    for key, val in train_metrics.items():
        mlflow.log_metric(f"Train {key}", val)

    y_pred_valid = estimator.predict(valid_data)
    y_pred_proba_valid = estimator.predict(valid_data)
    train_metrics = get_classification_metrics(
        y_valid, y_pred_valid, y_pred_proba_valid
    )

    for key, val in train_metrics.items():
        mlflow.log_metric(f"Validation {key}", val)

In [None]:

explainer = shap.TreeExplainer(estimator)
explain_data = train_data.sample(frac=0.1)
shap_values = explainer.shap_values(explain_data)

In [None]:
shap.summary_plot(shap_values, explain_data)