## Feature Engineering & Model Testing

In [1]:
import pandas as pd
import numpy as np
from util.data_access import load_data
from util.preprocess import get_preprocessed_data, get_column_indices, train_test_split_by_step, SEED
from sklearn.model_selection import RandomizedSearchCV
from util.tracking import get_classification_metrics
from util import columns
from skopt import BayesSearchCV
import lightgbm as lgb
from dotenv import load_dotenv
load_dotenv()
import datetime

import os
import mlflow
mlflow.set_tracking_uri('http://35.246.127.179')


EXPERIMENT_NAME = 'Fraud Model Feature Engineering Loop'

try:
    EXPERIMENT_ID = mlflow.get_experiment_by_name(EXPERIMENT_NAME)
except Exception as e:
    print(e)
    EXPERIMENT_ID = mlflow.create_experiment(EXPERIMENT_NAME)

In [2]:
df_raw = load_data()

df = get_preprocessed_data(df_raw)

X_train, X_valid, y_train, y_valid = train_test_split_by_step(
    data=df,
    step='step',
    target='fraud',
    train_size=0.8
)


In [3]:
assert all(i in columns.NUMERICAL for i in columns.FRAUD_COMMITED_MEAN)

In [4]:
SEARCH = True
train_data = X_train.loc[:,columns.MODEL]
valid_data = X_valid.loc[:,columns.MODEL]
_cat_columns_idx = get_column_indices(train_data,columns.CATEGORICAL)

params = {
    'max_depth': -1,
    'n_estimators' : 200,
    'num_leaves': 45,
    'scale_pos_weight': 1
}
model = lgb.LGBMClassifier(
    **params,
    random_state=SEED,
    n_jobs=-1
)

if SEARCH:
    param_dist = {
        'max_depth': (3, 100),
        'n_estimators': (100, 300),
        'num_leaves': (40, 60),
        'learning_rate': (1e-4, 1e-1, 'log-uniform'),
    }
    estimator = BayesSearchCV(
        model,
        search_spaces=param_dist,
        n_iter=50,
        cv=5,
        n_jobs=-1 
        )
    estimator.fit(train_data, y_train, categorical_feature=_cat_columns_idx)
    params = estimator.best_params_
else:
    estimator = model
    estimator.fit(train_data,y_train,categorical_feature=_cat_columns_idx)



In [5]:
with mlflow.start_run(experiment_id='3') as run:
    mlflow.log_param('Train Data Dimension', train_data.shape)     
    mlflow.log_param('Train Target Bad Rate', y_train.mean())     
    mlflow.log_param('Valid Data Dimension', valid_data.shape)     
    mlflow.log_param('Valid Target Dimension', y_valid.mean())     

    mlflow.log_param('Model Type', model.__class__.__name__)
    y_pred_train = estimator.predict(train_data)
    y_pred_proba_train = estimator.predict(train_data)
    train_metrics = get_classification_metrics(y_train,y_pred_train,y_pred_proba_train)

    for key, val in params.items():
        mlflow.log_param(key,val)

    for key, val in train_metrics.items():
        mlflow.log_metric(f'Train {key}', val)

    y_pred_valid = estimator.predict(valid_data)
    y_pred_proba_valid = estimator.predict(valid_data)
    train_metrics = get_classification_metrics(y_valid,y_pred_valid,y_pred_proba_valid)

    for key, val in train_metrics.items():
        mlflow.log_metric(f'Validation {key}', val)

In [6]:
import shap
explainer = shap.TreeExplainer(estimator)
explain_data = train_data.sample(frac=0.1)
shap_values = explainer.shap_values(explain_data)


Exception: Model type not yet supported by TreeExplainer: <class 'skopt.searchcv.BayesSearchCV'>

In [None]:
shap.summary_plot(shap_values, explain_data)