In [289]:
import os
import sys
import warnings

import numpy as np
import pandas as pd
import mlflow
import mlflow.sklearn

from sklearn import set_config
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer, make_column_selector
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.metrics import roc_auc_score

from feature_engine.selection import DropFeatures

from xgboost import XGBClassifier

from imblearn.pipeline import Pipeline as imbPipeline
from imblearn.over_sampling import ADASYN, SMOTE

from hyperopt import hp, tpe, fmin, STATUS_OK, Trials

import logging

logging.basicConfig(level=logging.WARN)
logger = logging.getLogger(__name__)

In [290]:
# helper function
def prep_train_val_test(df_train, df_val, df_test, target_col):
    X_train = df_train.drop(target_col, axis=1)
    y_train = df_train[target_col]

    X_val = df_val.drop(target_col, axis=1)
    y_val = df_val[target_col]

    X_test = df_test.drop(target_col, axis=1)
    y_test = df_test[target_col]
    
    return X_train, y_train, X_val, y_val, X_test, y_test

### Build preprocessing pipeline

Build preprocessing steps based on findings from EDA.

In [299]:
simple_imputer = SimpleImputer(add_indicator=True)
knn_imputer = KNNImputer(add_indicator=True)

numerical_pipeline = Pipeline(steps=[
    ("imputer", knn_imputer),
    ("scaler", StandardScaler())
])

numerical_transformers = [
    ("numerical", numerical_pipeline, make_column_selector(dtype_include=np.number))
]

encoder = OneHotEncoder(min_frequency=0.05, sparse=False, handle_unknown="infrequent_if_exist")
categorical_transformers = [
    ("categorical", encoder, make_column_selector(dtype_include=object)),
]

transformers = numerical_transformers + categorical_transformers
preprocessor = ColumnTransformer(transformers, remainder="passthrough", sparse_threshold=0)

### Define the objective and optimize function
The `objective` function is used to find the optimal hyperparameters. The `optimize` function will then use this
function's return value to search the space to minimize the loss.

From baseline experiments with `Decision Tree`, `Random Forest`, `LightGBM`, and `XGBoost`, `XGBoost` performs the best. Therefore, choosing `XGBoost` to further tune hyperparameters.

In [292]:
def objective(params):
    with mlflow.start_run(run_name="xgboost") as mlflow_run:
        model = imbPipeline([
            ("drop_column", DropFeatures(['Vicuna'])),
            ("preprocessor", preprocessor),
            ("resampler", None),
            ("classifier", XGBClassifier(n_jobs=4, verbosity=0))
        ])

        model.set_params(**params)

        mlflow.sklearn.autolog(
            log_input_examples=True,
            silent=True
        )

        model.fit(X_train, y_train)

        train_metrics = mlflow.sklearn.eval_and_log_metrics(model, X_train, y_train, prefix="train_", pos_label=1)
        val_metrics = mlflow.sklearn.eval_and_log_metrics(model, X_val, y_val, prefix="val_", pos_label=1)
        test_metrics = mlflow.sklearn.eval_and_log_metrics(model, X_test, y_test, prefix="test_", pos_label=1)

        loss = val_metrics["val_roc_auc_score"]

        val_metrics = {k.replace("val_", ""): v for k, v in val_metrics.items()}
        test_metrics = {k.replace("test_", ""): v for k, v in test_metrics.items()}

        return {
          "loss": loss,
          "status": STATUS_OK,
          "val_metrics": val_metrics,
          "test_metrics": test_metrics,
          "model": model,
          "run": mlflow_run
        }


def optimize(trials, space, max_evals=20):
    fmin(
        objective,
        space=space,
        algo=tpe.suggest,
        max_evals=max_evals,
        trials=trials
    )
    
    return trials.best_trial["result"]

### Configure hyperparameter search space

In [293]:
space = {
    "preprocessor__numerical__imputer": hp.choice("imputer", [simple_imputer, knn_imputer]),
    "preprocessor__numerical__scaler": hp.choice("scaler", [StandardScaler(), MinMaxScaler()]),
    "preprocessor__categorical__min_frequency": hp.choice("min_freq", [0.1, 0.05, None]),
    "resampler": hp.choice("oversampler", [SMOTE(), ADASYN(), None]),
    "classifier__n_estimaters": hp.quniform(536),
    "classifier__max_depth": hp.quniform(3, 10),
    "classifier__min_child_weight": 10,
    "classifier__eta": hp.uniform(0, 1),
    "classifier__colsample_bytree": 0.21620329526101392,
    "classifier__learning_rate": 0.050065586051749114,
    "classifier__subsample": 0.595439112017179
}

### Run trials and output best performing model result

In [294]:
if __name__ == "__main__":
    warnings.filterwarnings("ignore")
    np.random.seed(42)

    try:
        df = pd.read_csv("../data/raw/train.csv")
        df_test = pd.read_csv("../data/raw/test.csv")
    except Exception as e:
        logger.exception(f"Error reading raw dataset. Error: {e}")
    

    # split original train data into training and validation sets
    df_train, df_val = train_test_split(df, test_size=0.2)

    X_train, y_train, X_val, y_val, X_test, y_test = prep_train_val_test(df_train, df_val, df_test, "target")
    
    trials = Trials()
    best_result = optimize(trials, space, max_evals=100)

100%|█████████████████████████████████████████████████| 100/100 [11:38<00:00,  6.98s/trial, best loss: 0.9536970662418692]


In [295]:
pd.DataFrame(
    [best_result["val_metrics"], best_result["test_metrics"]],
    index=["validation", "test"]
)

Unnamed: 0,precision_score,recall_score,f1_score,accuracy_score,log_loss,roc_auc_score,score
validation,0.806897,0.629032,0.706949,0.930764,0.220476,0.953697,0.930764
test,0.93007,0.272541,0.421553,0.645975,0.620904,0.874041,0.645975


In [298]:
set_config(display="diagram")
best_result["model"]