In [1]:
import os
import sys
import warnings

import numpy as np
import pandas as pd
import mlflow
import mlflow.sklearn

from sklearn import set_config
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer, make_column_selector
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

from feature_engine.selection import DropFeatures

from xgboost import XGBClassifier

from imblearn.pipeline import Pipeline as imbPipeline
from imblearn.over_sampling import ADASYN, SMOTE

from hyperopt import hp, tpe, fmin, STATUS_OK, Trials
from hyperopt.pyll.base import scope

import logging

logging.basicConfig(level=logging.WARN)
logger = logging.getLogger(__name__)

In [2]:
# helper function
def prep_train_val_test(df_train, df_val, df_test, target_col):
    X_train = df_train.drop(target_col, axis=1)
    y_train = df_train[target_col]

    X_val = df_val.drop(target_col, axis=1)
    y_val = df_val[target_col]

    X_test = df_test.drop(target_col, axis=1)
    y_test = df_test[target_col]
    
    return X_train, y_train, X_val, y_val, X_test, y_test

### Build preprocessing pipeline

Build preprocessing steps based on findings from EDA.

In [3]:
simple_imputer = SimpleImputer(add_indicator=True)
knn_imputer = KNNImputer(add_indicator=True)

numerical_pipeline = Pipeline(steps=[
    ("imputer", knn_imputer),
    ("scaler", StandardScaler())
])

numerical_transformers = [
    ("numerical", numerical_pipeline, make_column_selector(dtype_include=np.number))
]

encoder = OneHotEncoder(min_frequency=0.05, sparse=False, handle_unknown="infrequent_if_exist")
categorical_transformers = [
    ("categorical", encoder, make_column_selector(dtype_include=object)),
]

transformers = numerical_transformers + categorical_transformers
preprocessor = ColumnTransformer(transformers, remainder="passthrough", sparse_threshold=0)

### Define the objective and optimize function
The `objective` function is used to find the optimal hyperparameters. The `optimize` function will then use this
function's return value to search the space to minimize the loss.

From baseline experiments with `Decision Tree`, `Random Forest`, `LightGBM`, and `XGBoost`, `XGBoost` performs the best. Therefore, choosing `XGBoost` to further tune hyperparameters.

In [4]:
def objective(params):
    with mlflow.start_run(run_name="xgboost") as mlflow_run:
        model = imbPipeline([
            ("drop_column", DropFeatures(['Vicuna'])),
            ("preprocessor", preprocessor),
            ("resampler", None),
            ("classifier", XGBClassifier(n_jobs=4, verbosity=0))
        ])

        model.set_params(**params)

        mlflow.sklearn.autolog(
            log_input_examples=True,
            silent=True
        )

        model.fit(X_train, y_train)

        train_metrics = mlflow.sklearn.eval_and_log_metrics(model, X_train, y_train, prefix="train_", pos_label=1)
        val_metrics = mlflow.sklearn.eval_and_log_metrics(model, X_val, y_val, prefix="val_", pos_label=1)
        test_metrics = mlflow.sklearn.eval_and_log_metrics(model, X_test, y_test, prefix="test_", pos_label=1)

        loss = val_metrics["val_roc_auc_score"]
        
        train_metrics = {k.replace("train_", ""): v for k, v in train_metrics.items()}
        val_metrics = {k.replace("val_", ""): v for k, v in val_metrics.items()}
        test_metrics = {k.replace("test_", ""): v for k, v in test_metrics.items()}

        return {
            "loss": loss,
            "status": STATUS_OK,
            "train_metrics": train_metrics,
            "val_metrics": val_metrics,
            "test_metrics": test_metrics,
            "model": model,
            "run": mlflow_run
        }


def optimize(trials, space, max_evals=20):
    fmin(
        objective,
        space=space,
        algo=tpe.suggest,
        max_evals=max_evals,
        trials=trials
    )
    
    return trials.best_trial["result"]

### Configure hyperparameter search space
Hard coded best hyperparameters for quick reproducible result, but left the original hyperopt search space in comments.

In [5]:
space = {
    "preprocessor__numerical__imputer": simple_imputer, # hp.choice("imputer", [simple_imputer, knn_imputer]),
    "preprocessor__numerical__scaler": StandardScaler(), # hp.choice("scaler", [StandardScaler(), MinMaxScaler()]),
    "resampler": None, # hp.choice("oversampler", [SMOTE(), ADASYN(), None]),
    "classifier__n_estimaters": 1388, # hp.randint("n_estimators", 500, 2000),
    "classifier__max_depth": 11, # scope.int(hp.quniform("max_depth", 5, 12, 1)),
    "classifier__min_child_weight": 2, # scope.int(hp.quniform("min_child_weight", 1, 10, 1)),
    "classifier__colsample_bytree": 0.36326869776836995, # hp.uniform("colsample_bytree", 0.2, 0.4),
    "classifier__subsample": 0.5715025025179864, # hp.uniform("subsample", 0.5, 0.9),
    "classifier__learning_rate": 0.04304893649824187 # hp.loguniform('learning_rate', np.log(0.01), np.log(0.2))
}

### Run trials and output best performing model result

In [6]:
if __name__ == "__main__":
    warnings.filterwarnings("ignore")
    np.random.seed(631784649)

    try:
        df = pd.read_csv("../data/raw/train.csv")
        df_test = pd.read_csv("../data/raw/test.csv")
    except Exception as e:
        logger.exception(f"Error reading raw dataset. Error: {e}")
    

    # split original train data into training and validation sets
    df_train, df_val = train_test_split(df, test_size=0.2)

    X_train, y_train, X_val, y_val, X_test, y_test = prep_train_val_test(df_train, df_val, df_test, "target")
    
    trials = Trials()
    best_result = optimize(trials, space, max_evals=1)

100%|██████████████████████████████████████████████████████████████████████| 1/1 [00:03<00:00,  3.97s/trial, best loss: 0.968448684075078]


In [7]:
metric = "roc_auc_score"

pd.DataFrame(
    {
        "train": best_result["train_metrics"][metric],
        "validation": best_result["val_metrics"][metric],
        "test": best_result["test_metrics"][metric]
    },
    index=[metric]
)

Unnamed: 0,train,validation,test
roc_auc_score,0.998439,0.968449,0.906685


In [8]:
# comment out to properly display on GitHub, uncomment to show interactive model graph
# set_config(display="diagram")
# best_result["model"]

Best model pipeline

<img src="../docs/images/best_model.png" alt="best model" width="500"/>