In [None]:
import pandas as pd
import numpy as np
import ray
from ray import tune
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold
from tune_sklearn import TuneSearchCV
from xgboost import XGBClassifier
from ray.tune.suggest.bohb import TuneBOHB
from ray.tune.schedulers import HyperBandForBOHB

FILE_URL = "https://ray-ci-higgs.s3.us-west-2.amazonaws.com/" \
                      "safe_driver.csv"

In [None]:
train_df = pd.read_csv(FILE_URL, dtype={'id': np.int32, 'target': np.int8})

y = train_df['target'].values
X = train_df.drop(['target', 'id'], axis=1)

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=7)

In [None]:
# default classifier.
model = XGBClassifier(objective='binary:logistic', nthread=1, eval_metric='auc',)

In [None]:
def train_model(model, X_train, y_train, X_test, y_test):
	skf = StratifiedKFold(n_splits=3, shuffle=True, random_state=1234)
	run_cv = RandomizedSearchCV(model, param_distributions= {}, n_iter=1, scoring='roc_auc', n_jobs=8, cv=skf.split(X_train,y_train), verbose=3, random_state=1001)
	run_cv.fit(X_train, y_train)
	print("====run_cv_result====")
	print(run_cv.cv_results_)
	y_pred = run_cv.predict_proba(X_test)
	roc_auc = roc_auc_score(y_test, y_pred[:,1])
	gini_score = 2 * roc_auc - 1
	print("====gini score: {}====".format(gini_score))

In [None]:
# To simulate training a default classifier without any hpo on a laptop.
train_model(model, X_train, y_train, X_test, y_test)

In [None]:
# To simulate hpo on a cluster.
new_params = {
        # "learning_rate": tune.choice([0.02, 0.1, 0.5]),
        "max_depth": tune.randint(lower=1, upper=10),
        "min_child_weight": tune.loguniform(lower=0.001, upper=128),
        "subsample": tune.uniform(lower=0.1, upper=1.0),
        "colsample_bylevel": tune.uniform(lower=0.01, upper=1.0),
        "colsample_bytree": tune.uniform(lower=0.01, upper=1.0),
        "reg_alpha": tune.loguniform(lower=1 / 1024, upper=10.0),
        "reg_lambda": tune.loguniform(lower=1 / 1024, upper=10.0),
        "scale_pos_weight": tune.choice([1, 26]),  # 26 calculated by the #negative_y/#positive_y
}

In [None]:
# This is to show an example of GridSearchCV as offered by sklearn's API.
# GridSearchCV becomes very inefficient in high dimensional search space.
# Thus only a few parameters may be covered. Also it has to be discrete. 

# run_cv = GridSearchCV(
#     model, 
#     {
#     	"learning_rate": [0.5, 1],
#     	"n_estimators": [50, 100],
#     },
#     cv=3,
#     scoring='roc_auc', 
#     n_jobs=-1, 
#     verbose=2
# )

# TuneSearchCV carries a lot of similarities as GridSearchCV in its API - friendly API.
hpo = TuneSearchCV(
    model,
    param_distributions=new_params,
    cv=3,
    n_trials=100,
    scoring="roc_auc",
    early_stopping="HyperBandForBOHB",
    max_iters=10,
    max_concurrent=40,
    search_optimization="bohb",
    verbose=2,
)

In [None]:
ray.init(address="auto")

In [None]:
hpo.fit(X_train, y_train)

In [None]:
best_model = hpo.best_estimator_
train_model(best_model, X_train, y_train, X_test, y_test)