In [None]:
import boto3
from io import BytesIO
import joblib
import numpy as np
import pandas as pd
import ray
from ray import tune
from ray.tune.suggest.bohb import TuneBOHB
from ray.tune.schedulers import HyperBandForBOHB
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold
from tune_sklearn import TuneSearchCV
from xgboost import XGBClassifier

In [None]:
def load_from_s3(path="s3://ray-ci-higgs/hpo_best_estimator.csv"):
    # Path is an s3 bucket
    assert path[:5] == 's3://'
    s3_bucket, s3_key = path.split('/')[2], path.split('/')[3:]
    s3_key = '/'.join(s3_key)
    with BytesIO() as f:
        boto3.client("s3").download_fileobj(Bucket=s3_bucket, Key=s3_key, Fileobj=f)
        f.seek(0)
        file = joblib.load(f)
    return file

In [None]:
HPO_CV_RESULTS_URL = "https://ray-ci-higgs.s3.us-west-2.amazonaws.com/" \
                      "hpo_cv_results_table.csv"

DATA_URL = "https://ray-ci-higgs.s3.us-west-2.amazonaws.com/" \
                      "safe_driver.csv"

train_df = pd.read_csv(DATA_URL, dtype={'id': np.int32, 'target': np.int8})

y = train_df['target'].values
X = train_df.drop(['target', 'id'], axis=1)

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=7)

In [None]:
# default classifier.
model = XGBClassifier(objective='binary:logistic', nthread=1, eval_metric='auc',)

In [None]:
def print_roc_auc_score(model, X_test, y_test):
    y_pred = model.predict_proba(X_test)
    roc_auc = roc_auc_score(y_test, y_pred[:,1])
    print("====roc_auc score: {}====".format(roc_auc))

In [None]:
def train_model(model, X_train, y_train, X_test, y_test):
    skf = StratifiedKFold(n_splits=3, shuffle=True, random_state=1234)
    run_cv = RandomizedSearchCV(model, param_distributions= {}, n_iter=1, scoring='roc_auc', n_jobs=8, cv=skf.split(X_train,y_train), verbose=3, random_state=1001)
    run_cv.fit(X_train, y_train)
    print("====run_cv_result====")
    print(run_cv.cv_results_)
    print_roc_auc_score(run_cv.best_estimator_, X_test, y_test)

In [None]:
# To simulate training a default classifier without any hpo on a laptop.
train_model(model, X_train, y_train, X_test, y_test)

In [None]:
# To simulate hpo on a cluster.
new_params = {
        "max_depth": tune.randint(lower=1, upper=5),
        "min_child_weight": tune.loguniform(lower=0.001, upper=128),
        "subsample": tune.uniform(lower=0.1, upper=1.0),
        "colsample_bylevel": tune.uniform(lower=0.01, upper=1.0),
        "colsample_bytree": tune.uniform(lower=0.01, upper=1.0),
        "reg_alpha": tune.loguniform(lower=1 / 1024, upper=10.0),
        "reg_lambda": tune.loguniform(lower=1 / 1024, upper=10.0),
        "scale_pos_weight": tune.choice([1, 26]),  # 26 calculated by the #negative_y/#positive_y
}

In [None]:
# This is to show an example of GridSearchCV as offered by sklearn's API.
# GridSearchCV becomes very inefficient in high dimensional search space.
# Thus only a few parameters may be covered. Also it has to be discrete. 

# grid_search = GridSearchCV(
#     model, 
#     {
#       "max_depth": [1, 5],
#       "subsample": [0.5, 1],
#     },
#     cv=3,
#     scoring='roc_auc', 
#     n_jobs=-1, 
#     verbose=2
# )

# TuneSearchCV carries a lot of similarities as GridSearchCV in its API - friendly API.
hpo = TuneSearchCV(
    model,
    param_distributions=new_params,
    cv=3,
    n_trials=100,
    scoring="roc_auc",
    early_stopping="HyperBandForBOHB",
    max_iters=10,
    max_concurrent=40,
    search_optimization="bohb",
    verbose=2,
)

# hpo = TuneSearchCV(
#     model,
#     param_distributions=new_params,
#     cv=3,
#     n_trials=100,
#     scoring="roc_auc",
#     early_stopping=True,  # defaults to ASHA
#     max_iters=10,
#     max_concurrent=40,
#     search_optimization="optuna",
#     verbose=2,
# )

In [None]:
ray.init(address="auto")

In [None]:
hpo.fit(X_train, y_train)

In [None]:
best_model = hpo.best_estimator_
train_model(best_model, X_train, y_train, X_test, y_test)