In [1]:
import modelgym
import numpy as np
import os

from modelgym.tracker import ProgressTrackerFile
from modelgym.trainer import Trainer
from modelgym.util import split_and_preprocess
from modelgym.util import TASK_CLASSIFICATION

from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split



We will be trying to working with <a href="https://archive.ics.uci.edu/ml/datasets/Breast+Cancer+Wisconsin+(Diagnostic)">Breast Cancer Wisconsin (Diagnostic) Data Set</a>

First, let's define some parameters.

In [2]:
# 20% of our dataset will be used to final evaluation and model comparisson
TEST_SIZE = 0.2
# ???
N_CV_SPLITS = 3

# ???
N_ESTIMATORS = 1000
# ???
N_PROBES = 100         

# We are using tree structured parzen estimator for hyperparameter optimization
# Another option is 'random'
OPTIMIZER = 'tpe'

## Dataset

Then, let's load the dataset itself

In [3]:
from sklearn.datasets import load_breast_cancer

In [4]:
X, y = load_breast_cancer(return_X_y=True)

Splitting the code into train and test

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=TEST_SIZE)

This code is used to transform data to inner modelgym format. For example, it would transform categorial features, if we had any.

In [6]:
cv_pairs, (dtrain, dtest) = split_and_preprocess(X_train.copy(), y_train, 
                                                 X_test.copy(), y_test, 
                                                 cat_cols=[], n_splits=N_CV_SPLITS)

## Training

Here comes our trainer

In [7]:
trainer = Trainer(opt_evals=N_PROBES, n_estimators=N_ESTIMATORS)

In [8]:
model = modelgym.XGBModel(TASK_CLASSIFICATION)

# We will save progress in the 'result' folder.
# <config_key> and <model_name> are just used for filename generation
tracker = ProgressTrackerFile('result', config_key='example', model_name='XGBoost')

# Tuning hyperparameters and training model on cross validation
metrics = trainer.crossval_optimize_params(model, cv_pairs, algo_name=OPTIMIZER, tracker=tracker, verbose=False)

saved state to result/tracker_example_XGBoost.pickle
saved state to result/tracker_example_XGBoost.pickle
saved state to result/tracker_example_XGBoost.pickle
saved state to result/tracker_example_XGBoost.pickle
saved state to result/tracker_example_XGBoost.pickle
saved state to result/tracker_example_XGBoost.pickle
saved state to result/tracker_example_XGBoost.pickle
saved state to result/tracker_example_XGBoost.pickle
saved state to result/tracker_example_XGBoost.pickle
saved state to result/tracker_example_XGBoost.pickle


We will compare our custom metric -- roc auc in the end of training

In [10]:
custom_metric = {'roc_auc': roc_auc_score}

In [11]:
test_metrics = trainer.fit_eval(model, dtrain, dtest,
                                params=metrics['params'],
                                n_estimators=metrics['best_n_estimators'],
                                custom_metric=custom_metric)

In [12]:
test_metrics['roc_auc']

0.98276972624798709

In [13]:
best_model = test_metrics['bst']

No we can go and conquer the world with out best model!