This is a classification example to show how to use Oboe for training and testing, in the context of AutoML, i.e., do pipeline selection on the training set and then evaluate the performance of the selected model on the test set.

In [1]:
method = 'tensoroboe'  # 'Oboe' or 'TensorOboe'
problem_type = 'classification'

from oboe import AutoLearner, error  # This may take around 15 seconds at first run.

import numpy as np
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import time

data = load_iris()
x = np.array(data['data'])
y = np.array(data['target'])
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

# disable warnings
import warnings
warnings.filterwarnings('ignore')

# Example 1: a no-brainer use

The default `TensorOboe` running mode is `warm`, which means the meta-training is warm-started with pre-imputed error tensor.

In [2]:
# initialize the autolearner class
m = AutoLearner(p_type='classification', runtime_limit=50, method=method, verbose=True)

rank for EM-Tucker imputation: (20, 4, 2, 2, 8, 20)
shape of the error tensor: (551, 4, 2, 2, 8, 183)
Loading latent factors from storage ...
Loading saved runtime predictors ...


In [4]:
# fit autolearner on training set and record runtime
start = time.time()
m.fit(x_train, y_train, categorical=None) # TensorOboe accepts the list of feature types
elapsed_time = time.time() - start


Shape of training dataset: 120 data points, 4 features
Splitting training set into training and validation ..
Predicting pipeline running time ..
runtime limit of initial round: 32.0 seconds
fitting and kfold_fit_validating the best-on-average pipeline
Pipeline fitting completed.
Fitted an ensemble with size 1
having a capped running time of 32 seconds
Fitted an ensemble with size 1
Fitted an ensemble with size 1
Fitted an ensemble with size 1
Fitted an ensemble with size 1
Fitted an ensemble with size 1
Doubling process started ...
Fitting with ranks=(20, 4, 2, 2, 8, 18), t=32.0

Single round runtime target: 32.0
Fitting AutoLearner with maximum runtime 32.0 seconds
Selecting an initial set of models to evaluate ...
greedy_initialization
[0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5]
Sampling 8 entries of new row...
pool fitting completed
<oboe.pipeline.PipelineObject object at 0x7faca980bcd0>
<oboe.pipeline.PipelineObject object at 0x7faca980bc70>
<oboe.pipeline.PipelineObject object at 0x7faca9

In [5]:
# use the fitted autolearner for prediction on test set
y_predicted = m.predict(x_test)
print("prediction error: {}".format(error(y_test, y_predicted, 'classification')))    
print("elapsed time: {}".format(elapsed_time))

prediction error: 0.025000000000000022
elapsed time: 17.90514302253723


In [6]:
# get names of the selected machine learning models
m.get_models()

{'ensemble method': 'select at most 5 pipelines with smallest cv error',
 'base learners': [{'imputer': {'algorithm': 'SimpleImputer',
    'hyperparameters': {'strategy': 'median'}},
   'encoder': {'algorithm': 'OneHotEncoder',
    'hyperparameters': {'handle_unknown': 'ignore', 'sparse': 0}},
   'standardizer': {'algorithm': 'StandardScaler', 'hyperparameters': {}},
   'dim_reducer': {'algorithm': 'SelectKBest', 'hyperparameters': {'k': 3}},
   'estimator': {'algorithm': 'ExtraTrees',
    'hyperparameters': {'min_samples_split': 1e-05, 'criterion': 'entropy'}}},
  {'imputer': {'algorithm': 'SimpleImputer',
    'hyperparameters': {'strategy': 'most_frequent'}},
   'encoder': {'algorithm': None},
   'standardizer': {'algorithm': 'StandardScaler', 'hyperparameters': {}},
   'dim_reducer': {'algorithm': 'SelectKBest', 'hyperparameters': {'k': 1}},
   'estimator': {'algorithm': 'GBT',
    'hyperparameters': {'learning_rate': 0.1,
     'max_depth': 3,
     'max_features': 'log2'}}},
  {'imp

# Example 2: build an ensemble of models with given configurations

In [7]:
#experimental settings
VERBOSE = False #whether to print out information indicating current fitting progress
N_CORES = 1 #number of cores
RUNTIME_BUDGET = 15

In [8]:
# #optional: limit the types of algorithms (not yet supported)
# s = ['AB', 'ExtraTrees', 'GNB', 'KNN', 'RF', 'DT']

In [9]:
#autolearner arguments
autolearner_kwargs = {
    'p_type': 'classification',
    'method': method,
    'runtime_limit': RUNTIME_BUDGET,
    'verbose': VERBOSE,
    'selection_method': 'min_variance',
    'stacking_alg': 'greedy',
    'n_cores': N_CORES,
    'build_ensemble': True,
}

In [10]:
#intialize the autolearner class
m = AutoLearner(**autolearner_kwargs)

In [12]:
# fit autolearner on training set and record runtime
start = time.time()
m.fit(x_train, y_train, categorical=None)
elapsed_time = time.time() - start

In [13]:
# use the fitted autolearner for prediction on test set
y_predicted = m.predict(x_test)
print("prediction error: {}".format(error(y_test, y_predicted, 'classification')))
print("elapsed time: {}".format(elapsed_time))

prediction error: 0.025000000000000022
elapsed time: 7.4095470905303955


In [14]:
# get names of the selected machine learning models
m.get_models()

{'ensemble method': 'select at most 5 pipelines with smallest cv error',
 'base learners': [{'imputer': {'algorithm': 'SimpleImputer',
    'hyperparameters': {'strategy': 'constant'}},
   'encoder': {'algorithm': None},
   'standardizer': {'algorithm': None},
   'dim_reducer': {'algorithm': 'SelectKBest', 'hyperparameters': {'k': 3}},
   'estimator': {'algorithm': 'GBT',
    'hyperparameters': {'learning_rate': 0.025,
     'max_depth': 3,
     'max_features': 'log2'}}}]}