This is a classification example to show how to use Oboe for training and testing, in the context of AutoML, i.e., do model selection on the training set and then evaluate the performance of the selected model on the test set.

In [1]:
method = 'Oboe'
problem_type = 'classification'

from oboe import AutoLearner, error  # This may take around 15 seconds at first run.

import numpy as np
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import time

data = load_iris()
x = np.array(data['data'])
y = np.array(data['target'])
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

# disable warnings
import warnings
warnings.filterwarnings('ignore')

# Example 1: a no-brainer use

In [2]:
# initialize the autolearner class
m = AutoLearner(p_type='classification', runtime_limit=30, method=method, verbose=False)

In [3]:
# fit autolearner on training set and record runtime
start = time.time()
m.fit(x_train, y_train)
elapsed_time = time.time() - start

In [4]:
# use the fitted autolearner for prediction on test set
y_predicted = m.predict(x_test)
print("prediction error: {}".format(error(y_test, y_predicted, 'classification')))    
print("elapsed time: {}".format(elapsed_time))

prediction error: 0.04444444444444442
elapsed time: 27.44637703895569


In [5]:
# get names of the selected machine learning models
m.get_models()

{'ensemble method': 'select at most 5 pipelines with smallest cv error',
 'base learners': {'kSVM': [{'C': 0.5, 'kernel': 'poly', 'coef0': 10},
   {'C': 0.25, 'kernel': 'poly', 'coef0': 10},
   {'C': 0.125, 'kernel': 'poly', 'coef0': 10},
   {'C': 0.125, 'kernel': 'poly', 'coef0': 10},
   {'C': 0.125, 'kernel': 'poly', 'coef0': 10}]}}

# Example 2: build an ensemble of models

In [6]:
#experimental settings
VERBOSE = False #whether to print out information indicating current fitting progress
N_CORES = 1 #number of cores
RUNTIME_BUDGET = 30

In [7]:
#optional: limit the types of algorithms
s = ['AB', 'ExtraTrees', 'GNB', 'KNN', 'RF', 'DT']

In [8]:
#autolearner arguments
autolearner_kwargs = {
    'p_type': 'classification',
    'method': method,
    'runtime_limit': RUNTIME_BUDGET,
    'verbose': VERBOSE,
    'selection_method': 'ED',
    'algorithms': s,
    'stacking_alg': 'greedy',
    'n_cores': N_CORES,
    'build_ensemble': True,
}

In [9]:
#intialize the autolearner class
m = AutoLearner(**autolearner_kwargs)

In [10]:
# fit autolearner on training set and record runtime
start = time.time()
m.fit(x_train, y_train)
elapsed_time = time.time() - start

In [11]:
# use the fitted autolearner for prediction on test set
y_predicted = m.predict(x_test)
print("prediction error: {}".format(error(y_test, y_predicted, 'classification')))
print("elapsed time: {}".format(elapsed_time))
print("individual accuracies of selected models: {}".format(m.get_model_accuracies(y_test)))

prediction error: 0.07037037037037036
elapsed time: 27.255645036697388
individual accuracies of selected models: [0.07, 0.07, 0.07, 0.07, 0.07]


In [12]:
# get names of the selected machine learning models
m.get_models()

{'ensemble method': 'select at most 5 pipelines with smallest cv error',
 'base learners': {'KNN': [{'n_neighbors': 1, 'p': 1},
   {'n_neighbors': 7, 'p': 2},
   {'n_neighbors': 3, 'p': 1},
   {'n_neighbors': 9, 'p': 1},
   {'n_neighbors': 3, 'p': 2}]}}

# Example 3: just select a collection of promising models without building an ensemble afterwards

In [13]:
#experimental settings
VERBOSE = True #whether to print out information indicating current fitting progress
N_CORES = 1 #number of cores
RUNTIME_BUDGET = 15

In [14]:
#optional: limit the types of algorithms
s = ['AB', 'ExtraTrees', 'GNB', 'KNN', 'RF', 'DT']

In [15]:
#autolearner arguments
autolearner_kwargs = {
    'p_type': 'classification',
    'method': method,
    'runtime_limit': RUNTIME_BUDGET,
    'verbose': VERBOSE,
    'selection_method': 'ED',
    'algorithms': s,
    'stacking_alg': 'greedy',
    'n_cores': N_CORES,
    'build_ensemble': False,
}

In [16]:
#intialize the autolearner class
m = AutoLearner(**autolearner_kwargs)

In [17]:
# fit autolearner on training set and record runtime
start = time.time()
m.fit(x_train, y_train)
elapsed_time = time.time() - start


Shape of training dataset: 120 data points, 4 features
Fitting with k=7, t=7.5

Single round runtime target: 7.5
Fitting AutoLearner with max runtime 7.5s
Sampling 39 entries of new row...
Time limit reached.
KNN {'n_neighbors': 1, 'p': 2} complete.
KNN {'n_neighbors': 1, 'p': 1} complete.
KNN {'n_neighbors': 3, 'p': 1} complete.
DT {'min_samples_split': 64} complete.


In [18]:
# use the fitted autolearner for prediction on test set
y_predicted = m.predict(x_test)
 
print("elapsed time: {}".format(elapsed_time))
print("accuracies of selected models: {}".format(m.get_pipeline_accuracies(y_test)))

elapsed time: 12.52166485786438
accuracies of selected models: [0.07, 0.07, 0.07, 0.07, 0.096]


Note that we do not have a single accuracy value here if we do not build an ensemble, instead, we just have a collection of fitted models with individual accuracies reported.

In [19]:
# get names of the selected machine learning models
m.get_models()

{'selected models': {'KNN': [{'n_neighbors': 1, 'p': 2},
   {'n_neighbors': 1, 'p': 2},
   {'n_neighbors': 1, 'p': 1},
   {'n_neighbors': 3, 'p': 1}],
  'DT': [{'min_samples_split': 64}]}}