This is a classification example to show how to use Oboe for training and testing, in the context of AutoML, i.e., do pipeline selection on the training set and then evaluate the performance of the selected model on the test set.

In [1]:
# necessary modules
import sys
import pandas as pd
import os
import time
import numpy as np
import multiprocessing

#Oboe modules; this will be simplified when Oboe becomes pip installable
automl_path = '../automl/'
sys.path.append(automl_path)
from auto_learner import AutoLearner
import util

#import scikit-learn modules
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# disable warnings
import warnings
warnings.filterwarnings('ignore')

In [2]:
#load and split dataset into training and test folds
data = load_iris()
x = np.array(data['data'])
y = np.array(data['target'])
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

# Example 1: a no-brainer use

In [3]:
# initialize the autolearner class
m = AutoLearner(p_type='classification', runtime_limit=100, load_imputed=True, verbose=True)

In [4]:
# fit autolearner on training set and record runtime
start = time.time()
m.fit(x_train, y_train)
elapsed_time = time.time() - start


Shape of training dataset: 120 data points, 4 features
Splitting training set into training and validation ..
Predicting pipeline running time ..
Runtime limit of initial round: 8.0
Doubling process started ...

Single round runtime target: 8.0
Fitting AutoLearner with maximum runtime 8.0 seconds
Selecting an initial set of models to evaluate ...
[0.01243822 0.04170117 0.05424547 0.03708994 0.05440154 0.01510171
 0.01148718 0.05553925 0.01       0.01060516 0.01074114 0.01
 0.01       0.01       0.01       0.01       0.01       0.01
 0.01034928 0.01       0.01       0.01       0.01       0.01068285
 0.0120408  0.02765299 0.01       0.01       0.01       0.01
 0.01       0.01       0.01       0.01       0.01198053 0.01
 0.01       0.01       0.01       0.01       0.01017439 0.0106675
 0.01039751 0.01       0.01       0.01       0.02802525 0.01328719
 0.01       0.01       0.01215969 0.01       0.01       0.01
 0.01       0.01       0.01040664 0.01       0.01100383 0.01234148
 0.01      

having a capped running time of 4 seconds
having a capped running time of 4 seconds
having a capped running time of 4 seconds
having a capped running time of 4 seconds
having a capped running time of 4 seconds
having a capped running time of 4 seconds
having a capped running time of 4 seconds
having a capped running time of 4 seconds
having a capped running time of 4 seconds
having a capped running time of 4 seconds
having a capped running time of 4 seconds
having a capped running time of 4 seconds
having a capped running time of 4 seconds
having a capped running time of 4 seconds
having a capped running time of 4 seconds
having a capped running time of 4 seconds
having a capped running time of 4 seconds
having a capped running time of 4 seconds
having a capped running time of 4 seconds
having a capped running time of 4 seconds
having a capped running time of 4 seconds
having a capped running time of 4 seconds
having a capped running time of 4 seconds
having a capped running time of 4 

having a capped running time of 8 seconds
having a capped running time of 8 seconds
having a capped running time of 8 seconds
having a capped running time of 8 seconds
having a capped running time of 8 seconds
having a capped running time of 8 seconds
having a capped running time of 8 seconds
having a capped running time of 8 seconds
having a capped running time of 8 seconds
having a capped running time of 8 seconds
having a capped running time of 8 seconds
having a capped running time of 8 seconds
having a capped running time of 8 seconds
having a capped running time of 8 seconds
having a capped running time of 8 seconds
having a capped running time of 8 seconds
having a capped running time of 8 seconds
having a capped running time of 8 seconds
having a capped running time of 8 seconds
having a capped running time of 8 seconds
having a capped running time of 8 seconds
having a capped running time of 8 seconds
having a capped running time of 8 seconds
having a capped running time of 8 

In [7]:
# use the fitted autolearner for prediction on test set
y_predicted = m.predict(x_test)
print("prediction error: {}".format(util.error(y_test, y_predicted, 'classification')))    
print("elapsed time: {}".format(elapsed_time))

prediction error: 0.07711161387631972
elapsed time: 38.970279693603516


In [8]:
# get names of the selected machine learning models
m.get_models()

{'ensemble method': 'select at most 5 pipelines with smallest cv error',
 'base learners': [{'imputer': {'algorithm': 'SimpleImputer',
    'hyperparameters': {'strategy': 'median'}},
   'encoder': {'algorithm': None},
   'standardizer': {'algorithm': None},
   'dim_reducer': {'algorithm': 'VarianceThreshold', 'hyperparameters': {}},
   'estimator': {'algorithm': 'MLP',
    'hyperparameters': {'learning_rate_init': 0.01,
     'learning_rate': 'adaptive',
     'solver': 'sgd',
     'alpha': 0.0001}}},
  {'imputer': {'algorithm': 'SimpleImputer',
    'hyperparameters': {'strategy': 'mean'}},
   'encoder': {'algorithm': None},
   'standardizer': {'algorithm': 'StandardScaler', 'hyperparameters': {}},
   'dim_reducer': {'algorithm': 'PCA', 'hyperparameters': {'n_components': 3}},
   'estimator': {'algorithm': 'ExtraTrees',
    'hyperparameters': {'min_samples_split': 0.1, 'criterion': 'entropy'}}},
  {'imputer': {'algorithm': 'SimpleImputer',
    'hyperparameters': {'strategy': 'constant'}}

# Example 2: build an ensemble of models

In [9]:
#experimental settings
VERBOSE = False #whether to print out information indicating current fitting progress
N_CORES = 1 #number of cores
RUNTIME_BUDGET = 15

In [8]:
# #optional: limit the types of algorithms (not yet supported)
# s = ['AB', 'ExtraTrees', 'GNB', 'KNN', 'RF', 'DT']

In [10]:
#autolearner arguments
autolearner_kwargs = {
    'p_type': 'classification',
    'runtime_limit': RUNTIME_BUDGET,
    'verbose': VERBOSE,
    'selection_method': 'min_variance',
    'stacking_alg': 'greedy',
    'n_cores': N_CORES,
    'build_ensemble': True,
}

In [11]:
#intialize the autolearner class
m = AutoLearner(**autolearner_kwargs)

In [12]:
# fit autolearner on training set and record runtime
start = time.time()
m.fit(x_train, y_train)
elapsed_time = time.time() - start

In [14]:
# use the fitted autolearner for prediction on test set
y_predicted = m.predict(x_test)
print("prediction error: {}".format(util.error(y_test, y_predicted, 'classification')))
print("elapsed time: {}".format(elapsed_time))

prediction error: 0.057346656611362457
elapsed time: 12.30546522140503


In [15]:
# get names of the selected machine learning models
m.get_models()

{'ensemble method': 'select at most 5 pipelines with smallest cv error',
 'base learners': [{'imputer': {'algorithm': 'SimpleImputer',
    'hyperparameters': {'strategy': 'constant'}},
   'encoder': {'algorithm': None},
   'standardizer': {'algorithm': None},
   'dim_reducer': {'algorithm': None},
   'estimator': {'algorithm': 'KNN',
    'hyperparameters': {'n_neighbors': 11, 'p': 2}}},
  {'imputer': {'algorithm': 'SimpleImputer',
    'hyperparameters': {'strategy': 'median'}},
   'encoder': {'algorithm': None},
   'standardizer': {'algorithm': None},
   'dim_reducer': {'algorithm': 'SelectKBest', 'hyperparameters': {'k': 2}},
   'estimator': {'algorithm': 'Logit',
    'hyperparameters': {'C': 1.5, 'solver': 'saga', 'penalty': 'l2'}}},
  {'imputer': {'algorithm': 'SimpleImputer',
    'hyperparameters': {'strategy': 'median'}},
   'encoder': {'algorithm': None},
   'standardizer': {'algorithm': None},
   'dim_reducer': {'algorithm': 'SelectKBest', 'hyperparameters': {'k': 2}},
   'estim

# Example 3: just select a collection of promising models without building an ensemble afterwards (not yet supported)

In [23]:
# #experimental settings
# VERBOSE = False #whether to print out information indicating current fitting progress
# N_CORES = 1 #number of cores
# RUNTIME_BUDGET = 15

# #autolearner arguments
# autolearner_kwargs = {
#     'p_type': 'classification',
#     'runtime_limit': RUNTIME_BUDGET,
#     'verbose': VERBOSE,
#     'selection_method': 'min_variance',
#     'stacking_alg': 'greedy',
#     'n_cores': N_CORES,
#     'build_ensemble': False,
# }

In [24]:
# #intialize the autolearner class
# m = AutoLearner(**autolearner_kwargs)

In [25]:
# # fit autolearner on training set and record runtime
# start = time.time()
# m.fit(x_train, y_train)
# elapsed_time = time.time() - start

In [26]:
# # use the fitted autolearner for prediction on test set
# y_predicted = m.predict(x_test)
 
# print("elapsed time: {}".format(elapsed_time))
# print("accuracies of selected models: {}".format(m.get_model_accuracy(y_test)))

Note that we do not have a single accuracy value here if we do not build an ensemble, instead, we just have a collection of fitted models with individual accuracies reported.

In [27]:
# # get names of the selected machine learning models
# m.get_models()